diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index c0500ec003f6..da6744b76b86 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1,7546 +1,7550 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" #include "statcommon.h" static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); static int zpool_do_reopen(int, char **); static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); static int zpool_do_events(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_ADD, HELP_ATTACH, HELP_CLEAR, HELP_CREATE, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, HELP_SCRUB, HELP_STATUS, HELP_UPGRADE, HELP_EVENTS, HELP_GET, HELP_SET, HELP_SPLIT, HELP_REGUID, HELP_REOPEN } zpool_help_t; /* * Flags for stats to display with "zpool iostats" */ enum iostat_type { IOS_DEFAULT = 0, IOS_LATENCY = 1, IOS_QUEUES = 2, IOS_L_HISTO = 3, IOS_RQ_HISTO = 4, IOS_COUNT, /* always last element */ }; /* iostat_type entries as bitmasks */ #define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) #define IOS_LATENCY_M (1ULL << IOS_LATENCY) #define IOS_QUEUES_M (1ULL << IOS_QUEUES) #define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) #define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) /* Mask of all the histo bits */ #define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) /* * Lookup table for iostat flags to nvlist names. Basically a list * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ static const char *vsx_type_to_nvlist[IOS_COUNT][11] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, NULL}, }; /* * Given a cb->cb_flags with a histogram bit set, return the iostat_type. * Right now, only one histo bit is ever set at one time, so we can * just do a highbit64(a) */ #define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) typedef struct zpool_command { const char *name; int (*func)(int, char **); zpool_help_t usage; } zpool_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zpool_command_t command_table[] = { { "create", zpool_do_create, HELP_CREATE }, { "destroy", zpool_do_destroy, HELP_DESTROY }, { NULL }, { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, { NULL }, { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "events", zpool_do_events, HELP_EVENTS }, { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: return (gettext("\tadd [-fgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-f] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: return (gettext("\texport [-af] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" "\timport [-d dir | -c cachefile] [-F [-n]] \n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " "[-R root] [-F [-n]]\n" "\t [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [-c CMD] [-T d | u] [-ghHLpPvy] " "[[-lq]|[-r|-w]]\n" "\t [[pool ...]|[pool vdev ...]|[vdev ...]] " "[interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-t] ...\n")); case HELP_ONLINE: return (gettext("\tonline ...\n")); case HELP_REPLACE: return (gettext("\treplace [-f] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove ...\n")); case HELP_REOPEN: return (gettext("\treopen \n")); case HELP_SCRUB: return (gettext("\tscrub [-s] ...\n")); case HELP_STATUS: return (gettext("\tstatus [-c CMD] [-gLPvxD] [-T d|u] [pool]" " ... [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_EVENTS: return (gettext("\tevents [-vHfc]\n")); case HELP_GET: return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-gLnP] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: return (gettext("\treguid \n")); } abort(); /* NOTREACHED */ } /* * Callback routine that will print out a pool property value. */ static int print_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { int i; (void) fprintf(fp, gettext("usage: zpool command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && ((strcmp(current_command->name, "set") == 0) || (strcmp(current_command->name, "get") == 0) || (strcmp(current_command->name, "list") == 0))) { (void) fprintf(fp, gettext("\nthe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-15s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); (void) fprintf(fp, "\t%-15s ", "feature@..."); (void) fprintf(fp, "YES disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties must be " "appended with a feature name.\nSee zpool-features(5).\n")); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, boolean_t print_logs, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; if (name != NULL) (void) printf("\t%*s%s\n", indent, "", name); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if ((is_log && !print_logs) || (!is_log && print_logs)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); print_vdev_tree(zhp, vname, child[c], indent + 2, B_FALSE, name_flags); free(vname); } } static boolean_t prop_list_contains_feature(nvlist_t *proplist) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; nvp = nvlist_next_nvpair(proplist, nvp)) { if (zpool_prop_feature(nvpair_name(nvp))) return (B_TRUE); } return (B_FALSE); } /* * Add a property pair (name, string-value) into a property nvlist. */ static int add_prop_list(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { zpool_prop_t prop = ZPROP_INVAL; zfs_prop_t fprop; nvlist_t *proplist; const char *normnm; char *strval; if (*props == NULL && nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); return (1); } proplist = *props; if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL && !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } /* * feature@ properties and version should not be specified * at the same time. */ if ((prop == ZPROP_INVAL && zpool_prop_feature(propname) && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && prop_list_contains_feature(proplist))) { (void) fprintf(stderr, gettext("'feature@' and " "'version' properties cannot be specified " "together\n")); return (2); } if (zpool_prop_feature(propname)) normnm = propname; else normnm = zpool_prop_to_name(prop); } else { if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { normnm = zfs_prop_to_name(fprop); } else { normnm = propname; } } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && prop != ZPOOL_PROP_CACHEFILE) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (2); } if (nvlist_add_string(proplist, normnm, propval) != 0) { (void) fprintf(stderr, gettext("internal " "error: out of memory\n")); return (1); } return (0); } /* * Set a default property pair (name, string-value) in a property nvlist */ static int add_prop_list_default(const char *propname, char *propval, nvlist_t **props, boolean_t poolprop) { char *pval; if (nvlist_lookup_string(*props, propname, &pval) == 0) return (0); return (add_prop_list(propname, propval, props, B_TRUE)); } /* * zpool add [-fgLnP] [-o property=value] ... * * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not add the devices, but display the resulting layout if * they were to be added. * -o Set property=value. * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by get_vdev_spec(), which constructs the nvlist needed to pass to * libzfs. */ int zpool_do_add(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; char *poolname; int ret; zpool_handle_t *zhp; nvlist_t *config; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, "fgLno:P")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'g': name_flags |= VDEV_NAME_GUID; break; case 'L': name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'n': dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 'P': name_flags |= VDEV_NAME_PATH; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); usage(B_FALSE); } poolname = argv[0]; argc--; argv++; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } /* pass off to get_vdev_spec for processing */ nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } if (dryrun) { nvlist_t *poolnvroot; nvlist_t **l2child; uint_t l2children, c; char *vname; boolean_t hadcache = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " "configuration:\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, B_FALSE, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, B_FALSE, name_flags); /* Do the same for the logs */ if (num_logs(poolnvroot) > 0) { print_vdev_tree(zhp, "logs", poolnvroot, 0, B_TRUE, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, B_TRUE, name_flags); } else if (num_logs(nvroot) > 0) { print_vdev_tree(zhp, "logs", nvroot, 0, B_TRUE, name_flags); } /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { hadcache = B_TRUE; (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { if (!hadcache) (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } ret = 0; } else { ret = (zpool_add(zhp, nvroot) != 0); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool remove ... * * Removes the given vdev from the pool. Currently, this supports removing * spares, cache, and log devices from the pool. */ int zpool_do_remove(int argc, char **argv) { char *poolname; int i, ret = 0; zpool_handle_t *zhp = NULL; argc--; argv++; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (zpool_vdev_remove(zhp, argv[i]) != 0) ret = 1; } zpool_close(zhp); return (ret); } /* * zpool labelclear * * Verifies that the vdev is not active and zeros out the label information * on the device. */ int zpool_do_labelclear(int argc, char **argv) { char *vdev, *name; int c, fd = -1, ret = 0; pool_state_t state; boolean_t inuse = B_FALSE; boolean_t force = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get vdev name */ if (argc < 1) { (void) fprintf(stderr, gettext("missing vdev device name\n")); usage(B_FALSE); } vdev = argv[0]; if ((fd = open(vdev, O_RDWR)) < 0) { (void) fprintf(stderr, gettext("Unable to open %s\n"), vdev); return (B_FALSE); } name = NULL; if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0) { if (force) goto wipe_label; (void) fprintf(stderr, gettext("Unable to determine pool state for %s\n" "Use -f to force the clearing any label data\n"), vdev); return (1); } if (inuse) { switch (state) { default: case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: (void) fprintf(stderr, gettext("labelclear operation failed.\n" "\tVdev %s is a member (%s), of pool \"%s\".\n" "\tTo remove label information from this device, " "export or destroy\n\tthe pool, or remove %s from " "the configuration of this pool\n\tand retry the " "labelclear operation.\n"), vdev, zpool_pool_state_to_name(state), name, vdev); ret = 1; goto errout; case POOL_STATE_EXPORTED: if (force) break; (void) fprintf(stderr, gettext("labelclear operation failed.\n\tVdev " "%s is a member of the exported pool \"%s\".\n" "\tUse \"zpool labelclear -f %s\" to force the " "removal of label\n\tinformation.\n"), vdev, name, vdev); ret = 1; goto errout; case POOL_STATE_POTENTIALLY_ACTIVE: if (force) break; (void) fprintf(stderr, gettext("labelclear operation failed.\n" "\tVdev %s is a member of the pool \"%s\".\n" "\tThis pool is unknown to this system, but may " "be active on\n\tanother system. Use " "\'zpool labelclear -f %s\' to force the\n" "\tremoval of label information.\n"), vdev, name, vdev); ret = 1; goto errout; case POOL_STATE_DESTROYED: /* inuse should never be set for a destroyed pool... */ break; } } wipe_label: if (zpool_clear_label(fd) != 0) { (void) fprintf(stderr, gettext("Label clear failed on vdev %s\n"), vdev); ret = 1; } errout: close(fd); if (name != NULL) free(name); return (ret); } /* * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * * -f Force creation, even if devices appear in use * -n Do not create the pool, but display the resulting layout if it * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's * '/' * -o Set property=value. * -o Set feature@feature=enabled|disabled. * -d Don't automatically enable all supported pool features * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The * bulk of the vdev processing is done in get_vdev_spec() in zpool_vdev.c. Once * we get the nvlist back from get_vdev_spec(), we either print out the contents * (if '-n' was specified), or pass it to libzfs to do the creation. */ int zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t enable_all_pool_feat = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; char *tname = NULL; int ret = 1; char *altroot = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'd': enable_all_pool_feat = B_FALSE; break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; break; case 'm': /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); goto errout; } *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; /* * If the user is creating a pool that doesn't support * feature flags, don't enable any features. */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { char *end; u_longlong_t ver; ver = strtoull(propval, &end, 10); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_all_pool_feat = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -O option\n")); goto errout; } *propval = '\0'; propval++; /* * Mountpoints are checked and then added later. * Uniquely among properties, they can be specified * more than once, to avoid conflict with -m. */ if (0 == strcmp(optarg, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { mountpoint = propval; } else if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) { goto errout; } break; case 't': /* * Sanity check temporary pool name. */ if (strchr(optarg, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create " "'%s': invalid character '/' in temporary " "name\n"), optarg); (void) fprintf(stderr, gettext("use 'zfs " "create' to create a dataset\n")); goto errout; } if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto errout; tname = optarg; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); goto badusage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); goto badusage; } poolname = argv[0]; /* * As a special case, check for use of '/' in the name, and direct the * user to use 'zfs create' instead. */ if (strchr(poolname, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create '%s': invalid " "character '/' in pool name\n"), poolname); (void) fprintf(stderr, gettext("use 'zfs create' to " "create a dataset\n")); goto errout; } /* pass off to get_vdev_spec for bulk processing */ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, argc - 1, argv + 1); if (nvroot == NULL) goto errout; /* make_root_vdev() allows 0 toplevel children if there are spares */ if (!zfs_allocatable_devs(nvroot)) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto errout; } if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); goto errout; } /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. */ if (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; DIR *dirp; if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " "'%s': must be an absolute path, 'legacy', or " "'none'\n"), mountpoint); goto errout; } if (mountpoint == NULL) { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s/%s", altroot, poolname); else (void) snprintf(buf, sizeof (buf), "/%s", poolname); } else { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s%s", altroot, mountpoint); else (void) snprintf(buf, sizeof (buf), "%s", mountpoint); } if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); (void) fprintf(stderr, gettext("use '-m' " "option to provide a different default\n")); goto errout; } else if (dirp) { int count = 0; while (count < 3 && readdir(dirp) != NULL) count++; (void) closedir(dirp); if (count > 2) { (void) fprintf(stderr, gettext("mountpoint " "'%s' exists and is not empty\n"), buf); (void) fprintf(stderr, gettext("use '-m' " "option to provide a " "different default\n")); goto errout; } } } /* * Now that the mountpoint's validity has been checked, ensure that * the property is set appropriately prior to creating the pool. */ if (mountpoint != NULL) { ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), mountpoint, &fsprops, B_FALSE); if (ret != 0) goto errout; } ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run * through all the vdevs in the list and print out in an * appropriate hierarchy. */ (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); print_vdev_tree(NULL, poolname, nvroot, 0, B_FALSE, 0); if (num_logs(nvroot) > 0) print_vdev_tree(NULL, "logs", nvroot, 0, B_TRUE, 0); ret = 0; } else { /* * Hand off to libzfs. */ spa_feature_t i; for (i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; char *propval; zfeature_info_t *feat = &spa_feature_table[i]; (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); /* * Only features contained in props will be enabled: * remove from the nvlist every ZFS_FEATURE_DISABLED * value and add every missing ZFS_FEATURE_ENABLED if * enable_all_pool_feat is set. */ if (!nvlist_lookup_string(props, propname, &propval)) { if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) (void) nvlist_remove_all(props, propname); } else if (enable_all_pool_feat) { ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) goto errout; } } ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { if (zfs_mount(pool, NULL, 0) == 0) ret = zfs_shareall(pool); zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { (void) fprintf(stderr, gettext("pool name may have " "been omitted\n")); } } errout: nvlist_free(nvroot); nvlist_free(fsprops); nvlist_free(props); return (ret); badusage: nvlist_free(fsprops); nvlist_free(props); usage(B_FALSE); return (2); } /* * zpool destroy * * -f Forcefully unmount any datasets * * Destroy the given pool. Automatically unmounts any datasets in the pool. */ int zpool_do_destroy(int argc, char **argv) { boolean_t force = B_FALSE; int c; char *pool; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { /* * As a special case, check for use of '/' in the name, and * direct the user to use 'zfs destroy' instead. */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("use 'zfs destroy' to " "destroy a dataset\n")); return (1); } if (zpool_disable_datasets(zhp, force) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); zpool_close(zhp); return (1); } /* The history must be logged as part of the export */ log_history = B_FALSE; ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); return (ret); } typedef struct export_cbdata { boolean_t force; boolean_t hardforce; } export_cbdata_t; /* * Export one pool */ int zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; if (zpool_disable_datasets(zhp, cb->force) != 0) return (1); /* The history must be logged as part of the export */ log_history = B_FALSE; if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) return (1); } else if (zpool_export(zhp, cb->force, history_str) != 0) { return (1); } return (0); } /* * zpool export [-f] ... * * -a Export all pools * -f Forcefully unmount datasets * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, * then the datasets will be forcefully unmounted. */ int zpool_do_export(int argc, char **argv) { export_cbdata_t cb; boolean_t do_all = B_FALSE; boolean_t force = B_FALSE; boolean_t hardforce = B_FALSE; int c, ret; /* check options */ while ((c = getopt(argc, argv, "afF")) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'f': force = B_TRUE; break; case 'F': hardforce = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.force = force; cb.hardforce = hardforce; argc -= optind; argv += optind; if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb)); } /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb); return (ret); } /* * Given a vdev configuration, determine the maximum width needed for the device * name column. */ static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, int name_flags) { char *name; nvlist_t **child; uint_t c, children; int ret; name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); if (strlen(name) + depth > max) max = strlen(name) + depth; free(name); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if ((ret = max_width(zhp, child[c], depth + 2, max, name_flags)) > max) max = ret; } return (max); } typedef struct spare_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; } spare_cbdata_t; static boolean_t find_vdev(nvlist_t *nv, uint64_t search) { uint64_t guid; nvlist_t **child; uint_t c, children; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && search == guid) return (B_TRUE); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev(child[c], search)) return (B_TRUE); } return (B_FALSE); } static int find_spare(zpool_handle_t *zhp, void *data) { spare_cbdata_t *cbp = data; nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (find_vdev(nvroot, cbp->cb_guid)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } typedef struct status_cbdata { int cb_count; int cb_name_flags; int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_status; vdev_cmd_data_list_t *vcdl; } status_cbdata_t; /* Print output line for specific vdev in a specific pool */ static void zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path) { int i; for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) == 0) && (strcmp(vcdl->data[i].pool, pool) == 0)) { printf("%s", vcdl->data[i].line); break; } } } /* * Print out configuration state as requested by status_callback. */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth, boolean_t isspare) { nvlist_t **child; uint_t c, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; char *state; char *path = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) state = "INUSE"; else if (vs->vs_state == VDEV_STATE_HEALTHY) state = "AVAIL"; } (void) printf("\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" was %s", path); } else if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), zpool_get_name(spare_cb.cb_zhp)); zpool_close(spare_cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_IO_FAILURE: (void) printf(gettext("experienced I/O failures")); break; case VDEV_AUX_BAD_LOG: (void) printf(gettext("bad intent log")); break; case VDEV_AUX_EXTERNAL: (void) printf(gettext("external device fault")); break; case VDEV_AUX_SPLIT_POOL: (void) printf(gettext("split into new pool")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); if (ps && ps->pss_state == DSS_SCANNING && vs->vs_scan_processed != 0 && children == 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? "resilvering" : "repairing"); } if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } (void) printf("\n"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; /* Don't print logs or holes here */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_status_config(zhp, cb, vname, child[c], depth + 2, isspare); free(vname); } } /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ static void print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth) { nvlist_t **child; uint_t c, children; vdev_stat_t *vs; char *type, *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MISSING) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) printf("\n"); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_import_config(cb, vname, child[c], depth + 2); free(vname); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } } /* * Print log vdevs. * Logs are recorded as top level vdevs in the main pool child array * but with "is_log" set to 1. We use either print_status_config() or * print_import_config() to print the top level logs then any log * children (eg mirrored slogs) are printed recursively - which * works because only the top level vdev is marked "is_log" */ static void print_logs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv) { uint_t c, children; nvlist_t **child; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; (void) printf(gettext("\tlogs\n")); for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; char *name; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (!is_log) continue; name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, B_FALSE); else print_import_config(cb, name, child[c], 2); free(name); } } /* * Display the status for the given pool. */ static void show_import(nvlist_t *config) { uint64_t pool_state; vdev_stat_t *vs; char *name; uint64_t guid; char *msgid; nvlist_t *nvroot; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t vsc; char *comment; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid, &errata); (void) printf(gettext(" pool: %s\n"), name); (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" status: One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: (void) printf(gettext(" status: One or more devices contains " "corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf( gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: (void) printf(gettext(" status: One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: (void) printf(gettext(" status: The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: (void) printf(gettext(" status: The pool is formatted using a " "legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" status: The pool is formatted using an " "incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: (void) printf(gettext(" status: Some supported features are " "not enabled on the pool.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("status: The pool uses the following " "feature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("status: The pool can only be accessed " "in read-only mode on this system. It\n\tcannot be " "accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); break; case ZPOOL_STATUS_HOSTID_MISMATCH: (void) printf(gettext(" status: The pool was last accessed by " "another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: (void) printf(gettext(" status: One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: (void) printf(gettext(" status: An intent log record cannot be " "read.\n")); break; case ZPOOL_STATUS_RESILVERING: (void) printf(gettext(" status: One or more devices were being " "resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: (void) printf(gettext(" status: Errata #%d detected.\n"), errata); break; default: /* * No other status can be seen when importing pools. */ assert(reason == ZPOOL_STATUS_OK); } /* * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext(" action: The pool can " "be imported using its name or numeric " "identifier,\n\thowever there is a compat" "ibility issue which should be corrected" "\n\tby running 'zpool scrub'\n")); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: (void) printf(gettext(" action: The pool can" "not be imported with this version of ZFS " "due to\n\tan active asynchronous destroy. " "Revert to an earlier version\n\tand " "allow the destroy to complete before " "updating.\n")); break; default: /* * All errata must contain an action message. */ assert(0); } } else { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("action: The pool cannot be " "imported. Access the pool on a system that " "supports\n\tthe required feature(s), or recreate " "the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("action: The pool cannot be " "imported in read-write mode. Import the pool " "with\n" "\t\"-o readonly=on\", access the pool on a system " "that supports the\n\trequired feature(s), or " "recreate the pool from backup.\n")); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ if (((vs->vs_state == VDEV_STATE_CLOSED) || (vs->vs_state == VDEV_STATE_CANT_OPEN)) && (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext("\tThe pool was destroyed, " "but can be imported using the '-Df' flags.\n")); else if (pool_state != POOL_STATE_EXPORTED) (void) printf(gettext("\tThe pool may be active on " "another system, but can be imported using\n\t" "the '-f' flag.\n")); } if (msgid != NULL) (void) printf(gettext(" see: http://zfsonlinux.org/msg/%s\n"), msgid); (void) printf(gettext(" config:\n\n")); cb.cb_namewidth = max_width(NULL, nvroot, 0, 0, VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; print_import_config(&cb, name, nvroot, 0); if (num_logs(nvroot) > 0) print_logs(NULL, &cb, nvroot); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " "be part of this pool, though their\n\texact " "configuration cannot be determined.\n")); } } /* * Perform the import for the given configuration. This passes the heavy * lifting off to zpool_import_props(), and then mounts the datasets contained * within the pool. */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { zpool_handle_t *zhp; char *name; uint64_t state; uint64_t version; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (state != POOL_STATE_EXPORTED && !(flags & ZFS_IMPORT_ANY_HOST)) { uint64_t hostid = 0; unsigned long system_hostid = get_system_hostid(); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (hostid != 0 && (unsigned long)hostid != system_hostid) { char *hostname; uint64_t timestamp; time_t t; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, ×tamp) == 0); t = timestamp; (void) fprintf(stderr, gettext("cannot import " "'%s': pool may be in use from other " "system, it was last accessed by %s " "(hostid: 0x%lx) on %s"), name, hostname, (unsigned long)hostid, asctime(localtime(&t))); (void) fprintf(stderr, gettext("use '-f' to " "import anyway\n")); return (1); } } if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) name = (char *)newname; if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY) && zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); } zpool_close(zhp); return (0); } /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] * [-d dir | -c cachefile] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] * [-d dir | -c cachefile] [-f] [-n] [-F] [newpool] * * -c Read pool information from a cachefile instead of searching * devices. * * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * * -D Scan for previously destroyed pools or import all or only * specified destroyed pools. * * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * * -V Import even in the presence of faulted vdevs. This is an * intentionally undocumented option for testing purposes, and * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * * -f Force import, even if it appears that the pool is active. * * -F Attempt rewind if necessary. * * -n See if rewind would work, but don't actually rewind. * * -N Import the pool but don't mount datasets. * * -T Specify a starting txg to use for import. This option is * intentionally undocumented option for testing purposes. * * -a Import all pools found. * * -o Set property=value and/or temporary mount options (without '='). * * -s Scan using the default search path, the libblkid cache will * not be consulted. * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. */ int zpool_do_import(int argc, char **argv) { char **searchdirs = NULL; char *env, *envdup = NULL; int nsearch = 0; int c; int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; nvpair_t *elem; nvlist_t *config; uint64_t searchguid = 0; char *searchname = NULL; char *propval; nvlist_t *found_config; nvlist_t *policy = NULL; nvlist_t *props = NULL; boolean_t first; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; uint64_t pool_state, txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; /* check options */ while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:stT:VX")) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'c': cachefile = optarg; break; case 'd': if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); } else { char **tmp = safe_malloc((nsearch + 1) * sizeof (char *)); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); free(searchdirs); searchdirs = tmp; } searchdirs[nsearch++] = optarg; break; case 'D': do_destroyed = B_TRUE; break; case 'f': flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; case 'm': flags |= ZFS_IMPORT_MISSING_LOG; break; case 'n': dryrun = B_TRUE; break; case 'N': flags |= ZFS_IMPORT_ONLY; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto error; } else { mntopts = optarg; } break; case 'R': if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto error; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 's': do_scan = B_TRUE; break; case 't': flags |= ZFS_IMPORT_TEMP_NAME; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; case 'T': errno = 0; txg = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid txg value\n")); usage(B_FALSE); } rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; break; case 'V': flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (cachefile && nsearch != 0) { (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0) goto error; /* check argument count */ if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } /* * Check for the effective uid. We do this explicitly here because * otherwise any attempt to discover pools will silently fail. */ if (argc == 0 && geteuid() != 0) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); if (searchdirs != NULL) free(searchdirs); nvlist_free(props); nvlist_free(policy); return (1); } /* * Depending on the arguments given, we do one of the following: * * Iterate through all pools and display information about * each one. * * -a Iterate through all pools and try to import each one. * * Find the pool that corresponds to the given GUID/pool * name and import that one. * * -D Above options applies only to destroyed pools. */ if (argc != 0) { char *endptr; errno = 0; searchguid = strtoull(argv[0], &endptr, 10); if (errno != 0 || *endptr != '\0') { searchname = argv[0]; searchguid = 0; } found_config = NULL; /* * User specified a name or guid. Ensure it's unique. */ idata.unique = B_TRUE; } /* * Check the environment for the preferred search path. */ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { char *dir; envdup = strdup(env); dir = strtok(envdup, ":"); while (dir != NULL) { if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); } else { char **tmp = safe_malloc((nsearch + 1) * sizeof (char *)); bcopy(searchdirs, tmp, nsearch * sizeof (char *)); free(searchdirs); searchdirs = tmp; } searchdirs[nsearch++] = dir; dir = strtok(NULL, ":"); } } idata.path = searchdirs; idata.paths = nsearch; idata.poolname = searchname; idata.guid = searchguid; idata.cachefile = cachefile; idata.scan = do_scan; /* * Under Linux the zpool_find_import_impl() function leverages the * taskq implementation to parallelize device scanning. It is * therefore necessary to initialize this functionality for the * duration of the zpool_search_import() function. */ thread_init(); pools = zpool_search_import(g_zfs, &idata); thread_fini(); if (pools != NULL && idata.exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), argv[0]); (void) fprintf(stderr, gettext("use the form '%s " " ' to give it a new name\n"), "zpool import"); err = 1; } else if (pools == NULL && idata.exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); (void) fprintf(stderr, gettext("and no additional pools " "with that name were found\n")); err = 1; } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } err = 1; } if (err == 1) { if (searchdirs != NULL) free(searchdirs); if (envdup != NULL) free(envdup); nvlist_free(policy); nvlist_free(pools); nvlist_free(props); return (1); } /* * At this point we have a list of import candidate configs. Even if * we were searching by pool name or guid, we still need to * post-process the list to deal with pool state and possible * duplicate names. */ err = 0; elem = NULL; first = B_TRUE; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) continue; if (do_destroyed && pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY, policy) == 0); if (argc == 0) { if (first) first = B_FALSE; else if (!do_all) (void) printf("\n"); if (do_all) { err |= do_import(config, NULL, mntopts, props, flags); } else { show_import(config); } } else if (searchname != NULL) { char *name; /* * We are searching for a pool based on name. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (strcmp(name, searchname) == 0) { if (found_config != NULL) { (void) fprintf(stderr, gettext( "cannot import '%s': more than " "one matching pool\n"), searchname); (void) fprintf(stderr, gettext( "import by numeric ID instead\n")); err = B_TRUE; } found_config = config; } } else { uint64_t guid; /* * Search for a pool by guid. */ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (guid == searchguid) found_config = config; } } /* * If we were searching for a specific pool, verify that we found a * pool, and then do the import. */ if (argc != 0 && err == 0) { if (found_config == NULL) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); err = B_TRUE; } else { err |= do_import(found_config, argc == 1 ? NULL : argv[1], mntopts, props, flags); } } /* * If we were just looking for pools, report an error if none were * found. */ if (argc == 0 && first) (void) fprintf(stderr, gettext("no pools available to import\n")); error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); if (searchdirs != NULL) free(searchdirs); if (envdup != NULL) free(envdup); return (err ? 1 : 0); } typedef struct iostat_cbdata { uint64_t cb_flags; int cb_name_flags; int cb_namewidth; int cb_iteration; char **cb_vdev_names; /* Only show these vdevs */ unsigned int cb_vdev_names_count; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; vdev_cmd_data_list_t *vcdl; } iostat_cbdata_t; /* iostat labels */ typedef struct name_and_columns { const char *name; /* Column name */ unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; #define IOSTAT_MAX_LABELS 11 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {"scrub"}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"sync_queue", 2}, {"async_queue", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {NULL}}, }; /* Shorthand - if "columns" field not set, default to 1 column */ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"wait"}, {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"scrub"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { [IOS_L_HISTO] = "latency", [IOS_RQ_HISTO] = "req_size", }; /* * Return the number of labels in a null-terminated name_and_columns_t * array. * */ static unsigned int label_array_len(const name_and_columns_t *labels) { int i = 0; while (labels[i].name) i++; return (i); } /* * Return the number of strings in a null-terminated string array. * For example: * * const char foo[] = {"bar", "baz", NULL} * * returns 2 */ static uint64_t str_array_len(const char *array[]) { uint64_t i = 0; while (array[i]) i++; return (i); } /* * Return a default column width for default/latency/queue columns. This does * not include histograms, which have their columns autosized. */ static unsigned int default_column_width(iostat_cbdata_t *cb, enum iostat_type type) { unsigned long column_width = 5; /* Normal niceprint */ static unsigned long widths[] = { /* * Choose some sane default column sizes for printing the * raw numbers. */ [IOS_DEFAULT] = 15, /* 1PB capacity */ [IOS_LATENCY] = 10, /* 1B ns = 10sec */ [IOS_QUEUES] = 6, /* 1M queue entries */ }; if (cb->cb_literal) column_width = widths[type]; return (column_width); } /* * Print the column labels, i.e: * * capacity operations bandwidth * alloc free read write read write ... * * If force_column_width is set, use it for the column width. If not set, use * the default column width. */ void print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) { int i, idx, s; unsigned int text_start, rw_column_width, spaces_to_end; uint64_t flags = cb->cb_flags; uint64_t f; unsigned int column_width = force_column_width; /* For each bit set in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; if (!force_column_width) column_width = default_column_width(cb, idx); /* Print our top labels centered over "read write" label. */ for (i = 0; i < label_array_len(labels[idx]); i++) { const char *name = labels[idx][i].name; /* * We treat labels[][].columns == 0 as shorthand * for one column. It makes writing out the label * tables more concise. */ unsigned int columns = MAX(1, labels[idx][i].columns); unsigned int slen = strlen(name); rw_column_width = (column_width * columns) + (2 * (columns - 1)); text_start = (int)((rw_column_width)/columns - slen/columns); printf(" "); /* Two spaces between columns */ /* Space from beginning of column to label */ for (s = 0; s < text_start; s++) printf(" "); printf("%s", name); /* Print space after label to end of column */ spaces_to_end = rw_column_width - text_start - slen; for (s = 0; s < spaces_to_end; s++) printf(" "); } } printf("\n"); } /* * Utility function to print out a line of dashes like: * * -------------------------------- ----- ----- ----- ----- ----- * * ...or a dashed named-row line like: * * logs - - - - - * * @cb: iostat data * * @force_column_width If non-zero, use the value as the column width. * Otherwise use the default column widths. * * @name: Print a dashed named-row line starting * with @name. Otherwise, print a regular * dashed line. */ static void print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, const char *name) { int i; unsigned int namewidth; uint64_t flags = cb->cb_flags; uint64_t f; int idx; const name_and_columns_t *labels; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), name ? strlen(name) : 0); if (name) { printf("%-*s", namewidth, name); } else { for (i = 0; i < namewidth; i++) (void) printf("-"); } /* For each bit in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { unsigned int column_width; idx = lowbit64(f) - 1; if (force_column_width) column_width = force_column_width; else column_width = default_column_width(cb, idx); labels = iostat_bottom_labels[idx]; for (i = 0; i < label_array_len(labels); i++) { if (name) printf(" %*s-", column_width - 1, " "); else printf(" %.*s", column_width, "--------------------"); } } printf("\n"); } static void print_iostat_separator_impl(iostat_cbdata_t *cb, unsigned int force_column_width) { print_iostat_dashes(cb, force_column_width, NULL); } static void print_iostat_separator(iostat_cbdata_t *cb) { print_iostat_separator_impl(cb, 0); } static void print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, const char *histo_vdev_name) { unsigned int namewidth; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdev_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), histo_vdev_name ? strlen(histo_vdev_name) : 0); if (histo_vdev_name) printf("%-*s", namewidth, histo_vdev_name); else printf("%*s", namewidth, ""); print_iostat_labels(cb, force_column_width, iostat_top_labels); printf("%-*s", namewidth, title); print_iostat_labels(cb, force_column_width, iostat_bottom_labels); print_iostat_separator_impl(cb, force_column_width); } static void print_iostat_header(iostat_cbdata_t *cb) { print_iostat_header_impl(cb, 0, NULL); } /* * Display a single statistic. */ static void print_one_stat(uint64_t value, enum zfs_nicenum_format format, unsigned int column_size, boolean_t scripted) { char buf[64]; zfs_nicenum_format(value, buf, sizeof (buf), format); if (scripted) printf("\t%s", buf); else printf(" %*s", column_size, buf); } /* * Calculate the default vdev stats * * Subtract oldvs from newvs, apply a scaling factor, and save the resulting * stats into calcvs. */ static void calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, vdev_stat_t *calcvs) { int i; memcpy(calcvs, newvs, sizeof (*calcvs)); for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); } /* * Internal representation of the extended iostats data. * * The extended iostat stats are exported in nvlists as either uint64_t arrays * or single uint64_t's. We make both look like arrays to make them easier * to process. In order to make single uint64_t's look like arrays, we set * __data to the stat data, and then set *data = &__data with count = 1. Then, * we can just use *data and count. */ struct stat_array { uint64_t *data; uint_t count; /* Number of entries in data[] */ uint64_t __data; /* Only used when data is a single uint64_t */ }; static uint64_t -stat_histo_max(struct stat_array *nva, unsigned int len) { +stat_histo_max(struct stat_array *nva, unsigned int len) +{ uint64_t max = 0; int i; for (i = 0; i < len; i++) max = MAX(max, array64_max(nva[i].data, nva[i].count)); return (max); } /* * Helper function to lookup a uint64_t array or uint64_t value and store its * data as a stat_array. If the nvpair is a single uint64_t value, then we make * it look like a one element array to make it easier to process. */ static int nvpair64_to_stat_array(nvlist_t *nvl, const char *name, - struct stat_array *nva) { + struct stat_array *nva) +{ nvpair_t *tmp; int ret; verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); switch (nvpair_type(tmp)) { case DATA_TYPE_UINT64_ARRAY: ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); break; case DATA_TYPE_UINT64: ret = nvpair_value_uint64(tmp, &nva->__data); nva->data = &nva->__data; nva->count = 1; break; default: /* Not a uint64_t */ ret = EINVAL; break; } return (ret); } /* * Given a list of nvlist names, look up the extended stats in newnv and oldnv, * subtract them, and return the results in a newly allocated stat_array. * You must free the returned array after you are done with it with * free_calc_stats(). * * Additionally, you can set "oldnv" to NULL if you simply want the newnv * values. */ static struct stat_array * calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, nvlist_t *newnv) { nvlist_t *oldnvx = NULL, *newnvx; struct stat_array *oldnva, *newnva, *calcnva; int i, j; unsigned int alloc_size = (sizeof (struct stat_array)) * len; /* Extract our extended stats nvlist from the main list */ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, &newnvx) == 0); if (oldnv) { verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, &oldnvx) == 0); } newnva = safe_malloc(alloc_size); oldnva = safe_malloc(alloc_size); calcnva = safe_malloc(alloc_size); for (j = 0; j < len; j++) { verify(nvpair64_to_stat_array(newnvx, names[j], &newnva[j]) == 0); calcnva[j].count = newnva[j].count; alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); calcnva[j].data = safe_malloc(alloc_size); memcpy(calcnva[j].data, newnva[j].data, alloc_size); if (oldnvx) { verify(nvpair64_to_stat_array(oldnvx, names[j], &oldnva[j]) == 0); for (i = 0; i < oldnva[j].count; i++) calcnva[j].data[i] -= oldnva[j].data[i]; } } free(newnva); free(oldnva); return (calcnva); } static void free_calc_stats(struct stat_array *nva, unsigned int len) { int i; for (i = 0; i < len; i++) free(nva[i].data); free(nva); } static void print_iostat_histo(struct stat_array *nva, unsigned int len, iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, double scale) { int i, j; char buf[6]; uint64_t val; enum zfs_nicenum_format format; unsigned int buckets; unsigned int start_bucket; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; /* All these histos are the same size, so just use nva[0].count */ buckets = nva[0].count; if (cb->cb_flags & IOS_RQ_HISTO_M) { /* Start at 512 - req size should never be lower than this */ start_bucket = 9; } else { start_bucket = 0; } for (j = start_bucket; j < buckets; j++) { /* Print histogram bucket label */ if (cb->cb_flags & IOS_L_HISTO_M) { /* Ending range of this bucket */ val = (1UL << (j + 1)) - 1; zfs_nicetime(val, buf, sizeof (buf)); } else { /* Request size (starting range of bucket) */ val = (1UL << j); zfs_nicenum(val, buf, sizeof (buf)); } if (cb->cb_scripted) printf("%llu", (u_longlong_t)val); else printf("%-*s", namewidth, buf); /* Print the values on the line */ for (i = 0; i < len; i++) { print_one_stat(nva[i].data[j] * scale, format, column_width, cb->cb_scripted); } printf("\n"); } } static void print_solid_separator(unsigned int length) { while (length--) printf("-"); printf("\n"); } static void print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale, const char *name) { unsigned int column_width; unsigned int namewidth; unsigned int entire_width; enum iostat_type type; struct stat_array *nva; const char **names; unsigned int names_len; /* What type of histo are we? */ type = IOS_HISTO_IDX(cb->cb_flags); /* Get NULL-terminated array of nvlist names for our histo */ names = vsx_type_to_nvlist[type]; names_len = str_array_len(names); /* num of names */ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); if (cb->cb_literal) { column_width = MAX(5, (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); } else { column_width = 5; } namewidth = MAX(cb->cb_namewidth, strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); /* * Calculate the entire line width of what we're printing. The * +2 is for the two spaces between columns: */ /* read write */ /* ----- ----- */ /* |___| <---------- column_width */ /* */ /* |__________| <--- entire_width */ /* */ entire_width = namewidth + (column_width + 2) * label_array_len(iostat_bottom_labels[type]); if (cb->cb_scripted) printf("%s\n", name); else print_iostat_header_impl(cb, column_width, name); print_iostat_histo(nva, names_len, cb, column_width, namewidth, scale); free_calc_stats(nva, names_len); if (!cb->cb_scripted) print_solid_separator(entire_width); } /* * Calculate the average latency of a power-of-two latency histogram */ static uint64_t single_histo_average(uint64_t *histo, unsigned int buckets) { int i; uint64_t count = 0, total = 0; for (i = 0; i < buckets; i++) { /* * Our buckets are power-of-two latency ranges. Use the * midpoint latency of each bucket to calculate the average. * For example: * * Bucket Midpoint * 8ns-15ns: 12ns * 16ns-31ns: 24ns * ... */ if (histo[i] != 0) { total += histo[i] * (((1UL << i) + ((1UL << i)/2))); count += histo[i]; } } /* Prevent divide by zero */ return (count == 0 ? 0 : total / count); } static void print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_QUEUES); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; for (i = 0; i < ARRAY_SIZE(names); i++) { val = nva[i].data[0] * scale; print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } static void print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_LATENCY); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_TIME; /* Print our avg latencies on the line */ for (i = 0; i < ARRAY_SIZE(names); i++) { /* Compute average latency for a latency histo */ val = single_histo_average(nva[i].data, nva[i].count) * scale; print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } /* * Print default statistics (capacity/operations/bandwidth) */ static void print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) { unsigned int column_width = default_column_width(cb, IOS_DEFAULT); enum zfs_nicenum_format format; char na; /* char to print for "not applicable" values */ if (cb->cb_literal) { format = ZFS_NICENUM_RAW; na = '0'; } else { format = ZFS_NICENUM_1024; na = '-'; } /* only toplevel vdevs have capacity stats */ if (vs->vs_space == 0) { if (cb->cb_scripted) printf("\t%c\t%c", na, na); else printf(" %*c %*c", column_width, na, column_width, na); } else { print_one_stat(vs->vs_alloc, format, column_width, cb->cb_scripted); print_one_stat(vs->vs_space - vs->vs_alloc, format, column_width, cb->cb_scripted); } print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); } /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. * * Returns the number of stat lines printed. */ unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children; vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; char *vname; int i; int ret = 0; uint64_t tdelta; double scale; calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } /* Do we only want to see a specific vdev? */ for (i = 0; i < cb->cb_vdev_names_count; i++) { /* Yes we do. Is this the vdev? */ if (strcmp(name, cb->cb_vdev_names[i]) == 0) { /* * This is our vdev. Since it is the only vdev we * will be displaying, make depth = 0 so that it * doesn't get indented. */ depth = 0; break; } } if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { /* Couldn't match the name */ goto children; } verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); /* * Print the vdev name unless it's is a histogram. Histograms * display the vdev name in the header itself. */ if (!(cb->cb_flags & IOS_ANYHISTO_M)) { if (cb->cb_scripted) { printf("%s", name); } else { if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); } } /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { /* * If we specify printing histograms with no time interval, then * print the histogram numbers over the entire lifetime of the * vdev. */ scale = 1; } else { if (tdelta == 0) scale = 1.0; else scale = (double)NANOSEC / tdelta; } if (cb->cb_flags & IOS_DEFAULT_M) { calc_default_iostats(oldvs, newvs, calcvs); print_iostat_default(calcvs, cb, scale); } if (cb->cb_flags & IOS_LATENCY_M) print_iostat_latency(cb, oldnv, newnv, scale); if (cb->cb_flags & IOS_QUEUES_M) print_iostat_queues(cb, oldnv, newnv, scale); if (cb->cb_flags & IOS_ANYHISTO_M) { printf("\n"); print_iostat_histos(cb, oldnv, newnv, scale, name); } if (cb->vcdl != NULL) { char *path; if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, &path) == 0) { if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); if (cb->cb_flags & IOS_ANYHISTO_M) printf("\n"); } } if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf("\n"); ret++; children: free(calcvs); if (!cb->cb_verbose) return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) return (ret); if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &c) != 0) return (ret); for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, &ishole); (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (ishole || islog) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* * Log device section */ if (num_logs(newnv) > 0) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, "logs"); } for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (islog) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } } /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) return (ret); if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &c) != 0) return (ret); if (children > 0) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdev_names) { print_iostat_dashes(cb, 0, "cache"); } for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } return (ret); } static int refresh_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; boolean_t missing; /* * If the pool has disappeared, remove it from the list and continue. */ if (zpool_refresh_stats(zhp, &missing) != 0) return (-1); if (missing) pool_list_remove(cb->cb_list, zhp); return (0); } /* * Callback to print out the iostats for the given pool. */ int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; int ret; newconfig = zpool_get_config(zhp, &oldconfig); if (cb->cb_iteration == 1) oldconfig = NULL; verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, &newnvroot) == 0); if (oldconfig == NULL) oldnvroot = NULL; else verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) { print_iostat_separator(cb); } return (ret); } static int get_columns(void) { struct winsize ws; int columns = 80; int error; if (isatty(STDOUT_FILENO)) { error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); if (error == 0) columns = ws.ws_col; } else { columns = 999; } return (columns); } int get_namewidth(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *config, *nvroot; int columns; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); unsigned int poolname_len = strlen(zpool_get_name(zhp)); if (!cb->cb_verbose) cb->cb_namewidth = poolname_len; else cb->cb_namewidth = MAX(poolname_len, max_width(zhp, nvroot, 0, cb->cb_namewidth, cb->cb_name_flags)); } /* * The width must be at least 10, but may be as large as the * column width - 42 so that we can still fit in one line. */ columns = get_columns(); if (cb->cb_namewidth < 10) cb->cb_namewidth = 10; if (cb->cb_namewidth > columns - 42) cb->cb_namewidth = columns - 42; return (0); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { float interval = 0; unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { /* * If this is not a valid number, just plow on. The * user will get a more informative error message later * on. */ interval = 0; } } /* * If the last argument is also an integer, then we have both a count * and an interval. */ if (argc > 0 && isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext("interval " "cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { interval = 0; } } *iv = interval; *cnt = count; *argcp = argc; } static void get_timestamp_arg(char c) { if (c == 'u') timestamp_fmt = UDATE; else if (c == 'd') timestamp_fmt = DDATE; else usage(B_FALSE); } /* * Return stat flags that are supported by all pools by both the module and * zpool iostat. "*data" should be initialized to all 0xFFs before running. * It will get ANDed down until only the flags that are supported on all pools * remain. */ static int get_stat_flags_cb(zpool_handle_t *zhp, void *data) { uint64_t *mask = data; nvlist_t *config, *nvroot, *nvx; uint64_t flags = 0; int i, j; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); /* Default stats are always supported, but for completeness.. */ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) flags |= IOS_DEFAULT_M; /* Get our extended stats nvlist from the main list */ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, &nvx) != 0) { /* * No extended stats; they're probably running an older * module. No big deal, we support that too. */ goto end; } /* For each extended stat, make sure all its nvpairs are supported */ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { if (!vsx_type_to_nvlist[j][0]) continue; /* Start off by assuming the flag is supported, then check */ flags |= (1ULL << j); for (i = 0; vsx_type_to_nvlist[j][i]; i++) { if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { /* flag isn't supported */ flags = flags & ~(1ULL << j); break; } } } end: *mask = *mask & flags; return (0); } /* * Return a bitmask of stats that are supported on all pools by both the module * and zpool iostat. */ static uint64_t get_stat_flags(zpool_list_t *list) { uint64_t mask = -1; /* * get_stat_flags_cb() will lop off bits from "mask" until only the * flags that are supported on all pools remain. */ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); return (mask); } /* * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. */ static int is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) { iostat_cbdata_t *cb = cb_data; char *name = NULL; int ret = 0; name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); if (strcmp(name, cb->cb_vdev_names[0]) == 0) ret = 1; /* match */ free(name); return (ret); } /* * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. */ static int is_vdev(zpool_handle_t *zhp, void *cb_data) { return (for_each_vdev(zhp, is_vdev_cb, cb_data)); } /* * Check if vdevs are in a pool * * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise * return 0. If pool_name is NULL, then search all pools. */ static int are_vdevs_in_pool(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { char **tmp_name; int ret = 0; int i; int pool_count = 0; if ((argc == 0) || !*argv) return (0); if (pool_name) pool_count = 1; /* Temporarily hijack cb_vdev_names for a second... */ tmp_name = cb->cb_vdev_names; /* Go though our list of prospective vdev names */ for (i = 0; i < argc; i++) { cb->cb_vdev_names = argv + i; /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, is_vdev, cb); if (!ret) { /* No match */ break; } } cb->cb_vdev_names = tmp_name; return (ret); } static int is_pool_cb(zpool_handle_t *zhp, void *data) { char *name = data; if (strcmp(name, zpool_get_name(zhp)) == 0) return (1); return (0); } /* * Do we have a pool named *name? If so, return 1, otherwise 0. */ static int is_pool(char *name) { return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ static int -are_all_pools(int argc, char **argv) { +are_all_pools(int argc, char **argv) +{ if ((argc == 0) || !*argv) return (0); while (--argc >= 0) if (!is_pool(argv[argc])) return (0); return (1); } /* * Helper function to print out vdev/pool names we can't resolve. Used for an * error message. */ static void error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, iostat_cbdata_t *cb) { int i; char *name; char *str; for (i = 0; i < argc; i++) { name = argv[i]; if (is_pool(name)) str = gettext("pool"); else if (are_vdevs_in_pool(1, &name, pool_name, cb)) str = gettext("vdev in this pool"); else if (are_vdevs_in_pool(1, &name, NULL, cb)) str = gettext("vdev in another pool"); else str = gettext("unknown"); fprintf(stderr, "\t%s (%s)\n", name, str); } } /* * Same as get_interval_count(), but with additional checks to not misinterpret * guids as interval/count values. Assumes VDEV_NAME_GUID is set in * cb.cb_name_flags. */ static void get_interval_count_filter_guids(int *argc, char **argv, float *interval, unsigned long *count, iostat_cbdata_t *cb) { char **tmpargv = argv; int argc_for_interval = 0; /* Is the last arg an interval value? Or a guid? */ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { /* * The last arg is not a guid, so it's probably an * interval value. */ argc_for_interval++; if (*argc >= 2 && !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { /* * The 2nd to last arg is not a guid, so it's probably * an interval value. */ argc_for_interval++; } } /* Point to our list of possible intervals */ tmpargv = &argv[*argc - argc_for_interval]; *argc = *argc - argc_for_interval; get_interval_count(&argc_for_interval, tmpargv, interval, count); } /* * Floating point sleep(). Allows you to pass in a floating point value for * seconds. */ static void -fsleep(float sec) { +fsleep(float sec) +{ struct timespec req; req.tv_sec = floor(sec); req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; nanosleep(&req, NULL); } /* * zpool iostat [-c CMD] [-ghHLpPvy] [[-lq]|[-r|-w]] [-n name] [-T d|u] * [[ pool ...]|[pool vdev ...]|[vdev ...]] * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs * -h Display help * -p Display values in parsable (exact) format. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -l Display average latency * -q Display queue depths * -w Display latency histograms * -r Display request size histogram * -T Display a timestamp in date(1) or Unix format * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely * on pool_list_update() to detect the addition of new pools. Configuration * changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; int npools; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t verbose = B_FALSE; boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; iostat_cbdata_t cb = { 0 }; char *cmd = NULL; /* Used for printing error message */ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; uint64_t unsupported_flags; /* check options */ while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwH")) != -1) { switch (c) { case 'c': cmd = optarg; break; case 'g': guid = B_TRUE; break; case 'L': follow_links = B_TRUE; break; case 'P': full_name = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': verbose = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'l': latency = B_TRUE; break; case 'q': queues = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'w': l_histo = B_TRUE; break; case 'r': rq_histo = B_TRUE; break; case 'y': omit_since_boot = B_TRUE; break; case 'h': usage(B_FALSE); break; case '?': if (optopt == 'c') { fprintf(stderr, gettext("Missing CMD for -c\n")); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; cb.cb_literal = parsable; cb.cb_scripted = scripted; if (guid) cb.cb_name_flags |= VDEV_NAME_GUID; if (follow_links) cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; if (full_name) cb.cb_name_flags |= VDEV_NAME_PATH; cb.cb_iteration = 0; cb.cb_namewidth = 0; cb.cb_verbose = verbose; /* Get our interval and count values (if any) */ if (guid) { get_interval_count_filter_guids(&argc, argv, &interval, &count, &cb); } else { get_interval_count(&argc, argv, &interval, &count); } if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { /* All the args are vdevs */ cb.cb_vdev_names = argv; cb.cb_vdev_names_count = argc; argc = 0; /* No pools to process */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { /* ...and the rest are vdev names */ cb.cb_vdev_names = argv + 1; cb.cb_vdev_names_count = argc - 1; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected either a list of ")); fprintf(stderr, gettext("pools, or list of vdevs in")); fprintf(stderr, " \"%s\", ", argv[0]); fprintf(stderr, gettext("but got:\n")); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The args don't make sense. The first arg isn't a pool name, * nor are all the args vdevs. */ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); fprintf(stderr, "\n"); return (1); } if (cb.cb_vdev_names_count != 0) { /* * If user specified vdevs, it implies verbose. */ cb.cb_verbose = B_TRUE; } /* * Construct the list of all interesting pools. */ ret = 0; if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { pool_list_free(list); return (1); } if (pool_list_count(list) == 0 && interval == 0) { pool_list_free(list); (void) fprintf(stderr, gettext("no pools available\n")); return (1); } if ((l_histo || rq_histo) && (queues || latency)) { pool_list_free(list); (void) fprintf(stderr, gettext("[-r|-w] isn't allowed with [-q|-l]\n")); usage(B_FALSE); return (1); } if (l_histo && rq_histo) { pool_list_free(list); (void) fprintf(stderr, gettext("Only one of [-r|-w] can be passed at a time\n")); usage(B_FALSE); return (1); } /* * Enter the main iostat loop. */ cb.cb_list = list; if (l_histo) { /* * Histograms tables look out of place when you try to display * them with the other stats, so make a rule that you can only * print histograms by themselves. */ cb.cb_flags = IOS_L_HISTO_M; } else if (rq_histo) { cb.cb_flags = IOS_RQ_HISTO_M; } else { cb.cb_flags = IOS_DEFAULT_M; if (latency) cb.cb_flags |= IOS_LATENCY_M; if (queues) cb.cb_flags |= IOS_QUEUES_M; } /* * See if the module supports all the stats we want to display. */ unsupported_flags = cb.cb_flags & ~get_stat_flags(list); if (unsupported_flags) { uint64_t f; int idx; fprintf(stderr, gettext("The loaded zfs module doesn't support:")); /* for each bit set in unsupported_flags */ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; fprintf(stderr, " -%c", flag_to_arg[idx]); } fprintf(stderr, ". Try running a newer module.\n"); pool_list_free(list); return (1); } for (;;) { if ((npools = pool_list_count(list)) == 0) (void) fprintf(stderr, gettext("no pools available\n")); else { /* * If this is the first iteration and -y was supplied * we skip any printing. */ boolean_t skip = (omit_since_boot && cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an * explicit step before calculating the maximum name * width, so that any * configuration changes are * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. * * The histogram code explicitly prints its header on * every vdev, so skip this for histograms. */ if (((++cb.cb_iteration == 1 && !skip) || (skip != verbose)) && (!(cb.cb_flags & IOS_ANYHISTO_M)) && !cb.cb_scripted) print_iostat_header(&cb); if (skip) { (void) fsleep(interval); continue; } if (cmd != NULL && cb.cb_verbose) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, g_zfs, cb.cb_vdev_names, cb.cb_vdev_names_count, cb.cb_name_flags); pool_list_iter(list, B_FALSE, print_iostat, &cb); if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. * * In addition, if we're printing specific vdevs then * we also want an ending separator. */ if (((npools > 1 && !verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) || (!(cb.cb_flags & IOS_ANYHISTO_M) && cb.cb_vdev_names_count)) && !cb.cb_scripted) { print_iostat_separator(&cb); } } /* * Flush the output so that redirection to a file isn't buffered * indefinitely. */ (void) fflush(stdout); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } pool_list_free(list); return (ret); } typedef struct list_cbdata { boolean_t cb_verbose; int cb_name_flags; int cb_namewidth; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) (void) printf(" "); else first = B_FALSE; right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { header = zpool_prop_column_name(pl->pl_prop); right_justify = zpool_prop_align_right(pl->pl_prop); } else { int i; for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)width, header); else (void) printf("%-*s", (int)width, header); } (void) printf("\n"); } /* * Given a pool and a list of properties, print out all the properties according * to the described layout. */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; char *propstr; boolean_t right_justify; size_t width; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) { if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), NULL, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); } else if ((zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop)) && zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; } else { propstr = "-"; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", (int)width, propstr); else (void) printf("%-*s", (int)width, propstr); } (void) printf("\n"); } static void print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format) { char propval[64]; boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { case ZPOOL_PROP_EXPANDSZ: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else zfs_nicenum_format(value, propval, sizeof (propval), format); break; case ZPOOL_PROP_FRAGMENTATION: if (value == ZFS_FRAG_INVALID) { (void) strlcpy(propval, "-", sizeof (propval)); } else if (format == ZFS_NICENUM_RAW) { (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value); } else { (void) snprintf(propval, sizeof (propval), "%llu%%", (unsigned long long)value); } break; case ZPOOL_PROP_CAPACITY: if (format == ZFS_NICENUM_RAW) (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value); else (void) snprintf(propval, sizeof (propval), "%llu%%", (unsigned long long)value); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); } if (!valid) (void) strlcpy(propval, "-", sizeof (propval)); if (scripted) (void) printf("\t%s", propval); else (void) printf(" %*s", (int)width, propval); } void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth) { nvlist_t **child; vdev_stat_t *vs; uint_t c, children; char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; boolean_t haslog = B_FALSE; char *dashes = "%-*s - - - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (name != NULL) { boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; enum zfs_nicenum_format format; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); /* * Print the properties for the individual vdevs. Some * properties are only applicable to toplevel vdevs. The * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted, toplevel, format); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, scripted, toplevel, format); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, scripted, toplevel, format); print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted, B_TRUE, format); print_one_column(ZPOOL_PROP_FRAGMENTATION, vs->vs_fragmentation, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), format); cap = (vs->vs_space == 0) ? 0 : (vs->vs_alloc * 100 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel, format); (void) printf("\n"); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) continue; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { haslog = B_TRUE; continue; } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } if (haslog == B_TRUE) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "log"); for (c = 0; c < children; c++) { if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) != 0 || !islog) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2); free(vname); } } } /* * Generic callback function to list a pool. */ int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; nvlist_t *config; nvlist_t *nvroot; config = zpool_get_config(zhp, NULL); print_pool(zhp, cbp); if (!cbp->cb_verbose) return (0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); print_list_stats(zhp, NULL, nvroot, cbp, 0); return (0); } /* * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -g Display guid for individual vdev name. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -L Follow links when resolving vdev path name. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. */ int zpool_do_list(int argc, char **argv) { int c; int ret = 0; list_cbdata_t cb = { 0 }; static char default_props[] = "name,size,allocated,free,expandsize,fragmentation,capacity," "dedupratio,health,altroot"; char *props = default_props; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { switch (c) { case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'o': props = optarg; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 'p': cb.cb_literal = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) break; if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (!cb.cb_scripted && (first || cb.cb_verbose)) { print_header(&cb); first = B_FALSE; } ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (interval == 0) break; if (count != 0 && --count == 0) break; pool_list_free(list); (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { (void) printf(gettext("no pools available\n")); ret = 0; } pool_list_free(list); zprop_free_list(cb.cb_proplist); return (ret); } static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; nvlist_t *props = NULL; char *propval; int ret; /* check options */ while ((c = getopt(argc, argv, "fo:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } old_disk = argv[1]; if (argc < 3) { if (!replacing) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } new_disk = old_disk; argc -= 1; argv += 1; } else { new_disk = argv[2]; argc -= 2; argv += 2; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { nvlist_free(props); return (1); } if (zpool_get_config(zhp, NULL) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); nvlist_free(props); return (1); } nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); nvlist_free(props); return (1); } ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing); nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool replace [-f] * * -f Force attach, even if appears to be in use. * * Replace with . */ /* ARGSUSED */ int zpool_do_replace(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); } /* * zpool attach [-f] [-o property=value] * * -f Force attach, even if appears to be in use. * -o Set property=value. * * Attach to the mirror containing . If is not * part of a mirror, then will be transformed into a mirror of * and . In either case, will begin life * with a DTL of [0, now], and will immediately begin to resilver itself. */ int zpool_do_attach(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); } /* * zpool detach [-f] * * -f Force detach of , even if DTLs argue against it * (not supported yet) * * Detach a device from a mirror. The operation will be refused if * is the last device in the mirror, or if the DTLs indicate that this device * has the only valid copy of some data. */ /* ARGSUSED */ int zpool_do_detach(int argc, char **argv) { int c; char *poolname, *path; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } poolname = argv[0]; path = argv[1]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_vdev_detach(zhp, path); zpool_close(zhp); return (ret); } /* * zpool split [-gLnP] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. * -P Display full path for vdev name. * -R Mount the split-off pool under an alternate root. * * Splits the named pool and gives it the new pool name. Devices to be split * off may be listed, provided that no more than one device is specified * per top-level vdev mirror. The newly split pool is left in an exported * state unless -R is specified. * * Restrictions: the top-level of the pool pool must only be made up of * mirrors; all devices in the pool must be healthy; no device may be * undergoing a resilvering operation. */ int zpool_do_split(int argc, char **argv) { char *srcpool, *newpool, *propval; char *mntopts = NULL; splitflags_t flags; int c, ret = 0; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; flags.dryrun = B_FALSE; flags.import = B_FALSE; flags.name_flags = 0; /* check options */ while ((c = getopt(argc, argv, ":gLR:no:P")) != -1) { switch (c) { case 'g': flags.name_flags |= VDEV_NAME_GUID; break; case 'L': flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'R': flags.import = B_TRUE; if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } break; case 'n': flags.dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } } else { mntopts = optarg; } break; case 'P': flags.name_flags |= VDEV_NAME_PATH; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); break; } } if (!flags.import && mntopts != NULL) { (void) fprintf(stderr, gettext("setting mntopts is only " "valid when importing the pool\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("Missing new pool name\n")); usage(B_FALSE); } srcpool = argv[0]; newpool = argv[1]; argc -= 2; argv += 2; if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { nvlist_free(props); return (1); } config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); if (config == NULL) { ret = 1; } else { if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, B_FALSE, flags.name_flags); } } zpool_close(zhp); if (ret != 0 || flags.dryrun || !flags.import) { nvlist_free(config); nvlist_free(props); return (ret); } /* * The split was successful. Now we need to open the new * pool and import it. */ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { nvlist_free(config); nvlist_free(props); return (1); } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && zpool_enable_datasets(zhp, mntopts, 0) != 0) { ret = 1; (void) fprintf(stderr, gettext("Split was successful, but " "the datasets could not all be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); } zpool_close(zhp); nvlist_free(config); nvlist_free(props); return (ret); } /* * zpool online ... */ int zpool_do_online(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; int flags = 0; /* check options */ while ((c = getopt(argc, argv, "et")) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; case 't': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), argv[i]); if (newstate == VDEV_STATE_FAULTED) (void) printf(gettext("use 'zpool " "clear' to restore a faulted " "device\n")); else (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); } } else { ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool offline [-ft] ... * * -f Force the device into the offline state, even if doing * so would appear to compromise pool availability. * (not supported yet) * * -t Only take the device off-line temporarily. The offline * state will not be persistent across reboots. */ /* ARGSUSED */ int zpool_do_offline(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; boolean_t istmp = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "ft")) != -1) { switch (c) { case 't': istmp = B_TRUE; break; case 'f': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } zpool_close(zhp); return (ret); } /* * zpool clear [device] * * Clear all errors associated with a pool or a particular device. */ int zpool_do_clear(int argc, char **argv) { int c; int ret = 0; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; /* check options */ while ((c = getopt(argc, argv, "FnX")) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'X': xtreme_rewind = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In future, further rewind policy choices can be passed along here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0) return (1); pool = argv[0]; device = argc == 2 ? argv[1] : NULL; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { nvlist_free(policy); return (1); } if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); nvlist_free(policy); return (ret); } /* * zpool reguid */ int zpool_do_reguid(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_reguid(zhp); zpool_close(zhp); return (ret); } /* * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. */ int zpool_do_reopen(int argc, char **argv) { int c; int ret = 0; zpool_handle_t *zhp; char *pool; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc--; argv++; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) return (1); ret = zpool_reopen(zhp); zpool_close(zhp); return (ret); } typedef struct scrub_cbdata { int cb_type; int cb_argc; char **cb_argv; } scrub_cbdata_t; int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; int err; /* * Ignore faulted pools. */ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { (void) fprintf(stderr, gettext("cannot scrub '%s': pool is " "currently unavailable\n"), zpool_get_name(zhp)); return (1); } err = zpool_scan(zhp, cb->cb_type); return (err != 0); } /* * zpool scrub [-s] ... * * -s Stop. Stops any in-progress scrub. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_SCRUB; /* check options */ while ((c = getopt(argc, argv, "s")) != -1) { switch (c) { case 's': cb.cb_type = POOL_SCAN_NONE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); } /* * Print out detailed scrub status. */ void print_scan_status(pool_scan_stat_t *ps) { time_t start, end; uint64_t elapsed, mins_left, hours_left; uint64_t pass_exam, examined, total; uint_t rate; double fraction_done; char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; (void) printf(gettext(" scan: ")); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } start = ps->pss_start_time; end = ps->pss_end_time; zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || ps->pss_func == POOL_SCAN_RESILVER); /* * Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; char *fmt = NULL; if (ps->pss_func == POOL_SCAN_SCRUB) { fmt = gettext("scrub repaired %s in %lluh%um with " "%llu errors on %s"); } else if (ps->pss_func == POOL_SCAN_RESILVER) { fmt = gettext("resilvered %s in %lluh%um with " "%llu errors on %s"); } /* LINTED */ (void) printf(fmt, processed_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), (u_longlong_t)ps->pss_errors, ctime((time_t *)&end)); return; } else if (ps->pss_state == DSS_CANCELED) { if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } return; } assert(ps->pss_state == DSS_SCANNING); /* * Scan is in progress. */ if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } examined = ps->pss_examined ? ps->pss_examined : 1; total = ps->pss_to_examine; fraction_done = (double)examined / total; /* elapsed time for this pass */ elapsed = time(NULL) - ps->pss_pass_start; elapsed = elapsed ? elapsed : 1; pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; rate = pass_exam / elapsed; rate = rate ? rate : 1; mins_left = ((total - examined) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(examined, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than 30 days */ (void) printf(gettext("\t%s scanned out of %s at %s/s"), examined_buf, total_buf, rate_buf); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (scan is slow, no estimated time)\n")); } if (ps->pss_func == POOL_SCAN_RESILVER) { (void) printf(gettext("\t%s resilvered, %.2f%% done\n"), processed_buf, 100 * fraction_done); } else if (ps->pss_func == POOL_SCAN_SCRUB) { (void) printf(gettext("\t%s repaired, %.2f%% done\n"), processed_buf, 100 * fraction_done); } } static void print_error_log(zpool_handle_t *zhp) { nvlist_t *nverrlist = NULL; nvpair_t *elem; char *pathname; size_t len = MAXPATHLEN * 2; if (zpool_get_errlog(zhp, &nverrlist) != 0) { (void) printf("errors: List of errors unavailable " "(insufficient privileges)\n"); return; } (void) printf("errors: Permanent errors have been " "detected in the following files:\n\n"); pathname = safe_malloc(len); elem = NULL; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); (void) printf("%7s %s\n", "", pathname); } free(pathname); nvlist_free(nverrlist); } static void print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, uint_t nspares) { uint_t i; char *name; if (nspares == 0) return; (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); print_status_config(zhp, cb, name, spares[i], 2, B_TRUE); free(name); } } static void print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, uint_t nl2cache) { uint_t i; char *name; if (nl2cache == 0) return; (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE); free(name); } } static void print_dedup_stats(nvlist_t *config) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; /* * If the pool was faulted then we may not have been able to * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); (void) printf(gettext(" dedup: ")); if (ddo->ddo_count == 0) { (void) printf(gettext("no DDT entries\n")); return; } (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n", (u_longlong_t)ddo->ddo_count, (u_longlong_t)ddo->ddo_dspace, (u_longlong_t)ddo->ddo_mspace); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0); zpool_dump_ddt(dds, ddh); } /* * Display a summary of pool status. Displays a summary such as: * * pool: tank * status: DEGRADED * reason: One or more devices ... * see: http://zfsonlinux.org/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK * c2t0d0 UNAVAIL * * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; char *msgid; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t c; vdev_stat_t *vs; config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); cbp->cb_count++; /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); if (cbp->cb_first) cbp->cb_first = B_FALSE; } return (0); } if (cbp->cb_first) cbp->cb_first = B_FALSE; else (void) printf("\n"); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); (void) printf(gettext(" pool: %s\n"), zpool_get_name(zhp)); (void) printf(gettext(" state: %s\n"), health); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: (void) printf(gettext("status: One or more devices could not " "be opened. Sufficient replicas exist for\n\tthe pool to " "continue functioning in a degraded state.\n")); (void) printf(gettext("action: Attach the missing device and " "online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: (void) printf(gettext("status: One or more devices could not " "be opened. There are insufficient\n\treplicas for the " "pool to continue functioning.\n")); (void) printf(gettext("action: Attach the missing device and " "online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: (void) printf(gettext("status: One or more devices could not " "be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); (void) printf(gettext("action: Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: (void) printf(gettext("status: One or more devices could not " "be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_FAILING_DEV: (void) printf(gettext("status: One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); (void) printf(gettext("action: Determine if the device needs " "to be replaced, and clear the errors\n\tusing " "'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: (void) printf(gettext("status: One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); (void) printf(gettext("action: Online the device using " "'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: (void) printf(gettext("status: One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); (void) printf(gettext("action: Online the device using " "'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: (void) printf(gettext("status: One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); (void) printf(gettext("action: Wait for the resilver to " "complete.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf(gettext("status: One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); (void) printf(gettext("action: Restore the file in question " "if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: (void) printf(gettext("status: The pool metadata is corrupted " "and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: (void) printf(gettext("status: The pool is formatted using a " "legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); (void) printf(gettext("action: Upgrade the pool using 'zpool " "upgrade'. Once this is done, the\n\tpool will no longer " "be accessible on software that does not support\n\t" "feature flags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext("status: The pool has been upgraded to a " "newer, incompatible on-disk version.\n\tThe pool cannot " "be accessed on this system.\n")); (void) printf(gettext("action: Access the pool from a system " "running more recent software, or\n\trestore the pool from " "backup.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: (void) printf(gettext("status: Some supported features are not " "enabled on the pool. The pool can\n\tstill be used, but " "some features are unavailable.\n")); (void) printf(gettext("action: Enable all features using " "'zpool upgrade'. Once this is done,\n\tthe pool may no " "longer be accessible by software that does not support\n\t" "the features. See zpool-features(5) for details.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: (void) printf(gettext("status: The pool cannot be accessed on " "this system because it uses the\n\tfollowing feature(s) " "not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); (void) printf(gettext("action: Access the pool from a system " "that supports the required feature(s),\n\tor restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: (void) printf(gettext("status: The pool can only be accessed " "in read-only mode on this system. It\n\tcannot be " "accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); (void) printf(gettext("action: The pool cannot be accessed in " "read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: (void) printf(gettext("status: One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); (void) printf(gettext("action: Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: (void) printf(gettext("status: One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); (void) printf(gettext("action: Destroy and re-create the pool " "from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: (void) printf(gettext("status: One or more devices are " "faulted in response to IO failures.\n")); (void) printf(gettext("action: Make sure the affected devices " "are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: (void) printf(gettext("status: An intent log record " "could not be read.\n" "\tWaiting for administrator intervention to fix the " "faulted pool.\n")); (void) printf(gettext("action: Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: (void) printf(gettext("status: Mismatch between pool hostid " "and system hostid on imported pool.\n\tThis pool was " "previously imported into a system with a different " "hostid,\n\tand then was verbatim imported into this " "system.\n")); (void) printf(gettext("action: Export this pool on all systems " "on which it is imported.\n" "\tThen import it to correct the mismatch.\n")); break; case ZPOOL_STATUS_ERRATA: (void) printf(gettext("status: Errata #%d detected.\n"), errata); switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext("action: To correct the issue " "run 'zpool scrub'.\n")); break; default: /* * All errata which allow the pool to be imported * must contain an action message. */ assert(0); } break; default: /* * The remaining errors can't actually be generated, yet. */ assert(reason == ZPOOL_STATUS_OK); } if (msgid != NULL) (void) printf(gettext(" see: http://zfsonlinux.org/msg/%s\n"), msgid); if (config != NULL) { uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; pool_scan_stat_t *ps = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); print_scan_status(ps); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE); if (num_logs(nvroot) > 0) print_logs(zhp, cbp, nvroot); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, cbp, l2cache, nl2cache); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, cbp, spares, nspares); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { nvlist_t *nverrlist = NULL; /* * If the approximate error count is small, get a * precise count by fetching the entire log and * uniquifying the results. */ if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && zpool_get_errlog(zhp, &nverrlist) == 0) { nvpair_t *elem; elem = NULL; nerr = 0; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nerr++; } } nvlist_free(nverrlist); (void) printf("\n"); if (nerr == 0) (void) printf(gettext("errors: No known data " "errors\n")); else if (!cbp->cb_verbose) (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); else print_error_log(zhp); } if (cbp->cb_dedup_stats) print_dedup_stats(config); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); } return (0); } /* * zpool status [-c CMD] [-gLPvx] [-T d|u] [pool] ... [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) * -T Display a timestamp in date(1) or Unix format * * Describes the health status of all pools or some subset. */ int zpool_do_status(int argc, char **argv) { int c; int ret; float interval = 0; unsigned long count = 0; status_cbdata_t cb = { 0 }; char *cmd = NULL; /* check options */ while ((c = getopt(argc, argv, "c:gLPvxDT:")) != -1) { switch (c) { case 'c': cmd = optarg; break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 'v': cb.cb_verbose = B_TRUE; break; case 'x': cb.cb_explain = B_TRUE; break; case 'D': cb.cb_dedup_stats = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case '?': if (optopt == 'c') { fprintf(stderr, gettext("Missing CMD for -c\n")); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; cb.cb_print_status = B_TRUE; for (;;) { if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb); if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) (void) printf(gettext("all pools are healthy\n")); if (ret != 0) return (ret); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fsleep(interval); } return (0); } typedef struct upgrade_cbdata { int cb_first; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; static int check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) { int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int *count = (int *)unsupp_fs; if (zfs_version > ZPL_VERSION) { (void) printf(gettext("%s (v%d) is not supported by this " "implementation of ZFS.\n"), zfs_get_name(zhp), zfs_version); (*count)++; } zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs); zfs_close(zhp); return (0); } static int upgrade_version(zpool_handle_t *zhp, uint64_t version) { int ret; nvlist_t *config; uint64_t oldversion; int unsupp_fs = 0; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); if (ret != 0) return (ret); if (unsupp_fs) { (void) fprintf(stderr, gettext("Upgrade not performed due " "to %d unsupported filesystems (max v%d).\n"), unsupp_fs, (int)ZPL_VERSION); return (1); } ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion, (u_longlong_t)version); } return (0); } static int upgrade_enable_all(zpool_handle_t *zhp, int *countp) { int i, ret, count; boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; if (!nvlist_exists(enabled, fguid)) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, ZFS_FEATURE_ENABLED); if (ret != 0) { free(propname); return (ret); } count++; if (firstff) { (void) printf(gettext("Enabled the " "following features on '%s':\n"), zpool_get_name(zhp)); firstff = B_FALSE; } (void) printf(gettext(" %s\n"), fname); free(propname); } } if (countp != NULL) *countp = count; return (0); } static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; boolean_t printnl = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < cbp->cb_version) { cbp->cb_first = B_FALSE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); printnl = B_TRUE; /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count > 0) { cbp->cb_first = B_FALSE; printnl = B_TRUE; } } if (printnl) { (void) printf(gettext("\n")); } return (0); } static int upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " "formatted with legacy version numbers and can\n" "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " "support feature\nflags.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; } (void) printf("%2llu %s\n", (u_longlong_t)version, zpool_get_name(zhp)); } return (0); } static int upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (version >= SPA_VERSION_FEATURES) { int i; boolean_t poolfirst = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " "supported features are not " "enabled on the following pools. " "Once a\nfeature is enabled the " "pool may become incompatible with " "software\nthat does not support " "the feature. See " "zpool-features(5) for " "details.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" "---------\n")); cbp->cb_first = B_FALSE; } if (poolfirst) { (void) printf(gettext("%s\n"), zpool_get_name(zhp)); poolfirst = B_FALSE; } (void) printf(gettext(" %s\n"), fname); } /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } } return (0); } /* ARGSUSED */ static int upgrade_one(zpool_handle_t *zhp, void *data) { boolean_t printnl = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) fprintf(stderr, gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" " to upgrade.\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), zpool_get_name(zhp), (u_longlong_t)cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), (u_longlong_t)cbp->cb_version); return (0); } if (cur_version != cbp->cb_version) { printnl = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count = 0; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count != 0) { printnl = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " "supported features enabled.\n"), zpool_get_name(zhp)); } } if (printnl) { (void) printf(gettext("\n")); } return (0); } /* * zpool upgrade * zpool upgrade -v * zpool upgrade [-V version] <-a | pool ...> * * With no arguments, display downrev'd ZFS pool available for upgrade. * Individual pools can be upgraded by specifying the pool, and '-a' will * upgrade all pools. */ int zpool_do_upgrade(int argc, char **argv) { int c; upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; boolean_t upgradeall = B_FALSE; char *end; /* check options */ while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); if (*end != '\0' || !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); usage(B_FALSE); } } (void) printf(gettext("This system supports ZFS pool feature " "flags.\n\n")); if (showversions) { int i; (void) printf(gettext("The following features are " "supported:\n\n")); (void) printf(gettext("FEAT DESCRIPTION\n")); (void) printf("----------------------------------------------" "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; (void) printf("%-37s%s\n", fi->fi_uname, ro); (void) printf(" %s\n", fi->fi_desc); } (void) printf("\n"); (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS version\n")); (void) printf(gettext(" 2 Ditto blocks " "(replicated metadata)\n")); (void) printf(gettext(" 3 Hot spares and double parity " "RAID-Z\n")); (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); (void) printf(gettext(" 18 Snapshot user holds\n")); (void) printf(gettext(" 19 Log device removal\n")); (void) printf(gettext(" 20 Compression using zle " "(zero-length encoding)\n")); (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext(" 26 Improved snapshot deletion " "performance\n")); (void) printf(gettext(" 27 Improved snapshot creation " "performance\n")); (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); } else if (argc == 0 && upgradeall) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); if (ret == 0 && cb.cb_first) { if (cb.cb_version == SPA_VERSION) { (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " "pool already has all supported features " "enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), (u_longlong_t)cb.cb_version); } } } else if (argc == 0) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("All pools are formatted " "using feature flags.\n\n")); } else { (void) printf(gettext("\nUse 'zpool upgrade -v' " "for a list of available legacy versions.\n")); } cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " "all supported features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, upgrade_one, &cb); } return (ret); } typedef struct hist_cbdata { boolean_t first; boolean_t longfmt; boolean_t internal; } hist_cbdata_t; /* * Print out the command history for a specific pool. */ static int get_history_one(zpool_handle_t *zhp, void *data) { nvlist_t *nvhis; nvlist_t **records; uint_t numrecords; int ret, i; hist_cbdata_t *cb = (hist_cbdata_t *)data; cb->first = B_FALSE; (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); if ((ret = zpool_get_history(zhp, &nvhis)) != 0) return (ret); verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; char tbuf[30] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { int ievent = fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); if (!cb->internal) continue; if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); continue; } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(rec, ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64(rec, ZPOOL_HIST_DSID)); } (void) printf(" %s", fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { if (!cb->internal) continue; (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } } else { if (!cb->internal) continue; (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); struct passwd *pwd = getpwuid(who); (void) printf("user %d ", (int)who); if (pwd != NULL) (void) printf("(%s) ", pwd->pw_name); } if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { (void) printf("on %s", fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { (void) printf(":%s", fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } (void) printf("]"); (void) printf("\n"); } (void) printf("\n"); nvlist_free(nvhis); return (ret); } /* * zpool history * * Displays the history of commands that modified pools. */ int zpool_do_history(int argc, char **argv) { hist_cbdata_t cbdata = { 0 }; int ret; int c; cbdata.first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': cbdata.longfmt = B_TRUE; break; case 'i': cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { (void) fprintf(stderr, gettext("no pools available\n")); return (0); } return (ret); } typedef struct ev_opts { int verbose; int scripted; int follow; int clear; } ev_opts_t; static void zpool_do_events_short(nvlist_t *nvl) { char ctime_str[26], str[32], *ptr; int64_t *tv; uint_t n; verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); memset(str, ' ', 32); (void) ctime_r((const time_t *)&tv[0], ctime_str); (void) strncpy(str, ctime_str+4, 6); /* 'Jun 30' */ (void) strncpy(str+7, ctime_str+20, 4); /* '1993' */ (void) strncpy(str+12, ctime_str+11, 8); /* '21:49:08' */ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ (void) printf(gettext("%s "), str); verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); (void) printf(gettext("%s\n"), ptr); } static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { data_type_t type = nvpair_type(nvp); const char *name = nvpair_name(nvp); boolean_t b; uint8_t i8; uint16_t i16; uint32_t i32; uint64_t i64; char *str; nvlist_t *cnv; printf(gettext("%*s%s = "), depth, "", name); switch (type) { case DATA_TYPE_BOOLEAN: printf(gettext("%s"), "1"); break; case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(nvp, &b); printf(gettext("%s"), b ? "1" : "0"); break; case DATA_TYPE_BYTE: (void) nvpair_value_byte(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT8: (void) nvpair_value_int8(nvp, (void *)&i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_UINT8: (void) nvpair_value_uint8(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT16: (void) nvpair_value_int16(nvp, (void *)&i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_UINT16: (void) nvpair_value_uint16(nvp, &i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_INT32: (void) nvpair_value_int32(nvp, (void *)&i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_INT64: (void) nvpair_value_int64(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); /* * translate vdev state values to readable * strings to aide zpool events consumers */ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } break; case DATA_TYPE_HRTIME: (void) nvpair_value_hrtime(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &str); printf(gettext("\"%s\""), str ? str : ""); break; case DATA_TYPE_NVLIST: printf(gettext("(embedded nvlist)\n")); (void) nvpair_value_nvlist(nvp, &cnv); zpool_do_events_nvprint(cnv, depth + 8); printf(gettext("%*s(end %s)"), depth, "", name); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; uint_t i, nelem; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); printf(gettext("(%d embedded nvlists)\n"), nelem); for (i = 0; i < nelem; i++) { printf(gettext("%*s%s[%d] = %s\n"), depth, "", name, i, "(embedded nvlist)"); zpool_do_events_nvprint(val[i], depth + 8); printf(gettext("%*s(end %s[%i])\n"), depth, "", name, i); } printf(gettext("%*s(end %s)\n"), depth, "", name); } break; case DATA_TYPE_INT8_ARRAY: { int8_t *val; uint_t i, nelem; (void) nvpair_value_int8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; uint_t i, nelem; (void) nvpair_value_uint8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; uint_t i, nelem; (void) nvpair_value_int16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; uint_t i, nelem; (void) nvpair_value_uint16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; uint_t i, nelem; (void) nvpair_value_int32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; uint_t i, nelem; (void) nvpair_value_uint32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; uint_t i, nelem; (void) nvpair_value_int64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; uint_t i, nelem; (void) nvpair_value_uint64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_STRING_ARRAY: { char **str; uint_t i, nelem; (void) nvpair_value_string_array(nvp, &str, &nelem); for (i = 0; i < nelem; i++) printf(gettext("\"%s\" "), str[i] ? str[i] : ""); break; } case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_DOUBLE: case DATA_TYPE_UNKNOWN: printf(gettext("")); break; } printf(gettext("\n")); } } static int zpool_do_events_next(ev_opts_t *opts) { nvlist_t *nvl; int zevent_fd, ret, dropped; zevent_fd = open(ZFS_DEV, O_RDWR); VERIFY(zevent_fd >= 0); if (!opts->scripted) (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); while (1) { ret = zpool_events_next(g_zfs, &nvl, &dropped, (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); if (ret || nvl == NULL) break; if (dropped > 0) (void) printf(gettext("dropped %d events\n"), dropped); zpool_do_events_short(nvl); if (opts->verbose) { zpool_do_events_nvprint(nvl, 8); printf(gettext("\n")); } (void) fflush(stdout); nvlist_free(nvl); } VERIFY(0 == close(zevent_fd)); return (ret); } static int zpool_do_events_clear(ev_opts_t *opts) { int count, ret; ret = zpool_events_clear(g_zfs, &count); if (!ret) (void) printf(gettext("cleared %d events\n"), count); return (ret); } /* * zpool events [-vfc] * * Displays events logs by ZFS. */ int zpool_do_events(int argc, char **argv) { ev_opts_t opts = { 0 }; int ret; int c; /* check options */ while ((c = getopt(argc, argv, "vHfc")) != -1) { switch (c) { case 'v': opts.verbose = 1; break; case 'H': opts.scripted = 1; break; case 'f': opts.follow = 1; break; case 'c': opts.clear = 1; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (opts.clear) ret = zpool_do_events_clear(&opts); else ret = zpool_do_events_next(&opts); return (ret); } static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[MAXNAMELEN]; zprop_source_t srctype; zprop_list_t *pl; for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * Skip the special fake placeholder. This will also skip * over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; if (zpool_prop_get_feature(zhp, pl->pl_user_prop, value, sizeof (value)) == 0) { zprop_print_one_property(zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } } else { if (zpool_get_prop(zhp, pl->pl_prop, value, sizeof (value), &srctype, cbp->cb_literal) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, NULL); } } return (0); } /* * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". * -p Display values in parsable (exact) format. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; int c, i; char *value; cb.cb_first = B_TRUE; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; /* check options */ while ((c = getopt(argc, argv, ":Hpo:")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'o': bzero(&cb.cb_columns, sizeof (cb.cb_columns)); i = 0; while (*optarg != '\0') { static char *col_subopts[] = { "name", "property", "value", "source", "all", NULL }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } switch (getsubopt(&optarg, col_subopts, &value)) { case 0: cb.cb_columns[i++] = GET_COL_NAME; break; case 1: cb.cb_columns[i++] = GET_COL_PROPERTY; break; case 2: cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: cb.cb_columns[i++] = GET_COL_SOURCE; break; case 4: if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; i = ZFS_GET_NCOLS; break; default: (void) fprintf(stderr, gettext("invalid column name " "'%s'\n"), value); usage(B_FALSE); } } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); argc--; argv++; if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } typedef struct set_cbdata { char *cb_propname; char *cb_value; boolean_t cb_any_successful; } set_cbdata_t; int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); if (!error) cb->cb_any_successful = B_TRUE; return (error); } int zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing property=value " "argument\n")); usage(B_FALSE); } if (argc < 3) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many pool names\n")); usage(B_FALSE); } cb.cb_propname = argv[1]; cb.cb_value = strchr(cb.cb_propname, '='); if (cb.cb_value == NULL) { (void) fprintf(stderr, gettext("missing value in " "property=value argument\n")); usage(B_FALSE); } *(cb.cb_value) = '\0'; cb.cb_value++; error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, set_callback, &cb); return (error); } static int find_command_idx(char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); srand(time(NULL)); dprintf_setup(&argc, argv); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) usage(B_TRUE); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s", libzfs_error_init(errno)); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Run the appropriate command. */ if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, argv + 1); } else if (strchr(cmdname, '=')) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, argv); } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { /* * 'freeze' is a vile debugging abomination, so we treat * it as such. */ char buf[16384]; int fd = open(ZFS_DEV, O_RDWR); (void) strlcpy((void *)buf, argv[2], sizeof (buf)); return (!!ioctl(fd, ZFS_IOC_POOL_FREEZE, buf)); } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } diff --git a/cmd/zpool/zpool_util.c b/cmd/zpool/zpool_util.c index df3f9bf834f4..43abfa23b0b8 100644 --- a/cmd/zpool/zpool_util.c +++ b/cmd/zpool/zpool_util.c @@ -1,111 +1,113 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include "zpool_util.h" /* * Utility function to guarantee malloc() success. */ void * safe_malloc(size_t size) { void *data; if ((data = calloc(1, size)) == NULL) { (void) fprintf(stderr, "internal error: out of memory\n"); exit(1); } return (data); } /* * Display an out of memory error message and abort the current program. */ void zpool_no_memory(void) { assert(errno == ENOMEM); (void) fprintf(stderr, gettext("internal error: out of memory\n")); exit(1); } /* * Return the number of logs in supplied nvlist */ uint_t num_logs(nvlist_t *nv) { uint_t nlogs = 0; uint_t c, children; nvlist_t **child; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (0); for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) nlogs++; } return (nlogs); } /* Find the max element in an array of uint64_t values */ uint64_t -array64_max(uint64_t array[], unsigned int len) { +array64_max(uint64_t array[], unsigned int len) +{ uint64_t max = 0; int i; for (i = 0; i < len; i++) max = MAX(max, array[i]); return (max); } /* * Return 1 if "str" is a number string, 0 otherwise. Works for integer and * floating point numbers. */ int -isnumber(char *str) { +isnumber(char *str) +{ for (; *str; str++) if (!(isdigit(*str) || (*str == '.'))) return (0); return (1); } diff --git a/include/linux/vfs_compat.h b/include/linux/vfs_compat.h index 7a1cb967b96c..baa29801822b 100644 --- a/include/linux/vfs_compat.h +++ b/include/linux/vfs_compat.h @@ -1,453 +1,457 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Copyright (C) 2015 Jörg Thalheim. */ #ifndef _ZFS_VFS_H #define _ZFS_VFS_H #include #include #include /* * 2.6.28 API change, * Added insert_inode_locked() helper function, prior to this most callers * used insert_inode_hash(). The older method doesn't check for collisions * in the inode_hashtable but it still acceptible for use. */ #ifndef HAVE_INSERT_INODE_LOCKED static inline int insert_inode_locked(struct inode *ip) { insert_inode_hash(ip); return (0); } #endif /* HAVE_INSERT_INODE_LOCKED */ /* * 2.6.35 API change, * Add truncate_setsize() if it is not exported by the Linux kernel. * * Truncate the inode and pages associated with the inode. The pages are * unmapped and removed from cache. */ #ifndef HAVE_TRUNCATE_SETSIZE static inline void truncate_setsize(struct inode *ip, loff_t new) { struct address_space *mapping = ip->i_mapping; i_size_write(ip, new); unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); truncate_inode_pages(mapping, new); unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); } #endif /* HAVE_TRUNCATE_SETSIZE */ /* * 2.6.32 - 2.6.33, bdi_setup_and_register() is not available. * 2.6.34 - 3.19, bdi_setup_and_register() takes 3 arguments. * 4.0 - x.y, bdi_setup_and_register() takes 2 arguments. */ #if defined(HAVE_2ARGS_BDI_SETUP_AND_REGISTER) static inline int zpl_bdi_setup_and_register(struct backing_dev_info *bdi, char *name) { return (bdi_setup_and_register(bdi, name)); } #elif defined(HAVE_3ARGS_BDI_SETUP_AND_REGISTER) static inline int zpl_bdi_setup_and_register(struct backing_dev_info *bdi, char *name) { return (bdi_setup_and_register(bdi, name, BDI_CAP_MAP_COPY)); } #else extern atomic_long_t zfs_bdi_seq; static inline int zpl_bdi_setup_and_register(struct backing_dev_info *bdi, char *name) { char tmp[32]; int error; bdi->name = name; bdi->capabilities = BDI_CAP_MAP_COPY; error = bdi_init(bdi); if (error) return (error); sprintf(tmp, "%.28s%s", name, "-%d"); error = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&zfs_bdi_seq)); if (error) { bdi_destroy(bdi); return (error); } return (error); } #endif /* * 2.6.38 API change, * LOOKUP_RCU flag introduced to distinguish rcu-walk from ref-walk cases. */ #ifndef LOOKUP_RCU #define LOOKUP_RCU 0x0 #endif /* LOOKUP_RCU */ /* * 3.2-rc1 API change, * Add set_nlink() if it is not exported by the Linux kernel. * * i_nlink is read-only in Linux 3.2, but it can be set directly in * earlier kernels. */ #ifndef HAVE_SET_NLINK static inline void set_nlink(struct inode *inode, unsigned int nlink) { inode->i_nlink = nlink; } #endif /* HAVE_SET_NLINK */ /* * 3.3 API change, * The VFS .create, .mkdir and .mknod callbacks were updated to take a * umode_t type rather than an int. To cleanly handle both definitions * the zpl_umode_t type is introduced and set accordingly. */ #ifdef HAVE_MKDIR_UMODE_T typedef umode_t zpl_umode_t; #else typedef int zpl_umode_t; #endif /* * 3.5 API change, * The clear_inode() function replaces end_writeback() and introduces an * ordering change regarding when the inode_sync_wait() occurs. See the * configure check in config/kernel-clear-inode.m4 for full details. */ #if defined(HAVE_EVICT_INODE) && !defined(HAVE_CLEAR_INODE) #define clear_inode(ip) end_writeback(ip) #endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */ /* * 3.6 API change, * The sget() helper function now takes the mount flags as an argument. */ #ifdef HAVE_5ARG_SGET #define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, fl, mtd) #else #define zpl_sget(type, cmp, set, fl, mtd) sget(type, cmp, set, mtd) #endif /* HAVE_5ARG_SGET */ #if defined(SEEK_HOLE) && defined(SEEK_DATA) && !defined(HAVE_LSEEK_EXECUTE) static inline loff_t lseek_execute( struct file *filp, struct inode *inode, loff_t offset, loff_t maxsize) { if (offset < 0 && !(filp->f_mode & FMODE_UNSIGNED_OFFSET)) return (-EINVAL); if (offset > maxsize) return (-EINVAL); if (offset != filp->f_pos) { spin_lock(&filp->f_lock); filp->f_pos = offset; filp->f_version = 0; spin_unlock(&filp->f_lock); } return (offset); } #endif /* SEEK_HOLE && SEEK_DATA && !HAVE_LSEEK_EXECUTE */ #if defined(CONFIG_FS_POSIX_ACL) /* * These functions safely approximates the behavior of posix_acl_release() * which cannot be used because it calls the GPL-only symbol kfree_rcu(). * The in-kernel version, which can access the RCU, frees the ACLs after * the grace period expires. Because we're unsure how long that grace * period may be this implementation conservatively delays for 60 seconds. * This is several orders of magnitude larger than expected grace period. * At 60 seconds the kernel will also begin issuing RCU stall warnings. */ #include #if defined(HAVE_POSIX_ACL_RELEASE) && !defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) #define zpl_posix_acl_release(arg) posix_acl_release(arg) #else void zpl_posix_acl_release_impl(struct posix_acl *); static inline void zpl_posix_acl_release(struct posix_acl *acl) { if ((acl == NULL) || (acl == ACL_NOT_CACHED)) return; if (atomic_dec_and_test(&acl->a_refcount)) zpl_posix_acl_release_impl(acl); } #endif /* HAVE_POSIX_ACL_RELEASE */ #ifdef HAVE_SET_CACHED_ACL_USABLE #define zpl_set_cached_acl(ip, ty, n) set_cached_acl(ip, ty, n) #define zpl_forget_cached_acl(ip, ty) forget_cached_acl(ip, ty) #else static inline void -zpl_set_cached_acl(struct inode *ip, int type, struct posix_acl *newer) { +zpl_set_cached_acl(struct inode *ip, int type, struct posix_acl *newer) +{ struct posix_acl *older = NULL; spin_lock(&ip->i_lock); if ((newer != ACL_NOT_CACHED) && (newer != NULL)) posix_acl_dup(newer); switch (type) { case ACL_TYPE_ACCESS: older = ip->i_acl; rcu_assign_pointer(ip->i_acl, newer); break; case ACL_TYPE_DEFAULT: older = ip->i_default_acl; rcu_assign_pointer(ip->i_default_acl, newer); break; } spin_unlock(&ip->i_lock); zpl_posix_acl_release(older); } static inline void -zpl_forget_cached_acl(struct inode *ip, int type) { +zpl_forget_cached_acl(struct inode *ip, int type) +{ zpl_set_cached_acl(ip, type, (struct posix_acl *)ACL_NOT_CACHED); } #endif /* HAVE_SET_CACHED_ACL_USABLE */ #ifndef HAVE___POSIX_ACL_CHMOD #ifdef HAVE_POSIX_ACL_CHMOD #define __posix_acl_chmod(acl, gfp, mode) posix_acl_chmod(acl, gfp, mode) #define __posix_acl_create(acl, gfp, mode) posix_acl_create(acl, gfp, mode) #else static inline int -__posix_acl_chmod(struct posix_acl **acl, int flags, umode_t umode) { +__posix_acl_chmod(struct posix_acl **acl, int flags, umode_t umode) +{ struct posix_acl *oldacl = *acl; mode_t mode = umode; int error; *acl = posix_acl_clone(*acl, flags); zpl_posix_acl_release(oldacl); if (!(*acl)) return (-ENOMEM); error = posix_acl_chmod_masq(*acl, mode); if (error) { zpl_posix_acl_release(*acl); *acl = NULL; } return (error); } static inline int -__posix_acl_create(struct posix_acl **acl, int flags, umode_t *umodep) { +__posix_acl_create(struct posix_acl **acl, int flags, umode_t *umodep) +{ struct posix_acl *oldacl = *acl; mode_t mode = *umodep; int error; *acl = posix_acl_clone(*acl, flags); zpl_posix_acl_release(oldacl); if (!(*acl)) return (-ENOMEM); error = posix_acl_create_masq(*acl, &mode); *umodep = mode; if (error < 0) { zpl_posix_acl_release(*acl); *acl = NULL; } return (error); } #endif /* HAVE_POSIX_ACL_CHMOD */ #endif /* HAVE___POSIX_ACL_CHMOD */ #ifdef HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T typedef umode_t zpl_equivmode_t; #else typedef mode_t zpl_equivmode_t; #endif /* HAVE_POSIX_ACL_EQUIV_MODE_UMODE_T */ /* * 4.8 API change, * posix_acl_valid() now must be passed a namespace, the namespace from * from super block associated with the given inode is used for this purpose. */ #ifdef HAVE_POSIX_ACL_VALID_WITH_NS #define zpl_posix_acl_valid(ip, acl) posix_acl_valid(ip->i_sb->s_user_ns, acl) #else #define zpl_posix_acl_valid(ip, acl) posix_acl_valid(acl) #endif #endif /* CONFIG_FS_POSIX_ACL */ /* * 2.6.38 API change, * The is_owner_or_cap() function was renamed to inode_owner_or_capable(). */ #ifdef HAVE_INODE_OWNER_OR_CAPABLE #define zpl_inode_owner_or_capable(ip) inode_owner_or_capable(ip) #else #define zpl_inode_owner_or_capable(ip) is_owner_or_cap(ip) #endif /* HAVE_INODE_OWNER_OR_CAPABLE */ /* * 3.19 API change * struct access f->f_dentry->d_inode was replaced by accessor function * file_inode(f) */ #ifndef HAVE_FILE_INODE static inline struct inode *file_inode(const struct file *f) { return (f->f_dentry->d_inode); } #endif /* HAVE_FILE_INODE */ /* * 4.1 API change * struct access file->f_path.dentry was replaced by accessor function * file_dentry(f) */ #ifndef HAVE_FILE_DENTRY static inline struct dentry *file_dentry(const struct file *f) { return (f->f_path.dentry); } #endif /* HAVE_FILE_DENTRY */ #ifdef HAVE_KUID_HELPERS static inline uid_t zfs_uid_read_impl(struct inode *ip) { #ifdef HAVE_SUPER_USER_NS return (from_kuid(ip->i_sb->s_user_ns, ip->i_uid)); #else return (from_kuid(kcred->user_ns, ip->i_uid)); #endif } static inline uid_t zfs_uid_read(struct inode *ip) { return (zfs_uid_read_impl(ip)); } static inline gid_t zfs_gid_read_impl(struct inode *ip) { #ifdef HAVE_SUPER_USER_NS return (from_kgid(ip->i_sb->s_user_ns, ip->i_gid)); #else return (from_kgid(kcred->user_ns, ip->i_gid)); #endif } static inline gid_t zfs_gid_read(struct inode *ip) { return (zfs_gid_read_impl(ip)); } static inline void zfs_uid_write(struct inode *ip, uid_t uid) { #ifdef HAVE_SUPER_USER_NS ip->i_uid = make_kuid(ip->i_sb->s_user_ns, uid); #else ip->i_uid = make_kuid(kcred->user_ns, uid); #endif } static inline void zfs_gid_write(struct inode *ip, gid_t gid) { #ifdef HAVE_SUPER_USER_NS ip->i_gid = make_kgid(ip->i_sb->s_user_ns, gid); #else ip->i_gid = make_kgid(kcred->user_ns, gid); #endif } #else static inline uid_t zfs_uid_read(struct inode *ip) { return (ip->i_uid); } static inline gid_t zfs_gid_read(struct inode *ip) { return (ip->i_gid); } static inline void zfs_uid_write(struct inode *ip, uid_t uid) { ip->i_uid = uid; } static inline void zfs_gid_write(struct inode *ip, gid_t gid) { ip->i_gid = gid; } #endif /* * 2.6.38 API change */ #ifdef HAVE_FOLLOW_DOWN_ONE #define zpl_follow_down_one(path) follow_down_one(path) #define zpl_follow_up(path) follow_up(path) #else #define zpl_follow_down_one(path) follow_down(path) #define zpl_follow_up(path) follow_up(path) #endif /* * 4.9 API change */ #ifndef HAVE_SETATTR_PREPARE static inline int setattr_prepare(struct dentry *dentry, struct iattr *ia) { return (inode_change_ok(dentry->d_inode, ia)); } #endif #endif /* _ZFS_VFS_H */ diff --git a/include/sys/trace_acl.h b/include/sys/trace_acl.h index 1d6e15c640ba..80c63f743e0f 100644 --- a/include/sys/trace_acl.h +++ b/include/sys/trace_acl.h @@ -1,156 +1,158 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs #undef TRACE_SYSTEM_VAR #define TRACE_SYSTEM_VAR zfs_acl #if !defined(_TRACE_ACL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ACL_H #include #include #include /* * Generic support for three argument tracepoints of the form: * * DTRACE_PROBE3(..., * znode_t *, ..., * zfs_ace_hdr_t *, ..., * uint32_t, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_ace_class, TP_PROTO(znode_t *zn, zfs_ace_hdr_t *ace, uint32_t mask_matched), TP_ARGS(zn, ace, mask_matched), TP_STRUCT__entry( __field(uint64_t, z_id) __field(uint8_t, z_unlinked) __field(uint8_t, z_atime_dirty) __field(uint8_t, z_zn_prefetch) __field(uint8_t, z_moved) __field(uint_t, z_blksz) __field(uint_t, z_seq) __field(uint64_t, z_mapcnt) __field(uint64_t, z_size) __field(uint64_t, z_pflags) __field(uint32_t, z_sync_cnt) __field(mode_t, z_mode) __field(boolean_t, z_is_sa) __field(boolean_t, z_is_mapped) __field(boolean_t, z_is_ctldir) __field(boolean_t, z_is_stale) __field(uint32_t, i_uid) __field(uint32_t, i_gid) __field(unsigned long, i_ino) __field(unsigned int, i_nlink) __field(u64, i_version) __field(loff_t, i_size) __field(unsigned int, i_blkbits) __field(unsigned short, i_bytes) __field(umode_t, i_mode) __field(__u32, i_generation) __field(uint16_t, z_type) __field(uint16_t, z_flags) __field(uint32_t, z_access_mask) __field(uint32_t, mask_matched) ), TP_fast_assign( __entry->z_id = zn->z_id; __entry->z_unlinked = zn->z_unlinked; __entry->z_atime_dirty = zn->z_atime_dirty; __entry->z_zn_prefetch = zn->z_zn_prefetch; __entry->z_moved = zn->z_moved; __entry->z_blksz = zn->z_blksz; __entry->z_seq = zn->z_seq; __entry->z_mapcnt = zn->z_mapcnt; __entry->z_size = zn->z_size; __entry->z_pflags = zn->z_pflags; __entry->z_sync_cnt = zn->z_sync_cnt; __entry->z_mode = zn->z_mode; __entry->z_is_sa = zn->z_is_sa; __entry->z_is_mapped = zn->z_is_mapped; __entry->z_is_ctldir = zn->z_is_ctldir; __entry->z_is_stale = zn->z_is_stale; __entry->i_uid = KUID_TO_SUID(ZTOI(zn)->i_uid); __entry->i_gid = KGID_TO_SGID(ZTOI(zn)->i_gid); __entry->i_ino = zn->z_inode.i_ino; __entry->i_nlink = zn->z_inode.i_nlink; __entry->i_version = zn->z_inode.i_version; __entry->i_size = zn->z_inode.i_size; __entry->i_blkbits = zn->z_inode.i_blkbits; __entry->i_bytes = zn->z_inode.i_bytes; __entry->i_mode = zn->z_inode.i_mode; __entry->i_generation = zn->z_inode.i_generation; __entry->z_type = ace->z_type; __entry->z_flags = ace->z_flags; __entry->z_access_mask = ace->z_access_mask; __entry->mask_matched = mask_matched; ), TP_printk("zn { id %llu unlinked %u atime_dirty %u " "zn_prefetch %u moved %u blksz %u seq %u " "mapcnt %llu size %llu pflags %llu " "sync_cnt %u mode 0x%x is_sa %d " "is_mapped %d is_ctldir %d is_stale %d inode { " "uid %u gid %u ino %lu nlink %u version %llu size %lli " "blkbits %u bytes %u mode 0x%x generation %x } } " "ace { type %u flags %u access_mask %u } mask_matched %u", __entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty, __entry->z_zn_prefetch, __entry->z_moved, __entry->z_blksz, __entry->z_seq, __entry->z_mapcnt, __entry->z_size, __entry->z_pflags, __entry->z_sync_cnt, __entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped, __entry->z_is_ctldir, __entry->z_is_stale, __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink, __entry->i_version, __entry->i_size, __entry->i_blkbits, __entry->i_bytes, __entry->i_mode, __entry->i_generation, __entry->z_type, __entry->z_flags, __entry->z_access_mask, __entry->mask_matched) ); /* END CSTYLED */ #define DEFINE_ACE_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_ace_class, name, \ TP_PROTO(znode_t *zn, zfs_ace_hdr_t *ace, uint32_t mask_matched), \ TP_ARGS(zn, ace, mask_matched)) +/* END CSTYLED */ DEFINE_ACE_EVENT(zfs_zfs__ace__denies); DEFINE_ACE_EVENT(zfs_zfs__ace__allows); #endif /* _TRACE_ACL_H */ #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH sys #define TRACE_INCLUDE_FILE trace_acl #include #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 9756bd458cc7..b4edb514201c 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -1,354 +1,364 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #include #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs #undef TRACE_SYSTEM_VAR #define TRACE_SYSTEM_VAR zfs_arc #if !defined(_TRACE_ARC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ARC_H #include #include #include /* For ZIO macros */ /* * Generic support for one argument tracepoints of the form: * * DTRACE_PROBE1(..., * arc_buf_hdr_t *, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, TP_PROTO(arc_buf_hdr_t *ab), TP_ARGS(ab), TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) __field(uint16_t, hdr_psize) __field(uint16_t, hdr_lsize) __field(uint64_t, hdr_spa) __field(arc_state_type_t, hdr_state_type) __field(clock_t, hdr_access) __field(uint32_t, hdr_mru_hits) __field(uint32_t, hdr_mru_ghost_hits) __field(uint32_t, hdr_mfu_hits) __field(uint32_t, hdr_mfu_ghost_hits) __field(uint32_t, hdr_l2_hits) __field(int64_t, hdr_refcount) ), TP_fast_assign( __entry->hdr_dva_word[0] = ab->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; __entry->hdr_birth = ab->b_birth; __entry->hdr_flags = ab->b_flags; __entry->hdr_bufcnt = ab->b_l1hdr.b_bufcnt; __entry->hdr_psize = ab->b_psize; __entry->hdr_lsize = ab->b_lsize; __entry->hdr_spa = ab->b_spa; __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; __entry->hdr_access = ab->b_l1hdr.b_arc_access; __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits; __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits; __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits; __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits; __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits; __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " "flags 0x%x bufcnt %u type %u psize %u lsize %u spa %llu " "state_type %u access %lu mru_hits %u mru_ghost_hits %u " "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, __entry->hdr_bufcnt, __entry->hdr_type, __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits, __entry->hdr_refcount) ); /* END CSTYLED */ #define DEFINE_ARC_BUF_HDR_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \ TP_PROTO(arc_buf_hdr_t *ab), \ TP_ARGS(ab)) +/* END CSTYLED */ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); /* * Generic support for two argument tracepoints of the form: * * DTRACE_PROBE2(..., * vdev_t *, ..., * zio_t *, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_l2arc_rw_class, TP_PROTO(vdev_t *vd, zio_t *zio), TP_ARGS(vd, zio), TP_STRUCT__entry( __field(uint64_t, vdev_id) __field(uint64_t, vdev_guid) __field(uint64_t, vdev_state) ZIO_TP_STRUCT_ENTRY ), TP_fast_assign( __entry->vdev_id = vd->vdev_id; __entry->vdev_guid = vd->vdev_guid; __entry->vdev_state = vd->vdev_state; ZIO_TP_FAST_ASSIGN ), TP_printk("vdev { id %llu guid %llu state %llu } " ZIO_TP_PRINTK_FMT, __entry->vdev_id, __entry->vdev_guid, __entry->vdev_state, ZIO_TP_PRINTK_ARGS) ); /* END CSTYLED */ #define DEFINE_L2ARC_RW_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_l2arc_rw_class, name, \ TP_PROTO(vdev_t *vd, zio_t *zio), \ TP_ARGS(vd, zio)) +/* END CSTYLED */ DEFINE_L2ARC_RW_EVENT(zfs_l2arc__read); DEFINE_L2ARC_RW_EVENT(zfs_l2arc__write); /* * Generic support for two argument tracepoints of the form: * * DTRACE_PROBE2(..., * zio_t *, ..., * l2arc_write_callback_t *, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_l2arc_iodone_class, TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), TP_ARGS(zio, cb), TP_STRUCT__entry(ZIO_TP_STRUCT_ENTRY), TP_fast_assign(ZIO_TP_FAST_ASSIGN), TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS) ); /* END CSTYLED */ #define DEFINE_L2ARC_IODONE_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_l2arc_iodone_class, name, \ TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), \ TP_ARGS(zio, cb)) +/* END CSTYLED */ DEFINE_L2ARC_IODONE_EVENT(zfs_l2arc__iodone); /* * Generic support for four argument tracepoints of the form: * * DTRACE_PROBE4(..., * arc_buf_hdr_t *, ..., * const blkptr_t *, * uint64_t, * const zbookmark_phys_t *); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_arc_miss_class, TP_PROTO(arc_buf_hdr_t *hdr, const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), TP_ARGS(hdr, bp, size, zb), TP_STRUCT__entry( __array(uint64_t, hdr_dva_word, 2) __field(uint64_t, hdr_birth) __field(uint32_t, hdr_flags) __field(uint32_t, hdr_bufcnt) __field(arc_buf_contents_t, hdr_type) __field(uint16_t, hdr_psize) __field(uint16_t, hdr_lsize) __field(uint64_t, hdr_spa) __field(arc_state_type_t, hdr_state_type) __field(clock_t, hdr_access) __field(uint32_t, hdr_mru_hits) __field(uint32_t, hdr_mru_ghost_hits) __field(uint32_t, hdr_mfu_hits) __field(uint32_t, hdr_mfu_ghost_hits) __field(uint32_t, hdr_l2_hits) __field(int64_t, hdr_refcount) __array(uint64_t, bp_dva0, 2) __array(uint64_t, bp_dva1, 2) __array(uint64_t, bp_dva2, 2) __array(uint64_t, bp_cksum, 4) __field(uint64_t, bp_lsize) __field(uint64_t, zb_objset) __field(uint64_t, zb_object) __field(int64_t, zb_level) __field(uint64_t, zb_blkid) ), TP_fast_assign( __entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0]; __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; __entry->hdr_birth = hdr->b_birth; __entry->hdr_flags = hdr->b_flags; __entry->hdr_bufcnt = hdr->b_l1hdr.b_bufcnt; __entry->hdr_psize = hdr->b_psize; __entry->hdr_lsize = hdr->b_lsize; __entry->hdr_spa = hdr->b_spa; __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; __entry->hdr_access = hdr->b_l1hdr.b_arc_access; __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits; __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits; __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits; __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count; __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; __entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1]; __entry->bp_dva1[0] = bp->blk_dva[1].dva_word[0]; __entry->bp_dva1[1] = bp->blk_dva[1].dva_word[1]; __entry->bp_dva2[0] = bp->blk_dva[2].dva_word[0]; __entry->bp_dva2[1] = bp->blk_dva[2].dva_word[1]; __entry->bp_cksum[0] = bp->blk_cksum.zc_word[0]; __entry->bp_cksum[1] = bp->blk_cksum.zc_word[1]; __entry->bp_cksum[2] = bp->blk_cksum.zc_word[2]; __entry->bp_cksum[3] = bp->blk_cksum.zc_word[3]; __entry->bp_lsize = size; __entry->zb_objset = zb->zb_objset; __entry->zb_object = zb->zb_object; __entry->zb_level = zb->zb_level; __entry->zb_blkid = zb->zb_blkid; ), TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " "flags 0x%x bufcnt %u psize %u lsize %u spa %llu state_type %u " "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " "mfu_ghost_hits %u l2_hits %u refcount %lli } " "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " "0x%llx:0x%llx cksum 0x%llx:0x%llx:0x%llx:0x%llx " "lsize %llu } zb { objset %llu object %llu level %lli " "blkid %llu }", __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], __entry->hdr_birth, __entry->hdr_flags, __entry->hdr_bufcnt, __entry->hdr_psize, __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits, __entry->hdr_refcount, __entry->bp_dva0[0], __entry->bp_dva0[1], __entry->bp_dva1[0], __entry->bp_dva1[1], __entry->bp_dva2[0], __entry->bp_dva2[1], __entry->bp_cksum[0], __entry->bp_cksum[1], __entry->bp_cksum[2], __entry->bp_cksum[3], __entry->bp_lsize, __entry->zb_objset, __entry->zb_object, __entry->zb_level, __entry->zb_blkid) ); /* END CSTYLED */ #define DEFINE_ARC_MISS_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_arc_miss_class, name, \ TP_PROTO(arc_buf_hdr_t *hdr, \ const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), \ TP_ARGS(hdr, bp, size, zb)) +/* END CSTYLED */ DEFINE_ARC_MISS_EVENT(zfs_arc__miss); /* * Generic support for four argument tracepoints of the form: * * DTRACE_PROBE4(..., * l2arc_dev_t *, ..., * list_t *, ..., * uint64_t, ..., * boolean_t, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_l2arc_evict_class, TP_PROTO(l2arc_dev_t *dev, list_t *buflist, uint64_t taddr, boolean_t all), TP_ARGS(dev, buflist, taddr, all), TP_STRUCT__entry( __field(uint64_t, vdev_id) __field(uint64_t, vdev_guid) __field(uint64_t, vdev_state) __field(uint64_t, l2ad_hand) __field(uint64_t, l2ad_start) __field(uint64_t, l2ad_end) __field(boolean_t, l2ad_first) __field(boolean_t, l2ad_writing) __field(uint64_t, taddr) __field(boolean_t, all) ), TP_fast_assign( __entry->vdev_id = dev->l2ad_vdev->vdev_id; __entry->vdev_guid = dev->l2ad_vdev->vdev_guid; __entry->vdev_state = dev->l2ad_vdev->vdev_state; __entry->l2ad_hand = dev->l2ad_hand; __entry->l2ad_start = dev->l2ad_start; __entry->l2ad_end = dev->l2ad_end; __entry->l2ad_first = dev->l2ad_first; __entry->l2ad_writing = dev->l2ad_writing; __entry->taddr = taddr; __entry->all = all; ), TP_printk("l2ad { vdev { id %llu guid %llu state %llu } " "hand %llu start %llu end %llu " "first %d writing %d } taddr %llu all %d", __entry->vdev_id, __entry->vdev_guid, __entry->vdev_state, __entry->l2ad_hand, __entry->l2ad_start, __entry->l2ad_end, __entry->l2ad_first, __entry->l2ad_writing, __entry->taddr, __entry->all) ); /* END CSTYLED */ #define DEFINE_L2ARC_EVICT_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_l2arc_evict_class, name, \ TP_PROTO(l2arc_dev_t *dev, \ list_t *buflist, uint64_t taddr, boolean_t all), \ TP_ARGS(dev, buflist, taddr, all)) +/* END CSTYLED */ DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict); #endif /* _TRACE_ARC_H */ #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH sys #define TRACE_INCLUDE_FILE trace_arc #include #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/trace_dbgmsg.h b/include/sys/trace_dbgmsg.h index e5b79f2ff8c9..27abe703f490 100644 --- a/include/sys/trace_dbgmsg.h +++ b/include/sys/trace_dbgmsg.h @@ -1,124 +1,130 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* Do not include this file directly. Please use instead. */ #ifndef _SYS_TRACE_DBGMSG_INDIRECT #error "trace_dbgmsg.h included directly" #endif /* * This file defines tracepoint events for use by the dbgmsg(), * dprintf(), and SET_ERROR() interfaces. These are grouped here because * they all provide a way to store simple messages in the debug log (as * opposed to events used by the DTRACE_PROBE interfaces which typically * dump structured data). * * This header is included inside the trace.h multiple inclusion guard, * and it is guarded above against direct inclusion, so it and need not * be guarded separately. */ /* * Generic support for four argument tracepoints of the form: * * DTRACE_PROBE4(..., * const char *, ..., * const char *, ..., * int, ..., * const char *, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_dprintf_class, TP_PROTO(const char *file, const char *function, int line, const char *msg), TP_ARGS(file, function, line, msg), TP_STRUCT__entry( __string(file, file) __string(function, function) __field(int, line) __string(msg, msg) ), TP_fast_assign( __assign_str(file, strchr(file, '/') ? strrchr(file, '/') + 1 : file) __assign_str(function, function); __entry->line = line; __assign_str(msg, msg); ), TP_printk("%s:%d:%s(): %s", __get_str(file), __entry->line, __get_str(function), __get_str(msg)) ); /* END CSTYLED */ #define DEFINE_DPRINTF_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_dprintf_class, name, \ TP_PROTO(const char *file, const char *function, int line, \ const char *msg), \ TP_ARGS(file, function, line, msg)) +/* END CSTYLED */ DEFINE_DPRINTF_EVENT(zfs_zfs__dprintf); /* * Generic support for four argument tracepoints of the form: * * DTRACE_PROBE4(..., * const char *, ..., * const char *, ..., * int, ..., * uintptr_t, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_set_error_class, TP_PROTO(const char *file, const char *function, int line, uintptr_t error), TP_ARGS(file, function, line, error), TP_STRUCT__entry( __string(file, file) __string(function, function) __field(int, line) __field(uintptr_t, error) ), TP_fast_assign( __assign_str(file, strchr(file, '/') ? strrchr(file, '/') + 1 : file) __assign_str(function, function); __entry->line = line; __entry->error = error; ), TP_printk("%s:%d:%s(): error 0x%lx", __get_str(file), __entry->line, __get_str(function), __entry->error) ); /* END CSTYLED */ #ifdef TP_CONDITION #define DEFINE_SET_ERROR_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT_CONDITION(zfs_set_error_class, name, \ TP_PROTO(const char *file, const char *function, int line, \ uintptr_t error), \ TP_ARGS(file, function, line, error), \ TP_CONDITION(error)) +/* END CSTYLED */ #else #define DEFINE_SET_ERROR_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_set_error_class, name, \ TP_PROTO(const char *file, const char *function, int line, \ uintptr_t error), \ TP_ARGS(file, function, line, error)) +/* END CSTYLED */ #endif DEFINE_SET_ERROR_EVENT(zfs_set__error); diff --git a/include/sys/trace_dbuf.h b/include/sys/trace_dbuf.h index 76274d152575..4c5e51ebe3b1 100644 --- a/include/sys/trace_dbuf.h +++ b/include/sys/trace_dbuf.h @@ -1,117 +1,125 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs #undef TRACE_SYSTEM_VAR #define TRACE_SYSTEM_VAR zfs_dbuf #if !defined(_TRACE_DBUF_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_DBUF_H #include #include /* * Generic support for two argument tracepoints of the form: * * DTRACE_PROBE2(..., * dmu_buf_impl_t *, ..., * zio_t *, ...); */ #define DBUF_TP_STRUCT_ENTRY \ __string(os_spa, \ spa_name(DB_DNODE(db)->dn_objset->os_spa)) \ __field(uint64_t, ds_object) \ __field(uint64_t, db_object) \ __field(uint64_t, db_level) \ __field(uint64_t, db_blkid) \ __field(uint64_t, db_offset) \ __field(uint64_t, db_size) \ __field(uint64_t, db_state) \ __field(int64_t, db_holds) \ #define DBUF_TP_FAST_ASSIGN \ __assign_str(os_spa, \ spa_name(DB_DNODE(db)->dn_objset->os_spa)); \ \ __entry->ds_object = db->db_objset->os_dsl_dataset ? \ db->db_objset->os_dsl_dataset->ds_object : 0; \ \ __entry->db_object = db->db.db_object; \ __entry->db_level = db->db_level; \ __entry->db_blkid = db->db_blkid; \ __entry->db_offset = db->db.db_offset; \ __entry->db_size = db->db.db_size; \ __entry->db_state = db->db_state; \ __entry->db_holds = refcount_count(&db->db_holds); #define DBUF_TP_PRINTK_FMT \ "dbuf { spa \"%s\" objset %llu object %llu level %llu " \ "blkid %llu offset %llu size %llu state %llu holds %lld }" #define DBUF_TP_PRINTK_ARGS \ __get_str(os_spa), __entry->ds_object, \ __entry->db_object, __entry->db_level, \ __entry->db_blkid, __entry->db_offset, \ __entry->db_size, __entry->db_state, __entry->db_holds +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_dbuf_class, TP_PROTO(dmu_buf_impl_t *db, zio_t *zio), TP_ARGS(db, zio), TP_STRUCT__entry(DBUF_TP_STRUCT_ENTRY), TP_fast_assign(DBUF_TP_FAST_ASSIGN), TP_printk(DBUF_TP_PRINTK_FMT, DBUF_TP_PRINTK_ARGS) ); +/* END CSTYLED */ #define DEFINE_DBUF_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_dbuf_class, name, \ TP_PROTO(dmu_buf_impl_t *db, zio_t *zio), \ TP_ARGS(db, zio)) +/* END CSTYLED */ DEFINE_DBUF_EVENT(zfs_blocked__read); +/* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_dbuf_evict_one_class, TP_PROTO(dmu_buf_impl_t *db, multilist_sublist_t *mls), TP_ARGS(db, mls), TP_STRUCT__entry(DBUF_TP_STRUCT_ENTRY), TP_fast_assign(DBUF_TP_FAST_ASSIGN), TP_printk(DBUF_TP_PRINTK_FMT, DBUF_TP_PRINTK_ARGS) ); +/* END CSTYLED */ #define DEFINE_DBUF_EVICT_ONE_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_dbuf_evict_one_class, name, \ TP_PROTO(dmu_buf_impl_t *db, multilist_sublist_t *mls), \ TP_ARGS(db, mls)) +/* END CSTYLED */ DEFINE_DBUF_EVICT_ONE_EVENT(zfs_dbuf__evict__one); #endif /* _TRACE_DBUF_H */ #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH sys #define TRACE_INCLUDE_FILE trace_dbuf #include #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/trace_dmu.h b/include/sys/trace_dmu.h index 0f2f49921a0e..844746a9c41e 100644 --- a/include/sys/trace_dmu.h +++ b/include/sys/trace_dmu.h @@ -1,121 +1,123 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs #undef TRACE_SYSTEM_VAR #define TRACE_SYSTEM_VAR zfs_dmu #if !defined(_TRACE_DMU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_DMU_H #include #include /* * Generic support for three argument tracepoints of the form: * * DTRACE_PROBE3(..., * dmu_tx_t *, ..., * uint64_t, ..., * uint64_t, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_delay_mintime_class, TP_PROTO(dmu_tx_t *tx, uint64_t dirty, uint64_t min_tx_time), TP_ARGS(tx, dirty, min_tx_time), TP_STRUCT__entry( __field(uint64_t, tx_txg) __field(uint64_t, tx_lastsnap_txg) __field(uint64_t, tx_lasttried_txg) __field(boolean_t, tx_anyobj) __field(boolean_t, tx_waited) __field(hrtime_t, tx_start) __field(boolean_t, tx_wait_dirty) __field(int, tx_err) #ifdef DEBUG_DMU_TX __field(uint64_t, tx_space_towrite) __field(uint64_t, tx_space_tofree) __field(uint64_t, tx_space_tooverwrite) __field(uint64_t, tx_space_tounref) __field(int64_t, tx_space_written) __field(int64_t, tx_space_freed) #endif __field(uint64_t, min_tx_time) __field(uint64_t, dirty) ), TP_fast_assign( __entry->tx_txg = tx->tx_txg; __entry->tx_lastsnap_txg = tx->tx_lastsnap_txg; __entry->tx_lasttried_txg = tx->tx_lasttried_txg; __entry->tx_anyobj = tx->tx_anyobj; __entry->tx_waited = tx->tx_waited; __entry->tx_start = tx->tx_start; __entry->tx_wait_dirty = tx->tx_wait_dirty; __entry->tx_err = tx->tx_err; #ifdef DEBUG_DMU_TX __entry->tx_space_towrite = tx->tx_space_towrite; __entry->tx_space_tofree = tx->tx_space_tofree; __entry->tx_space_tooverwrite = tx->tx_space_tooverwrite; __entry->tx_space_tounref = tx->tx_space_tounref; __entry->tx_space_written = tx->tx_space_written.rc_count; __entry->tx_space_freed = tx->tx_space_freed.rc_count; #endif __entry->dirty = dirty; __entry->min_tx_time = min_tx_time; ), TP_printk("tx { txg %llu lastsnap_txg %llu tx_lasttried_txg %llu " "anyobj %d waited %d start %llu wait_dirty %d err %i " #ifdef DEBUG_DMU_TX "space_towrite %llu space_tofree %llu space_tooverwrite %llu " "space_tounref %llu space_written %lli space_freed %lli " #endif "} dirty %llu min_tx_time %llu", __entry->tx_txg, __entry->tx_lastsnap_txg, __entry->tx_lasttried_txg, __entry->tx_anyobj, __entry->tx_waited, __entry->tx_start, __entry->tx_wait_dirty, __entry->tx_err, #ifdef DEBUG_DMU_TX __entry->tx_space_towrite, __entry->tx_space_tofree, __entry->tx_space_tooverwrite, __entry->tx_space_tounref, __entry->tx_space_written, __entry->tx_space_freed, #endif __entry->dirty, __entry->min_tx_time) ); /* END CSTYLED */ #define DEFINE_DELAY_MINTIME_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_delay_mintime_class, name, \ TP_PROTO(dmu_tx_t *tx, uint64_t dirty, uint64_t min_tx_time), \ TP_ARGS(tx, dirty, min_tx_time)) +/* END CSTYLED */ DEFINE_DELAY_MINTIME_EVENT(zfs_delay__mintime); #endif /* _TRACE_DMU_H */ #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH sys #define TRACE_INCLUDE_FILE trace_dmu #include #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/trace_dnode.h b/include/sys/trace_dnode.h index 292f8e2b7b3b..a651a56cf2eb 100644 --- a/include/sys/trace_dnode.h +++ b/include/sys/trace_dnode.h @@ -1,121 +1,123 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs #undef TRACE_SYSTEM_VAR #define TRACE_SYSTEM_VAR zfs_dnode #if !defined(_TRACE_DNODE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_DNODE_H #include #include /* * Generic support for three argument tracepoints of the form: * * DTRACE_PROBE3(..., * dnode_t *, ..., * int64_t, ..., * uint32_t, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_dnode_move_class, TP_PROTO(dnode_t *dn, int64_t refcount, uint32_t dbufs), TP_ARGS(dn, refcount, dbufs), TP_STRUCT__entry( __field(uint64_t, dn_object) __field(dmu_object_type_t, dn_type) __field(uint16_t, dn_bonuslen) __field(uint8_t, dn_bonustype) __field(uint8_t, dn_nblkptr) __field(uint8_t, dn_checksum) __field(uint8_t, dn_compress) __field(uint8_t, dn_nlevels) __field(uint8_t, dn_indblkshift) __field(uint8_t, dn_datablkshift) __field(uint8_t, dn_moved) __field(uint16_t, dn_datablkszsec) __field(uint32_t, dn_datablksz) __field(uint64_t, dn_maxblkid) __field(int64_t, dn_tx_holds) __field(int64_t, dn_holds) __field(boolean_t, dn_have_spill) __field(int64_t, refcount) __field(uint32_t, dbufs) ), TP_fast_assign( __entry->dn_object = dn->dn_object; __entry->dn_type = dn->dn_type; __entry->dn_bonuslen = dn->dn_bonuslen; __entry->dn_bonustype = dn->dn_bonustype; __entry->dn_nblkptr = dn->dn_nblkptr; __entry->dn_checksum = dn->dn_checksum; __entry->dn_compress = dn->dn_compress; __entry->dn_nlevels = dn->dn_nlevels; __entry->dn_indblkshift = dn->dn_indblkshift; __entry->dn_datablkshift = dn->dn_datablkshift; __entry->dn_moved = dn->dn_moved; __entry->dn_datablkszsec = dn->dn_datablkszsec; __entry->dn_datablksz = dn->dn_datablksz; __entry->dn_maxblkid = dn->dn_maxblkid; __entry->dn_tx_holds = dn->dn_tx_holds.rc_count; __entry->dn_holds = dn->dn_holds.rc_count; __entry->dn_have_spill = dn->dn_have_spill; __entry->refcount = refcount; __entry->dbufs = dbufs; ), TP_printk("dn { object %llu type %d bonuslen %u bonustype %u " "nblkptr %u checksum %u compress %u nlevels %u indblkshift %u " "datablkshift %u moved %u datablkszsec %u datablksz %u " "maxblkid %llu tx_holds %lli holds %lli have_spill %d } " "refcount %lli dbufs %u", __entry->dn_object, __entry->dn_type, __entry->dn_bonuslen, __entry->dn_bonustype, __entry->dn_nblkptr, __entry->dn_checksum, __entry->dn_compress, __entry->dn_nlevels, __entry->dn_indblkshift, __entry->dn_datablkshift, __entry->dn_moved, __entry->dn_datablkszsec, __entry->dn_datablksz, __entry->dn_maxblkid, __entry->dn_tx_holds, __entry->dn_holds, __entry->dn_have_spill, __entry->refcount, __entry->dbufs) ); /* END CSTYLED */ #define DEFINE_DNODE_MOVE_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_dnode_move_class, name, \ TP_PROTO(dnode_t *dn, int64_t refcount, uint32_t dbufs), \ TP_ARGS(dn, refcount, dbufs)) +/* END CSTYLED */ DEFINE_DNODE_MOVE_EVENT(zfs_dnode__move); #endif /* _TRACE_DNODE_H */ #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH sys #define TRACE_INCLUDE_FILE trace_dnode #include #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/trace_multilist.h b/include/sys/trace_multilist.h index 816ba5b0a831..7cf4dc39d626 100644 --- a/include/sys/trace_multilist.h +++ b/include/sys/trace_multilist.h @@ -1,80 +1,82 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs #undef TRACE_SYSTEM_VAR #define TRACE_SYSTEM_VAR zfs_multilist #if !defined(_TRACE_MULTILIST_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MULTILIST_H #include #include /* * Generic support for three argument tracepoints of the form: * * DTRACE_PROBE3(..., * multilist_t *, ..., * unsigned int, ..., * void *, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class, TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj), TP_ARGS(ml, sublist_idx, obj), TP_STRUCT__entry( __field(size_t, ml_offset) __field(uint64_t, ml_num_sublists) __field(unsigned int, sublist_idx) ), TP_fast_assign( __entry->ml_offset = ml->ml_offset; __entry->ml_num_sublists = ml->ml_num_sublists; __entry->sublist_idx = sublist_idx; ), TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ", __entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx) ); /* END CSTYLED */ #define DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \ TP_PROTO(multilist_t *ml, unsigned int sublist_idx, void *obj), \ TP_ARGS(ml, sublist_idx, obj)) +/* END CSTYLED */ DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__insert); DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove); #endif /* _TRACE_MULTILIST_H */ #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH sys #define TRACE_INCLUDE_FILE trace_multilist #include #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/trace_txg.h b/include/sys/trace_txg.h index a408761f9367..6c414bfce7a0 100644 --- a/include/sys/trace_txg.h +++ b/include/sys/trace_txg.h @@ -1,76 +1,78 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs #undef TRACE_SYSTEM_VAR #define TRACE_SYSTEM_VAR zfs_txg #if !defined(_TRACE_TXG_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TXG_H #include #include /* * Generic support for two argument tracepoints of the form: * * DTRACE_PROBE2(..., * dsl_pool_t *, ..., * uint64_t, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_txg_class, TP_PROTO(dsl_pool_t *dp, uint64_t txg), TP_ARGS(dp, txg), TP_STRUCT__entry( __field(uint64_t, txg) ), TP_fast_assign( __entry->txg = txg; ), TP_printk("txg %llu", __entry->txg) ); /* END CSTYLED */ #define DEFINE_TXG_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_txg_class, name, \ TP_PROTO(dsl_pool_t *dp, uint64_t txg), \ TP_ARGS(dp, txg)) +/* END CSTYLED */ DEFINE_TXG_EVENT(zfs_dsl_pool_sync__done); DEFINE_TXG_EVENT(zfs_txg__quiescing); DEFINE_TXG_EVENT(zfs_txg__opened); DEFINE_TXG_EVENT(zfs_txg__syncing); DEFINE_TXG_EVENT(zfs_txg__synced); DEFINE_TXG_EVENT(zfs_txg__quiesced); #endif /* _TRACE_TXG_H */ #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH sys #define TRACE_INCLUDE_FILE trace_txg #include #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/trace_zil.h b/include/sys/trace_zil.h index 389037c21c44..fbceee643e0f 100644 --- a/include/sys/trace_zil.h +++ b/include/sys/trace_zil.h @@ -1,131 +1,133 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) #undef TRACE_SYSTEM #define TRACE_SYSTEM zfs #undef TRACE_SYSTEM_VAR #define TRACE_SYSTEM_VAR zfs_zil #if !defined(_TRACE_ZIL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ZIL_H #include #include /* * Generic support for one argument tracepoints of the form: * * DTRACE_PROBE1(..., * zilog_t *, ...); */ /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_zil_class, TP_PROTO(zilog_t *zilog), TP_ARGS(zilog), TP_STRUCT__entry( __field(uint64_t, zl_lr_seq) __field(uint64_t, zl_commit_lr_seq) __field(uint64_t, zl_destroy_txg) __field(uint64_t, zl_replaying_seq) __field(uint32_t, zl_suspend) __field(uint8_t, zl_suspending) __field(uint8_t, zl_keep_first) __field(uint8_t, zl_replay) __field(uint8_t, zl_stop_sync) __field(uint8_t, zl_writer) __field(uint8_t, zl_logbias) __field(uint8_t, zl_sync) __field(int, zl_parse_error) __field(uint64_t, zl_parse_blk_seq) __field(uint64_t, zl_parse_lr_seq) __field(uint64_t, zl_parse_blk_count) __field(uint64_t, zl_parse_lr_count) __field(uint64_t, zl_next_batch) __field(uint64_t, zl_com_batch) __field(uint64_t, zl_itx_list_sz) __field(uint64_t, zl_cur_used) __field(clock_t, zl_replay_time) __field(uint64_t, zl_replay_blks) ), TP_fast_assign( __entry->zl_lr_seq = zilog->zl_lr_seq; __entry->zl_commit_lr_seq = zilog->zl_commit_lr_seq; __entry->zl_destroy_txg = zilog->zl_destroy_txg; __entry->zl_replaying_seq = zilog->zl_replaying_seq; __entry->zl_suspend = zilog->zl_suspend; __entry->zl_suspending = zilog->zl_suspending; __entry->zl_keep_first = zilog->zl_keep_first; __entry->zl_replay = zilog->zl_replay; __entry->zl_stop_sync = zilog->zl_stop_sync; __entry->zl_writer = zilog->zl_writer; __entry->zl_logbias = zilog->zl_logbias; __entry->zl_sync = zilog->zl_sync; __entry->zl_parse_error = zilog->zl_parse_error; __entry->zl_parse_blk_seq = zilog->zl_parse_blk_seq; __entry->zl_parse_lr_seq = zilog->zl_parse_lr_seq; __entry->zl_parse_blk_count = zilog->zl_parse_blk_count; __entry->zl_parse_lr_count = zilog->zl_parse_lr_count; __entry->zl_next_batch = zilog->zl_next_batch; __entry->zl_com_batch = zilog->zl_com_batch; __entry->zl_itx_list_sz = zilog->zl_itx_list_sz; __entry->zl_cur_used = zilog->zl_cur_used; __entry->zl_replay_time = zilog->zl_replay_time; __entry->zl_replay_blks = zilog->zl_replay_blks; ), TP_printk("zl { lr_seq %llu commit_lr_seq %llu destroy_txg %llu " "replaying_seq %llu suspend %u suspending %u keep_first %u " "replay %u stop_sync %u writer %u logbias %u sync %u " "parse_error %u parse_blk_seq %llu parse_lr_seq %llu " "parse_blk_count %llu parse_lr_count %llu next_batch %llu " "com_batch %llu itx_list_sz %llu cur_used %llu replay_time %lu " "replay_blks %llu }", __entry->zl_lr_seq, __entry->zl_commit_lr_seq, __entry->zl_destroy_txg, __entry->zl_replaying_seq, __entry->zl_suspend, __entry->zl_suspending, __entry->zl_keep_first, __entry->zl_replay, __entry->zl_stop_sync, __entry->zl_writer, __entry->zl_logbias, __entry->zl_sync, __entry->zl_parse_error, __entry->zl_parse_blk_seq, __entry->zl_parse_lr_seq, __entry->zl_parse_blk_count, __entry->zl_parse_lr_count, __entry->zl_next_batch, __entry->zl_com_batch, __entry->zl_itx_list_sz, __entry->zl_cur_used, __entry->zl_replay_time, __entry->zl_replay_blks) ); /* END CSTYLED */ #define DEFINE_ZIL_EVENT(name) \ +/* BEGIN CSTYLED */ DEFINE_EVENT(zfs_zil_class, name, \ TP_PROTO(zilog_t *zilog), \ TP_ARGS(zilog)) DEFINE_ZIL_EVENT(zfs_zil__cw1); DEFINE_ZIL_EVENT(zfs_zil__cw2); +/* END CSTYLED */ #endif /* _TRACE_ZIL_H */ #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH sys #define TRACE_INCLUDE_FILE trace_zil #include #endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/lib/libnvpair/libnvpair.c b/lib/libnvpair/libnvpair.c index b852cb6170b1..7e24dd8445d4 100644 --- a/lib/libnvpair/libnvpair.c +++ b/lib/libnvpair/libnvpair.c @@ -1,1274 +1,1275 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include "libnvpair.h" /* * libnvpair - A tools library for manipulating pairs. * * This library provides routines packing an unpacking nv pairs * for transporting data across process boundaries, transporting * between kernel and userland, and possibly saving onto disk files. */ /* * Print control structure. */ #define DEFINEOP(opname, vtype) \ struct { \ int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ const char *, vtype); \ void *arg; \ } opname #define DEFINEARROP(opname, vtype) \ struct { \ int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ const char *, vtype, uint_t); \ void *arg; \ } opname struct nvlist_printops { DEFINEOP(print_boolean, int); DEFINEOP(print_boolean_value, boolean_t); DEFINEOP(print_byte, uchar_t); DEFINEOP(print_int8, int8_t); DEFINEOP(print_uint8, uint8_t); DEFINEOP(print_int16, int16_t); DEFINEOP(print_uint16, uint16_t); DEFINEOP(print_int32, int32_t); DEFINEOP(print_uint32, uint32_t); DEFINEOP(print_int64, int64_t); DEFINEOP(print_uint64, uint64_t); DEFINEOP(print_double, double); DEFINEOP(print_string, char *); DEFINEOP(print_hrtime, hrtime_t); DEFINEOP(print_nvlist, nvlist_t *); DEFINEARROP(print_boolean_array, boolean_t *); DEFINEARROP(print_byte_array, uchar_t *); DEFINEARROP(print_int8_array, int8_t *); DEFINEARROP(print_uint8_array, uint8_t *); DEFINEARROP(print_int16_array, int16_t *); DEFINEARROP(print_uint16_array, uint16_t *); DEFINEARROP(print_int32_array, int32_t *); DEFINEARROP(print_uint32_array, uint32_t *); DEFINEARROP(print_int64_array, int64_t *); DEFINEARROP(print_uint64_array, uint64_t *); DEFINEARROP(print_string_array, char **); DEFINEARROP(print_nvlist_array, nvlist_t **); }; struct nvlist_prtctl { FILE *nvprt_fp; /* output destination */ enum nvlist_indent_mode nvprt_indent_mode; /* see above */ int nvprt_indent; /* absolute indent, or tab depth */ int nvprt_indentinc; /* indent or tab increment */ const char *nvprt_nmfmt; /* member name format, max one %s */ const char *nvprt_eomfmt; /* after member format, e.g. "\n" */ const char *nvprt_btwnarrfmt; /* between array members */ int nvprt_btwnarrfmt_nl; /* nvprt_eoamfmt includes newline? */ struct nvlist_printops *nvprt_dfltops; struct nvlist_printops *nvprt_custops; }; #define DFLTPRTOP(pctl, type) \ ((pctl)->nvprt_dfltops->print_##type.op) #define DFLTPRTOPARG(pctl, type) \ ((pctl)->nvprt_dfltops->print_##type.arg) #define CUSTPRTOP(pctl, type) \ ((pctl)->nvprt_custops->print_##type.op) #define CUSTPRTOPARG(pctl, type) \ ((pctl)->nvprt_custops->print_##type.arg) #define RENDER(pctl, type, nvl, name, val) \ { \ int done = 0; \ if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ done = CUSTPRTOP(pctl, type)(pctl, \ CUSTPRTOPARG(pctl, type), nvl, name, val); \ } \ if (!done) { \ (void) DFLTPRTOP(pctl, type)(pctl, \ DFLTPRTOPARG(pctl, type), nvl, name, val); \ } \ (void) fprintf(pctl->nvprt_fp, "%s", pctl->nvprt_eomfmt); \ } #define ARENDER(pctl, type, nvl, name, arrp, count) \ { \ int done = 0; \ if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ done = CUSTPRTOP(pctl, type)(pctl, \ CUSTPRTOPARG(pctl, type), nvl, name, arrp, count); \ } \ if (!done) { \ (void) DFLTPRTOP(pctl, type)(pctl, \ DFLTPRTOPARG(pctl, type), nvl, name, arrp, count); \ } \ (void) fprintf(pctl->nvprt_fp, "%s", pctl->nvprt_eomfmt); \ } static void nvlist_print_with_indent(nvlist_t *, nvlist_prtctl_t); /* * ====================================================================== * | | * | Indentation | * | | * ====================================================================== */ static void indent(nvlist_prtctl_t pctl, int onemore) { int depth; switch (pctl->nvprt_indent_mode) { case NVLIST_INDENT_ABS: (void) fprintf(pctl->nvprt_fp, "%*s", pctl->nvprt_indent + onemore * pctl->nvprt_indentinc, ""); break; case NVLIST_INDENT_TABBED: depth = pctl->nvprt_indent + onemore; while (depth-- > 0) (void) fprintf(pctl->nvprt_fp, "\t"); } } /* * ====================================================================== * | | * | Default nvlist member rendering functions. | * | | * ====================================================================== */ /* * Generate functions to print single-valued nvlist members. * * type_and_variant - suffix to form function name * vtype - C type for the member value * ptype - C type to cast value to for printing * vfmt - format string for pair value, e.g "%d" or "0x%llx" */ #define NVLIST_PRTFUNC(type_and_variant, vtype, ptype, vfmt) \ static int \ nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ nvlist_t *nvl, const char *name, vtype value) \ { \ FILE *fp = pctl->nvprt_fp; \ NOTE(ARGUNUSED(private)) \ NOTE(ARGUNUSED(nvl)) \ indent(pctl, 1); \ (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ (void) fprintf(fp, vfmt, (ptype)value); \ return (1); \ } NVLIST_PRTFUNC(boolean, int, int, "%d") NVLIST_PRTFUNC(boolean_value, boolean_t, int, "%d") NVLIST_PRTFUNC(byte, uchar_t, uchar_t, "0x%2.2x") NVLIST_PRTFUNC(int8, int8_t, int, "%d") NVLIST_PRTFUNC(uint8, uint8_t, uint8_t, "0x%x") NVLIST_PRTFUNC(int16, int16_t, int16_t, "%d") NVLIST_PRTFUNC(uint16, uint16_t, uint16_t, "0x%x") NVLIST_PRTFUNC(int32, int32_t, int32_t, "%d") NVLIST_PRTFUNC(uint32, uint32_t, uint32_t, "0x%x") NVLIST_PRTFUNC(int64, int64_t, longlong_t, "%lld") NVLIST_PRTFUNC(uint64, uint64_t, u_longlong_t, "0x%llx") NVLIST_PRTFUNC(double, double, double, "0x%f") NVLIST_PRTFUNC(string, char *, char *, "%s") NVLIST_PRTFUNC(hrtime, hrtime_t, hrtime_t, "0x%llx") /* * Generate functions to print array-valued nvlist members. */ #define NVLIST_ARRPRTFUNC(type_and_variant, vtype, ptype, vfmt) \ static int \ nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ nvlist_t *nvl, const char *name, vtype *valuep, uint_t count) \ { \ FILE *fp = pctl->nvprt_fp; \ uint_t i; \ NOTE(ARGUNUSED(private)) \ NOTE(ARGUNUSED(nvl)) \ for (i = 0; i < count; i++) { \ if (i == 0 || pctl->nvprt_btwnarrfmt_nl) { \ indent(pctl, 1); \ (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ if (pctl->nvprt_btwnarrfmt_nl) \ (void) fprintf(fp, "[%d]: ", i); \ } \ if (i != 0) \ (void) fprintf(fp, "%s", pctl->nvprt_btwnarrfmt); \ (void) fprintf(fp, vfmt, (ptype)valuep[i]); \ } \ return (1); \ } NVLIST_ARRPRTFUNC(boolean_array, boolean_t, boolean_t, "%d") NVLIST_ARRPRTFUNC(byte_array, uchar_t, uchar_t, "0x%2.2x") NVLIST_ARRPRTFUNC(int8_array, int8_t, int8_t, "%d") NVLIST_ARRPRTFUNC(uint8_array, uint8_t, uint8_t, "0x%x") NVLIST_ARRPRTFUNC(int16_array, int16_t, int16_t, "%d") NVLIST_ARRPRTFUNC(uint16_array, uint16_t, uint16_t, "0x%x") NVLIST_ARRPRTFUNC(int32_array, int32_t, int32_t, "%d") NVLIST_ARRPRTFUNC(uint32_array, uint32_t, uint32_t, "0x%x") NVLIST_ARRPRTFUNC(int64_array, int64_t, longlong_t, "%lld") NVLIST_ARRPRTFUNC(uint64_array, uint64_t, u_longlong_t, "0x%llx") NVLIST_ARRPRTFUNC(string_array, char *, char *, "%s") /*ARGSUSED*/ static int nvprint_nvlist(nvlist_prtctl_t pctl, void *private, nvlist_t *nvl, const char *name, nvlist_t *value) { FILE *fp = pctl->nvprt_fp; indent(pctl, 1); (void) fprintf(fp, "%s = (embedded nvlist)\n", name); pctl->nvprt_indent += pctl->nvprt_indentinc; nvlist_print_with_indent(value, pctl); pctl->nvprt_indent -= pctl->nvprt_indentinc; indent(pctl, 1); (void) fprintf(fp, "(end %s)\n", name); return (1); } /*ARGSUSED*/ static int nvaprint_nvlist_array(nvlist_prtctl_t pctl, void *private, nvlist_t *nvl, const char *name, nvlist_t **valuep, uint_t count) { FILE *fp = pctl->nvprt_fp; uint_t i; indent(pctl, 1); (void) fprintf(fp, "%s = (array of embedded nvlists)\n", name); for (i = 0; i < count; i++) { indent(pctl, 1); (void) fprintf(fp, "(start %s[%d])\n", name, i); pctl->nvprt_indent += pctl->nvprt_indentinc; nvlist_print_with_indent(valuep[i], pctl); pctl->nvprt_indent -= pctl->nvprt_indentinc; indent(pctl, 1); (void) fprintf(fp, "(end %s[%d])\n", name, i); } return (1); } /* * ====================================================================== * | | * | Interfaces that allow control over formatting. | * | | * ====================================================================== */ void nvlist_prtctl_setdest(nvlist_prtctl_t pctl, FILE *fp) { pctl->nvprt_fp = fp; } FILE * nvlist_prtctl_getdest(nvlist_prtctl_t pctl) { return (pctl->nvprt_fp); } void nvlist_prtctl_setindent(nvlist_prtctl_t pctl, enum nvlist_indent_mode mode, int start, int inc) { if (mode < NVLIST_INDENT_ABS || mode > NVLIST_INDENT_TABBED) mode = NVLIST_INDENT_TABBED; if (start < 0) start = 0; if (inc < 0) inc = 1; pctl->nvprt_indent_mode = mode; pctl->nvprt_indent = start; pctl->nvprt_indentinc = inc; } void nvlist_prtctl_doindent(nvlist_prtctl_t pctl, int onemore) { indent(pctl, onemore); } void nvlist_prtctl_setfmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, const char *fmt) { switch (which) { case NVLIST_FMT_MEMBER_NAME: if (fmt == NULL) fmt = "%s = "; pctl->nvprt_nmfmt = fmt; break; case NVLIST_FMT_MEMBER_POSTAMBLE: if (fmt == NULL) fmt = "\n"; pctl->nvprt_eomfmt = fmt; break; case NVLIST_FMT_BTWN_ARRAY: if (fmt == NULL) { pctl->nvprt_btwnarrfmt = " "; pctl->nvprt_btwnarrfmt_nl = 0; } else { pctl->nvprt_btwnarrfmt = fmt; pctl->nvprt_btwnarrfmt_nl = (strstr(fmt, "\n") != NULL); } break; default: break; } } void nvlist_prtctl_dofmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, ...) { FILE *fp = pctl->nvprt_fp; va_list ap; char *name; va_start(ap, which); switch (which) { case NVLIST_FMT_MEMBER_NAME: name = va_arg(ap, char *); (void) fprintf(fp, pctl->nvprt_nmfmt, name); break; case NVLIST_FMT_MEMBER_POSTAMBLE: (void) fprintf(fp, "%s", pctl->nvprt_eomfmt); break; case NVLIST_FMT_BTWN_ARRAY: (void) fprintf(fp, "%s", pctl->nvprt_btwnarrfmt); break; default: break; } va_end(ap); } /* * ====================================================================== * | | * | Interfaces to allow appointment of replacement rendering functions.| * | | * ====================================================================== */ #define NVLIST_PRINTCTL_REPLACE(type, vtype) \ void \ nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype), \ void *private) \ { \ CUSTPRTOP(pctl, type) = func; \ CUSTPRTOPARG(pctl, type) = private; \ } NVLIST_PRINTCTL_REPLACE(boolean, int) NVLIST_PRINTCTL_REPLACE(boolean_value, boolean_t) NVLIST_PRINTCTL_REPLACE(byte, uchar_t) NVLIST_PRINTCTL_REPLACE(int8, int8_t) NVLIST_PRINTCTL_REPLACE(uint8, uint8_t) NVLIST_PRINTCTL_REPLACE(int16, int16_t) NVLIST_PRINTCTL_REPLACE(uint16, uint16_t) NVLIST_PRINTCTL_REPLACE(int32, int32_t) NVLIST_PRINTCTL_REPLACE(uint32, uint32_t) NVLIST_PRINTCTL_REPLACE(int64, int64_t) NVLIST_PRINTCTL_REPLACE(uint64, uint64_t) NVLIST_PRINTCTL_REPLACE(double, double) NVLIST_PRINTCTL_REPLACE(string, char *) NVLIST_PRINTCTL_REPLACE(hrtime, hrtime_t) NVLIST_PRINTCTL_REPLACE(nvlist, nvlist_t *) #define NVLIST_PRINTCTL_AREPLACE(type, vtype) \ void \ nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, \ uint_t), void *private) \ { \ CUSTPRTOP(pctl, type) = func; \ CUSTPRTOPARG(pctl, type) = private; \ } NVLIST_PRINTCTL_AREPLACE(boolean_array, boolean_t *) NVLIST_PRINTCTL_AREPLACE(byte_array, uchar_t *) NVLIST_PRINTCTL_AREPLACE(int8_array, int8_t *) NVLIST_PRINTCTL_AREPLACE(uint8_array, uint8_t *) NVLIST_PRINTCTL_AREPLACE(int16_array, int16_t *) NVLIST_PRINTCTL_AREPLACE(uint16_array, uint16_t *) NVLIST_PRINTCTL_AREPLACE(int32_array, int32_t *) NVLIST_PRINTCTL_AREPLACE(uint32_array, uint32_t *) NVLIST_PRINTCTL_AREPLACE(int64_array, int64_t *) NVLIST_PRINTCTL_AREPLACE(uint64_array, uint64_t *) NVLIST_PRINTCTL_AREPLACE(string_array, char **) NVLIST_PRINTCTL_AREPLACE(nvlist_array, nvlist_t **) /* * ====================================================================== * | | * | Interfaces to manage nvlist_prtctl_t cookies. | * | | * ====================================================================== */ -static const struct nvlist_printops defprtops = { +static const struct nvlist_printops defprtops = +{ { nvprint_boolean, NULL }, { nvprint_boolean_value, NULL }, { nvprint_byte, NULL }, { nvprint_int8, NULL }, { nvprint_uint8, NULL }, { nvprint_int16, NULL }, { nvprint_uint16, NULL }, { nvprint_int32, NULL }, { nvprint_uint32, NULL }, { nvprint_int64, NULL }, { nvprint_uint64, NULL }, { nvprint_double, NULL }, { nvprint_string, NULL }, { nvprint_hrtime, NULL }, { nvprint_nvlist, NULL }, { nvaprint_boolean_array, NULL }, { nvaprint_byte_array, NULL }, { nvaprint_int8_array, NULL }, { nvaprint_uint8_array, NULL }, { nvaprint_int16_array, NULL }, { nvaprint_uint16_array, NULL }, { nvaprint_int32_array, NULL }, { nvaprint_uint32_array, NULL }, { nvaprint_int64_array, NULL }, { nvaprint_uint64_array, NULL }, { nvaprint_string_array, NULL }, { nvaprint_nvlist_array, NULL }, }; static void prtctl_defaults(FILE *fp, struct nvlist_prtctl *pctl, struct nvlist_printops *ops) { pctl->nvprt_fp = fp; pctl->nvprt_indent_mode = NVLIST_INDENT_TABBED; pctl->nvprt_indent = 0; pctl->nvprt_indentinc = 1; pctl->nvprt_nmfmt = "%s = "; pctl->nvprt_eomfmt = "\n"; pctl->nvprt_btwnarrfmt = " "; pctl->nvprt_btwnarrfmt_nl = 0; pctl->nvprt_dfltops = (struct nvlist_printops *)&defprtops; pctl->nvprt_custops = ops; } nvlist_prtctl_t nvlist_prtctl_alloc(void) { struct nvlist_prtctl *pctl; struct nvlist_printops *ops; if ((pctl = malloc(sizeof (*pctl))) == NULL) return (NULL); if ((ops = calloc(1, sizeof (*ops))) == NULL) { free(pctl); return (NULL); } prtctl_defaults(stdout, pctl, ops); return (pctl); } void nvlist_prtctl_free(nvlist_prtctl_t pctl) { if (pctl != NULL) { free(pctl->nvprt_custops); free(pctl); } } /* * ====================================================================== * | | * | Top-level print request interfaces. | * | | * ====================================================================== */ /* * nvlist_print - Prints elements in an event buffer */ static void nvlist_print_with_indent(nvlist_t *nvl, nvlist_prtctl_t pctl) { FILE *fp = pctl->nvprt_fp; char *name; uint_t nelem; nvpair_t *nvp; if (nvl == NULL) return; indent(pctl, 0); (void) fprintf(fp, "nvlist version: %d\n", NVL_VERSION(nvl)); nvp = nvlist_next_nvpair(nvl, NULL); while (nvp) { data_type_t type = nvpair_type(nvp); name = nvpair_name(nvp); nelem = 0; switch (type) { case DATA_TYPE_BOOLEAN: { RENDER(pctl, boolean, nvl, name, 1); break; } case DATA_TYPE_BOOLEAN_VALUE: { boolean_t val; (void) nvpair_value_boolean_value(nvp, &val); RENDER(pctl, boolean_value, nvl, name, val); break; } case DATA_TYPE_BYTE: { uchar_t val; (void) nvpair_value_byte(nvp, &val); RENDER(pctl, byte, nvl, name, val); break; } case DATA_TYPE_INT8: { int8_t val; (void) nvpair_value_int8(nvp, &val); RENDER(pctl, int8, nvl, name, val); break; } case DATA_TYPE_UINT8: { uint8_t val; (void) nvpair_value_uint8(nvp, &val); RENDER(pctl, uint8, nvl, name, val); break; } case DATA_TYPE_INT16: { int16_t val; (void) nvpair_value_int16(nvp, &val); RENDER(pctl, int16, nvl, name, val); break; } case DATA_TYPE_UINT16: { uint16_t val; (void) nvpair_value_uint16(nvp, &val); RENDER(pctl, uint16, nvl, name, val); break; } case DATA_TYPE_INT32: { int32_t val; (void) nvpair_value_int32(nvp, &val); RENDER(pctl, int32, nvl, name, val); break; } case DATA_TYPE_UINT32: { uint32_t val; (void) nvpair_value_uint32(nvp, &val); RENDER(pctl, uint32, nvl, name, val); break; } case DATA_TYPE_INT64: { int64_t val; (void) nvpair_value_int64(nvp, &val); RENDER(pctl, int64, nvl, name, val); break; } case DATA_TYPE_UINT64: { uint64_t val; (void) nvpair_value_uint64(nvp, &val); RENDER(pctl, uint64, nvl, name, val); break; } case DATA_TYPE_DOUBLE: { double val; (void) nvpair_value_double(nvp, &val); RENDER(pctl, double, nvl, name, val); break; } case DATA_TYPE_STRING: { char *val; (void) nvpair_value_string(nvp, &val); RENDER(pctl, string, nvl, name, val); break; } case DATA_TYPE_BOOLEAN_ARRAY: { boolean_t *val; (void) nvpair_value_boolean_array(nvp, &val, &nelem); ARENDER(pctl, boolean_array, nvl, name, val, nelem); break; } case DATA_TYPE_BYTE_ARRAY: { uchar_t *val; (void) nvpair_value_byte_array(nvp, &val, &nelem); ARENDER(pctl, byte_array, nvl, name, val, nelem); break; } case DATA_TYPE_INT8_ARRAY: { int8_t *val; (void) nvpair_value_int8_array(nvp, &val, &nelem); ARENDER(pctl, int8_array, nvl, name, val, nelem); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; (void) nvpair_value_uint8_array(nvp, &val, &nelem); ARENDER(pctl, uint8_array, nvl, name, val, nelem); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; (void) nvpair_value_int16_array(nvp, &val, &nelem); ARENDER(pctl, int16_array, nvl, name, val, nelem); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; (void) nvpair_value_uint16_array(nvp, &val, &nelem); ARENDER(pctl, uint16_array, nvl, name, val, nelem); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; (void) nvpair_value_int32_array(nvp, &val, &nelem); ARENDER(pctl, int32_array, nvl, name, val, nelem); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; (void) nvpair_value_uint32_array(nvp, &val, &nelem); ARENDER(pctl, uint32_array, nvl, name, val, nelem); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; (void) nvpair_value_int64_array(nvp, &val, &nelem); ARENDER(pctl, int64_array, nvl, name, val, nelem); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; (void) nvpair_value_uint64_array(nvp, &val, &nelem); ARENDER(pctl, uint64_array, nvl, name, val, nelem); break; } case DATA_TYPE_STRING_ARRAY: { char **val; (void) nvpair_value_string_array(nvp, &val, &nelem); ARENDER(pctl, string_array, nvl, name, val, nelem); break; } case DATA_TYPE_HRTIME: { hrtime_t val; (void) nvpair_value_hrtime(nvp, &val); RENDER(pctl, hrtime, nvl, name, val); break; } case DATA_TYPE_NVLIST: { nvlist_t *val; (void) nvpair_value_nvlist(nvp, &val); RENDER(pctl, nvlist, nvl, name, val); break; } case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); ARENDER(pctl, nvlist_array, nvl, name, val, nelem); break; } default: (void) fprintf(fp, " unknown data type (%d)", type); break; } nvp = nvlist_next_nvpair(nvl, nvp); } } void nvlist_print(FILE *fp, nvlist_t *nvl) { struct nvlist_prtctl pc; prtctl_defaults(fp, &pc, NULL); nvlist_print_with_indent(nvl, &pc); } void nvlist_prt(nvlist_t *nvl, nvlist_prtctl_t pctl) { nvlist_print_with_indent(nvl, pctl); } #define NVP(elem, type, vtype, ptype, format) { \ vtype value; \ \ (void) nvpair_value_##type(elem, &value); \ (void) printf("%*s%s: " format "\n", indent, "", \ nvpair_name(elem), (ptype)value); \ } #define NVPA(elem, type, vtype, ptype, format) { \ uint_t i, count; \ vtype *value; \ \ (void) nvpair_value_##type(elem, &value, &count); \ for (i = 0; i < count; i++) { \ (void) printf("%*s%s[%d]: " format "\n", indent, "", \ nvpair_name(elem), i, (ptype)value[i]); \ } \ } /* * Similar to nvlist_print() but handles arrays slightly differently. */ void dump_nvlist(nvlist_t *list, int indent) { nvpair_t *elem = NULL; boolean_t bool_value; nvlist_t *nvlist_value; nvlist_t **nvlist_array_value; uint_t i, count; if (list == NULL) { return; } while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { switch (nvpair_type(elem)) { case DATA_TYPE_BOOLEAN: (void) printf("%*s%s\n", indent, "", nvpair_name(elem)); break; case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(elem, &bool_value); (void) printf("%*s%s: %s\n", indent, "", nvpair_name(elem), bool_value ? "true" : "false"); break; case DATA_TYPE_BYTE: NVP(elem, byte, uchar_t, int, "%u"); break; case DATA_TYPE_INT8: NVP(elem, int8, int8_t, int, "%d"); break; case DATA_TYPE_UINT8: NVP(elem, uint8, uint8_t, int, "%u"); break; case DATA_TYPE_INT16: NVP(elem, int16, int16_t, int, "%d"); break; case DATA_TYPE_UINT16: NVP(elem, uint16, uint16_t, int, "%u"); break; case DATA_TYPE_INT32: NVP(elem, int32, int32_t, long, "%ld"); break; case DATA_TYPE_UINT32: NVP(elem, uint32, uint32_t, ulong_t, "%lu"); break; case DATA_TYPE_INT64: NVP(elem, int64, int64_t, longlong_t, "%lld"); break; case DATA_TYPE_UINT64: NVP(elem, uint64, uint64_t, u_longlong_t, "%llu"); break; case DATA_TYPE_STRING: NVP(elem, string, char *, char *, "'%s'"); break; case DATA_TYPE_BYTE_ARRAY: NVPA(elem, byte_array, uchar_t, int, "%u"); break; case DATA_TYPE_INT8_ARRAY: NVPA(elem, int8_array, int8_t, int, "%d"); break; case DATA_TYPE_UINT8_ARRAY: NVPA(elem, uint8_array, uint8_t, int, "%u"); break; case DATA_TYPE_INT16_ARRAY: NVPA(elem, int16_array, int16_t, int, "%d"); break; case DATA_TYPE_UINT16_ARRAY: NVPA(elem, uint16_array, uint16_t, int, "%u"); break; case DATA_TYPE_INT32_ARRAY: NVPA(elem, int32_array, int32_t, long, "%ld"); break; case DATA_TYPE_UINT32_ARRAY: NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu"); break; case DATA_TYPE_INT64_ARRAY: NVPA(elem, int64_array, int64_t, longlong_t, "%lld"); break; case DATA_TYPE_UINT64_ARRAY: NVPA(elem, uint64_array, uint64_t, u_longlong_t, "%llu"); break; case DATA_TYPE_STRING_ARRAY: NVPA(elem, string_array, char *, char *, "'%s'"); break; case DATA_TYPE_NVLIST: (void) nvpair_value_nvlist(elem, &nvlist_value); (void) printf("%*s%s:\n", indent, "", nvpair_name(elem)); dump_nvlist(nvlist_value, indent + 4); break; case DATA_TYPE_NVLIST_ARRAY: (void) nvpair_value_nvlist_array(elem, &nvlist_array_value, &count); for (i = 0; i < count; i++) { (void) printf("%*s%s[%u]:\n", indent, "", nvpair_name(elem), i); dump_nvlist(nvlist_array_value[i], indent + 4); } break; default: (void) printf(dgettext(TEXT_DOMAIN, "bad config type " "%d for %s\n"), nvpair_type(elem), nvpair_name(elem)); } } } /* * ====================================================================== * | | * | Misc private interface. | * | | * ====================================================================== */ /* * Determine if string 'value' matches 'nvp' value. The 'value' string is * converted, depending on the type of 'nvp', prior to match. For numeric * types, a radix independent sscanf conversion of 'value' is used. If 'nvp' * is an array type, 'ai' is the index into the array against which we are * checking for match. If nvp is of DATA_TYPE_STRING*, the caller can pass * in a regex_t compilation of value in 'value_regex' to trigger regular * expression string match instead of simple strcmp(). * * Return 1 on match, 0 on no-match, and -1 on error. If the error is * related to value syntax error and 'ep' is non-NULL, *ep will point into * the 'value' string at the location where the error exists. * * NOTE: It may be possible to move the non-regex_t version of this into * common code used by library/kernel/boot. */ int nvpair_value_match_regex(nvpair_t *nvp, int ai, char *value, regex_t *value_regex, char **ep) { char *evalue; uint_t a_len; int sr; if (ep) *ep = NULL; if ((nvp == NULL) || (value == NULL)) return (-1); /* error fail match - invalid args */ /* make sure array and index combination make sense */ if ((nvpair_type_is_array(nvp) && (ai < 0)) || (!nvpair_type_is_array(nvp) && (ai >= 0))) return (-1); /* error fail match - bad index */ /* non-string values should be single 'chunk' */ if ((nvpair_type(nvp) != DATA_TYPE_STRING) && (nvpair_type(nvp) != DATA_TYPE_STRING_ARRAY)) { value += strspn(value, " \t"); evalue = value + strcspn(value, " \t"); if (*evalue) { if (ep) *ep = evalue; return (-1); /* error fail match - syntax */ } } sr = EOF; switch (nvpair_type(nvp)) { case DATA_TYPE_STRING: { char *val; /* check string value for match */ if (nvpair_value_string(nvp, &val) == 0) { if (value_regex) { if (regexec(value_regex, val, (size_t)0, NULL, 0) == 0) return (1); /* match */ } else { if (strcmp(value, val) == 0) return (1); /* match */ } } break; } case DATA_TYPE_STRING_ARRAY: { char **val_array; /* check indexed string value of array for match */ if ((nvpair_value_string_array(nvp, &val_array, &a_len) == 0) && (ai < a_len)) { if (value_regex) { if (regexec(value_regex, val_array[ai], (size_t)0, NULL, 0) == 0) return (1); } else { if (strcmp(value, val_array[ai]) == 0) return (1); } } break; } case DATA_TYPE_BYTE: { uchar_t val, val_arg; /* scanf uchar_t from value and check for match */ sr = sscanf(value, "%c", &val_arg); if ((sr == 1) && (nvpair_value_byte(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_BYTE_ARRAY: { uchar_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%c", &val_arg); if ((sr == 1) && (nvpair_value_byte_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_INT8: { int8_t val, val_arg; /* scanf int8_t from value and check for match */ sr = sscanf(value, "%"SCNi8, &val_arg); if ((sr == 1) && (nvpair_value_int8(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_INT8_ARRAY: { int8_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi8, &val_arg); if ((sr == 1) && (nvpair_value_int8_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_UINT8: { uint8_t val, val_arg; /* scanf uint8_t from value and check for match */ sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg); if ((sr == 1) && (nvpair_value_uint8(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg); if ((sr == 1) && (nvpair_value_uint8_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_INT16: { int16_t val, val_arg; /* scanf int16_t from value and check for match */ sr = sscanf(value, "%"SCNi16, &val_arg); if ((sr == 1) && (nvpair_value_int16(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi16, &val_arg); if ((sr == 1) && (nvpair_value_int16_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_UINT16: { uint16_t val, val_arg; /* scanf uint16_t from value and check for match */ sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg); if ((sr == 1) && (nvpair_value_uint16(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg); if ((sr == 1) && (nvpair_value_uint16_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_INT32: { int32_t val, val_arg; /* scanf int32_t from value and check for match */ sr = sscanf(value, "%"SCNi32, &val_arg); if ((sr == 1) && (nvpair_value_int32(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi32, &val_arg); if ((sr == 1) && (nvpair_value_int32_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_UINT32: { uint32_t val, val_arg; /* scanf uint32_t from value and check for match */ sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); if ((sr == 1) && (nvpair_value_uint32(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); if ((sr == 1) && (nvpair_value_uint32_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_INT64: { int64_t val, val_arg; /* scanf int64_t from value and check for match */ sr = sscanf(value, "%"SCNi64, &val_arg); if ((sr == 1) && (nvpair_value_int64(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi64, &val_arg); if ((sr == 1) && (nvpair_value_int64_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_UINT64: { uint64_t val_arg, val; /* scanf uint64_t from value and check for match */ sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg); if ((sr == 1) && (nvpair_value_uint64(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg); if ((sr == 1) && (nvpair_value_uint64_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_BOOLEAN_VALUE: { boolean_t val, val_arg; /* scanf boolean_t from value and check for match */ sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); if ((sr == 1) && (nvpair_value_boolean_value(nvp, &val) == 0) && (val == val_arg)) return (1); break; } case DATA_TYPE_BOOLEAN_ARRAY: { boolean_t *val_array, val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); if ((sr == 1) && (nvpair_value_boolean_array(nvp, &val_array, &a_len) == 0) && (ai < a_len) && (val_array[ai] == val_arg)) return (1); break; } case DATA_TYPE_HRTIME: case DATA_TYPE_NVLIST: case DATA_TYPE_NVLIST_ARRAY: case DATA_TYPE_BOOLEAN: case DATA_TYPE_DOUBLE: case DATA_TYPE_UNKNOWN: default: /* * unknown/unsupported data type */ return (-1); /* error fail match */ } /* * check to see if sscanf failed conversion, return approximate * pointer to problem */ if (sr != 1) { if (ep) *ep = value; return (-1); /* error fail match - syntax */ } return (0); /* fail match */ } int nvpair_value_match(nvpair_t *nvp, int ai, char *value, char **ep) { return (nvpair_value_match_regex(nvp, ai, value, NULL, ep)); } diff --git a/lib/libshare/libshare.c b/lib/libshare/libshare.c index 41fe8bb6cb26..aa565ca82862 100644 --- a/lib/libshare/libshare.c +++ b/lib/libshare/libshare.c @@ -1,804 +1,806 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Gunnar Beutner */ #include #include #include #include #include #include #include #include #include #include #include "libshare_impl.h" #include "nfs.h" #include "smb.h" static sa_share_impl_t find_share(sa_handle_impl_t handle, const char *sharepath); static sa_share_impl_t alloc_share(const char *sharepath); static void free_share(sa_share_impl_t share); static void parse_sharetab(sa_handle_impl_t impl_handle); static int process_share(sa_handle_impl_t impl_handle, sa_share_impl_t impl_share, char *pathname, char *resource, char *fstype, char *options, char *description, char *dataset, boolean_t from_sharetab); static void update_sharetab(sa_handle_impl_t impl_handle); static int update_zfs_share(sa_share_impl_t impl_handle, const char *proto); static int update_zfs_shares(sa_handle_impl_t impl_handle, const char *proto); static int fstypes_count; static sa_fstype_t *fstypes; sa_fstype_t * register_fstype(const char *name, const sa_share_ops_t *ops) { sa_fstype_t *fstype; fstype = calloc(sizeof (sa_fstype_t), 1); if (fstype == NULL) return (NULL); fstype->name = name; fstype->ops = ops; fstype->fsinfo_index = fstypes_count; fstypes_count++; fstype->next = fstypes; fstypes = fstype; return (fstype); } sa_handle_t sa_init(int init_service) { sa_handle_impl_t impl_handle; impl_handle = calloc(sizeof (struct sa_handle_impl), 1); if (impl_handle == NULL) return (NULL); impl_handle->zfs_libhandle = libzfs_init(); if (impl_handle->zfs_libhandle != NULL) { libzfs_print_on_error(impl_handle->zfs_libhandle, B_TRUE); } parse_sharetab(impl_handle); update_zfs_shares(impl_handle, NULL); return ((sa_handle_t)impl_handle); } __attribute__((constructor)) static void libshare_init(void) { libshare_nfs_init(); libshare_smb_init(); } static void -parse_sharetab(sa_handle_impl_t impl_handle) { +parse_sharetab(sa_handle_impl_t impl_handle) +{ FILE *fp; char line[512]; char *eol, *pathname, *resource, *fstype, *options, *description; fp = fopen(ZFS_SHARETAB, "r"); if (fp == NULL) return; while (fgets(line, sizeof (line), fp) != NULL) { eol = line + strlen(line) - 1; while (eol >= line) { if (*eol != '\r' && *eol != '\n') break; *eol = '\0'; eol--; } pathname = line; if ((resource = strchr(pathname, '\t')) == NULL) continue; *resource = '\0'; resource++; if ((fstype = strchr(resource, '\t')) == NULL) continue; *fstype = '\0'; fstype++; if ((options = strchr(fstype, '\t')) == NULL) continue; *options = '\0'; options++; if ((description = strchr(fstype, '\t')) != NULL) { *description = '\0'; description++; } if (strcmp(resource, "-") == 0) resource = NULL; (void) process_share(impl_handle, NULL, pathname, resource, fstype, options, description, NULL, B_TRUE); } fclose(fp); } static void update_sharetab(sa_handle_impl_t impl_handle) { sa_share_impl_t impl_share; int temp_fd; FILE *temp_fp; char tempfile[] = ZFS_SHARETAB".XXXXXX"; sa_fstype_t *fstype; const char *resource; if (mkdir("/etc/dfs", 0755) < 0 && errno != EEXIST) { return; } temp_fd = mkstemp(tempfile); if (temp_fd < 0) return; temp_fp = fdopen(temp_fd, "w"); if (temp_fp == NULL) return; impl_share = impl_handle->shares; while (impl_share != NULL) { fstype = fstypes; while (fstype != NULL) { if (FSINFO(impl_share, fstype)->active && FSINFO(impl_share, fstype)->shareopts != NULL) { resource = FSINFO(impl_share, fstype)->resource; if (resource == NULL) resource = "-"; fprintf(temp_fp, "%s\t%s\t%s\t%s\n", impl_share->sharepath, resource, fstype->name, FSINFO(impl_share, fstype)->shareopts); } fstype = fstype->next; } impl_share = impl_share->next; } fflush(temp_fp); fsync(temp_fd); fclose(temp_fp); (void) rename(tempfile, ZFS_SHARETAB); } typedef struct update_cookie_s { sa_handle_impl_t handle; const char *proto; } update_cookie_t; static int update_zfs_shares_cb(zfs_handle_t *zhp, void *pcookie) { update_cookie_t *udata = (update_cookie_t *)pcookie; char mountpoint[ZFS_MAXPROPLEN]; char shareopts[ZFS_MAXPROPLEN]; char *dataset; zfs_type_t type = zfs_get_type(zhp); if (type == ZFS_TYPE_FILESYSTEM && zfs_iter_filesystems(zhp, update_zfs_shares_cb, pcookie) != 0) { zfs_close(zhp); return (1); } if (type != ZFS_TYPE_FILESYSTEM) { zfs_close(zhp); return (0); } if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE) != 0) { zfs_close(zhp); return (0); } dataset = (char *)zfs_get_name(zhp); if (dataset == NULL) { zfs_close(zhp); return (0); } if (!zfs_is_mounted(zhp, NULL)) { zfs_close(zhp); return (0); } if ((udata->proto == NULL || strcmp(udata->proto, "nfs") == 0) && zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0 && strcmp(shareopts, "off") != 0) { (void) process_share(udata->handle, NULL, mountpoint, NULL, "nfs", shareopts, NULL, dataset, B_FALSE); } if ((udata->proto == NULL || strcmp(udata->proto, "smb") == 0) && zfs_prop_get(zhp, ZFS_PROP_SHARESMB, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0 && strcmp(shareopts, "off") != 0) { (void) process_share(udata->handle, NULL, mountpoint, NULL, "smb", shareopts, NULL, dataset, B_FALSE); } zfs_close(zhp); return (0); } static int update_zfs_share(sa_share_impl_t impl_share, const char *proto) { sa_handle_impl_t impl_handle = impl_share->handle; zfs_handle_t *zhp; update_cookie_t udata; if (impl_handle->zfs_libhandle == NULL) return (SA_SYSTEM_ERR); assert(impl_share->dataset != NULL); zhp = zfs_open(impl_share->handle->zfs_libhandle, impl_share->dataset, ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (SA_SYSTEM_ERR); udata.handle = impl_handle; udata.proto = proto; (void) update_zfs_shares_cb(zhp, &udata); return (SA_OK); } static int update_zfs_shares(sa_handle_impl_t impl_handle, const char *proto) { update_cookie_t udata; if (impl_handle->zfs_libhandle == NULL) return (SA_SYSTEM_ERR); udata.handle = impl_handle; udata.proto = proto; (void) zfs_iter_root(impl_handle->zfs_libhandle, update_zfs_shares_cb, &udata); return (SA_OK); } static int process_share(sa_handle_impl_t impl_handle, sa_share_impl_t impl_share, char *pathname, char *resource, char *proto, char *options, char *description, char *dataset, boolean_t from_sharetab) { struct stat statbuf; int rc; char *resource_dup = NULL, *dataset_dup = NULL; boolean_t new_share; sa_fstype_t *fstype; new_share = B_FALSE; if (impl_share == NULL) impl_share = find_share(impl_handle, pathname); if (impl_share == NULL) { if (lstat(pathname, &statbuf) != 0 || !S_ISDIR(statbuf.st_mode)) return (SA_BAD_PATH); impl_share = alloc_share(pathname); if (impl_share == NULL) { rc = SA_NO_MEMORY; goto err; } new_share = B_TRUE; } if (dataset != NULL) { dataset_dup = strdup(dataset); if (dataset_dup == NULL) { rc = SA_NO_MEMORY; goto err; } } free(impl_share->dataset); impl_share->dataset = dataset_dup; rc = SA_INVALID_PROTOCOL; fstype = fstypes; while (fstype != NULL) { if (strcmp(fstype->name, proto) == 0) { if (resource != NULL) { resource_dup = strdup(resource); if (resource_dup == NULL) { rc = SA_NO_MEMORY; goto err; } } free(FSINFO(impl_share, fstype)->resource); FSINFO(impl_share, fstype)->resource = resource_dup; rc = fstype->ops->update_shareopts(impl_share, resource, options); if (rc == SA_OK && from_sharetab) FSINFO(impl_share, fstype)->active = B_TRUE; break; } fstype = fstype->next; } if (rc != SA_OK) goto err; if (new_share) { impl_share->handle = impl_handle; impl_share->next = impl_handle->shares; impl_handle->shares = impl_share; } err: if (rc != SA_OK) { if (new_share) free_share(impl_share); } return (rc); } void sa_fini(sa_handle_t handle) { sa_handle_impl_t impl_handle = (sa_handle_impl_t)handle; sa_share_impl_t impl_share, next; sa_share_impl_t *pcurr; if (impl_handle == NULL) return; /* * clean up shares which don't have a non-NULL dataset property, * which means they're in sharetab but we couldn't find their * ZFS dataset. */ pcurr = &(impl_handle->shares); impl_share = *pcurr; while (impl_share != NULL) { next = impl_share->next; if (impl_share->dataset == NULL) { /* remove item from the linked list */ *pcurr = next; sa_disable_share(impl_share, NULL); free_share(impl_share); } else { pcurr = &(impl_share->next); } impl_share = next; } update_sharetab(impl_handle); if (impl_handle->zfs_libhandle != NULL) libzfs_fini(impl_handle->zfs_libhandle); impl_share = impl_handle->shares; while (impl_share != NULL) { next = impl_share->next; free_share(impl_share); impl_share = next; } free(impl_handle); } static sa_share_impl_t find_share(sa_handle_impl_t impl_handle, const char *sharepath) { sa_share_impl_t impl_share; impl_share = impl_handle->shares; while (impl_share != NULL) { if (strcmp(impl_share->sharepath, sharepath) == 0) { break; } impl_share = impl_share->next; } return (impl_share); } sa_share_t sa_find_share(sa_handle_t handle, char *sharepath) { return ((sa_share_t)find_share((sa_handle_impl_t)handle, sharepath)); } int sa_enable_share(sa_share_t share, char *protocol) { sa_share_impl_t impl_share = (sa_share_impl_t)share; int rc, ret; boolean_t found_protocol; sa_fstype_t *fstype; #ifdef DEBUG fprintf(stderr, "sa_enable_share: share->sharepath=%s, protocol=%s\n", impl_share->sharepath, protocol); #endif assert(impl_share->handle != NULL); ret = SA_OK; found_protocol = B_FALSE; fstype = fstypes; while (fstype != NULL) { if (protocol == NULL || strcmp(fstype->name, protocol) == 0) { update_zfs_share(impl_share, fstype->name); rc = fstype->ops->enable_share(impl_share); if (rc != SA_OK) ret = rc; else FSINFO(impl_share, fstype)->active = B_TRUE; found_protocol = B_TRUE; } fstype = fstype->next; } update_sharetab(impl_share->handle); return (found_protocol ? ret : SA_INVALID_PROTOCOL); } int sa_disable_share(sa_share_t share, char *protocol) { sa_share_impl_t impl_share = (sa_share_impl_t)share; int rc, ret; boolean_t found_protocol; sa_fstype_t *fstype; #ifdef DEBUG fprintf(stderr, "sa_disable_share: share->sharepath=%s, protocol=%s\n", impl_share->sharepath, protocol); #endif ret = SA_OK; found_protocol = B_FALSE; fstype = fstypes; while (fstype != NULL) { if (protocol == NULL || strcmp(fstype->name, protocol) == 0) { rc = fstype->ops->disable_share(impl_share); if (rc == SA_OK) { fstype->ops->clear_shareopts(impl_share); FSINFO(impl_share, fstype)->active = B_FALSE; } else ret = rc; found_protocol = B_TRUE; } fstype = fstype->next; } update_sharetab(impl_share->handle); return (found_protocol ? ret : SA_INVALID_PROTOCOL); } /* * sa_errorstr(err) * * convert an error value to an error string */ char * sa_errorstr(int err) { static char errstr[32]; char *ret = NULL; switch (err) { case SA_OK: ret = dgettext(TEXT_DOMAIN, "ok"); break; case SA_NO_SUCH_PATH: ret = dgettext(TEXT_DOMAIN, "path doesn't exist"); break; case SA_NO_MEMORY: ret = dgettext(TEXT_DOMAIN, "no memory"); break; case SA_DUPLICATE_NAME: ret = dgettext(TEXT_DOMAIN, "name in use"); break; case SA_BAD_PATH: ret = dgettext(TEXT_DOMAIN, "bad path"); break; case SA_NO_SUCH_GROUP: ret = dgettext(TEXT_DOMAIN, "no such group"); break; case SA_CONFIG_ERR: ret = dgettext(TEXT_DOMAIN, "configuration error"); break; case SA_SYSTEM_ERR: ret = dgettext(TEXT_DOMAIN, "system error"); break; case SA_SYNTAX_ERR: ret = dgettext(TEXT_DOMAIN, "syntax error"); break; case SA_NO_PERMISSION: ret = dgettext(TEXT_DOMAIN, "no permission"); break; case SA_BUSY: ret = dgettext(TEXT_DOMAIN, "busy"); break; case SA_NO_SUCH_PROP: ret = dgettext(TEXT_DOMAIN, "no such property"); break; case SA_INVALID_NAME: ret = dgettext(TEXT_DOMAIN, "invalid name"); break; case SA_INVALID_PROTOCOL: ret = dgettext(TEXT_DOMAIN, "invalid protocol"); break; case SA_NOT_ALLOWED: ret = dgettext(TEXT_DOMAIN, "operation not allowed"); break; case SA_BAD_VALUE: ret = dgettext(TEXT_DOMAIN, "bad property value"); break; case SA_INVALID_SECURITY: ret = dgettext(TEXT_DOMAIN, "invalid security type"); break; case SA_NO_SUCH_SECURITY: ret = dgettext(TEXT_DOMAIN, "security type not found"); break; case SA_VALUE_CONFLICT: ret = dgettext(TEXT_DOMAIN, "property value conflict"); break; case SA_NOT_IMPLEMENTED: ret = dgettext(TEXT_DOMAIN, "not implemented"); break; case SA_INVALID_PATH: ret = dgettext(TEXT_DOMAIN, "invalid path"); break; case SA_NOT_SUPPORTED: ret = dgettext(TEXT_DOMAIN, "operation not supported"); break; case SA_PROP_SHARE_ONLY: ret = dgettext(TEXT_DOMAIN, "property not valid for group"); break; case SA_NOT_SHARED: ret = dgettext(TEXT_DOMAIN, "not shared"); break; case SA_NO_SUCH_RESOURCE: ret = dgettext(TEXT_DOMAIN, "no such resource"); break; case SA_RESOURCE_REQUIRED: ret = dgettext(TEXT_DOMAIN, "resource name required"); break; case SA_MULTIPLE_ERROR: ret = dgettext(TEXT_DOMAIN, "errors from multiple protocols"); break; case SA_PATH_IS_SUBDIR: ret = dgettext(TEXT_DOMAIN, "path is a subpath of share"); break; case SA_PATH_IS_PARENTDIR: ret = dgettext(TEXT_DOMAIN, "path is parent of a share"); break; case SA_NO_SECTION: ret = dgettext(TEXT_DOMAIN, "protocol requires a section"); break; case SA_NO_PROPERTIES: ret = dgettext(TEXT_DOMAIN, "properties not found"); break; case SA_NO_SUCH_SECTION: ret = dgettext(TEXT_DOMAIN, "section not found"); break; case SA_PASSWORD_ENC: ret = dgettext(TEXT_DOMAIN, "passwords must be encrypted"); break; case SA_SHARE_EXISTS: ret = dgettext(TEXT_DOMAIN, "path or file is already shared"); break; default: (void) snprintf(errstr, sizeof (errstr), dgettext(TEXT_DOMAIN, "unknown %d"), err); ret = errstr; } return (ret); } int sa_parse_legacy_options(sa_group_t group, char *options, char *proto) { sa_fstype_t *fstype; #ifdef DEBUG fprintf(stderr, "sa_parse_legacy_options: options=%s, proto=%s\n", options, proto); #endif fstype = fstypes; while (fstype != NULL) { if (strcmp(fstype->name, proto) != 0) { fstype = fstype->next; continue; } return (fstype->ops->validate_shareopts(options)); } return (SA_INVALID_PROTOCOL); } boolean_t sa_needs_refresh(sa_handle_t handle) { return (B_TRUE); } libzfs_handle_t * sa_get_zfs_handle(sa_handle_t handle) { sa_handle_impl_t impl_handle = (sa_handle_impl_t)handle; if (impl_handle == NULL) return (NULL); return (impl_handle->zfs_libhandle); } static sa_share_impl_t alloc_share(const char *sharepath) { sa_share_impl_t impl_share; impl_share = calloc(sizeof (struct sa_share_impl), 1); if (impl_share == NULL) return (NULL); impl_share->sharepath = strdup(sharepath); if (impl_share->sharepath == NULL) { free(impl_share); return (NULL); } impl_share->fsinfo = calloc(sizeof (sa_share_fsinfo_t), fstypes_count); if (impl_share->fsinfo == NULL) { free(impl_share->sharepath); free(impl_share); return (NULL); } return (impl_share); } static void -free_share(sa_share_impl_t impl_share) { +free_share(sa_share_impl_t impl_share) +{ sa_fstype_t *fstype; fstype = fstypes; while (fstype != NULL) { fstype->ops->clear_shareopts(impl_share); free(FSINFO(impl_share, fstype)->resource); fstype = fstype->next; } free(impl_share->sharepath); free(impl_share->dataset); free(impl_share->fsinfo); free(impl_share); } int sa_zfs_process_share(sa_handle_t handle, sa_group_t group, sa_share_t share, char *mountpoint, char *proto, zprop_source_t source, char *shareopts, char *sourcestr, char *dataset) { sa_handle_impl_t impl_handle = (sa_handle_impl_t)handle; sa_share_impl_t impl_share = (sa_share_impl_t)share; #ifdef DEBUG fprintf(stderr, "sa_zfs_process_share: mountpoint=%s, proto=%s, " "shareopts=%s, sourcestr=%s, dataset=%s\n", mountpoint, proto, shareopts, sourcestr, dataset); #endif return (process_share(impl_handle, impl_share, mountpoint, NULL, proto, shareopts, NULL, dataset, B_FALSE)); } void sa_update_sharetab_ts(sa_handle_t handle) { sa_handle_impl_t impl_handle = (sa_handle_impl_t)handle; update_sharetab(impl_handle); } diff --git a/lib/libspl/include/sys/byteorder.h b/lib/libspl/include/sys/byteorder.h index 7ef1c42959f8..72d40b1643a8 100644 --- a/lib/libspl/include/sys/byteorder.h +++ b/lib/libspl/include/sys/byteorder.h @@ -1,221 +1,225 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ #ifndef _SYS_BYTEORDER_H #define _SYS_BYTEORDER_H #include #include #if defined(__GNUC__) && defined(_ASM_INLINES) && \ (defined(__i386) || defined(__amd64)) #include #endif #ifdef __cplusplus extern "C" { #endif /* * macros for conversion between host and (internet) network byte order */ #if defined(_BIG_ENDIAN) && !defined(ntohl) && !defined(__lint) /* big-endian */ #define ntohl(x) (x) #define ntohs(x) (x) #define htonl(x) (x) #define htons(x) (x) #elif !defined(ntohl) /* little-endian */ #ifndef _IN_PORT_T #define _IN_PORT_T typedef uint16_t in_port_t; #endif #ifndef _IN_ADDR_T #define _IN_ADDR_T typedef uint32_t in_addr_t; #endif #if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) extern uint32_t htonl(uint32_t); extern uint16_t htons(uint16_t); extern uint32_t ntohl(uint32_t); extern uint16_t ntohs(uint16_t); #else extern in_addr_t htonl(in_addr_t); extern in_port_t htons(in_port_t); extern in_addr_t ntohl(in_addr_t); extern in_port_t ntohs(in_port_t); #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) */ #endif #if !defined(_XPG4_2) || defined(__EXTENSIONS__) /* * Macros to reverse byte order */ #define BSWAP_8(x) ((x) & 0xff) #define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) #define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) #define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) #define BMASK_8(x) ((x) & 0xff) #define BMASK_16(x) ((x) & 0xffff) #define BMASK_32(x) ((x) & 0xffffffff) #define BMASK_64(x) (x) /* * Macros to convert from a specific byte order to/from native byte order */ #ifdef _BIG_ENDIAN #define BE_8(x) BMASK_8(x) #define BE_16(x) BMASK_16(x) #define BE_32(x) BMASK_32(x) #define BE_64(x) BMASK_64(x) #define LE_8(x) BSWAP_8(x) #define LE_16(x) BSWAP_16(x) #define LE_32(x) BSWAP_32(x) #define LE_64(x) BSWAP_64(x) #else #define LE_8(x) BMASK_8(x) #define LE_16(x) BMASK_16(x) #define LE_32(x) BMASK_32(x) #define LE_64(x) BMASK_64(x) #define BE_8(x) BSWAP_8(x) #define BE_16(x) BSWAP_16(x) #define BE_32(x) BSWAP_32(x) #define BE_64(x) BSWAP_64(x) #endif #ifdef _BIG_ENDIAN static __inline__ uint64_t -htonll(uint64_t n) { +htonll(uint64_t n) +{ return (n); } static __inline__ uint64_t -ntohll(uint64_t n) { +ntohll(uint64_t n) +{ return (n); } #else static __inline__ uint64_t -htonll(uint64_t n) { +htonll(uint64_t n) +{ return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); } static __inline__ uint64_t -ntohll(uint64_t n) { +ntohll(uint64_t n) +{ return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); } #endif /* * Macros to read unaligned values from a specific byte order to * native byte order */ #define BE_IN8(xa) \ *((uint8_t *)(xa)) #define BE_IN16(xa) \ (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) #define BE_IN32(xa) \ (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) #define BE_IN64(xa) \ (((uint64_t)BE_IN32(xa) << 32) | BE_IN32((uint8_t *)(xa)+4)) #define LE_IN8(xa) \ *((uint8_t *)(xa)) #define LE_IN16(xa) \ (((uint16_t)LE_IN8((uint8_t *)(xa) + 1) << 8) | LE_IN8(xa)) #define LE_IN32(xa) \ (((uint32_t)LE_IN16((uint8_t *)(xa) + 2) << 16) | LE_IN16(xa)) #define LE_IN64(xa) \ (((uint64_t)LE_IN32((uint8_t *)(xa) + 4) << 32) | LE_IN32(xa)) /* * Macros to write unaligned values from native byte order to a specific byte * order. */ #define BE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); #define BE_OUT16(xa, yv) \ BE_OUT8((uint8_t *)(xa) + 1, yv); \ BE_OUT8((uint8_t *)(xa), (yv) >> 8); #define BE_OUT32(xa, yv) \ BE_OUT16((uint8_t *)(xa) + 2, yv); \ BE_OUT16((uint8_t *)(xa), (yv) >> 16); #define BE_OUT64(xa, yv) \ BE_OUT32((uint8_t *)(xa) + 4, yv); \ BE_OUT32((uint8_t *)(xa), (yv) >> 32); #define LE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); #define LE_OUT16(xa, yv) \ LE_OUT8((uint8_t *)(xa), yv); \ LE_OUT8((uint8_t *)(xa) + 1, (yv) >> 8); #define LE_OUT32(xa, yv) \ LE_OUT16((uint8_t *)(xa), yv); \ LE_OUT16((uint8_t *)(xa) + 2, (yv) >> 16); #define LE_OUT64(xa, yv) \ LE_OUT32((uint8_t *)(xa), yv); \ LE_OUT32((uint8_t *)(xa) + 4, (yv) >> 32); #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ #ifdef __cplusplus } #endif #endif /* _SYS_BYTEORDER_H */ diff --git a/lib/libspl/include/umem.h b/lib/libspl/include/umem.h index a89cb4950db2..c63026ceed1a 100644 --- a/lib/libspl/include/umem.h +++ b/lib/libspl/include/umem.h @@ -1,203 +1,204 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _LIBSPL_UMEM_H #define _LIBSPL_UMEM_H /* * XXX: We should use the real portable umem library if it is detected * at configure time. However, if the library is not available, we can * use a trivial malloc based implementation. This obviously impacts * performance, but unless you are using a full userspace build of zpool for * something other than ztest, you are likely not going to notice or care. * * https://labs.omniti.com/trac/portableumem */ #include #include #ifdef __cplusplus extern "C" { #endif typedef void vmem_t; /* * Flags for umem_alloc/umem_free */ #define UMEM_DEFAULT 0x0000 /* normal -- may fail */ #define UMEM_NOFAIL 0x0100 /* Never fails */ /* * Flags for umem_cache_create() */ #define UMC_NOTOUCH 0x00010000 #define UMC_NODEBUG 0x00020000 #define UMC_NOMAGAZINE 0x00040000 #define UMC_NOHASH 0x00080000 #define UMEM_CACHE_NAMELEN 31 typedef int umem_nofail_callback_t(void); typedef int umem_constructor_t(void *, void *, int); typedef void umem_destructor_t(void *, void *); typedef void umem_reclaim_t(void *); typedef struct umem_cache { char cache_name[UMEM_CACHE_NAMELEN + 1]; size_t cache_bufsize; size_t cache_align; umem_constructor_t *cache_constructor; umem_destructor_t *cache_destructor; umem_reclaim_t *cache_reclaim; void *cache_private; void *cache_arena; int cache_cflags; } umem_cache_t; static inline void * umem_alloc(size_t size, int flags) { void *ptr = NULL; do { ptr = malloc(size); } while (ptr == NULL && (flags & UMEM_NOFAIL)); return (ptr); } static inline void * umem_alloc_aligned(size_t size, size_t align, int flags) { void *ptr = NULL; int rc = EINVAL; do { rc = posix_memalign(&ptr, align, size); } while (rc == ENOMEM && (flags & UMEM_NOFAIL)); if (rc == EINVAL) { fprintf(stderr, "%s: invalid memory alignment (%zd)\n", __func__, align); if (flags & UMEM_NOFAIL) abort(); return (NULL); } return (ptr); } static inline void * umem_zalloc(size_t size, int flags) { void *ptr = NULL; ptr = umem_alloc(size, flags); if (ptr) memset(ptr, 0, size); return (ptr); } static inline void umem_free(void *ptr, size_t size) { free(ptr); } static inline void -umem_nofail_callback(umem_nofail_callback_t *cb) {} +umem_nofail_callback(umem_nofail_callback_t *cb) +{} static inline umem_cache_t * umem_cache_create( char *name, size_t bufsize, size_t align, umem_constructor_t *constructor, umem_destructor_t *destructor, umem_reclaim_t *reclaim, void *priv, void *vmp, int cflags) { umem_cache_t *cp; cp = umem_alloc(sizeof (umem_cache_t), UMEM_DEFAULT); if (cp) { strncpy(cp->cache_name, name, UMEM_CACHE_NAMELEN); cp->cache_bufsize = bufsize; cp->cache_align = align; cp->cache_constructor = constructor; cp->cache_destructor = destructor; cp->cache_reclaim = reclaim; cp->cache_private = priv; cp->cache_arena = vmp; cp->cache_cflags = cflags; } return (cp); } static inline void umem_cache_destroy(umem_cache_t *cp) { umem_free(cp, sizeof (umem_cache_t)); } static inline void * umem_cache_alloc(umem_cache_t *cp, int flags) { void *ptr = NULL; if (cp->cache_align != 0) ptr = umem_alloc_aligned( cp->cache_bufsize, cp->cache_align, flags); else ptr = umem_alloc(cp->cache_bufsize, flags); if (ptr && cp->cache_constructor) cp->cache_constructor(ptr, cp->cache_private, UMEM_DEFAULT); return (ptr); } static inline void umem_cache_free(umem_cache_t *cp, void *ptr) { if (cp->cache_destructor) cp->cache_destructor(ptr, cp->cache_private); umem_free(ptr, cp->cache_bufsize); } static inline void umem_cache_reap_now(umem_cache_t *cp) { } #ifdef __cplusplus } #endif #endif diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c index 791d509819ea..ae67906fe008 100644 --- a/lib/libzpool/taskq.c +++ b/lib/libzpool/taskq.c @@ -1,368 +1,368 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2012 Garrett D'Amore . All rights reserved. * Copyright (c) 2014 by Delphix. All rights reserved. */ #include int taskq_now; taskq_t *system_taskq; taskq_t *system_delay_taskq; #define TASKQ_ACTIVE 0x00010000 static taskq_ent_t * task_alloc(taskq_t *tq, int tqflags) { taskq_ent_t *t; int rv; again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); tq->tq_freelist = t->tqent_next; } else { if (tq->tq_nalloc >= tq->tq_maxalloc) { if (!(tqflags & KM_SLEEP)) return (NULL); /* * We don't want to exceed tq_maxalloc, but we can't * wait for other tasks to complete (and thus free up * task structures) without risking deadlock with * the caller. So, we just delay for one second * to throttle the allocation rate. If we have tasks * complete before one second timeout expires then * taskq_ent_free will signal us and we will * immediately retry the allocation. */ tq->tq_maxalloc_wait++; rv = cv_timedwait(&tq->tq_maxalloc_cv, &tq->tq_lock, ddi_get_lbolt() + hz); tq->tq_maxalloc_wait--; if (rv > 0) goto again; /* signaled */ } mutex_exit(&tq->tq_lock); t = kmem_alloc(sizeof (taskq_ent_t), tqflags); mutex_enter(&tq->tq_lock); if (t != NULL) { /* Make sure we start without any flags */ t->tqent_flags = 0; tq->tq_nalloc++; } } return (t); } static void task_free(taskq_t *tq, taskq_ent_t *t) { if (tq->tq_nalloc <= tq->tq_minalloc) { t->tqent_next = tq->tq_freelist; tq->tq_freelist = t; } else { tq->tq_nalloc--; mutex_exit(&tq->tq_lock); kmem_free(t, sizeof (taskq_ent_t)); mutex_enter(&tq->tq_lock); } if (tq->tq_maxalloc_wait) cv_signal(&tq->tq_maxalloc_cv); } taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) { taskq_ent_t *t; if (taskq_now) { func(arg); return (1); } mutex_enter(&tq->tq_lock); ASSERT(tq->tq_flags & TASKQ_ACTIVE); if ((t = task_alloc(tq, tqflags)) == NULL) { mutex_exit(&tq->tq_lock); return (0); } if (tqflags & TQ_FRONT) { t->tqent_next = tq->tq_task.tqent_next; t->tqent_prev = &tq->tq_task; } else { t->tqent_next = &tq->tq_task; t->tqent_prev = tq->tq_task.tqent_prev; } t->tqent_next->tqent_prev = t; t->tqent_prev->tqent_next = t; t->tqent_func = func; t->tqent_arg = arg; t->tqent_flags = 0; cv_signal(&tq->tq_dispatch_cv); mutex_exit(&tq->tq_lock); return (1); } taskqid_t taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags, clock_t expire_time) { return (0); } int taskq_empty_ent(taskq_ent_t *t) { return (t->tqent_next == NULL); } void taskq_init_ent(taskq_ent_t *t) { t->tqent_next = NULL; t->tqent_prev = NULL; t->tqent_func = NULL; t->tqent_arg = NULL; t->tqent_flags = 0; } void taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, taskq_ent_t *t) { ASSERT(func != NULL); /* * Mark it as a prealloc'd task. This is important * to ensure that we don't free it later. */ t->tqent_flags |= TQENT_FLAG_PREALLOC; /* * Enqueue the task to the underlying queue. */ mutex_enter(&tq->tq_lock); if (flags & TQ_FRONT) { t->tqent_next = tq->tq_task.tqent_next; t->tqent_prev = &tq->tq_task; } else { t->tqent_next = &tq->tq_task; t->tqent_prev = tq->tq_task.tqent_prev; } t->tqent_next->tqent_prev = t; t->tqent_prev->tqent_next = t; t->tqent_func = func; t->tqent_arg = arg; cv_signal(&tq->tq_dispatch_cv); mutex_exit(&tq->tq_lock); } void taskq_wait(taskq_t *tq) { mutex_enter(&tq->tq_lock); while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) cv_wait(&tq->tq_wait_cv, &tq->tq_lock); mutex_exit(&tq->tq_lock); } void taskq_wait_id(taskq_t *tq, taskqid_t id) { taskq_wait(tq); } void taskq_wait_outstanding(taskq_t *tq, taskqid_t id) { taskq_wait(tq); } static void taskq_thread(void *arg) { taskq_t *tq = arg; taskq_ent_t *t; boolean_t prealloc; mutex_enter(&tq->tq_lock); while (tq->tq_flags & TASKQ_ACTIVE) { if ((t = tq->tq_task.tqent_next) == &tq->tq_task) { if (--tq->tq_active == 0) cv_broadcast(&tq->tq_wait_cv); cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); tq->tq_active++; continue; } t->tqent_prev->tqent_next = t->tqent_next; t->tqent_next->tqent_prev = t->tqent_prev; t->tqent_next = NULL; t->tqent_prev = NULL; prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC; mutex_exit(&tq->tq_lock); rw_enter(&tq->tq_threadlock, RW_READER); t->tqent_func(t->tqent_arg); rw_exit(&tq->tq_threadlock); mutex_enter(&tq->tq_lock); if (!prealloc) task_free(tq, t); } tq->tq_nthreads--; cv_broadcast(&tq->tq_wait_cv); mutex_exit(&tq->tq_lock); thread_exit(); } /*ARGSUSED*/ taskq_t * taskq_create(const char *name, int nthreads, pri_t pri, - int minalloc, int maxalloc, uint_t flags) + int minalloc, int maxalloc, uint_t flags) { taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); int t; if (flags & TASKQ_THREADS_CPU_PCT) { int pct; ASSERT3S(nthreads, >=, 0); ASSERT3S(nthreads, <=, 100); pct = MIN(nthreads, 100); pct = MAX(pct, 0); nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100; nthreads = MAX(nthreads, 1); /* need at least 1 thread */ } else { ASSERT3S(nthreads, >=, 1); } rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL); (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN); tq->tq_flags = flags | TASKQ_ACTIVE; tq->tq_active = nthreads; tq->tq_nthreads = nthreads; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; tq->tq_task.tqent_next = &tq->tq_task; tq->tq_task.tqent_prev = &tq->tq_task; tq->tq_threadlist = kmem_alloc(nthreads * sizeof (kthread_t *), KM_SLEEP); if (flags & TASKQ_PREPOPULATE) { mutex_enter(&tq->tq_lock); while (minalloc-- > 0) task_free(tq, task_alloc(tq, KM_SLEEP)); mutex_exit(&tq->tq_lock); } for (t = 0; t < nthreads; t++) VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL); return (tq); } void taskq_destroy(taskq_t *tq) { int nthreads = tq->tq_nthreads; taskq_wait(tq); mutex_enter(&tq->tq_lock); tq->tq_flags &= ~TASKQ_ACTIVE; cv_broadcast(&tq->tq_dispatch_cv); while (tq->tq_nthreads != 0) cv_wait(&tq->tq_wait_cv, &tq->tq_lock); tq->tq_minalloc = 0; while (tq->tq_nalloc != 0) { ASSERT(tq->tq_freelist != NULL); task_free(tq, task_alloc(tq, KM_SLEEP)); } mutex_exit(&tq->tq_lock); kmem_free(tq->tq_threadlist, nthreads * sizeof (kthread_t *)); rw_destroy(&tq->tq_threadlock); mutex_destroy(&tq->tq_lock); cv_destroy(&tq->tq_dispatch_cv); cv_destroy(&tq->tq_wait_cv); cv_destroy(&tq->tq_maxalloc_cv); kmem_free(tq, sizeof (taskq_t)); } int taskq_member(taskq_t *tq, kthread_t *t) { int i; if (taskq_now) return (1); for (i = 0; i < tq->tq_nthreads; i++) if (tq->tq_threadlist[i] == t) return (1); return (0); } int taskq_cancel_id(taskq_t *tq, taskqid_t id) { return (ENOENT); } void system_taskq_init(void) { system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); system_delay_taskq = taskq_create("delay_taskq", 4, maxclsyspri, 4, 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); } void system_taskq_fini(void) { taskq_destroy(system_taskq); system_taskq = NULL; /* defensive */ taskq_destroy(system_delay_taskq); system_delay_taskq = NULL; } diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c index a68a02cdfddd..8592386ddbe7 100644 --- a/module/icp/algs/aes/aes_impl.c +++ b/module/icp/algs/aes/aes_impl.c @@ -1,1617 +1,1619 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #ifdef __amd64 #ifdef _KERNEL /* Workaround for no XMM kernel thread save/restore */ #define KPREEMPT_DISABLE kpreempt_disable() #define KPREEMPT_ENABLE kpreempt_enable() #else #define KPREEMPT_DISABLE #define KPREEMPT_ENABLE #endif /* _KERNEL */ #endif /* __amd64 */ /* * This file is derived from the file rijndael-alg-fst.c taken from the * "optimized C code v3.0" on the "rijndael home page" * http://www.iaik.tu-graz.ac.at/research/krypto/AES/old/~rijmen/rijndael/ * pointed by the NIST web-site http://csrc.nist.gov/archive/aes/ * * The following note is from the original file: */ /* * rijndael-alg-fst.c * * @version 3.0 (December 2000) * * Optimised ANSI C code for the Rijndael cipher (now AES) * * @author Vincent Rijmen * @author Antoon Bosselaers * @author Paulo Barreto * * This code is hereby placed in the public domain. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if defined(__amd64) /* These functions are used to execute amd64 instructions for AMD or Intel: */ extern int rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[], int keyBits); extern int rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[], int keyBits); extern void aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); extern void aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); /* These functions are used to execute Intel-specific AES-NI instructions: */ extern int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits); extern int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits); extern void aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); extern void aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); static int intel_aes_instructions_present(void); #define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d, e) #define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d, e) #else /* Generic C implementation */ #define AES_ENCRYPT_IMPL(a, b, c, d, e) rijndael_encrypt(a, b, c, d) #define AES_DECRYPT_IMPL(a, b, c, d, e) rijndael_decrypt(a, b, c, d) #define rijndael_key_setup_enc_raw rijndael_key_setup_enc #endif /* __amd64 */ #if defined(_LITTLE_ENDIAN) && !defined(__amd64) #define AES_BYTE_SWAP #endif #if !defined(__amd64) /* * Constant tables */ /* * Te0[x] = S [x].[02, 01, 01, 03]; * Te1[x] = S [x].[03, 02, 01, 01]; * Te2[x] = S [x].[01, 03, 02, 01]; * Te3[x] = S [x].[01, 01, 03, 02]; * Te4[x] = S [x].[01, 01, 01, 01]; * * Td0[x] = Si[x].[0e, 09, 0d, 0b]; * Td1[x] = Si[x].[0b, 0e, 09, 0d]; * Td2[x] = Si[x].[0d, 0b, 0e, 09]; * Td3[x] = Si[x].[09, 0d, 0b, 0e]; * Td4[x] = Si[x].[01, 01, 01, 01]; */ /* Encrypt Sbox constants (for the substitute bytes operation) */ static const uint32_t Te0[256] = { 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU, 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U, 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU, 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU, 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U, 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU, 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU, 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU, 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU, 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU, 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U, 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU, 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU, 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U, 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU, 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU, 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU, 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU, 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU, 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U, 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU, 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU, 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU, 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU, 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U, 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U, 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U, 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U, 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU, 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U, 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U, 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU, 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU, 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U, 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U, 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U, 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU, 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U, 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU, 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U, 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU, 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U, 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U, 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU, 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U, 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U, 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U, 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U, 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U, 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U, 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U, 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U, 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU, 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U, 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U, 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U, 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U, 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U, 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U, 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU, 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U, 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U, 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U, 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU }; static const uint32_t Te1[256] = { 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU, 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U, 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU, 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U, 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU, 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U, 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU, 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U, 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U, 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU, 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U, 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U, 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U, 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU, 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U, 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U, 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU, 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U, 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U, 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U, 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU, 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU, 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U, 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU, 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU, 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U, 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU, 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U, 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU, 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U, 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U, 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U, 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU, 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U, 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU, 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U, 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU, 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U, 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U, 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU, 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU, 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU, 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U, 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U, 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU, 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U, 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU, 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U, 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU, 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U, 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU, 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU, 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U, 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU, 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U, 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU, 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U, 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U, 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U, 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU, 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU, 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U, 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU, 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U }; static const uint32_t Te2[256] = { 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU, 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U, 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU, 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U, 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU, 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U, 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU, 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U, 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U, 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU, 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U, 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U, 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U, 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU, 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U, 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U, 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU, 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U, 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U, 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U, 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU, 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU, 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U, 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU, 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU, 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U, 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU, 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U, 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU, 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U, 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U, 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U, 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU, 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U, 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU, 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U, 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU, 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U, 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U, 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU, 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU, 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU, 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U, 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U, 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU, 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U, 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU, 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U, 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU, 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U, 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU, 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU, 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U, 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU, 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U, 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU, 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U, 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U, 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U, 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU, 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU, 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U, 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU, 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U }; static const uint32_t Te3[256] = { 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U, 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U, 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U, 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU, 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU, 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU, 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U, 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU, 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU, 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U, 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U, 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU, 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU, 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU, 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU, 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU, 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U, 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU, 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU, 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U, 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U, 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U, 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U, 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U, 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU, 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U, 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU, 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU, 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U, 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U, 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U, 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU, 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U, 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU, 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU, 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U, 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U, 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU, 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U, 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU, 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U, 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U, 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U, 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U, 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU, 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U, 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU, 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U, 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU, 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U, 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU, 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU, 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU, 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU, 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U, 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U, 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U, 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U, 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U, 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U, 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU, 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U, 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU, 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU }; static const uint32_t Te4[256] = { 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU, 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U, 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU, 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U, 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU, 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U, 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU, 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U, 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U, 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU, 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U, 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U, 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U, 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU, 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U, 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U, 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU, 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U, 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U, 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U, 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU, 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU, 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U, 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU, 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU, 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U, 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU, 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U, 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU, 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U, 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U, 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U, 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU, 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U, 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU, 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U, 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU, 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U, 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U, 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU, 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU, 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU, 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U, 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U, 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU, 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U, 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU, 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U, 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU, 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U, 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU, 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU, 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U, 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU, 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U, 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU, 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U, 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U, 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U, 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU, 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU, 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U, 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU, 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U }; /* Decrypt Sbox constants (for the substitute bytes operation) */ static const uint32_t Td0[256] = { 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U, 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U, 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U, 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU, 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U, 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U, 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU, 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U, 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU, 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U, 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U, 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U, 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U, 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU, 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U, 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU, 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U, 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU, 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U, 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U, 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U, 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU, 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U, 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU, 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U, 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU, 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U, 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU, 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU, 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U, 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU, 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U, 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU, 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U, 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U, 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U, 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU, 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U, 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U, 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU, 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U, 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U, 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U, 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U, 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U, 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU, 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U, 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U, 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U, 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U, 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U, 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU, 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU, 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU, 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU, 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U, 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U, 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU, 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU, 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U, 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU, 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U, 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U, 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U }; static const uint32_t Td1[256] = { 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU, 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U, 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU, 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U, 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U, 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U, 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U, 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U, 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U, 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU, 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU, 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU, 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U, 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU, 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U, 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U, 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U, 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU, 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU, 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U, 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU, 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U, 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU, 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU, 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U, 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U, 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U, 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU, 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U, 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU, 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U, 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U, 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U, 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU, 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U, 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U, 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U, 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U, 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U, 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U, 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU, 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU, 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U, 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU, 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U, 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU, 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU, 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U, 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU, 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U, 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U, 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U, 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U, 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U, 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U, 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U, 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU, 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U, 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U, 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU, 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U, 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U, 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U, 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U }; static const uint32_t Td2[256] = { 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U, 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U, 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U, 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U, 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU, 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U, 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U, 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U, 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U, 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU, 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U, 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U, 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU, 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U, 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U, 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U, 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U, 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U, 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U, 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU, 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U, 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U, 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U, 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U, 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U, 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU, 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU, 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U, 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU, 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U, 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU, 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU, 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU, 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU, 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U, 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U, 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U, 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U, 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U, 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U, 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U, 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU, 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU, 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U, 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U, 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU, 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU, 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U, 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U, 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U, 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U, 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U, 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U, 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U, 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU, 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U, 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U, 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U, 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U, 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U, 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U, 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU, 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U, 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U }; static const uint32_t Td3[256] = { 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU, 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU, 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U, 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U, 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU, 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU, 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U, 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU, 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U, 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU, 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U, 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U, 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U, 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U, 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U, 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU, 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU, 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U, 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U, 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU, 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU, 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U, 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U, 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U, 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U, 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU, 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U, 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U, 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU, 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU, 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U, 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U, 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U, 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU, 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U, 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U, 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U, 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U, 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U, 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U, 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U, 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU, 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U, 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U, 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU, 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU, 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U, 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU, 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U, 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U, 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U, 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U, 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U, 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U, 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU, 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU, 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU, 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU, 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U, 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U, 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U, 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU, 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U, 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U }; static const uint32_t Td4[256] = { 0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U, 0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U, 0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU, 0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU, 0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U, 0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U, 0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U, 0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU, 0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U, 0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU, 0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU, 0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU, 0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U, 0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U, 0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U, 0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U, 0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U, 0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U, 0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU, 0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U, 0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U, 0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU, 0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U, 0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U, 0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U, 0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU, 0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U, 0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U, 0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU, 0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U, 0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U, 0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU, 0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U, 0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU, 0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU, 0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U, 0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U, 0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U, 0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U, 0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU, 0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U, 0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U, 0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU, 0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU, 0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU, 0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U, 0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU, 0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U, 0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U, 0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U, 0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U, 0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU, 0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U, 0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU, 0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU, 0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU, 0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU, 0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U, 0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU, 0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U, 0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU, 0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U, 0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U, 0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU }; /* Rcon is Round Constant; used for encryption key expansion */ static const uint32_t rcon[RC_LENGTH] = { /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000 }; /* * Expand the cipher key into the encryption key schedule. * * Return the number of rounds for the given cipher key size. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4*(Nr + 1). * * Parameters: * rk AES key schedule 32-bit array to be initialized * cipherKey User key * keyBits AES key size (128, 192, or 256 bits) */ static int rijndael_key_setup_enc_raw(uint32_t rk[], const uint32_t cipherKey[], int keyBits) { int i = 0; uint32_t temp; rk[0] = cipherKey[0]; rk[1] = cipherKey[1]; rk[2] = cipherKey[2]; rk[3] = cipherKey[3]; if (keyBits == 128) { for (;;) { temp = rk[3]; rk[4] = rk[0] ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[temp & 0xff] & 0x0000ff00) ^ (Te4[temp >> 24] & 0x000000ff) ^ rcon[i]; rk[5] = rk[1] ^ rk[4]; rk[6] = rk[2] ^ rk[5]; rk[7] = rk[3] ^ rk[6]; if (++i == 10) { return (10); } rk += 4; } } rk[4] = cipherKey[4]; rk[5] = cipherKey[5]; if (keyBits == 192) { for (;;) { temp = rk[5]; rk[6] = rk[0] ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[temp & 0xff] & 0x0000ff00) ^ (Te4[temp >> 24] & 0x000000ff) ^ rcon[i]; rk[7] = rk[1] ^ rk[6]; rk[8] = rk[2] ^ rk[7]; rk[9] = rk[3] ^ rk[8]; if (++i == 8) { return (12); } rk[10] = rk[4] ^ rk[9]; rk[11] = rk[5] ^ rk[10]; rk += 6; } } rk[6] = cipherKey[6]; rk[7] = cipherKey[7]; if (keyBits == 256) { for (;;) { temp = rk[7]; rk[8] = rk[0] ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[temp & 0xff] & 0x0000ff00) ^ (Te4[temp >> 24] & 0x000000ff) ^ rcon[i]; rk[9] = rk[1] ^ rk[8]; rk[10] = rk[2] ^ rk[9]; rk[11] = rk[3] ^ rk[10]; if (++i == 7) { return (14); } temp = rk[11]; rk[12] = rk[4] ^ (Te4[temp >> 24] & 0xff000000) ^ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ (Te4[temp & 0xff] & 0x000000ff); rk[13] = rk[5] ^ rk[12]; rk[14] = rk[6] ^ rk[13]; rk[15] = rk[7] ^ rk[14]; rk += 8; } } return (0); } #endif /* !__amd64 */ #if defined(__amd64) /* * Expand the 32-bit AES cipher key array into the encryption and decryption * key schedules. * * Parameters: * key AES key schedule to be initialized * keyarr32 User key * keyBits AES key size (128, 192, or 256 bits) */ static void aes_setupkeys(aes_key_t *key, const uint32_t *keyarr32, int keybits) { if (intel_aes_instructions_present()) { key->flags = INTEL_AES_NI_CAPABLE; KPREEMPT_DISABLE; key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]), keyarr32, keybits); key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]), keyarr32, keybits); KPREEMPT_ENABLE; } else { key->flags = 0; key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]), keyarr32, keybits); key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]), keyarr32, keybits); } key->type = AES_32BIT_KS; } /* * Encrypt one block of data. The block is assumed to be an array * of four uint32_t values, so copy for alignment (and byte-order * reversal for little endian systems might be necessary on the * input and output byte streams. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4*(Nr + 1). * * Parameters: * rk Key schedule, of aes_ks_t (60 32-bit integers) * Nr Number of rounds * pt Input block (plain text) * ct Output block (crypto text). Can overlap with pt * flags Indicates whether we're on Intel AES-NI-capable hardware */ static void rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4], - uint32_t ct[4], int flags) { + uint32_t ct[4], int flags) +{ if (flags & INTEL_AES_NI_CAPABLE) { KPREEMPT_DISABLE; aes_encrypt_intel(rk, Nr, pt, ct); KPREEMPT_ENABLE; } else { aes_encrypt_amd64(rk, Nr, pt, ct); } } /* * Decrypt one block of data. The block is assumed to be an array * of four uint32_t values, so copy for alignment (and byte-order * reversal for little endian systems might be necessary on the * input and output byte streams. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4*(Nr + 1). * * Parameters: * rk Key schedule, of aes_ks_t (60 32-bit integers) * Nr Number of rounds * ct Input block (crypto text) * pt Output block (plain text). Can overlap with pt * flags Indicates whether we're on Intel AES-NI-capable hardware */ static void rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], - uint32_t pt[4], int flags) { + uint32_t pt[4], int flags) +{ if (flags & INTEL_AES_NI_CAPABLE) { KPREEMPT_DISABLE; aes_decrypt_intel(rk, Nr, ct, pt); KPREEMPT_ENABLE; } else { aes_decrypt_amd64(rk, Nr, ct, pt); } } #else /* generic C implementation */ /* * Expand the cipher key into the decryption key schedule. * Return the number of rounds for the given cipher key size. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4*(Nr + 1). * * Parameters: * rk AES key schedule 32-bit array to be initialized * cipherKey User key * keyBits AES key size (128, 192, or 256 bits) */ static int rijndael_key_setup_dec(uint32_t rk[], const uint32_t cipherKey[], int keyBits) { int Nr, i, j; uint32_t temp; /* expand the cipher key: */ Nr = rijndael_key_setup_enc_raw(rk, cipherKey, keyBits); /* invert the order of the round keys: */ for (i = 0, j = 4 * Nr; i < j; i += 4, j -= 4) { temp = rk[i]; rk[i] = rk[j]; rk[j] = temp; temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; } /* * apply the inverse MixColumn transform to all * round keys but the first and the last: */ for (i = 1; i < Nr; i++) { rk += 4; rk[0] = Td0[Te4[rk[0] >> 24] & 0xff] ^ Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^ Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^ Td3[Te4[rk[0] & 0xff] & 0xff]; rk[1] = Td0[Te4[rk[1] >> 24] & 0xff] ^ Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^ Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^ Td3[Te4[rk[1] & 0xff] & 0xff]; rk[2] = Td0[Te4[rk[2] >> 24] & 0xff] ^ Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^ Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^ Td3[Te4[rk[2] & 0xff] & 0xff]; rk[3] = Td0[Te4[rk[3] >> 24] & 0xff] ^ Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^ Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^ Td3[Te4[rk[3] & 0xff] & 0xff]; } return (Nr); } /* * Expand the 32-bit AES cipher key array into the encryption and decryption * key schedules. * * Parameters: * key AES key schedule to be initialized * keyarr32 User key * keyBits AES key size (128, 192, or 256 bits) */ static void aes_setupkeys(aes_key_t *key, const uint32_t *keyarr32, int keybits) { key->nr = rijndael_key_setup_enc(&(key->encr_ks.ks32[0]), keyarr32, keybits); key->nr = rijndael_key_setup_dec(&(key->decr_ks.ks32[0]), keyarr32, keybits); key->type = AES_32BIT_KS; } /* * Encrypt one block of data. The block is assumed to be an array * of four uint32_t values, so copy for alignment (and byte-order * reversal for little endian systems might be necessary on the * input and output byte streams. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4*(Nr + 1). * * Parameters: * rk Key schedule, of aes_ks_t (60 32-bit integers) * Nr Number of rounds * pt Input block (plain text) * ct Output block (crypto text). Can overlap with pt */ static void rijndael_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]) { uint32_t s0, s1, s2, s3, t0, t1, t2, t3; int r; /* * map byte array block to cipher state * and add initial round key: */ s0 = pt[0] ^ rk[0]; s1 = pt[1] ^ rk[1]; s2 = pt[2] ^ rk[2]; s3 = pt[3] ^ rk[3]; /* * Nr - 1 full rounds: */ r = Nr >> 1; for (;;) { t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[4]; t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[5]; t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[6]; t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[7]; rk += 8; if (--r == 0) { break; } s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[0]; s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[1]; s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[2]; s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[3]; } /* * apply last round and * map cipher state to byte array block: */ s0 = (Te4[(t0 >> 24)] & 0xff000000) ^ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ (Te4[t3 & 0xff] & 0x000000ff) ^ rk[0]; ct[0] = s0; s1 = (Te4[(t1 >> 24)] & 0xff000000) ^ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ (Te4[t0 & 0xff] & 0x000000ff) ^ rk[1]; ct[1] = s1; s2 = (Te4[(t2 >> 24)] & 0xff000000) ^ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ (Te4[t1 & 0xff] & 0x000000ff) ^ rk[2]; ct[2] = s2; s3 = (Te4[(t3 >> 24)] & 0xff000000) ^ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ (Te4[t2 & 0xff] & 0x000000ff) ^ rk[3]; ct[3] = s3; } /* * Decrypt one block of data. The block is assumed to be an array * of four uint32_t values, so copy for alignment (and byte-order * reversal for little endian systems might be necessary on the * input and output byte streams. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4*(Nr + 1). * * Parameters: * rk Key schedule, of aes_ks_t (60 32-bit integers) * Nr Number of rounds * ct Input block (crypto text) * pt Output block (plain text). Can overlap with pt */ static void rijndael_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]) { uint32_t s0, s1, s2, s3, t0, t1, t2, t3; int r; /* * map byte array block to cipher state * and add initial round key: */ s0 = ct[0] ^ rk[0]; s1 = ct[1] ^ rk[1]; s2 = ct[2] ^ rk[2]; s3 = ct[3] ^ rk[3]; /* * Nr - 1 full rounds: */ r = Nr >> 1; for (;;) { t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[4]; t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[5]; t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[6]; t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[7]; rk += 8; if (--r == 0) { break; } s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[0]; s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[1]; s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[2]; s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[3]; } /* * apply last round and * map cipher state to byte array block: */ s0 = (Td4[t0 >> 24] & 0xff000000) ^ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ (Td4[t1 & 0xff] & 0x000000ff) ^ rk[0]; pt[0] = s0; s1 = (Td4[t1 >> 24] & 0xff000000) ^ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ (Td4[t2 & 0xff] & 0x000000ff) ^ rk[1]; pt[1] = s1; s2 = (Td4[t2 >> 24] & 0xff000000) ^ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ (Td4[t3 & 0xff] & 0x000000ff) ^ rk[2]; pt[2] = s2; s3 = (Td4[t3 >> 24] & 0xff000000) ^ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ (Td4[t0 & 0xff] & 0x000000ff) ^ rk[3]; pt[3] = s3; } #endif /* __amd64 */ /* * Initialize AES encryption and decryption key schedules. * * Parameters: * cipherKey User key * keyBits AES key size (128, 192, or 256 bits) * keysched AES key schedule to be initialized, of type aes_key_t. * Allocated by aes_alloc_keysched(). */ void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched) { aes_key_t *newbie = keysched; uint_t keysize, i, j; union { uint64_t ka64[4]; uint32_t ka32[8]; } keyarr; switch (keyBits) { case 128: newbie->nr = 10; break; case 192: newbie->nr = 12; break; case 256: newbie->nr = 14; break; default: /* should never get here */ return; } keysize = CRYPTO_BITS2BYTES(keyBits); /* * For _LITTLE_ENDIAN machines (except AMD64), reverse every * 4 bytes in the key. On _BIG_ENDIAN and AMD64, copy the key * without reversing bytes. * For AMD64, do not byte swap for aes_setupkeys(). * * SPARCv8/v9 uses a key schedule array with 64-bit elements. * X86/AMD64 uses a key schedule array with 32-bit elements. */ #ifndef AES_BYTE_SWAP if (IS_P2ALIGNED(cipherKey, sizeof (uint64_t))) { for (i = 0, j = 0; j < keysize; i++, j += 8) { /* LINTED: pointer alignment */ keyarr.ka64[i] = *((uint64_t *)&cipherKey[j]); } } else { bcopy(cipherKey, keyarr.ka32, keysize); } #else /* byte swap */ for (i = 0, j = 0; j < keysize; i++, j += 4) { keyarr.ka32[i] = htonl(*(uint32_t *)(void *)&cipherKey[j]); } #endif aes_setupkeys(newbie, keyarr.ka32, keyBits); } /* * Encrypt one block using AES. * Align if needed and (for x86 32-bit only) byte-swap. * * Parameters: * ks Key schedule, of type aes_key_t * pt Input block (plain text) * ct Output block (crypto text). Can overlap with pt */ int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct) { aes_key_t *ksch = (aes_key_t *)ks; #ifndef AES_BYTE_SWAP if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t))) { /* LINTED: pointer alignment */ AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr, /* LINTED: pointer alignment */ (uint32_t *)pt, (uint32_t *)ct, ksch->flags); } else { #endif uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; /* Copy input block into buffer */ #ifndef AES_BYTE_SWAP bcopy(pt, &buffer, AES_BLOCK_LEN); #else /* byte swap */ buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]); buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]); buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]); buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]); #endif AES_ENCRYPT_IMPL(&ksch->encr_ks.ks32[0], ksch->nr, buffer, buffer, ksch->flags); /* Copy result from buffer to output block */ #ifndef AES_BYTE_SWAP bcopy(&buffer, ct, AES_BLOCK_LEN); } #else /* byte swap */ *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]); *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]); *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]); *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]); #endif return (CRYPTO_SUCCESS); } /* * Decrypt one block using AES. * Align and byte-swap if needed. * * Parameters: * ks Key schedule, of type aes_key_t * ct Input block (crypto text) * pt Output block (plain text). Can overlap with pt */ int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt) { aes_key_t *ksch = (aes_key_t *)ks; #ifndef AES_BYTE_SWAP if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t))) { /* LINTED: pointer alignment */ AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr, /* LINTED: pointer alignment */ (uint32_t *)ct, (uint32_t *)pt, ksch->flags); } else { #endif uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; /* Copy input block into buffer */ #ifndef AES_BYTE_SWAP bcopy(ct, &buffer, AES_BLOCK_LEN); #else /* byte swap */ buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]); buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]); buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]); buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]); #endif AES_DECRYPT_IMPL(&ksch->decr_ks.ks32[0], ksch->nr, buffer, buffer, ksch->flags); /* Copy result from buffer to output block */ #ifndef AES_BYTE_SWAP bcopy(&buffer, pt, AES_BLOCK_LEN); } #else /* byte swap */ *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]); *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]); *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]); *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]); #endif return (CRYPTO_SUCCESS); } /* * Allocate key schedule for AES. * * Return the pointer and set size to the number of bytes allocated. * Memory allocated must be freed by the caller when done. * * Parameters: * size Size of key schedule allocated, in bytes * kmflag Flag passed to kmem_alloc(9F); ignored in userland. */ /* ARGSUSED */ void * aes_alloc_keysched(size_t *size, int kmflag) { aes_key_t *keysched; keysched = (aes_key_t *)kmem_alloc(sizeof (aes_key_t), kmflag); if (keysched != NULL) { *size = sizeof (aes_key_t); return (keysched); } return (NULL); } #ifdef __amd64 #define INTEL_AESNI_FLAG (1 << 25) /* * Return 1 if executing on Intel with AES-NI instructions, * otherwise 0 (i.e., Intel without AES-NI or AMD64). * Cache the result, as the CPU can't change. */ static int intel_aes_instructions_present(void) { static int cached_result = -1; unsigned eax, ebx, ecx, edx; unsigned func, subfunc; if (cached_result == -1) { /* first time */ /* check for an intel cpu */ func = 0; subfunc = 0; __asm__ __volatile__( "cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a"(func), "c"(subfunc)); if (memcmp((char *)(&ebx), "Genu", 4) == 0 && memcmp((char *)(&edx), "ineI", 4) == 0 && memcmp((char *)(&ecx), "ntel", 4) == 0) { func = 1; subfunc = 0; /* check for aes-ni instruction set */ __asm__ __volatile__( "cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "a"(func), "c"(subfunc)); cached_result = !!(ecx & INTEL_AESNI_FLAG); } else { cached_result = 0; } } return (cached_result); } #endif /* __amd64 */ diff --git a/module/icp/algs/modes/ctr.c b/module/icp/algs/modes/ctr.c index 77ba28dddfc0..e3b0e1238232 100644 --- a/module/icp/algs/modes/ctr.c +++ b/module/icp/algs/modes/ctr.c @@ -1,238 +1,238 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include /* * Encrypt and decrypt multiple blocks of data in counter mode. */ int ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct), void (*xor_block)(uint8_t *, uint8_t *)) { size_t remainder = length; size_t need = 0; uint8_t *datap = (uint8_t *)data; uint8_t *blockp; uint8_t *lastp; void *iov_or_mp; offset_t offset; uint8_t *out_data_1; uint8_t *out_data_2; size_t out_data_1_len; uint64_t lower_counter, upper_counter; if (length + ctx->ctr_remainder_len < block_size) { /* accumulate bytes here and return */ bcopy(datap, (uint8_t *)ctx->ctr_remainder + ctx->ctr_remainder_len, length); ctx->ctr_remainder_len += length; ctx->ctr_copy_to = datap; return (CRYPTO_SUCCESS); } lastp = (uint8_t *)ctx->ctr_cb; if (out != NULL) crypto_init_ptrs(out, &iov_or_mp, &offset); do { /* Unprocessed data from last call. */ if (ctx->ctr_remainder_len > 0) { need = block_size - ctx->ctr_remainder_len; if (need > remainder) return (CRYPTO_DATA_LEN_RANGE); bcopy(datap, &((uint8_t *)ctx->ctr_remainder) [ctx->ctr_remainder_len], need); blockp = (uint8_t *)ctx->ctr_remainder; } else { blockp = datap; } /* ctr_cb is the counter block */ cipher(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, (uint8_t *)ctx->ctr_tmp); lastp = (uint8_t *)ctx->ctr_tmp; /* * Increment Counter. */ lower_counter = ntohll(ctx->ctr_cb[1] & ctx->ctr_lower_mask); lower_counter = htonll(lower_counter + 1); lower_counter &= ctx->ctr_lower_mask; ctx->ctr_cb[1] = (ctx->ctr_cb[1] & ~(ctx->ctr_lower_mask)) | lower_counter; /* wrap around */ if (lower_counter == 0) { upper_counter = ntohll(ctx->ctr_cb[0] & ctx->ctr_upper_mask); upper_counter = htonll(upper_counter + 1); upper_counter &= ctx->ctr_upper_mask; ctx->ctr_cb[0] = (ctx->ctr_cb[0] & ~(ctx->ctr_upper_mask)) | upper_counter; } /* * XOR encrypted counter block with the current clear block. */ xor_block(blockp, lastp); if (out == NULL) { if (ctx->ctr_remainder_len > 0) { bcopy(lastp, ctx->ctr_copy_to, ctx->ctr_remainder_len); bcopy(lastp + ctx->ctr_remainder_len, datap, need); } } else { crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, &out_data_1_len, &out_data_2, block_size); /* copy block to where it belongs */ bcopy(lastp, out_data_1, out_data_1_len); if (out_data_2 != NULL) { bcopy(lastp + out_data_1_len, out_data_2, block_size - out_data_1_len); } /* update offset */ out->cd_offset += block_size; } /* Update pointer to next block of data to be processed. */ if (ctx->ctr_remainder_len != 0) { datap += need; ctx->ctr_remainder_len = 0; } else { datap += block_size; } remainder = (size_t)&data[length] - (size_t)datap; /* Incomplete last block. */ if (remainder > 0 && remainder < block_size) { bcopy(datap, ctx->ctr_remainder, remainder); ctx->ctr_remainder_len = remainder; ctx->ctr_copy_to = datap; goto out; } ctx->ctr_copy_to = NULL; } while (remainder > 0); out: return (CRYPTO_SUCCESS); } int ctr_mode_final(ctr_ctx_t *ctx, crypto_data_t *out, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) { uint8_t *lastp; void *iov_or_mp; offset_t offset; uint8_t *out_data_1; uint8_t *out_data_2; size_t out_data_1_len; uint8_t *p; int i; if (out->cd_length < ctx->ctr_remainder_len) return (CRYPTO_DATA_LEN_RANGE); encrypt_block(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, (uint8_t *)ctx->ctr_tmp); lastp = (uint8_t *)ctx->ctr_tmp; p = (uint8_t *)ctx->ctr_remainder; for (i = 0; i < ctx->ctr_remainder_len; i++) { p[i] ^= lastp[i]; } crypto_init_ptrs(out, &iov_or_mp, &offset); crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, &out_data_1_len, &out_data_2, ctx->ctr_remainder_len); bcopy(p, out_data_1, out_data_1_len); if (out_data_2 != NULL) { bcopy((uint8_t *)p + out_data_1_len, out_data_2, ctx->ctr_remainder_len - out_data_1_len); } out->cd_offset += ctx->ctr_remainder_len; ctx->ctr_remainder_len = 0; return (CRYPTO_SUCCESS); } int ctr_init_ctx(ctr_ctx_t *ctr_ctx, ulong_t count, uint8_t *cb, -void (*copy_block)(uint8_t *, uint8_t *)) + void (*copy_block)(uint8_t *, uint8_t *)) { uint64_t upper_mask = 0; uint64_t lower_mask = 0; if (count == 0 || count > 128) { return (CRYPTO_MECHANISM_PARAM_INVALID); } /* upper 64 bits of the mask */ if (count >= 64) { count -= 64; upper_mask = (count == 64) ? UINT64_MAX : (1ULL << count) - 1; lower_mask = UINT64_MAX; } else { /* now the lower 63 bits */ lower_mask = (1ULL << count) - 1; } ctr_ctx->ctr_lower_mask = htonll(lower_mask); ctr_ctx->ctr_upper_mask = htonll(upper_mask); copy_block(cb, (uchar_t *)ctr_ctx->ctr_cb); ctr_ctx->ctr_lastp = (uint8_t *)&ctr_ctx->ctr_cb[0]; ctr_ctx->ctr_flags |= CTR_MODE; return (CRYPTO_SUCCESS); } /* ARGSUSED */ void * ctr_alloc_ctx(int kmflag) { ctr_ctx_t *ctr_ctx; if ((ctr_ctx = kmem_zalloc(sizeof (ctr_ctx_t), kmflag)) == NULL) return (NULL); ctr_ctx->ctr_flags = CTR_MODE; return (ctr_ctx); } diff --git a/module/icp/algs/skein/skein_block.c b/module/icp/algs/skein/skein_block.c index d2e81196389e..6d85cb7d9e98 100644 --- a/module/icp/algs/skein/skein_block.c +++ b/module/icp/algs/skein/skein_block.c @@ -1,793 +1,790 @@ /* * Implementation of the Skein block functions. * Source code author: Doug Whiting, 2008. * This algorithm and source code is released to the public domain. * Compile-time switches: * SKEIN_USE_ASM -- set bits (256/512/1024) to select which * versions use ASM code for block processing * [default: use C for all block sizes] */ /* Copyright 2013 Doug Whiting. This code is released to the public domain. */ #include #include "skein_impl.h" #include /* for _ILP32 */ #ifndef SKEIN_USE_ASM #define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ #endif #ifndef SKEIN_LOOP /* * The low-level checksum routines use a lot of stack space. On systems where * small stacks frame are enforced (like 32-bit kernel builds), do not unroll * checksum calculations to save stack space. * * Even with no loops unrolled, we still can exceed the 1k stack frame limit * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32). We can * safely ignore it though, since that the checksum functions will be called * from a worker thread that won't be using much stack. That's why we have * the #pragma here to ignore the warning. */ #if defined(_ILP32) || defined(__powerpc) /* Assume small stack */ #pragma GCC diagnostic ignored "-Wframe-larger-than=" /* * We're running on 32-bit, don't unroll loops to save stack frame space * * Due to the ways the calculations on SKEIN_LOOP are done in * Skein_*_Process_Block(), a value of 111 disables unrolling loops * in any of those functions. */ #define SKEIN_LOOP 111 #else /* We're compiling with large stacks */ #define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ #endif #endif /* some useful definitions for code here */ #define BLK_BITS (WCNT*64) #define KW_TWK_BASE (0) #define KW_KEY_BASE (3) #define ks (kw + KW_KEY_BASE) #define ts (kw + KW_TWK_BASE) /* no debugging in Illumos version */ #define DebugSaveTweak(ctx) /* Skein_256 */ #if !(SKEIN_USE_ASM & 256) - void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, size_t blkCnt, size_t byteCntAdd) -{ /* do it in C */ +{ enum { WCNT = SKEIN_256_STATE_WORDS }; #undef RCNT #define RCNT (SKEIN_256_ROUNDS_TOTAL / 8) #ifdef SKEIN_LOOP /* configure how much to unroll the loop */ #define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10) #else #define SKEIN_UNROLL_256 (0) #endif #if SKEIN_UNROLL_256 #if (RCNT % SKEIN_UNROLL_256) #error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ #endif size_t r; /* key schedule words : chaining vars + tweak + "rotation" */ uint64_t kw[WCNT + 4 + RCNT * 2]; #else uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ #endif /* local copy of context vars, for speed */ uint64_t X0, X1, X2, X3; uint64_t w[WCNT]; /* local copy of input block */ #ifdef SKEIN_DEBUG /* use for debugging (help compiler put Xn in registers) */ const uint64_t *Xptr[4]; Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; #endif Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; do { /* * this implementation only supports 2**64 input bytes * (no carry out here) */ ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ ks[0] = ctx->X[0]; ks[1] = ctx->X[1]; ks[2] = ctx->X[2]; ks[3] = ctx->X[3]; ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY; ts[2] = ts[0] ^ ts[1]; /* get input block in little-endian format */ Skein_Get64_LSB_First(w, blkPtr, WCNT); DebugSaveTweak(ctx); Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); X0 = w[0] + ks[0]; /* do the first full key injection */ X1 = w[1] + ks[1] + ts[0]; X2 = w[2] + ks[2] + ts[1]; X3 = w[3] + ks[3]; Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); /* show starting state values */ blkPtr += SKEIN_256_BLOCK_BYTES; /* run the rounds */ #define Round256(p0, p1, p2, p3, ROT, rNum) \ - X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \ - X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \ #if SKEIN_UNROLL_256 == 0 #define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \ - Round256(p0, p1, p2, p3, ROT, rNum) \ - Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); #define I256(R) \ - X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \ - X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \ - X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \ - X3 += ks[((R) + 4) % 5] + (R) + 1; \ - Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \ + X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \ + X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \ + X3 += ks[((R) + 4) % 5] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); #else /* looping version */ #define R256(p0, p1, p2, p3, ROT, rNum) \ - Round256(p0, p1, p2, p3, ROT, rNum) \ - Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); #define I256(R) \ X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \ X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \ X3 += ks[r + (R) + 3] + r + (R); \ ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */ \ - ts[r + (R) + 2] = ts[r + (R) - 1]; \ - Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + ts[r + (R) + 2] = ts[r + (R) - 1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); /* loop thru it */ for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) #endif { #define R256_8_rounds(R) \ R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \ R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \ R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \ R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \ I256(2 * (R)); \ R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \ R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \ R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \ R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \ I256(2 * (R) + 1); R256_8_rounds(0); #define R256_Unroll_R(NN) \ ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \ (SKEIN_UNROLL_256 > (NN))) #if R256_Unroll_R(1) R256_8_rounds(1); #endif #if R256_Unroll_R(2) R256_8_rounds(2); #endif #if R256_Unroll_R(3) R256_8_rounds(3); #endif #if R256_Unroll_R(4) R256_8_rounds(4); #endif #if R256_Unroll_R(5) R256_8_rounds(5); #endif #if R256_Unroll_R(6) R256_8_rounds(6); #endif #if R256_Unroll_R(7) R256_8_rounds(7); #endif #if R256_Unroll_R(8) R256_8_rounds(8); #endif #if R256_Unroll_R(9) R256_8_rounds(9); #endif #if R256_Unroll_R(10) R256_8_rounds(10); #endif #if R256_Unroll_R(11) R256_8_rounds(11); #endif #if R256_Unroll_R(12) R256_8_rounds(12); #endif #if R256_Unroll_R(13) R256_8_rounds(13); #endif #if R256_Unroll_R(14) R256_8_rounds(14); #endif #if (SKEIN_UNROLL_256 > 14) #error "need more unrolling in Skein_256_Process_Block" #endif } /* * do the final "feedforward" xor, update context chaining vars */ ctx->X[0] = X0 ^ w[0]; ctx->X[1] = X1 ^ w[1]; ctx->X[2] = X2 ^ w[2]; ctx->X[3] = X3 ^ w[3]; Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); ts[1] &= ~SKEIN_T1_FLAG_FIRST; - } - while (--blkCnt); + } while (--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) size_t Skein_256_Process_Block_CodeSize(void) { return ((uint8_t *)Skein_256_Process_Block_CodeSize) - ((uint8_t *)Skein_256_Process_Block); } uint_t Skein_256_Unroll_Cnt(void) { return (SKEIN_UNROLL_256); } #endif #endif /* Skein_512 */ #if !(SKEIN_USE_ASM & 512) void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, size_t blkCnt, size_t byteCntAdd) -{ /* do it in C */ +{ enum { WCNT = SKEIN_512_STATE_WORDS }; #undef RCNT #define RCNT (SKEIN_512_ROUNDS_TOTAL / 8) #ifdef SKEIN_LOOP /* configure how much to unroll the loop */ #define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10) #else #define SKEIN_UNROLL_512 (0) #endif #if SKEIN_UNROLL_512 #if (RCNT % SKEIN_UNROLL_512) #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ #endif size_t r; /* key schedule words : chaining vars + tweak + "rotation" */ uint64_t kw[WCNT + 4 + RCNT * 2]; #else uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ #endif /* local copy of vars, for speed */ uint64_t X0, X1, X2, X3, X4, X5, X6, X7; uint64_t w[WCNT]; /* local copy of input block */ #ifdef SKEIN_DEBUG /* use for debugging (help compiler put Xn in registers) */ const uint64_t *Xptr[8]; Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; Xptr[4] = &X4; Xptr[5] = &X5; Xptr[6] = &X6; Xptr[7] = &X7; #endif Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; do { /* * this implementation only supports 2**64 input bytes * (no carry out here) */ ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ ks[0] = ctx->X[0]; ks[1] = ctx->X[1]; ks[2] = ctx->X[2]; ks[3] = ctx->X[3]; ks[4] = ctx->X[4]; ks[5] = ctx->X[5]; ks[6] = ctx->X[6]; ks[7] = ctx->X[7]; ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; ts[2] = ts[0] ^ ts[1]; /* get input block in little-endian format */ Skein_Get64_LSB_First(w, blkPtr, WCNT); DebugSaveTweak(ctx); Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); X0 = w[0] + ks[0]; /* do the first full key injection */ X1 = w[1] + ks[1]; X2 = w[2] + ks[2]; X3 = w[3] + ks[3]; X4 = w[4] + ks[4]; X5 = w[5] + ks[5] + ts[0]; X6 = w[6] + ks[6] + ts[1]; X7 = w[7] + ks[7]; blkPtr += SKEIN_512_BLOCK_BYTES; Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); /* run the rounds */ #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6; #if SKEIN_UNROLL_512 == 0 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \ Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); #define I512(R) \ X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */\ X1 += ks[((R) + 2) % 9]; \ X2 += ks[((R) + 3) % 9]; \ X3 += ks[((R) + 4) % 9]; \ X4 += ks[((R) + 5) % 9]; \ X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ X7 += ks[((R) + 8) % 9] + (R) + 1; \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); #else /* looping version */ #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); #define I512(R) \ X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ X1 += ks[r + (R) + 1]; \ X2 += ks[r + (R) + 2]; \ X3 += ks[r + (R) + 3]; \ X4 += ks[r + (R) + 4]; \ X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \ X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \ X7 += ks[r + (R) + 7] + r + (R); \ ks[r + (R)+8] = ks[r + (R) - 1]; /* rotate key schedule */\ ts[r + (R)+2] = ts[r + (R) - 1]; \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); /* loop thru it */ for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) #endif /* end of looped code definitions */ { #define R512_8_rounds(R) /* do 8 full rounds */ \ R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ I512(2 * (R)); \ R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ I512(2*(R) + 1); /* and key injection */ R512_8_rounds(0); #define R512_Unroll_R(NN) \ ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \ (SKEIN_UNROLL_512 > (NN))) #if R512_Unroll_R(1) R512_8_rounds(1); #endif #if R512_Unroll_R(2) R512_8_rounds(2); #endif #if R512_Unroll_R(3) R512_8_rounds(3); #endif #if R512_Unroll_R(4) R512_8_rounds(4); #endif #if R512_Unroll_R(5) R512_8_rounds(5); #endif #if R512_Unroll_R(6) R512_8_rounds(6); #endif #if R512_Unroll_R(7) R512_8_rounds(7); #endif #if R512_Unroll_R(8) R512_8_rounds(8); #endif #if R512_Unroll_R(9) R512_8_rounds(9); #endif #if R512_Unroll_R(10) R512_8_rounds(10); #endif #if R512_Unroll_R(11) R512_8_rounds(11); #endif #if R512_Unroll_R(12) R512_8_rounds(12); #endif #if R512_Unroll_R(13) R512_8_rounds(13); #endif #if R512_Unroll_R(14) R512_8_rounds(14); #endif #if (SKEIN_UNROLL_512 > 14) #error "need more unrolling in Skein_512_Process_Block" #endif } /* * do the final "feedforward" xor, update context chaining vars */ ctx->X[0] = X0 ^ w[0]; ctx->X[1] = X1 ^ w[1]; ctx->X[2] = X2 ^ w[2]; ctx->X[3] = X3 ^ w[3]; ctx->X[4] = X4 ^ w[4]; ctx->X[5] = X5 ^ w[5]; ctx->X[6] = X6 ^ w[6]; ctx->X[7] = X7 ^ w[7]; Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); ts[1] &= ~SKEIN_T1_FLAG_FIRST; - } - while (--blkCnt); + } while (--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) size_t Skein_512_Process_Block_CodeSize(void) { return ((uint8_t *)Skein_512_Process_Block_CodeSize) - ((uint8_t *)Skein_512_Process_Block); } uint_t Skein_512_Unroll_Cnt(void) { return (SKEIN_UNROLL_512); } #endif #endif /* Skein1024 */ #if !(SKEIN_USE_ASM & 1024) void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, size_t blkCnt, size_t byteCntAdd) { /* do it in C, always looping (unrolled is bigger AND slower!) */ enum { WCNT = SKEIN1024_STATE_WORDS }; #undef RCNT #define RCNT (SKEIN1024_ROUNDS_TOTAL/8) #ifdef SKEIN_LOOP /* configure how much to unroll the loop */ #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10) #else #define SKEIN_UNROLL_1024 (0) #endif #if (SKEIN_UNROLL_1024 != 0) #if (RCNT % SKEIN_UNROLL_1024) #error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ #endif size_t r; /* key schedule words : chaining vars + tweak + "rotation" */ uint64_t kw[WCNT + 4 + RCNT * 2]; #else uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ #endif /* local copy of vars, for speed */ uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11, X12, X13, X14, X15; uint64_t w[WCNT]; /* local copy of input block */ #ifdef SKEIN_DEBUG /* use for debugging (help compiler put Xn in registers) */ const uint64_t *Xptr[16]; Xptr[0] = &X00; Xptr[1] = &X01; Xptr[2] = &X02; Xptr[3] = &X03; Xptr[4] = &X04; Xptr[5] = &X05; Xptr[6] = &X06; Xptr[7] = &X07; Xptr[8] = &X08; Xptr[9] = &X09; Xptr[10] = &X10; Xptr[11] = &X11; Xptr[12] = &X12; Xptr[13] = &X13; Xptr[14] = &X14; Xptr[15] = &X15; #endif Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; do { /* * this implementation only supports 2**64 input bytes * (no carry out here) */ ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ ks[0] = ctx->X[0]; ks[1] = ctx->X[1]; ks[2] = ctx->X[2]; ks[3] = ctx->X[3]; ks[4] = ctx->X[4]; ks[5] = ctx->X[5]; ks[6] = ctx->X[6]; ks[7] = ctx->X[7]; ks[8] = ctx->X[8]; ks[9] = ctx->X[9]; ks[10] = ctx->X[10]; ks[11] = ctx->X[11]; ks[12] = ctx->X[12]; ks[13] = ctx->X[13]; ks[14] = ctx->X[14]; ks[15] = ctx->X[15]; ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^ ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; ts[2] = ts[0] ^ ts[1]; /* get input block in little-endian format */ Skein_Get64_LSB_First(w, blkPtr, WCNT); DebugSaveTweak(ctx); Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); X00 = w[0] + ks[0]; /* do the first full key injection */ X01 = w[1] + ks[1]; X02 = w[2] + ks[2]; X03 = w[3] + ks[3]; X04 = w[4] + ks[4]; X05 = w[5] + ks[5]; X06 = w[6] + ks[6]; X07 = w[7] + ks[7]; X08 = w[8] + ks[8]; X09 = w[9] + ks[9]; X10 = w[10] + ks[10]; X11 = w[11] + ks[11]; X12 = w[12] + ks[12]; X13 = w[13] + ks[13] + ts[0]; X14 = w[14] + ks[14] + ts[1]; X15 = w[15] + ks[15]; Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ pD, pE, pF, ROT, rNum) \ X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\ X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\ X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\ X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\ X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE; #if SKEIN_UNROLL_1024 == 0 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ pE, pF, ROT, rn) \ Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ pD, pE, pF, ROT, rn) \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr); #define I1024(R) \ X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */\ X01 += ks[((R) + 2) % 17]; \ X02 += ks[((R) + 3) % 17]; \ X03 += ks[((R) + 4) % 17]; \ X04 += ks[((R) + 5) % 17]; \ X05 += ks[((R) + 6) % 17]; \ X06 += ks[((R) + 7) % 17]; \ X07 += ks[((R) + 8) % 17]; \ X08 += ks[((R) + 9) % 17]; \ X09 += ks[((R) + 10) % 17]; \ X10 += ks[((R) + 11) % 17]; \ X11 += ks[((R) + 12) % 17]; \ X12 += ks[((R) + 13) % 17]; \ X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \ X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \ X15 += ks[((R) + 16) % 17] + (R) +1; \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); #else /* looping version */ #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ pE, pF, ROT, rn) \ Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ pD, pE, pF, ROT, rn) \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr); #define I1024(R) \ X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \ X01 += ks[r + (R) + 1]; \ X02 += ks[r + (R) + 2]; \ X03 += ks[r + (R) + 3]; \ X04 += ks[r + (R) + 4]; \ X05 += ks[r + (R) + 5]; \ X06 += ks[r + (R) + 6]; \ X07 += ks[r + (R) + 7]; \ X08 += ks[r + (R) + 8]; \ X09 += ks[r + (R) + 9]; \ X10 += ks[r + (R) + 10]; \ X11 += ks[r + (R) + 11]; \ X12 += ks[r + (R) + 12]; \ X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \ X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \ X15 += ks[r + (R) + 15] + r + (R); \ ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\ ts[r + (R) + 2] = ts[r + (R) - 1]; \ Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); /* loop thru it */ for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) #endif { #define R1024_8_rounds(R) /* do 8 full rounds */ \ R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ 14, 15, R1024_0, 8 * (R) + 1); \ R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ 08, 01, R1024_1, 8 * (R) + 2); \ R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ 10, 09, R1024_2, 8 * (R) + 3); \ R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ 12, 07, R1024_3, 8 * (R) + 4); \ I1024(2 * (R)); \ R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ 14, 15, R1024_4, 8 * (R) + 5); \ R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ 08, 01, R1024_5, 8 * (R) + 6); \ R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ 10, 09, R1024_6, 8 * (R) + 7); \ R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ 12, 07, R1024_7, 8 * (R) + 8); \ I1024(2 * (R) + 1); R1024_8_rounds(0); #define R1024_Unroll_R(NN) \ ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \ (SKEIN_UNROLL_1024 > (NN))) #if R1024_Unroll_R(1) R1024_8_rounds(1); #endif #if R1024_Unroll_R(2) R1024_8_rounds(2); #endif #if R1024_Unroll_R(3) R1024_8_rounds(3); #endif #if R1024_Unroll_R(4) R1024_8_rounds(4); #endif #if R1024_Unroll_R(5) R1024_8_rounds(5); #endif #if R1024_Unroll_R(6) R1024_8_rounds(6); #endif #if R1024_Unroll_R(7) R1024_8_rounds(7); #endif #if R1024_Unroll_R(8) R1024_8_rounds(8); #endif #if R1024_Unroll_R(9) R1024_8_rounds(9); #endif #if R1024_Unroll_R(10) R1024_8_rounds(10); #endif #if R1024_Unroll_R(11) R1024_8_rounds(11); #endif #if R1024_Unroll_R(12) R1024_8_rounds(12); #endif #if R1024_Unroll_R(13) R1024_8_rounds(13); #endif #if R1024_Unroll_R(14) R1024_8_rounds(14); #endif #if (SKEIN_UNROLL_1024 > 14) #error "need more unrolling in Skein_1024_Process_Block" #endif } /* * do the final "feedforward" xor, update context chaining vars */ ctx->X[0] = X00 ^ w[0]; ctx->X[1] = X01 ^ w[1]; ctx->X[2] = X02 ^ w[2]; ctx->X[3] = X03 ^ w[3]; ctx->X[4] = X04 ^ w[4]; ctx->X[5] = X05 ^ w[5]; ctx->X[6] = X06 ^ w[6]; ctx->X[7] = X07 ^ w[7]; ctx->X[8] = X08 ^ w[8]; ctx->X[9] = X09 ^ w[9]; ctx->X[10] = X10 ^ w[10]; ctx->X[11] = X11 ^ w[11]; ctx->X[12] = X12 ^ w[12]; ctx->X[13] = X13 ^ w[13]; ctx->X[14] = X14 ^ w[14]; ctx->X[15] = X15 ^ w[15]; Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); ts[1] &= ~SKEIN_T1_FLAG_FIRST; blkPtr += SKEIN1024_BLOCK_BYTES; } while (--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) size_t Skein1024_Process_Block_CodeSize(void) { return ((uint8_t *)Skein1024_Process_Block_CodeSize) - ((uint8_t *)Skein1024_Process_Block); } uint_t Skein1024_Unroll_Cnt(void) { return (SKEIN_UNROLL_1024); } #endif #endif diff --git a/module/icp/asm-x86_64/aes/aeskey.c b/module/icp/asm-x86_64/aes/aeskey.c index 96767fbea06a..c3d1f2990874 100644 --- a/module/icp/asm-x86_64/aes/aeskey.c +++ b/module/icp/asm-x86_64/aes/aeskey.c @@ -1,580 +1,580 @@ /* * --------------------------------------------------------------------------- * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. * * LICENSE TERMS * * The free distribution and use of this software is allowed (with or without * changes) provided that: * * 1. source code distributions include the above copyright notice, this * list of conditions and the following disclaimer; * * 2. binary distributions include the above copyright notice, this list * of conditions and the following disclaimer in their documentation; * * 3. the name of the copyright holder is not used to endorse products * built using this software without specific written permission. * * DISCLAIMER * * This software is provided 'as is' with no explicit or implied warranties * in respect of its properties, including, but not limited to, correctness * and/or fitness for purpose. * --------------------------------------------------------------------------- * Issue Date: 20/12/2007 */ #include #include "aesopt.h" #include "aestab.h" #include "aestab2.h" /* * Initialise the key schedule from the user supplied key. The key * length can be specified in bytes, with legal values of 16, 24 * and 32, or in bits, with legal values of 128, 192 and 256. These * values correspond with Nk values of 4, 6 and 8 respectively. * * The following macros implement a single cycle in the key * schedule generation process. The number of cycles needed * for each cx->n_col and nk value is: * * nk = 4 5 6 7 8 * ------------------------------ * cx->n_col = 4 10 9 8 7 7 * cx->n_col = 5 14 11 10 9 9 * cx->n_col = 6 19 15 12 11 11 * cx->n_col = 7 21 19 16 13 14 * cx->n_col = 8 29 23 19 17 14 */ /* * OpenSolaris changes * 1. Added header files aes_impl.h and aestab2.h * 2. Changed uint_8t and uint_32t to uint8_t and uint32_t * 3. Remove code under ifdef USE_VIA_ACE_IF_PRESENT (always undefined) * 4. Removed always-defined ifdefs FUNCS_IN_C, ENC_KEYING_IN_C, * AES_128, AES_192, AES_256, AES_VAR defines * 5. Changed aes_encrypt_key* aes_decrypt_key* functions to "static void" * 6. Changed N_COLS to MAX_AES_NB * 7. Replaced functions aes_encrypt_key and aes_decrypt_key with * OpenSolaris-compatible functions rijndael_key_setup_enc_amd64 and * rijndael_key_setup_dec_amd64 * 8. cstyled code and removed lint warnings */ #if defined(REDUCE_CODE_SIZE) #define ls_box ls_sub uint32_t ls_sub(const uint32_t t, const uint32_t n); #define inv_mcol im_sub uint32_t im_sub(const uint32_t x); #ifdef ENC_KS_UNROLL #undef ENC_KS_UNROLL #endif #ifdef DEC_KS_UNROLL #undef DEC_KS_UNROLL #endif #endif /* REDUCE_CODE_SIZE */ #define ke4(k, i) \ { k[4 * (i) + 4] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ k[4 * (i) + 5] = ss[1] ^= ss[0]; \ k[4 * (i) + 6] = ss[2] ^= ss[1]; \ k[4 * (i) + 7] = ss[3] ^= ss[2]; \ } static void aes_encrypt_key128(const unsigned char *key, uint32_t rk[]) { uint32_t ss[4]; rk[0] = ss[0] = word_in(key, 0); rk[1] = ss[1] = word_in(key, 1); rk[2] = ss[2] = word_in(key, 2); rk[3] = ss[3] = word_in(key, 3); #ifdef ENC_KS_UNROLL ke4(rk, 0); ke4(rk, 1); ke4(rk, 2); ke4(rk, 3); ke4(rk, 4); ke4(rk, 5); ke4(rk, 6); ke4(rk, 7); ke4(rk, 8); #else { uint32_t i; for (i = 0; i < 9; ++i) ke4(rk, i); } #endif /* ENC_KS_UNROLL */ ke4(rk, 9); } #define kef6(k, i) \ { k[6 * (i) + 6] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ k[6 * (i) + 7] = ss[1] ^= ss[0]; \ k[6 * (i) + 8] = ss[2] ^= ss[1]; \ k[6 * (i) + 9] = ss[3] ^= ss[2]; \ } #define ke6(k, i) \ { kef6(k, i); \ k[6 * (i) + 10] = ss[4] ^= ss[3]; \ k[6 * (i) + 11] = ss[5] ^= ss[4]; \ } static void aes_encrypt_key192(const unsigned char *key, uint32_t rk[]) { uint32_t ss[6]; rk[0] = ss[0] = word_in(key, 0); rk[1] = ss[1] = word_in(key, 1); rk[2] = ss[2] = word_in(key, 2); rk[3] = ss[3] = word_in(key, 3); rk[4] = ss[4] = word_in(key, 4); rk[5] = ss[5] = word_in(key, 5); #ifdef ENC_KS_UNROLL ke6(rk, 0); ke6(rk, 1); ke6(rk, 2); ke6(rk, 3); ke6(rk, 4); ke6(rk, 5); ke6(rk, 6); #else { uint32_t i; for (i = 0; i < 7; ++i) ke6(rk, i); } #endif /* ENC_KS_UNROLL */ kef6(rk, 7); } #define kef8(k, i) \ { k[8 * (i) + 8] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ k[8 * (i) + 9] = ss[1] ^= ss[0]; \ k[8 * (i) + 10] = ss[2] ^= ss[1]; \ k[8 * (i) + 11] = ss[3] ^= ss[2]; \ } #define ke8(k, i) \ { kef8(k, i); \ k[8 * (i) + 12] = ss[4] ^= ls_box(ss[3], 0); \ k[8 * (i) + 13] = ss[5] ^= ss[4]; \ k[8 * (i) + 14] = ss[6] ^= ss[5]; \ k[8 * (i) + 15] = ss[7] ^= ss[6]; \ } static void aes_encrypt_key256(const unsigned char *key, uint32_t rk[]) { uint32_t ss[8]; rk[0] = ss[0] = word_in(key, 0); rk[1] = ss[1] = word_in(key, 1); rk[2] = ss[2] = word_in(key, 2); rk[3] = ss[3] = word_in(key, 3); rk[4] = ss[4] = word_in(key, 4); rk[5] = ss[5] = word_in(key, 5); rk[6] = ss[6] = word_in(key, 6); rk[7] = ss[7] = word_in(key, 7); #ifdef ENC_KS_UNROLL ke8(rk, 0); ke8(rk, 1); ke8(rk, 2); ke8(rk, 3); ke8(rk, 4); ke8(rk, 5); #else { uint32_t i; for (i = 0; i < 6; ++i) ke8(rk, i); } #endif /* ENC_KS_UNROLL */ kef8(rk, 6); } /* * Expand the cipher key into the encryption key schedule. * * Return the number of rounds for the given cipher key size. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4 * (Nr + 1). * * Parameters: * rk AES key schedule 32-bit array to be initialized * cipherKey User key * keyBits AES key size (128, 192, or 256 bits) */ int rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[], - int keyBits) + int keyBits) { switch (keyBits) { case 128: aes_encrypt_key128((unsigned char *)&cipherKey[0], rk); return (10); case 192: aes_encrypt_key192((unsigned char *)&cipherKey[0], rk); return (12); case 256: aes_encrypt_key256((unsigned char *)&cipherKey[0], rk); return (14); default: /* should never get here */ break; } return (0); } /* this is used to store the decryption round keys */ /* in forward or reverse order */ #ifdef AES_REV_DKS #define v(n, i) ((n) - (i) + 2 * ((i) & 3)) #else #define v(n, i) (i) #endif #if DEC_ROUND == NO_TABLES #define ff(x) (x) #else #define ff(x) inv_mcol(x) #if defined(dec_imvars) #define d_vars dec_imvars #endif #endif /* FUNCS_IN_C & DEC_KEYING_IN_C */ #define k4e(k, i) \ { k[v(40, (4 * (i)) + 4)] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ k[v(40, (4 * (i)) + 5)] = ss[1] ^= ss[0]; \ k[v(40, (4 * (i)) + 6)] = ss[2] ^= ss[1]; \ k[v(40, (4 * (i)) + 7)] = ss[3] ^= ss[2]; \ } #if 1 #define kdf4(k, i) \ { ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \ ss[1] = ss[1] ^ ss[3]; \ ss[2] = ss[2] ^ ss[3]; \ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \ ss[i % 4] ^= ss[4]; \ ss[4] ^= k[v(40, (4 * (i)))]; k[v(40, (4 * (i)) + 4)] = ff(ss[4]); \ ss[4] ^= k[v(40, (4 * (i)) + 1)]; k[v(40, (4 * (i)) + 5)] = ff(ss[4]); \ ss[4] ^= k[v(40, (4 * (i)) + 2)]; k[v(40, (4 * (i)) + 6)] = ff(ss[4]); \ ss[4] ^= k[v(40, (4 * (i)) + 3)]; k[v(40, (4 * (i)) + 7)] = ff(ss[4]); \ } #define kd4(k, i) \ { ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \ ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \ k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \ k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \ k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \ k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \ } #define kdl4(k, i) \ { ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \ ss[i % 4] ^= ss[4]; \ k[v(40, (4 * (i)) + 4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \ k[v(40, (4 * (i)) + 5)] = ss[1] ^ ss[3]; \ k[v(40, (4 * (i)) + 6)] = ss[0]; \ k[v(40, (4 * (i)) + 7)] = ss[1]; \ } #else #define kdf4(k, i) \ { ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ k[v(40, (4 * (i)) + 4)] = ff(ss[0]); \ ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ff(ss[1]); \ ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ff(ss[2]); \ ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ff(ss[3]); \ } #define kd4(k, i) \ { ss[4] = ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ ss[0] ^= ss[4]; \ ss[4] = ff(ss[4]); \ k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \ ss[1] ^= ss[0]; \ k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \ ss[2] ^= ss[1]; \ k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \ ss[3] ^= ss[2]; \ k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \ } #define kdl4(k, i) \ { ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ k[v(40, (4 * (i)) + 4)] = ss[0]; \ ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ss[1]; \ ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ss[2]; \ ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ss[3]; \ } #endif static void aes_decrypt_key128(const unsigned char *key, uint32_t rk[]) { uint32_t ss[5]; #if defined(d_vars) d_vars; #endif rk[v(40, (0))] = ss[0] = word_in(key, 0); rk[v(40, (1))] = ss[1] = word_in(key, 1); rk[v(40, (2))] = ss[2] = word_in(key, 2); rk[v(40, (3))] = ss[3] = word_in(key, 3); #ifdef DEC_KS_UNROLL kdf4(rk, 0); kd4(rk, 1); kd4(rk, 2); kd4(rk, 3); kd4(rk, 4); kd4(rk, 5); kd4(rk, 6); kd4(rk, 7); kd4(rk, 8); kdl4(rk, 9); #else { uint32_t i; for (i = 0; i < 10; ++i) k4e(rk, i); #if !(DEC_ROUND == NO_TABLES) for (i = MAX_AES_NB; i < 10 * MAX_AES_NB; ++i) rk[i] = inv_mcol(rk[i]); #endif } #endif /* DEC_KS_UNROLL */ } #define k6ef(k, i) \ { k[v(48, (6 * (i)) + 6)] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ k[v(48, (6 * (i)) + 7)] = ss[1] ^= ss[0]; \ k[v(48, (6 * (i)) + 8)] = ss[2] ^= ss[1]; \ k[v(48, (6 * (i)) + 9)] = ss[3] ^= ss[2]; \ } #define k6e(k, i) \ { k6ef(k, i); \ k[v(48, (6 * (i)) + 10)] = ss[4] ^= ss[3]; \ k[v(48, (6 * (i)) + 11)] = ss[5] ^= ss[4]; \ } #define kdf6(k, i) \ { ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ k[v(48, (6 * (i)) + 6)] = ff(ss[0]); \ ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ff(ss[1]); \ ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ff(ss[2]); \ ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ff(ss[3]); \ ss[4] ^= ss[3]; k[v(48, (6 * (i)) + 10)] = ff(ss[4]); \ ss[5] ^= ss[4]; k[v(48, (6 * (i)) + 11)] = ff(ss[5]); \ } #define kd6(k, i) \ { ss[6] = ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \ k[v(48, (6 * (i)) + 6)] = ss[6] ^= k[v(48, (6 * (i)))]; \ ss[1] ^= ss[0]; \ k[v(48, (6 * (i)) + 7)] = ss[6] ^= k[v(48, (6 * (i)) + 1)]; \ ss[2] ^= ss[1]; \ k[v(48, (6 * (i)) + 8)] = ss[6] ^= k[v(48, (6 * (i)) + 2)]; \ ss[3] ^= ss[2]; \ k[v(48, (6 * (i)) + 9)] = ss[6] ^= k[v(48, (6 * (i)) + 3)]; \ ss[4] ^= ss[3]; \ k[v(48, (6 * (i)) + 10)] = ss[6] ^= k[v(48, (6 * (i)) + 4)]; \ ss[5] ^= ss[4]; \ k[v(48, (6 * (i)) + 11)] = ss[6] ^= k[v(48, (6 * (i)) + 5)]; \ } #define kdl6(k, i) \ { ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ k[v(48, (6 * (i)) + 6)] = ss[0]; \ ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ss[1]; \ ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ss[2]; \ ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ss[3]; \ } static void aes_decrypt_key192(const unsigned char *key, uint32_t rk[]) { uint32_t ss[7]; #if defined(d_vars) d_vars; #endif rk[v(48, (0))] = ss[0] = word_in(key, 0); rk[v(48, (1))] = ss[1] = word_in(key, 1); rk[v(48, (2))] = ss[2] = word_in(key, 2); rk[v(48, (3))] = ss[3] = word_in(key, 3); #ifdef DEC_KS_UNROLL ss[4] = word_in(key, 4); rk[v(48, (4))] = ff(ss[4]); ss[5] = word_in(key, 5); rk[v(48, (5))] = ff(ss[5]); kdf6(rk, 0); kd6(rk, 1); kd6(rk, 2); kd6(rk, 3); kd6(rk, 4); kd6(rk, 5); kd6(rk, 6); kdl6(rk, 7); #else rk[v(48, (4))] = ss[4] = word_in(key, 4); rk[v(48, (5))] = ss[5] = word_in(key, 5); { uint32_t i; for (i = 0; i < 7; ++i) k6e(rk, i); k6ef(rk, 7); #if !(DEC_ROUND == NO_TABLES) for (i = MAX_AES_NB; i < 12 * MAX_AES_NB; ++i) rk[i] = inv_mcol(rk[i]); #endif } #endif } #define k8ef(k, i) \ { k[v(56, (8 * (i)) + 8)] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ k[v(56, (8 * (i)) + 9)] = ss[1] ^= ss[0]; \ k[v(56, (8 * (i)) + 10)] = ss[2] ^= ss[1]; \ k[v(56, (8 * (i)) + 11)] = ss[3] ^= ss[2]; \ } #define k8e(k, i) \ { k8ef(k, i); \ k[v(56, (8 * (i)) + 12)] = ss[4] ^= ls_box(ss[3], 0); \ k[v(56, (8 * (i)) + 13)] = ss[5] ^= ss[4]; \ k[v(56, (8 * (i)) + 14)] = ss[6] ^= ss[5]; \ k[v(56, (8 * (i)) + 15)] = ss[7] ^= ss[6]; \ } #define kdf8(k, i) \ { ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ k[v(56, (8 * (i)) + 8)] = ff(ss[0]); \ ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ff(ss[1]); \ ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ff(ss[2]); \ ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ff(ss[3]); \ ss[4] ^= ls_box(ss[3], 0); k[v(56, (8 * (i)) + 12)] = ff(ss[4]); \ ss[5] ^= ss[4]; k[v(56, (8 * (i)) + 13)] = ff(ss[5]); \ ss[6] ^= ss[5]; k[v(56, (8 * (i)) + 14)] = ff(ss[6]); \ ss[7] ^= ss[6]; k[v(56, (8 * (i)) + 15)] = ff(ss[7]); \ } #define kd8(k, i) \ { ss[8] = ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ ss[0] ^= ss[8]; \ ss[8] = ff(ss[8]); \ k[v(56, (8 * (i)) + 8)] = ss[8] ^= k[v(56, (8 * (i)))]; \ ss[1] ^= ss[0]; \ k[v(56, (8 * (i)) + 9)] = ss[8] ^= k[v(56, (8 * (i)) + 1)]; \ ss[2] ^= ss[1]; \ k[v(56, (8 * (i)) + 10)] = ss[8] ^= k[v(56, (8 * (i)) + 2)]; \ ss[3] ^= ss[2]; \ k[v(56, (8 * (i)) + 11)] = ss[8] ^= k[v(56, (8 * (i)) + 3)]; \ ss[8] = ls_box(ss[3], 0); \ ss[4] ^= ss[8]; \ ss[8] = ff(ss[8]); \ k[v(56, (8 * (i)) + 12)] = ss[8] ^= k[v(56, (8 * (i)) + 4)]; \ ss[5] ^= ss[4]; \ k[v(56, (8 * (i)) + 13)] = ss[8] ^= k[v(56, (8 * (i)) + 5)]; \ ss[6] ^= ss[5]; \ k[v(56, (8 * (i)) + 14)] = ss[8] ^= k[v(56, (8 * (i)) + 6)]; \ ss[7] ^= ss[6]; \ k[v(56, (8 * (i)) + 15)] = ss[8] ^= k[v(56, (8 * (i)) + 7)]; \ } #define kdl8(k, i) \ { ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ k[v(56, (8 * (i)) + 8)] = ss[0]; \ ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ss[1]; \ ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ss[2]; \ ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ss[3]; \ } static void aes_decrypt_key256(const unsigned char *key, uint32_t rk[]) { uint32_t ss[9]; #if defined(d_vars) d_vars; #endif rk[v(56, (0))] = ss[0] = word_in(key, 0); rk[v(56, (1))] = ss[1] = word_in(key, 1); rk[v(56, (2))] = ss[2] = word_in(key, 2); rk[v(56, (3))] = ss[3] = word_in(key, 3); #ifdef DEC_KS_UNROLL ss[4] = word_in(key, 4); rk[v(56, (4))] = ff(ss[4]); ss[5] = word_in(key, 5); rk[v(56, (5))] = ff(ss[5]); ss[6] = word_in(key, 6); rk[v(56, (6))] = ff(ss[6]); ss[7] = word_in(key, 7); rk[v(56, (7))] = ff(ss[7]); kdf8(rk, 0); kd8(rk, 1); kd8(rk, 2); kd8(rk, 3); kd8(rk, 4); kd8(rk, 5); kdl8(rk, 6); #else rk[v(56, (4))] = ss[4] = word_in(key, 4); rk[v(56, (5))] = ss[5] = word_in(key, 5); rk[v(56, (6))] = ss[6] = word_in(key, 6); rk[v(56, (7))] = ss[7] = word_in(key, 7); { uint32_t i; for (i = 0; i < 6; ++i) k8e(rk, i); k8ef(rk, 6); #if !(DEC_ROUND == NO_TABLES) for (i = MAX_AES_NB; i < 14 * MAX_AES_NB; ++i) rk[i] = inv_mcol(rk[i]); #endif } #endif /* DEC_KS_UNROLL */ } /* * Expand the cipher key into the decryption key schedule. * * Return the number of rounds for the given cipher key size. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4 * (Nr + 1). * * Parameters: * rk AES key schedule 32-bit array to be initialized * cipherKey User key * keyBits AES key size (128, 192, or 256 bits) */ int rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[], - int keyBits) + int keyBits) { switch (keyBits) { case 128: aes_decrypt_key128((unsigned char *)&cipherKey[0], rk); return (10); case 192: aes_decrypt_key192((unsigned char *)&cipherKey[0], rk); return (12); case 256: aes_decrypt_key256((unsigned char *)&cipherKey[0], rk); return (14); default: /* should never get here */ break; } return (0); } diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c index 7fd66be3e063..12d57ed79eee 100644 --- a/module/icp/io/aes.c +++ b/module/icp/io/aes.c @@ -1,1437 +1,1439 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * AES provider for the Kernel Cryptographic Framework (KCF) */ #include #include #include #include #include #include #include #define _AES_IMPL #include #define CRYPTO_PROVIDER_NAME "aes" extern struct mod_ops mod_cryptoops; /* * Module linkage information for the kernel. */ static struct modlcrypto modlcrypto = { &mod_cryptoops, "AES Kernel SW Provider" }; static struct modlinkage modlinkage = { MODREV_1, { (void *)&modlcrypto, NULL } }; /* * Mechanism info structure passed to KCF during registration. */ static crypto_mech_info_t aes_mech_info_tab[] = { /* AES_ECB */ {SUN_CKM_AES_ECB, AES_ECB_MECH_INFO_TYPE, CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, /* AES_CBC */ {SUN_CKM_AES_CBC, AES_CBC_MECH_INFO_TYPE, CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, /* AES_CTR */ {SUN_CKM_AES_CTR, AES_CTR_MECH_INFO_TYPE, CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, /* AES_CCM */ {SUN_CKM_AES_CCM, AES_CCM_MECH_INFO_TYPE, CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, /* AES_GCM */ {SUN_CKM_AES_GCM, AES_GCM_MECH_INFO_TYPE, CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, /* AES_GMAC */ {SUN_CKM_AES_GMAC, AES_GMAC_MECH_INFO_TYPE, CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC | CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC | CRYPTO_FG_SIGN | CRYPTO_FG_SIGN_ATOMIC | CRYPTO_FG_VERIFY | CRYPTO_FG_VERIFY_ATOMIC, AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES} }; /* operations are in-place if the output buffer is NULL */ #define AES_ARG_INPLACE(input, output) \ if ((output) == NULL) \ (output) = (input); static void aes_provider_status(crypto_provider_handle_t, uint_t *); static crypto_control_ops_t aes_control_ops = { aes_provider_status }; static int aes_encrypt_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); static int aes_decrypt_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); static int aes_common_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t, boolean_t); static int aes_common_init_ctx(aes_ctx_t *, crypto_spi_ctx_template_t *, crypto_mechanism_t *, crypto_key_t *, int, boolean_t); static int aes_encrypt_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); static int aes_decrypt_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); static int aes_encrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, crypto_req_handle_t); static int aes_encrypt_update(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, crypto_req_handle_t); static int aes_encrypt_atomic(crypto_provider_handle_t, crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); static int aes_decrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, crypto_req_handle_t); static int aes_decrypt_update(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, crypto_req_handle_t); static int aes_decrypt_atomic(crypto_provider_handle_t, crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); static crypto_cipher_ops_t aes_cipher_ops = { aes_encrypt_init, aes_encrypt, aes_encrypt_update, aes_encrypt_final, aes_encrypt_atomic, aes_decrypt_init, aes_decrypt, aes_decrypt_update, aes_decrypt_final, aes_decrypt_atomic }; static int aes_mac_atomic(crypto_provider_handle_t, crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); static int aes_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); static crypto_mac_ops_t aes_mac_ops = { NULL, NULL, NULL, NULL, aes_mac_atomic, aes_mac_verify_atomic }; static int aes_create_ctx_template(crypto_provider_handle_t, crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *, size_t *, crypto_req_handle_t); static int aes_free_context(crypto_ctx_t *); static crypto_ctx_ops_t aes_ctx_ops = { aes_create_ctx_template, aes_free_context }; static crypto_ops_t aes_crypto_ops = {{{{{ &aes_control_ops, NULL, &aes_cipher_ops, &aes_mac_ops, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, &aes_ctx_ops }}}}}; static crypto_provider_info_t aes_prov_info = {{{{ CRYPTO_SPI_VERSION_1, "AES Software Provider", CRYPTO_SW_PROVIDER, NULL, &aes_crypto_ops, sizeof (aes_mech_info_tab)/sizeof (crypto_mech_info_t), aes_mech_info_tab }}}}; static crypto_kcf_provider_handle_t aes_prov_handle = 0; static crypto_data_t null_crypto_data = { CRYPTO_DATA_RAW }; int aes_mod_init(void) { int ret; if ((ret = mod_install(&modlinkage)) != 0) return (ret); /* Register with KCF. If the registration fails, remove the module. */ if (crypto_register_provider(&aes_prov_info, &aes_prov_handle)) { (void) mod_remove(&modlinkage); return (EACCES); } return (0); } int aes_mod_fini(void) { /* Unregister from KCF if module is registered */ if (aes_prov_handle != 0) { if (crypto_unregister_provider(aes_prov_handle)) return (EBUSY); aes_prov_handle = 0; } return (mod_remove(&modlinkage)); } static int aes_check_mech_param(crypto_mechanism_t *mechanism, aes_ctx_t **ctx, int kmflag) { void *p = NULL; boolean_t param_required = B_TRUE; size_t param_len; void *(*alloc_fun)(int); int rv = CRYPTO_SUCCESS; switch (mechanism->cm_type) { case AES_ECB_MECH_INFO_TYPE: param_required = B_FALSE; alloc_fun = ecb_alloc_ctx; break; case AES_CBC_MECH_INFO_TYPE: param_len = AES_BLOCK_LEN; alloc_fun = cbc_alloc_ctx; break; case AES_CTR_MECH_INFO_TYPE: param_len = sizeof (CK_AES_CTR_PARAMS); alloc_fun = ctr_alloc_ctx; break; case AES_CCM_MECH_INFO_TYPE: param_len = sizeof (CK_AES_CCM_PARAMS); alloc_fun = ccm_alloc_ctx; break; case AES_GCM_MECH_INFO_TYPE: param_len = sizeof (CK_AES_GCM_PARAMS); alloc_fun = gcm_alloc_ctx; break; case AES_GMAC_MECH_INFO_TYPE: param_len = sizeof (CK_AES_GMAC_PARAMS); alloc_fun = gmac_alloc_ctx; break; default: rv = CRYPTO_MECHANISM_INVALID; return (rv); } if (param_required && mechanism->cm_param != NULL && mechanism->cm_param_len != param_len) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } if (ctx != NULL) { p = (alloc_fun)(kmflag); *ctx = p; } return (rv); } /* * Initialize key schedules for AES */ static int init_keysched(crypto_key_t *key, void *newbie) { /* * Only keys by value are supported by this module. */ switch (key->ck_format) { case CRYPTO_KEY_RAW: if (key->ck_length < AES_MINBITS || key->ck_length > AES_MAXBITS) { return (CRYPTO_KEY_SIZE_RANGE); } /* key length must be either 128, 192, or 256 */ if ((key->ck_length & 63) != 0) return (CRYPTO_KEY_SIZE_RANGE); break; default: return (CRYPTO_KEY_TYPE_INCONSISTENT); } aes_init_keysched(key->ck_data, key->ck_length, newbie); return (CRYPTO_SUCCESS); } /* * KCF software provider control entry points. */ /* ARGSUSED */ static void aes_provider_status(crypto_provider_handle_t provider, uint_t *status) { *status = CRYPTO_PROVIDER_READY; } static int aes_encrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_spi_ctx_template_t template, - crypto_req_handle_t req) { + crypto_req_handle_t req) +{ return (aes_common_init(ctx, mechanism, key, template, req, B_TRUE)); } static int aes_decrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_spi_ctx_template_t template, - crypto_req_handle_t req) { + crypto_req_handle_t req) +{ return (aes_common_init(ctx, mechanism, key, template, req, B_FALSE)); } /* * KCF software provider encrypt entry points. */ static int aes_common_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_spi_ctx_template_t template, crypto_req_handle_t req, boolean_t is_encrypt_init) { aes_ctx_t *aes_ctx; int rv; int kmflag; /* * Only keys by value are supported by this module. */ if (key->ck_format != CRYPTO_KEY_RAW) { return (CRYPTO_KEY_TYPE_INCONSISTENT); } kmflag = crypto_kmflag(req); if ((rv = aes_check_mech_param(mechanism, &aes_ctx, kmflag)) != CRYPTO_SUCCESS) return (rv); rv = aes_common_init_ctx(aes_ctx, template, mechanism, key, kmflag, is_encrypt_init); if (rv != CRYPTO_SUCCESS) { crypto_free_mode_ctx(aes_ctx); return (rv); } ctx->cc_provider_private = aes_ctx; return (CRYPTO_SUCCESS); } static void aes_copy_block64(uint8_t *in, uint64_t *out) { if (IS_P2ALIGNED(in, sizeof (uint64_t))) { /* LINTED: pointer alignment */ out[0] = *(uint64_t *)&in[0]; /* LINTED: pointer alignment */ out[1] = *(uint64_t *)&in[8]; } else { uint8_t *iv8 = (uint8_t *)&out[0]; AES_COPY_BLOCK(in, iv8); } } static int aes_encrypt(crypto_ctx_t *ctx, crypto_data_t *plaintext, crypto_data_t *ciphertext, crypto_req_handle_t req) { int ret = CRYPTO_FAILED; aes_ctx_t *aes_ctx; size_t saved_length, saved_offset, length_needed; ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; /* * For block ciphers, plaintext must be a multiple of AES block size. * This test is only valid for ciphers whose blocksize is a power of 2. */ if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) && (plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0) return (CRYPTO_DATA_LEN_RANGE); AES_ARG_INPLACE(plaintext, ciphertext); /* * We need to just return the length needed to store the output. * We should not destroy the context for the following case. */ switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) { case CCM_MODE: length_needed = plaintext->cd_length + aes_ctx->ac_mac_len; break; case GCM_MODE: length_needed = plaintext->cd_length + aes_ctx->ac_tag_len; break; case GMAC_MODE: if (plaintext->cd_length != 0) return (CRYPTO_ARGUMENTS_BAD); length_needed = aes_ctx->ac_tag_len; break; default: length_needed = plaintext->cd_length; } if (ciphertext->cd_length < length_needed) { ciphertext->cd_length = length_needed; return (CRYPTO_BUFFER_TOO_SMALL); } saved_length = ciphertext->cd_length; saved_offset = ciphertext->cd_offset; /* * Do an update on the specified input data. */ ret = aes_encrypt_update(ctx, plaintext, ciphertext, req); if (ret != CRYPTO_SUCCESS) { return (ret); } /* * For CCM mode, aes_ccm_encrypt_final() will take care of any * left-over unprocessed data, and compute the MAC */ if (aes_ctx->ac_flags & CCM_MODE) { /* * ccm_encrypt_final() will compute the MAC and append * it to existing ciphertext. So, need to adjust the left over * length value accordingly */ /* order of following 2 lines MUST not be reversed */ ciphertext->cd_offset = ciphertext->cd_length; ciphertext->cd_length = saved_length - ciphertext->cd_length; ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, ciphertext, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); if (ret != CRYPTO_SUCCESS) { return (ret); } if (plaintext != ciphertext) { ciphertext->cd_length = ciphertext->cd_offset - saved_offset; } ciphertext->cd_offset = saved_offset; } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { /* * gcm_encrypt_final() will compute the MAC and append * it to existing ciphertext. So, need to adjust the left over * length value accordingly */ /* order of following 2 lines MUST not be reversed */ ciphertext->cd_offset = ciphertext->cd_length; ciphertext->cd_length = saved_length - ciphertext->cd_length; ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, ciphertext, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); if (ret != CRYPTO_SUCCESS) { return (ret); } if (plaintext != ciphertext) { ciphertext->cd_length = ciphertext->cd_offset - saved_offset; } ciphertext->cd_offset = saved_offset; } ASSERT(aes_ctx->ac_remainder_len == 0); (void) aes_free_context(ctx); return (ret); } static int aes_decrypt(crypto_ctx_t *ctx, crypto_data_t *ciphertext, crypto_data_t *plaintext, crypto_req_handle_t req) { int ret = CRYPTO_FAILED; aes_ctx_t *aes_ctx; off_t saved_offset; size_t saved_length, length_needed; ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; /* * For block ciphers, plaintext must be a multiple of AES block size. * This test is only valid for ciphers whose blocksize is a power of 2. */ if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) && (ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0) { return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); } AES_ARG_INPLACE(ciphertext, plaintext); /* * Return length needed to store the output. * Do not destroy context when plaintext buffer is too small. * * CCM: plaintext is MAC len smaller than cipher text * GCM: plaintext is TAG len smaller than cipher text * GMAC: plaintext length must be zero */ switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) { case CCM_MODE: length_needed = aes_ctx->ac_processed_data_len; break; case GCM_MODE: length_needed = ciphertext->cd_length - aes_ctx->ac_tag_len; break; case GMAC_MODE: if (plaintext->cd_length != 0) return (CRYPTO_ARGUMENTS_BAD); length_needed = 0; break; default: length_needed = ciphertext->cd_length; } if (plaintext->cd_length < length_needed) { plaintext->cd_length = length_needed; return (CRYPTO_BUFFER_TOO_SMALL); } saved_offset = plaintext->cd_offset; saved_length = plaintext->cd_length; /* * Do an update on the specified input data. */ ret = aes_decrypt_update(ctx, ciphertext, plaintext, req); if (ret != CRYPTO_SUCCESS) { goto cleanup; } if (aes_ctx->ac_flags & CCM_MODE) { ASSERT(aes_ctx->ac_processed_data_len == aes_ctx->ac_data_len); ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len); /* order of following 2 lines MUST not be reversed */ plaintext->cd_offset = plaintext->cd_length; plaintext->cd_length = saved_length - plaintext->cd_length; ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, plaintext, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); if (ret == CRYPTO_SUCCESS) { if (plaintext != ciphertext) { plaintext->cd_length = plaintext->cd_offset - saved_offset; } } else { plaintext->cd_length = saved_length; } plaintext->cd_offset = saved_offset; } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { /* order of following 2 lines MUST not be reversed */ plaintext->cd_offset = plaintext->cd_length; plaintext->cd_length = saved_length - plaintext->cd_length; ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, plaintext, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); if (ret == CRYPTO_SUCCESS) { if (plaintext != ciphertext) { plaintext->cd_length = plaintext->cd_offset - saved_offset; } } else { plaintext->cd_length = saved_length; } plaintext->cd_offset = saved_offset; } ASSERT(aes_ctx->ac_remainder_len == 0); cleanup: (void) aes_free_context(ctx); return (ret); } /* ARGSUSED */ static int aes_encrypt_update(crypto_ctx_t *ctx, crypto_data_t *plaintext, crypto_data_t *ciphertext, crypto_req_handle_t req) { off_t saved_offset; size_t saved_length, out_len; int ret = CRYPTO_SUCCESS; aes_ctx_t *aes_ctx; ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; AES_ARG_INPLACE(plaintext, ciphertext); /* compute number of bytes that will hold the ciphertext */ out_len = aes_ctx->ac_remainder_len; out_len += plaintext->cd_length; out_len &= ~(AES_BLOCK_LEN - 1); /* return length needed to store the output */ if (ciphertext->cd_length < out_len) { ciphertext->cd_length = out_len; return (CRYPTO_BUFFER_TOO_SMALL); } saved_offset = ciphertext->cd_offset; saved_length = ciphertext->cd_length; /* * Do the AES update on the specified input data. */ switch (plaintext->cd_format) { case CRYPTO_DATA_RAW: ret = crypto_update_iov(ctx->cc_provider_private, plaintext, ciphertext, aes_encrypt_contiguous_blocks, aes_copy_block64); break; case CRYPTO_DATA_UIO: ret = crypto_update_uio(ctx->cc_provider_private, plaintext, ciphertext, aes_encrypt_contiguous_blocks, aes_copy_block64); break; default: ret = CRYPTO_ARGUMENTS_BAD; } /* * Since AES counter mode is a stream cipher, we call * ctr_mode_final() to pick up any remaining bytes. * It is an internal function that does not destroy * the context like *normal* final routines. */ if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) { ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, ciphertext, aes_encrypt_block); } if (ret == CRYPTO_SUCCESS) { if (plaintext != ciphertext) ciphertext->cd_length = ciphertext->cd_offset - saved_offset; } else { ciphertext->cd_length = saved_length; } ciphertext->cd_offset = saved_offset; return (ret); } static int aes_decrypt_update(crypto_ctx_t *ctx, crypto_data_t *ciphertext, crypto_data_t *plaintext, crypto_req_handle_t req) { off_t saved_offset; size_t saved_length, out_len; int ret = CRYPTO_SUCCESS; aes_ctx_t *aes_ctx; ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; AES_ARG_INPLACE(ciphertext, plaintext); /* * Compute number of bytes that will hold the plaintext. * This is not necessary for CCM, GCM, and GMAC since these * mechanisms never return plaintext for update operations. */ if ((aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) { out_len = aes_ctx->ac_remainder_len; out_len += ciphertext->cd_length; out_len &= ~(AES_BLOCK_LEN - 1); /* return length needed to store the output */ if (plaintext->cd_length < out_len) { plaintext->cd_length = out_len; return (CRYPTO_BUFFER_TOO_SMALL); } } saved_offset = plaintext->cd_offset; saved_length = plaintext->cd_length; if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) gcm_set_kmflag((gcm_ctx_t *)aes_ctx, crypto_kmflag(req)); /* * Do the AES update on the specified input data. */ switch (ciphertext->cd_format) { case CRYPTO_DATA_RAW: ret = crypto_update_iov(ctx->cc_provider_private, ciphertext, plaintext, aes_decrypt_contiguous_blocks, aes_copy_block64); break; case CRYPTO_DATA_UIO: ret = crypto_update_uio(ctx->cc_provider_private, ciphertext, plaintext, aes_decrypt_contiguous_blocks, aes_copy_block64); break; default: ret = CRYPTO_ARGUMENTS_BAD; } /* * Since AES counter mode is a stream cipher, we call * ctr_mode_final() to pick up any remaining bytes. * It is an internal function that does not destroy * the context like *normal* final routines. */ if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) { ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, plaintext, aes_encrypt_block); if (ret == CRYPTO_DATA_LEN_RANGE) ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; } if (ret == CRYPTO_SUCCESS) { if (ciphertext != plaintext) plaintext->cd_length = plaintext->cd_offset - saved_offset; } else { plaintext->cd_length = saved_length; } plaintext->cd_offset = saved_offset; return (ret); } /* ARGSUSED */ static int aes_encrypt_final(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req) { aes_ctx_t *aes_ctx; int ret; ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; if (data->cd_format != CRYPTO_DATA_RAW && data->cd_format != CRYPTO_DATA_UIO) { return (CRYPTO_ARGUMENTS_BAD); } if (aes_ctx->ac_flags & CTR_MODE) { if (aes_ctx->ac_remainder_len > 0) { ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data, aes_encrypt_block); if (ret != CRYPTO_SUCCESS) return (ret); } } else if (aes_ctx->ac_flags & CCM_MODE) { ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, data, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); if (ret != CRYPTO_SUCCESS) { return (ret); } } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { size_t saved_offset = data->cd_offset; ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, data, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); if (ret != CRYPTO_SUCCESS) { return (ret); } data->cd_length = data->cd_offset - saved_offset; data->cd_offset = saved_offset; } else { /* * There must be no unprocessed plaintext. * This happens if the length of the last data is * not a multiple of the AES block length. */ if (aes_ctx->ac_remainder_len > 0) { return (CRYPTO_DATA_LEN_RANGE); } data->cd_length = 0; } (void) aes_free_context(ctx); return (CRYPTO_SUCCESS); } /* ARGSUSED */ static int aes_decrypt_final(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req) { aes_ctx_t *aes_ctx; int ret; off_t saved_offset; size_t saved_length; ASSERT(ctx->cc_provider_private != NULL); aes_ctx = ctx->cc_provider_private; if (data->cd_format != CRYPTO_DATA_RAW && data->cd_format != CRYPTO_DATA_UIO) { return (CRYPTO_ARGUMENTS_BAD); } /* * There must be no unprocessed ciphertext. * This happens if the length of the last ciphertext is * not a multiple of the AES block length. */ if (aes_ctx->ac_remainder_len > 0) { if ((aes_ctx->ac_flags & CTR_MODE) == 0) return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); else { ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data, aes_encrypt_block); if (ret == CRYPTO_DATA_LEN_RANGE) ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; if (ret != CRYPTO_SUCCESS) return (ret); } } if (aes_ctx->ac_flags & CCM_MODE) { /* * This is where all the plaintext is returned, make sure * the plaintext buffer is big enough */ size_t pt_len = aes_ctx->ac_data_len; if (data->cd_length < pt_len) { data->cd_length = pt_len; return (CRYPTO_BUFFER_TOO_SMALL); } ASSERT(aes_ctx->ac_processed_data_len == pt_len); ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len); saved_offset = data->cd_offset; saved_length = data->cd_length; ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, data, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); if (ret == CRYPTO_SUCCESS) { data->cd_length = data->cd_offset - saved_offset; } else { data->cd_length = saved_length; } data->cd_offset = saved_offset; if (ret != CRYPTO_SUCCESS) { return (ret); } } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { /* * This is where all the plaintext is returned, make sure * the plaintext buffer is big enough */ gcm_ctx_t *ctx = (gcm_ctx_t *)aes_ctx; size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; if (data->cd_length < pt_len) { data->cd_length = pt_len; return (CRYPTO_BUFFER_TOO_SMALL); } saved_offset = data->cd_offset; saved_length = data->cd_length; ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, data, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); if (ret == CRYPTO_SUCCESS) { data->cd_length = data->cd_offset - saved_offset; } else { data->cd_length = saved_length; } data->cd_offset = saved_offset; if (ret != CRYPTO_SUCCESS) { return (ret); } } if ((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) { data->cd_length = 0; } (void) aes_free_context(ctx); return (CRYPTO_SUCCESS); } /* ARGSUSED */ static int aes_encrypt_atomic(crypto_provider_handle_t provider, crypto_session_id_t session_id, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext, crypto_spi_ctx_template_t template, crypto_req_handle_t req) { aes_ctx_t aes_ctx; /* on the stack */ off_t saved_offset; size_t saved_length; size_t length_needed; int ret; AES_ARG_INPLACE(plaintext, ciphertext); /* * CTR, CCM, GCM, and GMAC modes do not require that plaintext * be a multiple of AES block size. */ switch (mechanism->cm_type) { case AES_CTR_MECH_INFO_TYPE: case AES_CCM_MECH_INFO_TYPE: case AES_GCM_MECH_INFO_TYPE: case AES_GMAC_MECH_INFO_TYPE: break; default: if ((plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0) return (CRYPTO_DATA_LEN_RANGE); } if ((ret = aes_check_mech_param(mechanism, NULL, 0)) != CRYPTO_SUCCESS) return (ret); bzero(&aes_ctx, sizeof (aes_ctx_t)); ret = aes_common_init_ctx(&aes_ctx, template, mechanism, key, crypto_kmflag(req), B_TRUE); if (ret != CRYPTO_SUCCESS) return (ret); switch (mechanism->cm_type) { case AES_CCM_MECH_INFO_TYPE: length_needed = plaintext->cd_length + aes_ctx.ac_mac_len; break; case AES_GMAC_MECH_INFO_TYPE: if (plaintext->cd_length != 0) return (CRYPTO_ARGUMENTS_BAD); /* FALLTHRU */ case AES_GCM_MECH_INFO_TYPE: length_needed = plaintext->cd_length + aes_ctx.ac_tag_len; break; default: length_needed = plaintext->cd_length; } /* return size of buffer needed to store output */ if (ciphertext->cd_length < length_needed) { ciphertext->cd_length = length_needed; ret = CRYPTO_BUFFER_TOO_SMALL; goto out; } saved_offset = ciphertext->cd_offset; saved_length = ciphertext->cd_length; /* * Do an update on the specified input data. */ switch (plaintext->cd_format) { case CRYPTO_DATA_RAW: ret = crypto_update_iov(&aes_ctx, plaintext, ciphertext, aes_encrypt_contiguous_blocks, aes_copy_block64); break; case CRYPTO_DATA_UIO: ret = crypto_update_uio(&aes_ctx, plaintext, ciphertext, aes_encrypt_contiguous_blocks, aes_copy_block64); break; default: ret = CRYPTO_ARGUMENTS_BAD; } if (ret == CRYPTO_SUCCESS) { if (mechanism->cm_type == AES_CCM_MECH_INFO_TYPE) { ret = ccm_encrypt_final((ccm_ctx_t *)&aes_ctx, ciphertext, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); if (ret != CRYPTO_SUCCESS) goto out; ASSERT(aes_ctx.ac_remainder_len == 0); } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE || mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) { ret = gcm_encrypt_final((gcm_ctx_t *)&aes_ctx, ciphertext, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); if (ret != CRYPTO_SUCCESS) goto out; ASSERT(aes_ctx.ac_remainder_len == 0); } else if (mechanism->cm_type == AES_CTR_MECH_INFO_TYPE) { if (aes_ctx.ac_remainder_len > 0) { ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx, ciphertext, aes_encrypt_block); if (ret != CRYPTO_SUCCESS) goto out; } } else { ASSERT(aes_ctx.ac_remainder_len == 0); } if (plaintext != ciphertext) { ciphertext->cd_length = ciphertext->cd_offset - saved_offset; } } else { ciphertext->cd_length = saved_length; } ciphertext->cd_offset = saved_offset; out: if (aes_ctx.ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) { bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); } return (ret); } /* ARGSUSED */ static int aes_decrypt_atomic(crypto_provider_handle_t provider, crypto_session_id_t session_id, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *ciphertext, crypto_data_t *plaintext, crypto_spi_ctx_template_t template, crypto_req_handle_t req) { aes_ctx_t aes_ctx; /* on the stack */ off_t saved_offset; size_t saved_length; size_t length_needed; int ret; AES_ARG_INPLACE(ciphertext, plaintext); /* * CCM, GCM, CTR, and GMAC modes do not require that ciphertext * be a multiple of AES block size. */ switch (mechanism->cm_type) { case AES_CTR_MECH_INFO_TYPE: case AES_CCM_MECH_INFO_TYPE: case AES_GCM_MECH_INFO_TYPE: case AES_GMAC_MECH_INFO_TYPE: break; default: if ((ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0) return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); } if ((ret = aes_check_mech_param(mechanism, NULL, 0)) != CRYPTO_SUCCESS) return (ret); bzero(&aes_ctx, sizeof (aes_ctx_t)); ret = aes_common_init_ctx(&aes_ctx, template, mechanism, key, crypto_kmflag(req), B_FALSE); if (ret != CRYPTO_SUCCESS) return (ret); switch (mechanism->cm_type) { case AES_CCM_MECH_INFO_TYPE: length_needed = aes_ctx.ac_data_len; break; case AES_GCM_MECH_INFO_TYPE: length_needed = ciphertext->cd_length - aes_ctx.ac_tag_len; break; case AES_GMAC_MECH_INFO_TYPE: if (plaintext->cd_length != 0) return (CRYPTO_ARGUMENTS_BAD); length_needed = 0; break; default: length_needed = ciphertext->cd_length; } /* return size of buffer needed to store output */ if (plaintext->cd_length < length_needed) { plaintext->cd_length = length_needed; ret = CRYPTO_BUFFER_TOO_SMALL; goto out; } saved_offset = plaintext->cd_offset; saved_length = plaintext->cd_length; if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE || mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) gcm_set_kmflag((gcm_ctx_t *)&aes_ctx, crypto_kmflag(req)); /* * Do an update on the specified input data. */ switch (ciphertext->cd_format) { case CRYPTO_DATA_RAW: ret = crypto_update_iov(&aes_ctx, ciphertext, plaintext, aes_decrypt_contiguous_blocks, aes_copy_block64); break; case CRYPTO_DATA_UIO: ret = crypto_update_uio(&aes_ctx, ciphertext, plaintext, aes_decrypt_contiguous_blocks, aes_copy_block64); break; default: ret = CRYPTO_ARGUMENTS_BAD; } if (ret == CRYPTO_SUCCESS) { if (mechanism->cm_type == AES_CCM_MECH_INFO_TYPE) { ASSERT(aes_ctx.ac_processed_data_len == aes_ctx.ac_data_len); ASSERT(aes_ctx.ac_processed_mac_len == aes_ctx.ac_mac_len); ret = ccm_decrypt_final((ccm_ctx_t *)&aes_ctx, plaintext, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); ASSERT(aes_ctx.ac_remainder_len == 0); if ((ret == CRYPTO_SUCCESS) && (ciphertext != plaintext)) { plaintext->cd_length = plaintext->cd_offset - saved_offset; } else { plaintext->cd_length = saved_length; } } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE || mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) { ret = gcm_decrypt_final((gcm_ctx_t *)&aes_ctx, plaintext, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); ASSERT(aes_ctx.ac_remainder_len == 0); if ((ret == CRYPTO_SUCCESS) && (ciphertext != plaintext)) { plaintext->cd_length = plaintext->cd_offset - saved_offset; } else { plaintext->cd_length = saved_length; } } else if (mechanism->cm_type != AES_CTR_MECH_INFO_TYPE) { ASSERT(aes_ctx.ac_remainder_len == 0); if (ciphertext != plaintext) plaintext->cd_length = plaintext->cd_offset - saved_offset; } else { if (aes_ctx.ac_remainder_len > 0) { ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx, plaintext, aes_encrypt_block); if (ret == CRYPTO_DATA_LEN_RANGE) ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; if (ret != CRYPTO_SUCCESS) goto out; } if (ciphertext != plaintext) plaintext->cd_length = plaintext->cd_offset - saved_offset; } } else { plaintext->cd_length = saved_length; } plaintext->cd_offset = saved_offset; out: if (aes_ctx.ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) { bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); } if (aes_ctx.ac_flags & CCM_MODE) { if (aes_ctx.ac_pt_buf != NULL) { vmem_free(aes_ctx.ac_pt_buf, aes_ctx.ac_data_len); } } else if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE)) { if (((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf != NULL) { vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf, ((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len); } } return (ret); } /* * KCF software provider context template entry points. */ /* ARGSUSED */ static int aes_create_ctx_template(crypto_provider_handle_t provider, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_spi_ctx_template_t *tmpl, size_t *tmpl_size, crypto_req_handle_t req) { void *keysched; size_t size; int rv; if (mechanism->cm_type != AES_ECB_MECH_INFO_TYPE && mechanism->cm_type != AES_CBC_MECH_INFO_TYPE && mechanism->cm_type != AES_CTR_MECH_INFO_TYPE && mechanism->cm_type != AES_CCM_MECH_INFO_TYPE && mechanism->cm_type != AES_GCM_MECH_INFO_TYPE && mechanism->cm_type != AES_GMAC_MECH_INFO_TYPE) return (CRYPTO_MECHANISM_INVALID); if ((keysched = aes_alloc_keysched(&size, crypto_kmflag(req))) == NULL) { return (CRYPTO_HOST_MEMORY); } /* * Initialize key schedule. Key length information is stored * in the key. */ if ((rv = init_keysched(key, keysched)) != CRYPTO_SUCCESS) { bzero(keysched, size); kmem_free(keysched, size); return (rv); } *tmpl = keysched; *tmpl_size = size; return (CRYPTO_SUCCESS); } static int aes_free_context(crypto_ctx_t *ctx) { aes_ctx_t *aes_ctx = ctx->cc_provider_private; if (aes_ctx != NULL) { if (aes_ctx->ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) { ASSERT(aes_ctx->ac_keysched_len != 0); bzero(aes_ctx->ac_keysched, aes_ctx->ac_keysched_len); kmem_free(aes_ctx->ac_keysched, aes_ctx->ac_keysched_len); } crypto_free_mode_ctx(aes_ctx); ctx->cc_provider_private = NULL; } return (CRYPTO_SUCCESS); } static int aes_common_init_ctx(aes_ctx_t *aes_ctx, crypto_spi_ctx_template_t *template, crypto_mechanism_t *mechanism, crypto_key_t *key, int kmflag, boolean_t is_encrypt_init) { int rv = CRYPTO_SUCCESS; void *keysched; size_t size = 0; if (template == NULL) { if ((keysched = aes_alloc_keysched(&size, kmflag)) == NULL) return (CRYPTO_HOST_MEMORY); /* * Initialize key schedule. * Key length is stored in the key. */ if ((rv = init_keysched(key, keysched)) != CRYPTO_SUCCESS) { kmem_free(keysched, size); return (rv); } aes_ctx->ac_flags |= PROVIDER_OWNS_KEY_SCHEDULE; aes_ctx->ac_keysched_len = size; } else { keysched = template; } aes_ctx->ac_keysched = keysched; switch (mechanism->cm_type) { case AES_CBC_MECH_INFO_TYPE: rv = cbc_init_ctx((cbc_ctx_t *)aes_ctx, mechanism->cm_param, mechanism->cm_param_len, AES_BLOCK_LEN, aes_copy_block64); break; case AES_CTR_MECH_INFO_TYPE: { CK_AES_CTR_PARAMS *pp; if (mechanism->cm_param == NULL || mechanism->cm_param_len != sizeof (CK_AES_CTR_PARAMS)) { return (CRYPTO_MECHANISM_PARAM_INVALID); } pp = (CK_AES_CTR_PARAMS *)(void *)mechanism->cm_param; rv = ctr_init_ctx((ctr_ctx_t *)aes_ctx, pp->ulCounterBits, pp->cb, aes_copy_block); break; } case AES_CCM_MECH_INFO_TYPE: if (mechanism->cm_param == NULL || mechanism->cm_param_len != sizeof (CK_AES_CCM_PARAMS)) { return (CRYPTO_MECHANISM_PARAM_INVALID); } rv = ccm_init_ctx((ccm_ctx_t *)aes_ctx, mechanism->cm_param, kmflag, is_encrypt_init, AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); break; case AES_GCM_MECH_INFO_TYPE: if (mechanism->cm_param == NULL || mechanism->cm_param_len != sizeof (CK_AES_GCM_PARAMS)) { return (CRYPTO_MECHANISM_PARAM_INVALID); } rv = gcm_init_ctx((gcm_ctx_t *)aes_ctx, mechanism->cm_param, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); break; case AES_GMAC_MECH_INFO_TYPE: if (mechanism->cm_param == NULL || mechanism->cm_param_len != sizeof (CK_AES_GMAC_PARAMS)) { return (CRYPTO_MECHANISM_PARAM_INVALID); } rv = gmac_init_ctx((gcm_ctx_t *)aes_ctx, mechanism->cm_param, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, aes_xor_block); break; case AES_ECB_MECH_INFO_TYPE: aes_ctx->ac_flags |= ECB_MODE; } if (rv != CRYPTO_SUCCESS) { if (aes_ctx->ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) { bzero(keysched, size); kmem_free(keysched, size); } } return (rv); } static int process_gmac_mech(crypto_mechanism_t *mech, crypto_data_t *data, CK_AES_GCM_PARAMS *gcm_params) { /* LINTED: pointer alignment */ CK_AES_GMAC_PARAMS *params = (CK_AES_GMAC_PARAMS *)mech->cm_param; if (mech->cm_type != AES_GMAC_MECH_INFO_TYPE) return (CRYPTO_MECHANISM_INVALID); if (mech->cm_param_len != sizeof (CK_AES_GMAC_PARAMS)) return (CRYPTO_MECHANISM_PARAM_INVALID); if (params->pIv == NULL) return (CRYPTO_MECHANISM_PARAM_INVALID); gcm_params->pIv = params->pIv; gcm_params->ulIvLen = AES_GMAC_IV_LEN; gcm_params->ulTagBits = AES_GMAC_TAG_BITS; if (data == NULL) return (CRYPTO_SUCCESS); if (data->cd_format != CRYPTO_DATA_RAW) return (CRYPTO_ARGUMENTS_BAD); gcm_params->pAAD = (uchar_t *)data->cd_raw.iov_base; gcm_params->ulAADLen = data->cd_length; return (CRYPTO_SUCCESS); } static int aes_mac_atomic(crypto_provider_handle_t provider, crypto_session_id_t session_id, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, crypto_spi_ctx_template_t template, crypto_req_handle_t req) { CK_AES_GCM_PARAMS gcm_params; crypto_mechanism_t gcm_mech; int rv; if ((rv = process_gmac_mech(mechanism, data, &gcm_params)) != CRYPTO_SUCCESS) return (rv); gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE; gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); gcm_mech.cm_param = (char *)&gcm_params; return (aes_encrypt_atomic(provider, session_id, &gcm_mech, key, &null_crypto_data, mac, template, req)); } static int aes_mac_verify_atomic(crypto_provider_handle_t provider, crypto_session_id_t session_id, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, crypto_spi_ctx_template_t template, crypto_req_handle_t req) { CK_AES_GCM_PARAMS gcm_params; crypto_mechanism_t gcm_mech; int rv; if ((rv = process_gmac_mech(mechanism, data, &gcm_params)) != CRYPTO_SUCCESS) return (rv); gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE; gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); gcm_mech.cm_param = (char *)&gcm_params; return (aes_decrypt_atomic(provider, session_id, &gcm_mech, key, mac, &null_crypto_data, template, req)); } diff --git a/module/icp/io/edonr_mod.c b/module/icp/io/edonr_mod.c index 19b5c963d805..cb748a954e25 100644 --- a/module/icp/io/edonr_mod.c +++ b/module/icp/io/edonr_mod.c @@ -1,62 +1,63 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. */ #include #include #include #include #include #include /* * Unlike sha2 or skein, we won't expose edonr via the Kernel Cryptographic * Framework (KCF), because Edon-R is *NOT* suitable for general-purpose * cryptographic use. Users of Edon-R must interface directly to this module. */ static struct modlmisc modlmisc = { &mod_cryptoops, "Edon-R Message-Digest Algorithm" }; static struct modlinkage modlinkage = { MODREV_1, {&modlmisc, NULL} }; int edonr_mod_init(void) { int error; if ((error = mod_install(&modlinkage)) != 0) return (error); return (0); } int -edonr_mod_fini(void) { +edonr_mod_fini(void) +{ return (mod_remove(&modlinkage)); } diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c index 705b1e819078..6db31c3559ac 100644 --- a/module/icp/io/skein_mod.c +++ b/module/icp/io/skein_mod.c @@ -1,734 +1,735 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. */ #include #include #include #include #include #define SKEIN_MODULE_IMPL #include /* * Like the sha2 module, we create the skein module with two modlinkages: * - modlmisc to allow direct calls to Skein_* API functions. * - modlcrypto to integrate well into the Kernel Crypto Framework (KCF). */ static struct modlmisc modlmisc = { &mod_cryptoops, "Skein Message-Digest Algorithm" }; static struct modlcrypto modlcrypto = { &mod_cryptoops, "Skein Kernel SW Provider" }; static struct modlinkage modlinkage = { MODREV_1, {&modlmisc, &modlcrypto, NULL} }; static crypto_mech_info_t skein_mech_info_tab[] = { {CKM_SKEIN_256, SKEIN_256_MECH_INFO_TYPE, CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, {CKM_SKEIN_256_MAC, SKEIN_256_MAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, {CKM_SKEIN_512, SKEIN_512_MECH_INFO_TYPE, CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, {CKM_SKEIN_512_MAC, SKEIN_512_MAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, {CKM_SKEIN1024, SKEIN1024_MECH_INFO_TYPE, CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, {CKM_SKEIN1024_MAC, SKEIN1024_MAC_MECH_INFO_TYPE, CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX, CRYPTO_KEYSIZE_UNIT_IN_BYTES} }; static void skein_provider_status(crypto_provider_handle_t, uint_t *); static crypto_control_ops_t skein_control_ops = { skein_provider_status }; static int skein_digest_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_req_handle_t); static int skein_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, crypto_req_handle_t); static int skein_update(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); static int skein_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); static int skein_digest_atomic(crypto_provider_handle_t, crypto_session_id_t, crypto_mechanism_t *, crypto_data_t *, crypto_data_t *, crypto_req_handle_t); static crypto_digest_ops_t skein_digest_ops = { skein_digest_init, skein_digest, skein_update, NULL, skein_final, skein_digest_atomic }; static int skein_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); static int skein_mac_atomic(crypto_provider_handle_t, crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); static crypto_mac_ops_t skein_mac_ops = { skein_mac_init, NULL, skein_update, /* using regular digest update is OK here */ skein_final, /* using regular digest final is OK here */ skein_mac_atomic, NULL }; static int skein_create_ctx_template(crypto_provider_handle_t, crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *, size_t *, crypto_req_handle_t); static int skein_free_context(crypto_ctx_t *); static crypto_ctx_ops_t skein_ctx_ops = { skein_create_ctx_template, skein_free_context }; static crypto_ops_t skein_crypto_ops = {{{{{ &skein_control_ops, &skein_digest_ops, NULL, &skein_mac_ops, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, &skein_ctx_ops, }}}}}; static crypto_provider_info_t skein_prov_info = {{{{ CRYPTO_SPI_VERSION_1, "Skein Software Provider", CRYPTO_SW_PROVIDER, NULL, &skein_crypto_ops, sizeof (skein_mech_info_tab) / sizeof (crypto_mech_info_t), skein_mech_info_tab }}}}; static crypto_kcf_provider_handle_t skein_prov_handle = 0; typedef struct skein_ctx { skein_mech_type_t sc_mech_type; size_t sc_digest_bitlen; /*LINTED(E_ANONYMOUS_UNION_DECL)*/ union { Skein_256_Ctxt_t sc_256; Skein_512_Ctxt_t sc_512; Skein1024_Ctxt_t sc_1024; }; } skein_ctx_t; #define SKEIN_CTX(_ctx_) ((skein_ctx_t *)((_ctx_)->cc_provider_private)) #define SKEIN_CTX_LVALUE(_ctx_) (_ctx_)->cc_provider_private #define SKEIN_OP(_skein_ctx, _op, ...) \ do { \ skein_ctx_t *sc = (_skein_ctx); \ switch (sc->sc_mech_type) { \ case SKEIN_256_MECH_INFO_TYPE: \ case SKEIN_256_MAC_MECH_INFO_TYPE: \ (void) Skein_256_ ## _op(&sc->sc_256, __VA_ARGS__);\ break; \ case SKEIN_512_MECH_INFO_TYPE: \ case SKEIN_512_MAC_MECH_INFO_TYPE: \ (void) Skein_512_ ## _op(&sc->sc_512, __VA_ARGS__);\ break; \ case SKEIN1024_MECH_INFO_TYPE: \ case SKEIN1024_MAC_MECH_INFO_TYPE: \ (void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\ break; \ } \ _NOTE(CONSTCOND) \ } while (0) static int skein_get_digest_bitlen(const crypto_mechanism_t *mechanism, size_t *result) { if (mechanism->cm_param != NULL) { /*LINTED(E_BAD_PTR_CAST_ALIGN)*/ skein_param_t *param = (skein_param_t *)mechanism->cm_param; if (mechanism->cm_param_len != sizeof (*param) || param->sp_digest_bitlen == 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); } *result = param->sp_digest_bitlen; } else { switch (mechanism->cm_type) { case SKEIN_256_MECH_INFO_TYPE: *result = 256; break; case SKEIN_512_MECH_INFO_TYPE: *result = 512; break; case SKEIN1024_MECH_INFO_TYPE: *result = 1024; break; default: return (CRYPTO_MECHANISM_INVALID); } } return (CRYPTO_SUCCESS); } int skein_mod_init(void) { int error; if ((error = mod_install(&modlinkage)) != 0) return (error); /* * Try to register with KCF - failure shouldn't unload us, since we * still may want to continue providing misc/skein functionality. */ (void) crypto_register_provider(&skein_prov_info, &skein_prov_handle); return (0); } int -skein_mod_fini(void) { +skein_mod_fini(void) +{ int ret; if (skein_prov_handle != 0) { if ((ret = crypto_unregister_provider(skein_prov_handle)) != CRYPTO_SUCCESS) { cmn_err(CE_WARN, "skein _fini: crypto_unregister_provider() " "failed (0x%x)", ret); return (EBUSY); } skein_prov_handle = 0; } return (mod_remove(&modlinkage)); } /* * KCF software provider control entry points. */ /* ARGSUSED */ static void skein_provider_status(crypto_provider_handle_t provider, uint_t *status) { *status = CRYPTO_PROVIDER_READY; } /* * General Skein hashing helper functions. */ /* * Performs an Update on a context with uio input data. */ static int skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data) { off_t offset = data->cd_offset; size_t length = data->cd_length; uint_t vec_idx; size_t cur_len; const uio_t *uio = data->cd_uio; /* we support only kernel buffer */ if (uio->uio_segflg != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing data to be * digested. */ for (vec_idx = 0; vec_idx < uio->uio_iovcnt && offset >= uio->uio_iov[vec_idx].iov_len; offset -= uio->uio_iov[vec_idx++].iov_len) ; if (vec_idx == uio->uio_iovcnt) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. */ return (CRYPTO_DATA_LEN_RANGE); } /* * Now do the digesting on the iovecs. */ while (vec_idx < uio->uio_iovcnt && length > 0) { cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset, length); SKEIN_OP(ctx, Update, (uint8_t *)uio->uio_iov[vec_idx].iov_base + offset, cur_len); length -= cur_len; vec_idx++; offset = 0; } if (vec_idx == uio->uio_iovcnt && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. * The caller requested to digest more data than it provided. */ return (CRYPTO_DATA_LEN_RANGE); } return (CRYPTO_SUCCESS); } /* * Performs a Final on a context and writes to a uio digest output. */ static int skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) { off_t offset = digest->cd_offset; uint_t vec_idx; uio_t *uio = digest->cd_uio; /* we support only kernel buffer */ if (uio->uio_segflg != UIO_SYSSPACE) return (CRYPTO_ARGUMENTS_BAD); /* * Jump to the first iovec containing ptr to the digest to be returned. */ for (vec_idx = 0; offset >= uio->uio_iov[vec_idx].iov_len && vec_idx < uio->uio_iovcnt; offset -= uio->uio_iov[vec_idx++].iov_len) ; if (vec_idx == uio->uio_iovcnt) { /* * The caller specified an offset that is larger than the * total size of the buffers it provided. */ return (CRYPTO_DATA_LEN_RANGE); } if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <= uio->uio_iov[vec_idx].iov_len) { /* The computed digest will fit in the current iovec. */ SKEIN_OP(ctx, Final, (uchar_t *)uio->uio_iov[vec_idx].iov_base + offset); } else { uint8_t *digest_tmp; off_t scratch_offset = 0; size_t length = CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen); size_t cur_len; digest_tmp = kmem_alloc(CRYPTO_BITS2BYTES( ctx->sc_digest_bitlen), crypto_kmflag(req)); if (digest_tmp == NULL) return (CRYPTO_HOST_MEMORY); SKEIN_OP(ctx, Final, digest_tmp); while (vec_idx < uio->uio_iovcnt && length > 0) { cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset, length); bcopy(digest_tmp + scratch_offset, uio->uio_iov[vec_idx].iov_base + offset, cur_len); length -= cur_len; vec_idx++; scratch_offset += cur_len; offset = 0; } kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen)); if (vec_idx == uio->uio_iovcnt && length > 0) { /* * The end of the specified iovec's was reached but * the length requested could not be processed, i.e. * The caller requested to digest more data than it * provided. */ return (CRYPTO_DATA_LEN_RANGE); } } return (CRYPTO_SUCCESS); } /* * KCF software provider digest entry points. */ /* * Initializes a skein digest context to the configuration in `mechanism'. * The mechanism cm_type must be one of SKEIN_*_MECH_INFO_TYPE. The cm_param * field may contain a skein_param_t structure indicating the length of the * digest the algorithm should produce. Otherwise the default output lengths * are applied (32 bytes for Skein-256, 64 bytes for Skein-512 and 128 bytes * for Skein-1024). */ static int skein_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, crypto_req_handle_t req) { int error = CRYPTO_SUCCESS; if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type)) return (CRYPTO_MECHANISM_INVALID); SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)), crypto_kmflag(req)); if (SKEIN_CTX(ctx) == NULL) return (CRYPTO_HOST_MEMORY); SKEIN_CTX(ctx)->sc_mech_type = mechanism->cm_type; error = skein_get_digest_bitlen(mechanism, &SKEIN_CTX(ctx)->sc_digest_bitlen); if (error != CRYPTO_SUCCESS) goto errout; SKEIN_OP(SKEIN_CTX(ctx), Init, SKEIN_CTX(ctx)->sc_digest_bitlen); return (CRYPTO_SUCCESS); errout: bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); SKEIN_CTX_LVALUE(ctx) = NULL; return (error); } /* * Executes a skein_update and skein_digest on a pre-initialized crypto * context in a single step. See the documentation to these functions to * see what to pass here. */ static int skein_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest, crypto_req_handle_t req) { int error = CRYPTO_SUCCESS; ASSERT(SKEIN_CTX(ctx) != NULL); if (digest->cd_length < CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) { digest->cd_length = CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); return (CRYPTO_BUFFER_TOO_SMALL); } error = skein_update(ctx, data, req); if (error != CRYPTO_SUCCESS) { bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); SKEIN_CTX_LVALUE(ctx) = NULL; digest->cd_length = 0; return (error); } error = skein_final(ctx, digest, req); return (error); } /* * Performs a skein Update with the input message in `data' (successive calls * can push more data). This is used both for digest and MAC operation. * Supported input data formats are raw, uio and mblk. */ /*ARGSUSED*/ static int skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req) { int error = CRYPTO_SUCCESS; ASSERT(SKEIN_CTX(ctx) != NULL); switch (data->cd_format) { case CRYPTO_DATA_RAW: SKEIN_OP(SKEIN_CTX(ctx), Update, (uint8_t *)data->cd_raw.iov_base + data->cd_offset, data->cd_length); break; case CRYPTO_DATA_UIO: error = skein_digest_update_uio(SKEIN_CTX(ctx), data); break; default: error = CRYPTO_ARGUMENTS_BAD; } return (error); } /* * Performs a skein Final, writing the output to `digest'. This is used both * for digest and MAC operation. * Supported output digest formats are raw, uio and mblk. */ /*ARGSUSED*/ static int skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) { int error = CRYPTO_SUCCESS; ASSERT(SKEIN_CTX(ctx) != NULL); if (digest->cd_length < CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) { digest->cd_length = CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); return (CRYPTO_BUFFER_TOO_SMALL); } switch (digest->cd_format) { case CRYPTO_DATA_RAW: SKEIN_OP(SKEIN_CTX(ctx), Final, (uint8_t *)digest->cd_raw.iov_base + digest->cd_offset); break; case CRYPTO_DATA_UIO: error = skein_digest_final_uio(SKEIN_CTX(ctx), digest, req); break; default: error = CRYPTO_ARGUMENTS_BAD; } if (error == CRYPTO_SUCCESS) digest->cd_length = CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); else digest->cd_length = 0; bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx)))); SKEIN_CTX_LVALUE(ctx) = NULL; return (error); } /* * Performs a full skein digest computation in a single call, configuring the * algorithm according to `mechanism', reading the input to be digested from * `data' and writing the output to `digest'. * Supported input/output formats are raw, uio and mblk. */ /*ARGSUSED*/ static int skein_digest_atomic(crypto_provider_handle_t provider, crypto_session_id_t session_id, crypto_mechanism_t *mechanism, crypto_data_t *data, crypto_data_t *digest, crypto_req_handle_t req) { int error; skein_ctx_t skein_ctx; crypto_ctx_t ctx; SKEIN_CTX_LVALUE(&ctx) = &skein_ctx; /* Init */ if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type)) return (CRYPTO_MECHANISM_INVALID); skein_ctx.sc_mech_type = mechanism->cm_type; error = skein_get_digest_bitlen(mechanism, &skein_ctx.sc_digest_bitlen); if (error != CRYPTO_SUCCESS) goto out; SKEIN_OP(&skein_ctx, Init, skein_ctx.sc_digest_bitlen); if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS) goto out; if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS) goto out; out: if (error == CRYPTO_SUCCESS) digest->cd_length = CRYPTO_BITS2BYTES(skein_ctx.sc_digest_bitlen); else digest->cd_length = 0; bzero(&skein_ctx, sizeof (skein_ctx)); return (error); } /* * Helper function that builds a Skein MAC context from the provided * mechanism and key. */ static int skein_mac_ctx_build(skein_ctx_t *ctx, crypto_mechanism_t *mechanism, crypto_key_t *key) { int error; if (!VALID_SKEIN_MAC_MECH(mechanism->cm_type)) return (CRYPTO_MECHANISM_INVALID); if (key->ck_format != CRYPTO_KEY_RAW) return (CRYPTO_ARGUMENTS_BAD); ctx->sc_mech_type = mechanism->cm_type; error = skein_get_digest_bitlen(mechanism, &ctx->sc_digest_bitlen); if (error != CRYPTO_SUCCESS) return (error); SKEIN_OP(ctx, InitExt, ctx->sc_digest_bitlen, 0, key->ck_data, CRYPTO_BITS2BYTES(key->ck_length)); return (CRYPTO_SUCCESS); } /* * KCF software provide mac entry points. */ /* * Initializes a skein MAC context. You may pass a ctx_template, in which * case the template will be reused to make initialization more efficient. * Otherwise a new context will be constructed. The mechanism cm_type must * be one of SKEIN_*_MAC_MECH_INFO_TYPE. Same as in skein_digest_init, you * may pass a skein_param_t in cm_param to configure the length of the * digest. The key must be in raw format. */ static int skein_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req) { int error; SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)), crypto_kmflag(req)); if (SKEIN_CTX(ctx) == NULL) return (CRYPTO_HOST_MEMORY); if (ctx_template != NULL) { bcopy(ctx_template, SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); } else { error = skein_mac_ctx_build(SKEIN_CTX(ctx), mechanism, key); if (error != CRYPTO_SUCCESS) goto errout; } return (CRYPTO_SUCCESS); errout: bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); return (error); } /* * The MAC update and final calls are reused from the regular digest code. */ /*ARGSUSED*/ /* * Same as skein_digest_atomic, performs an atomic Skein MAC operation in * one step. All the same properties apply to the arguments of this * function as to those of the partial operations above. */ static int skein_mac_atomic(crypto_provider_handle_t provider, crypto_session_id_t session_id, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req) { /* faux crypto context just for skein_digest_{update,final} */ int error; crypto_ctx_t ctx; skein_ctx_t skein_ctx; SKEIN_CTX_LVALUE(&ctx) = &skein_ctx; if (ctx_template != NULL) { bcopy(ctx_template, &skein_ctx, sizeof (skein_ctx)); } else { error = skein_mac_ctx_build(&skein_ctx, mechanism, key); if (error != CRYPTO_SUCCESS) goto errout; } if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS) goto errout; if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS) goto errout; return (CRYPTO_SUCCESS); errout: bzero(&skein_ctx, sizeof (skein_ctx)); return (error); } /* * KCF software provider context management entry points. */ /* * Constructs a context template for the Skein MAC algorithm. The same * properties apply to the arguments of this function as to those of * skein_mac_init. */ /*ARGSUSED*/ static int skein_create_ctx_template(crypto_provider_handle_t provider, crypto_mechanism_t *mechanism, crypto_key_t *key, crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size, crypto_req_handle_t req) { int error; skein_ctx_t *ctx_tmpl; ctx_tmpl = kmem_alloc(sizeof (*ctx_tmpl), crypto_kmflag(req)); if (ctx_tmpl == NULL) return (CRYPTO_HOST_MEMORY); error = skein_mac_ctx_build(ctx_tmpl, mechanism, key); if (error != CRYPTO_SUCCESS) goto errout; *ctx_template = ctx_tmpl; *ctx_template_size = sizeof (*ctx_tmpl); return (CRYPTO_SUCCESS); errout: bzero(ctx_tmpl, sizeof (*ctx_tmpl)); kmem_free(ctx_tmpl, sizeof (*ctx_tmpl)); return (error); } /* * Frees a skein context in a parent crypto context. */ static int skein_free_context(crypto_ctx_t *ctx) { if (SKEIN_CTX(ctx) != NULL) { bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); SKEIN_CTX_LVALUE(ctx) = NULL; } return (CRYPTO_SUCCESS); } diff --git a/module/unicode/u8_textprep.c b/module/unicode/u8_textprep.c index 74253c50d9b0..0330032fa0ef 100644 --- a/module/unicode/u8_textprep.c +++ b/module/unicode/u8_textprep.c @@ -1,2157 +1,2156 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). * * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also * the section 3C man pages. * Interface stability: Committed. */ #include #ifdef _KERNEL #include #include #include #include #include #include #include #else #include #include #endif /* _KERNEL */ #include #include #include /* The maximum possible number of bytes in a UTF-8 character. */ #define U8_MB_CUR_MAX (4) /* * The maximum number of bytes needed for a UTF-8 character to cover * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. */ #define U8_MAX_BYTES_UCS2 (3) /* The maximum possible number of bytes in a Stream-Safe Text. */ #define U8_STREAM_SAFE_TEXT_MAX (128) /* * The maximum number of characters in a combining/conjoining sequence and * the actual upperbound limit of a combining/conjoining sequence. */ #define U8_MAX_CHARS_A_SEQ (32) #define U8_UPPER_LIMIT_IN_A_SEQ (31) /* The combining class value for Starter. */ #define U8_COMBINING_CLASS_STARTER (0) /* * Some Hangul related macros at below. * * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, * Vowels, and optional Trailing consonants in Unicode scalar values. * * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not * the actual U+11A8. This is due to that the trailing consonant is optional * and thus we are doing a pre-calculation of subtracting one. * * Each of 19 modern leading consonants has total 588 possible syllables since * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for * no trailing consonant case, i.e., 21 x 28 = 588. * * We also have bunch of Hangul related macros at below. Please bear in mind * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is * a Hangul Jamo or not but the value does not guarantee that it is a Hangul * Jamo; it just guarantee that it will be most likely. */ #define U8_HANGUL_SYL_FIRST (0xAC00U) #define U8_HANGUL_SYL_LAST (0xD7A3U) #define U8_HANGUL_JAMO_L_FIRST (0x1100U) #define U8_HANGUL_JAMO_L_LAST (0x1112U) #define U8_HANGUL_JAMO_V_FIRST (0x1161U) #define U8_HANGUL_JAMO_V_LAST (0x1175U) #define U8_HANGUL_JAMO_T_FIRST (0x11A7U) #define U8_HANGUL_JAMO_T_LAST (0x11C2U) #define U8_HANGUL_V_COUNT (21) #define U8_HANGUL_VT_COUNT (588) #define U8_HANGUL_T_COUNT (28) #define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) #define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); #define U8_HANGUL_JAMO_L(u) \ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) #define U8_HANGUL_JAMO_V(u) \ ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) #define U8_HANGUL_JAMO_T(u) \ ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) #define U8_HANGUL_JAMO(u) \ ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) #define U8_HANGUL_SYLLABLE(u) \ ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) #define U8_HANGUL_COMPOSABLE_L_V(s, u) \ ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) #define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) /* The types of decomposition mappings. */ #define U8_DECOMP_BOTH (0xF5U) #define U8_DECOMP_CANONICAL (0xF6U) /* The indicator for 16-bit table. */ #define U8_16BIT_TABLE_INDICATOR (0x8000U) /* The following are some convenience macros. */ #define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \ (((uint32_t)(b2) & 0x3F) << 6) | \ ((uint32_t)(b3) & 0x3F)); #define U8_SIMPLE_SWAP(a, b, t) \ (t) = (a); \ (a) = (b); \ (b) = (t); #define U8_ASCII_TOUPPER(c) \ (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) #define U8_ASCII_TOLOWER(c) \ (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) #define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) /* * The following macro assumes that the two characters that are to be * swapped are adjacent to each other and 'a' comes before 'b'. * * If the assumptions are not met, then, the macro will fail. */ #define U8_SWAP_COMB_MARKS(a, b) \ for (k = 0; k < disp[(a)]; k++) \ u8t[k] = u8s[start[(a)] + k]; \ for (k = 0; k < disp[(b)]; k++) \ u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ start[(b)] = start[(a)] + disp[(b)]; \ for (k = 0; k < disp[(a)]; k++) \ u8s[start[(b)] + k] = u8t[k]; \ U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); /* The possible states during normalization. */ typedef enum { U8_STATE_START = 0, U8_STATE_HANGUL_L = 1, U8_STATE_HANGUL_LV = 2, U8_STATE_HANGUL_LVT = 3, U8_STATE_HANGUL_V = 4, U8_STATE_HANGUL_T = 5, U8_STATE_COMBINING_MARK = 6 } u8_normalization_states_t; /* * The three vectors at below are used to check bytes of a given UTF-8 * character are valid and not containing any malformed byte values. * * We used to have a quite relaxed UTF-8 binary representation but then there * was some security related issues and so the Unicode Consortium defined * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it * one more time at the Unicode 3.2. The following three tables are based on * that. */ #define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) #define I_ U8_ILLEGAL_CHAR #define O_ U8_OUT_OF_RANGE_CHAR const int8_t u8_number_of_bytes[0x100] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, }; #undef I_ #undef O_ const uint8_t u8_valid_min_2nd_byte[0x100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C0 C1 C2 C3 C4 C5 C6 C7 */ 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* C8 C9 CA CB CC CD CE CF */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* D0 D1 D2 D3 D4 D5 D6 D7 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* D8 D9 DA DB DC DD DE DF */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* E0 E1 E2 E3 E4 E5 E6 E7 */ 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* E8 E9 EA EB EC ED EE EF */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* F0 F1 F2 F3 F4 F5 F6 F7 */ 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; const uint8_t u8_valid_max_2nd_byte[0x100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* C0 C1 C2 C3 C4 C5 C6 C7 */ 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* C8 C9 CA CB CC CD CE CF */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* D0 D1 D2 D3 D4 D5 D6 D7 */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* D8 D9 DA DB DC DD DE DF */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* E0 E1 E2 E3 E4 E5 E6 E7 */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, /* E8 E9 EA EB EC ED EE EF */ 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, /* F0 F1 F2 F3 F4 F5 F6 F7 */ 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /* * The u8_validate() validates on the given UTF-8 character string and * calculate the byte length. It is quite similar to mblen(3C) except that * this will validate against the list of characters if required and * specific to UTF-8 and Unicode. */ int u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) { uchar_t *ib; uchar_t *ibtail; uchar_t **p; uchar_t *s1; uchar_t *s2; uchar_t f; int sz; size_t i; int ret_val; boolean_t second; boolean_t no_need_to_validate_entire; boolean_t check_additional; boolean_t validate_ucs2_range_only; if (! u8str) return (0); ib = (uchar_t *)u8str; ibtail = ib + n; ret_val = 0; no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; while (ib < ibtail) { /* * The first byte of a UTF-8 character tells how many * bytes will follow for the character. If the first byte * is an illegal byte value or out of range value, we just * return -1 with an appropriate error number. */ sz = u8_number_of_bytes[*ib]; if (sz == U8_ILLEGAL_CHAR) { *errnum = EILSEQ; return (-1); } if (sz == U8_OUT_OF_RANGE_CHAR || (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { *errnum = ERANGE; return (-1); } /* * If we don't have enough bytes to check on, that's also * an error. As you can see, we give illegal byte sequence * checking higher priority then EINVAL cases. */ if ((ibtail - ib) < sz) { *errnum = EINVAL; return (-1); } if (sz == 1) { ib++; ret_val++; } else { /* * Check on the multi-byte UTF-8 character. For more * details on this, see comment added for the used * data structures at the beginning of the file. */ f = *ib++; ret_val++; second = B_TRUE; for (i = 1; i < sz; i++) { if (second) { if (*ib < u8_valid_min_2nd_byte[f] || *ib > u8_valid_max_2nd_byte[f]) { *errnum = EILSEQ; return (-1); } second = B_FALSE; } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { *errnum = EILSEQ; return (-1); } ib++; ret_val++; } } if (check_additional) { for (p = (uchar_t **)list, i = 0; p[i]; i++) { s1 = ib - sz; s2 = p[i]; while (s1 < ib) { if (*s1 != *s2 || *s2 == '\0') break; s1++; s2++; } if (s1 >= ib && *s2 == '\0') { *errnum = EBADF; return (-1); } } } if (no_need_to_validate_entire) break; } return (ret_val); } /* * The do_case_conv() looks at the mapping tables and returns found * bytes if any. If not found, the input bytes are returned. The function * always terminate the return bytes with a null character assuming that * there are plenty of room to do so. * * The case conversions are simple case conversions mapping a character to * another character as specified in the Unicode data. The byte size of * the mapped character could be different from that of the input character. * * The return value is the byte length of the returned character excluding * the terminating null byte. */ static size_t do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) { size_t i; uint16_t b1 = 0; uint16_t b2 = 0; uint16_t b3 = 0; uint16_t b3_tbl; uint16_t b3_base; uint16_t b4 = 0; size_t start_id; size_t end_id; /* * At this point, the only possible values for sz are 2, 3, and 4. * The u8s should point to a vector that is well beyond the size of * 5 bytes. */ if (sz == 2) { b3 = u8s[0] = s[0]; b4 = u8s[1] = s[1]; } else if (sz == 3) { b2 = u8s[0] = s[0]; b3 = u8s[1] = s[1]; b4 = u8s[2] = s[2]; } else if (sz == 4) { b1 = u8s[0] = s[0]; b2 = u8s[1] = s[1]; b3 = u8s[2] = s[2]; b4 = u8s[3] = s[3]; } else { /* This is not possible but just in case as a fallback. */ if (is_it_toupper) *u8s = U8_ASCII_TOUPPER(*s); else *u8s = U8_ASCII_TOLOWER(*s); u8s[1] = '\0'; return (1); } u8s[sz] = '\0'; /* * Let's find out if we have a corresponding character. */ b1 = u8_common_b1_tbl[uv][b1]; if (b1 == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); b2 = u8_case_common_b2_tbl[uv][b1][b2]; if (b2 == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); if (is_it_toupper) { b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; /* Either there is no match or an error at the table. */ if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) return ((size_t)sz); b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; for (i = 0; start_id < end_id; start_id++) u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; } else { b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) return ((size_t)sz); b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; for (i = 0; start_id < end_id; start_id++) u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; } /* * If i is still zero, that means there is no corresponding character. */ if (i == 0) return ((size_t)sz); u8s[i] = '\0'; return (i); } /* * The do_case_compare() function compares the two input strings, s1 and s2, * one character at a time doing case conversions if applicable and return * the comparison result as like strcmp(). * * Since, in empirical sense, most of text data are 7-bit ASCII characters, * we treat the 7-bit ASCII characters as a special case trying to yield * faster processing time. */ static int do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, - size_t n2, boolean_t is_it_toupper, int *errnum) + size_t n2, boolean_t is_it_toupper, int *errnum) { int f; int sz1; int sz2; size_t j; size_t i1; size_t i2; uchar_t u8s1[U8_MB_CUR_MAX + 1]; uchar_t u8s2[U8_MB_CUR_MAX + 1]; i1 = i2 = 0; while (i1 < n1 && i2 < n2) { /* * Find out what would be the byte length for this UTF-8 * character at string s1 and also find out if this is * an illegal start byte or not and if so, issue a proper * error number and yet treat this byte as a character. */ sz1 = u8_number_of_bytes[*s1]; if (sz1 < 0) { *errnum = EILSEQ; sz1 = 1; } /* * For 7-bit ASCII characters mainly, we do a quick case * conversion right at here. * * If we don't have enough bytes for this character, issue * an EINVAL error and use what are available. * * If we have enough bytes, find out if there is * a corresponding uppercase character and if so, copy over * the bytes for a comparison later. If there is no * corresponding uppercase character, then, use what we have * for the comparison. */ if (sz1 == 1) { if (is_it_toupper) u8s1[0] = U8_ASCII_TOUPPER(*s1); else u8s1[0] = U8_ASCII_TOLOWER(*s1); s1++; u8s1[1] = '\0'; } else if ((i1 + sz1) > n1) { *errnum = EINVAL; for (j = 0; (i1 + j) < n1; ) u8s1[j++] = *s1++; u8s1[j] = '\0'; } else { (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); s1 += sz1; } /* Do the same for the string s2. */ sz2 = u8_number_of_bytes[*s2]; if (sz2 < 0) { *errnum = EILSEQ; sz2 = 1; } if (sz2 == 1) { if (is_it_toupper) u8s2[0] = U8_ASCII_TOUPPER(*s2); else u8s2[0] = U8_ASCII_TOLOWER(*s2); s2++; u8s2[1] = '\0'; } else if ((i2 + sz2) > n2) { *errnum = EINVAL; for (j = 0; (i2 + j) < n2; ) u8s2[j++] = *s2++; u8s2[j] = '\0'; } else { (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); s2 += sz2; } /* Now compare the two characters. */ if (sz1 == 1 && sz2 == 1) { if (*u8s1 > *u8s2) return (1); if (*u8s1 < *u8s2) return (-1); } else { f = strcmp((const char *)u8s1, (const char *)u8s2); if (f != 0) return (f); } /* * They were the same. Let's move on to the next * characters then. */ i1 += sz1; i2 += sz2; } /* * We compared until the end of either or both strings. * * If we reached to or went over the ends for the both, that means * they are the same. * * If we reached only one of the two ends, that means the other string * has something which then the fact can be used to determine * the return value. */ if (i1 >= n1) { if (i2 >= n2) return (0); return (-1); } return (1); } /* * The combining_class() function checks on the given bytes and find out * the corresponding Unicode combining class value. The return value 0 means * it is a Starter. Any illegal UTF-8 character will also be treated as * a Starter. */ static uchar_t combining_class(size_t uv, uchar_t *s, size_t sz) { uint16_t b1 = 0; uint16_t b2 = 0; uint16_t b3 = 0; uint16_t b4 = 0; if (sz == 1 || sz > 4) return (0); if (sz == 2) { b3 = s[0]; b4 = s[1]; } else if (sz == 3) { b2 = s[0]; b3 = s[1]; b4 = s[2]; } else if (sz == 4) { b1 = s[0]; b2 = s[1]; b3 = s[2]; b4 = s[3]; } b1 = u8_common_b1_tbl[uv][b1]; if (b1 == U8_TBL_ELEMENT_NOT_DEF) return (0); b2 = u8_combining_class_b2_tbl[uv][b1][b2]; if (b2 == U8_TBL_ELEMENT_NOT_DEF) return (0); b3 = u8_combining_class_b3_tbl[uv][b2][b3]; if (b3 == U8_TBL_ELEMENT_NOT_DEF) return (0); return (u8_combining_class_b4_tbl[uv][b3][b4]); } /* * The do_decomp() function finds out a matching decomposition if any * and return. If there is no match, the input bytes are copied and returned. * The function also checks if there is a Hangul, decomposes it if necessary * and returns. * * To save time, a single byte 7-bit ASCII character should be handled by * the caller. * * The function returns the number of bytes returned sans always terminating * the null byte. It will also return a state that will tell if there was * a Hangul character decomposed which then will be used by the caller. */ static size_t do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, - boolean_t canonical_decomposition, u8_normalization_states_t *state) + boolean_t canonical_decomposition, u8_normalization_states_t *state) { uint16_t b1 = 0; uint16_t b2 = 0; uint16_t b3 = 0; uint16_t b3_tbl; uint16_t b3_base; uint16_t b4 = 0; size_t start_id; size_t end_id; size_t i; uint32_t u1; if (sz == 2) { b3 = u8s[0] = s[0]; b4 = u8s[1] = s[1]; u8s[2] = '\0'; } else if (sz == 3) { /* Convert it to a Unicode scalar value. */ U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); /* * If this is a Hangul syllable, we decompose it into * a leading consonant, a vowel, and an optional trailing * consonant and then return. */ if (U8_HANGUL_SYLLABLE(u1)) { u1 -= U8_HANGUL_SYL_FIRST; b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) / U8_HANGUL_T_COUNT; b3 = u1 % U8_HANGUL_T_COUNT; U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); if (b3) { b3 += U8_HANGUL_JAMO_T_FIRST; U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); u8s[9] = '\0'; *state = U8_STATE_HANGUL_LVT; return (9); } u8s[6] = '\0'; *state = U8_STATE_HANGUL_LV; return (6); } b2 = u8s[0] = s[0]; b3 = u8s[1] = s[1]; b4 = u8s[2] = s[2]; u8s[3] = '\0'; /* * If this is a Hangul Jamo, we know there is nothing * further that we can decompose. */ if (U8_HANGUL_JAMO_L(u1)) { *state = U8_STATE_HANGUL_L; return (3); } if (U8_HANGUL_JAMO_V(u1)) { if (*state == U8_STATE_HANGUL_L) *state = U8_STATE_HANGUL_LV; else *state = U8_STATE_HANGUL_V; return (3); } if (U8_HANGUL_JAMO_T(u1)) { if (*state == U8_STATE_HANGUL_LV) *state = U8_STATE_HANGUL_LVT; else *state = U8_STATE_HANGUL_T; return (3); } } else if (sz == 4) { b1 = u8s[0] = s[0]; b2 = u8s[1] = s[1]; b3 = u8s[2] = s[2]; b4 = u8s[3] = s[3]; u8s[4] = '\0'; } else { /* * This is a fallback and should not happen if the function * was called properly. */ u8s[0] = s[0]; u8s[1] = '\0'; *state = U8_STATE_START; return (1); } /* * At this point, this routine does not know what it would get. * The caller should sort it out if the state isn't a Hangul one. */ *state = U8_STATE_START; /* Try to find matching decomposition mapping byte sequence. */ b1 = u8_common_b1_tbl[uv][b1]; if (b1 == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); b2 = u8_decomp_b2_tbl[uv][b1][b2]; if (b2 == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) return ((size_t)sz); /* * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR * which is 0x8000, this means we couldn't fit the mappings into * the cardinality of a unsigned byte. */ if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { b3_tbl -= U8_16BIT_TABLE_INDICATOR; start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; } else { start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; } /* This also means there wasn't any matching decomposition. */ if (start_id >= end_id) return ((size_t)sz); /* * The final table for decomposition mappings has three types of * byte sequences depending on whether a mapping is for compatibility * decomposition, canonical decomposition, or both like the following: * * (1) Compatibility decomposition mappings: * * +---+---+-...-+---+ * | B0| B1| ... | Bm| * +---+---+-...-+---+ * * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). * * (2) Canonical decomposition mappings: * * +---+---+---+-...-+---+ * | T | b0| b1| ... | bn| * +---+---+---+-...-+---+ * * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). * * (3) Both mappings: * * +---+---+---+---+-...-+---+---+---+-...-+---+ * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| * +---+---+---+---+-...-+---+---+---+-...-+---+ * * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement * byte, b0 to bn are canonical mapping bytes and B0 to Bm are * compatibility mapping bytes. * * Note that compatibility decomposition means doing recursive * decompositions using both compatibility decomposition mappings and * canonical decomposition mappings. On the other hand, canonical * decomposition means doing recursive decompositions using only * canonical decomposition mappings. Since the table we have has gone * through the recursions already, we do not need to do so during * runtime, i.e., the table has been completely flattened out * already. */ b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; /* Get the type, T, of the byte sequence. */ b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; /* * If necessary, adjust start_id, end_id, or both. Note that if * this is compatibility decomposition mapping, there is no * adjustment. */ if (canonical_decomposition) { /* Is the mapping only for compatibility decomposition? */ if (b1 < U8_DECOMP_BOTH) return ((size_t)sz); start_id++; if (b1 == U8_DECOMP_BOTH) { end_id = start_id + u8_decomp_final_tbl[uv][b3_base + start_id]; start_id++; } } else { /* * Unless this is a compatibility decomposition mapping, * we adjust the start_id. */ if (b1 == U8_DECOMP_BOTH) { start_id++; start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; } else if (b1 == U8_DECOMP_CANONICAL) { start_id++; } } for (i = 0; start_id < end_id; start_id++) u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; u8s[i] = '\0'; return (i); } /* * The find_composition_start() function uses the character bytes given and * find out the matching composition mappings if any and return the address * to the composition mappings as explained in the do_composition(). */ static uchar_t * find_composition_start(size_t uv, uchar_t *s, size_t sz) { uint16_t b1 = 0; uint16_t b2 = 0; uint16_t b3 = 0; uint16_t b3_tbl; uint16_t b3_base; uint16_t b4 = 0; size_t start_id; size_t end_id; if (sz == 1) { b4 = s[0]; } else if (sz == 2) { b3 = s[0]; b4 = s[1]; } else if (sz == 3) { b2 = s[0]; b3 = s[1]; b4 = s[2]; } else if (sz == 4) { b1 = s[0]; b2 = s[1]; b3 = s[2]; b4 = s[3]; } else { /* * This is a fallback and should not happen if the function * was called properly. */ return (NULL); } b1 = u8_composition_b1_tbl[uv][b1]; if (b1 == U8_TBL_ELEMENT_NOT_DEF) return (NULL); b2 = u8_composition_b2_tbl[uv][b1][b2]; if (b2 == U8_TBL_ELEMENT_NOT_DEF) return (NULL); b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) return (NULL); if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { b3_tbl -= U8_16BIT_TABLE_INDICATOR; start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; } else { start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; } if (start_id >= end_id) return (NULL); b3_base = u8_composition_b3_tbl[uv][b2][b3].base; return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); } /* * The blocked() function checks on the combining class values of previous * characters in this sequence and return whether it is blocked or not. */ static boolean_t blocked(uchar_t *comb_class, size_t last) { uchar_t my_comb_class; size_t i; my_comb_class = comb_class[last]; for (i = 1; i < last; i++) if (comb_class[i] >= my_comb_class || comb_class[i] == U8_COMBINING_CLASS_STARTER) return (B_TRUE); return (B_FALSE); } /* * The do_composition() reads the character string pointed by 's' and * do necessary canonical composition and then copy over the result back to * the 's'. * * The input argument 's' cannot contain more than 32 characters. */ static size_t do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, - uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) + uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) { uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; uchar_t tc[U8_MB_CUR_MAX] = { '\0' }; uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; size_t saved_marks_count; uchar_t *p; uchar_t *saved_p; uchar_t *q; size_t i; size_t saved_i; size_t j; size_t k; size_t l; size_t C; size_t saved_l; size_t size; uint32_t u1; uint32_t u2; boolean_t match_not_found = B_TRUE; /* * This should never happen unless the callers are doing some strange * and unexpected things. * * The "last" is the index pointing to the last character not last + 1. */ if (last >= U8_MAX_CHARS_A_SEQ) last = U8_UPPER_LIMIT_IN_A_SEQ; for (i = l = 0; i <= last; i++) { /* * The last or any non-Starters at the beginning, we don't * have any chance to do composition and so we just copy them * to the temporary buffer. */ if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { SAVE_THE_CHAR: p = s + start[i]; size = disp[i]; for (k = 0; k < size; k++) t[l++] = *p++; continue; } /* * If this could be a start of Hangul Jamos, then, we try to * conjoin them. */ if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], s[start[i] + 1], s[start[i] + 2]); U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], s[start[i] + 4], s[start[i] + 5]); if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { u1 -= U8_HANGUL_JAMO_L_FIRST; u2 -= U8_HANGUL_JAMO_V_FIRST; u1 = U8_HANGUL_SYL_FIRST + (u1 * U8_HANGUL_V_COUNT + u2) * U8_HANGUL_T_COUNT; i += 2; if (i <= last) { U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i]], s[start[i] + 1], s[start[i] + 2]); if (U8_HANGUL_JAMO_T(u2)) { u1 += u2 - U8_HANGUL_JAMO_T_FIRST; i++; } } U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); i--; l += 3; continue; } } /* * Let's then find out if this Starter has composition * mapping. */ p = find_composition_start(uv, s + start[i], disp[i]); if (p == NULL) goto SAVE_THE_CHAR; /* * We have a Starter with composition mapping and the next * character is a non-Starter. Let's try to find out if * we can do composition. */ saved_p = p; saved_i = i; saved_l = l; saved_marks_count = 0; TRY_THE_NEXT_MARK: q = s + start[++i]; size = disp[i]; /* * The next for() loop compares the non-Starter pointed by * 'q' with the possible (joinable) characters pointed by 'p'. * * The composition final table entry pointed by the 'p' * looks like the following: * * +---+---+---+-...-+---+---+---+---+-...-+---+---+ * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | * +---+---+---+-...-+---+---+---+---+-...-+---+---+ * * where C is the count byte indicating the number of * mapping pairs where each pair would be look like * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second * character of a canonical decomposition and the B0-Bm are * the bytes of a matching composite character. The F is * a filler byte after each character as the separator. */ match_not_found = B_TRUE; for (C = *p++; C > 0; C--) { for (k = 0; k < size; p++, k++) if (*p != q[k]) break; /* Have we found it? */ if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { match_not_found = B_FALSE; l = saved_l; while (*++p != U8_TBL_ELEMENT_FILLER) t[l++] = *p; break; } /* We didn't find; skip to the next pair. */ if (*p != U8_TBL_ELEMENT_FILLER) while (*++p != U8_TBL_ELEMENT_FILLER) ; while (*++p != U8_TBL_ELEMENT_FILLER) ; p++; } /* * If there was no match, we will need to save the combining * mark for later appending. After that, if the next one * is a non-Starter and not blocked, then, we try once * again to do composition with the next non-Starter. * * If there was no match and this was a Starter, then, * this is a new start. * * If there was a match and a composition done and we have * more to check on, then, we retrieve a new composition final * table entry for the composite and then try to do the * composition again. */ if (match_not_found) { if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { i--; goto SAVE_THE_CHAR; } saved_marks[saved_marks_count++] = i; } if (saved_l == l) { while (i < last) { if (blocked(comb_class, i + 1)) saved_marks[saved_marks_count++] = ++i; else break; } if (i < last) { p = saved_p; goto TRY_THE_NEXT_MARK; } } else if (i < last) { p = find_composition_start(uv, t + saved_l, l - saved_l); if (p != NULL) { saved_p = p; goto TRY_THE_NEXT_MARK; } } /* * There is no more composition possible. * * If there was no composition what so ever then we copy * over the original Starter and then append any non-Starters * remaining at the target string sequentially after that. */ if (saved_l == l) { p = s + start[saved_i]; size = disp[saved_i]; for (j = 0; j < size; j++) t[l++] = *p++; } for (k = 0; k < saved_marks_count; k++) { p = s + start[saved_marks[k]]; size = disp[saved_marks[k]]; for (j = 0; j < size; j++) t[l++] = *p++; } } /* * If the last character is a Starter and if we have a character * (possibly another Starter) that can be turned into a composite, * we do so and we do so until there is no more of composition * possible. */ if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { p = *os; saved_l = l - disp[last]; while (p < oslast) { size = u8_number_of_bytes[*p]; if (size <= 1 || (p + size) > oslast) break; saved_p = p; for (i = 0; i < size; i++) tc[i] = *p++; q = find_composition_start(uv, t + saved_l, l - saved_l); if (q == NULL) { p = saved_p; break; } match_not_found = B_TRUE; for (C = *q++; C > 0; C--) { for (k = 0; k < size; q++, k++) if (*q != tc[k]) break; if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { match_not_found = B_FALSE; l = saved_l; while (*++q != U8_TBL_ELEMENT_FILLER) { /* * This is practically * impossible but we don't * want to take any chances. */ if (l >= U8_STREAM_SAFE_TEXT_MAX) { p = saved_p; goto SAFE_RETURN; } t[l++] = *q; } break; } if (*q != U8_TBL_ELEMENT_FILLER) while (*++q != U8_TBL_ELEMENT_FILLER) ; while (*++q != U8_TBL_ELEMENT_FILLER) ; q++; } if (match_not_found) { p = saved_p; break; } } SAFE_RETURN: *os = p; } /* * Now we copy over the temporary string to the target string. * Since composition always reduces the number of characters or * the number of characters stay, we don't need to worry about * the buffer overflow here. */ for (i = 0; i < l; i++) s[i] = t[i]; s[l] = '\0'; return (l); } /* * The collect_a_seq() function checks on the given string s, collect * a sequence of characters at u8s, and return the sequence. While it collects * a sequence, it also applies case conversion, canonical or compatibility * decomposition, canonical decomposition, or some or all of them and * in that order. * * The collected sequence cannot be bigger than 32 characters since if * it is having more than 31 characters, the sequence will be terminated * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into * a Stream-Safe Text. The collected sequence is always terminated with * a null byte and the return value is the byte length of the sequence * including 0. The return value does not include the terminating * null byte. */ static size_t collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, - boolean_t is_it_toupper, - boolean_t is_it_tolower, - boolean_t canonical_decomposition, - boolean_t compatibility_decomposition, - boolean_t canonical_composition, - int *errnum, u8_normalization_states_t *state) + boolean_t is_it_toupper, boolean_t is_it_tolower, + boolean_t canonical_decomposition, boolean_t compatibility_decomposition, + boolean_t canonical_composition, + int *errnum, u8_normalization_states_t *state) { uchar_t *s; int sz; int saved_sz; size_t i; size_t j; size_t k; size_t l; uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; uchar_t disp[U8_MAX_CHARS_A_SEQ]; uchar_t start[U8_MAX_CHARS_A_SEQ]; uchar_t u8t[U8_MB_CUR_MAX] = { '\0' }; uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; uchar_t tc; size_t last; size_t saved_last; uint32_t u1; /* * Save the source string pointer which we will return a changed * pointer if we do processing. */ s = *source; /* * The following is a fallback for just in case callers are not * checking the string boundaries before the calling. */ if (s >= slast) { u8s[0] = '\0'; return (0); } /* * As the first thing, let's collect a character and do case * conversion if necessary. */ sz = u8_number_of_bytes[*s]; if (sz < 0) { *errnum = EILSEQ; u8s[0] = *s++; u8s[1] = '\0'; *source = s; return (1); } if (sz == 1) { if (is_it_toupper) u8s[0] = U8_ASCII_TOUPPER(*s); else if (is_it_tolower) u8s[0] = U8_ASCII_TOLOWER(*s); else u8s[0] = *s; s++; u8s[1] = '\0'; } else if ((s + sz) > slast) { *errnum = EINVAL; for (i = 0; s < slast; ) u8s[i++] = *s++; u8s[i] = '\0'; *source = s; return (i); } else { if (is_it_toupper || is_it_tolower) { i = do_case_conv(uv, u8s, s, sz, is_it_toupper); s += sz; sz = i; } else { for (i = 0; i < sz; ) u8s[i++] = *s++; u8s[i] = '\0'; } } /* * And then canonical/compatibility decomposition followed by * an optional canonical composition. Please be noted that * canonical composition is done only when a decomposition is * done. */ if (canonical_decomposition || compatibility_decomposition) { if (sz == 1) { *state = U8_STATE_START; saved_sz = 1; comb_class[0] = 0; start[0] = 0; disp[0] = 1; last = 1; } else { saved_sz = do_decomp(uv, u8s, u8s, sz, canonical_decomposition, state); last = 0; for (i = 0; i < saved_sz; ) { sz = u8_number_of_bytes[u8s[i]]; comb_class[last] = combining_class(uv, u8s + i, sz); start[last] = i; disp[last] = sz; last++; i += sz; } /* * Decomposition yields various Hangul related * states but not on combining marks. We need to * find out at here by checking on the last * character. */ if (*state == U8_STATE_START) { if (comb_class[last - 1]) *state = U8_STATE_COMBINING_MARK; } } saved_last = last; while (s < slast) { sz = u8_number_of_bytes[*s]; /* * If this is an illegal character, an incomplete * character, or an 7-bit ASCII Starter character, * then we have collected a sequence; break and let * the next call deal with the two cases. * * Note that this is okay only if you are using this * function with a fixed length string, not on * a buffer with multiple calls of one chunk at a time. */ if (sz <= 1) { break; } else if ((s + sz) > slast) { break; } else { /* * If the previous character was a Hangul Jamo * and this character is a Hangul Jamo that * can be conjoined, we collect the Jamo. */ if (*s == U8_HANGUL_JAMO_1ST_BYTE) { U8_PUT_3BYTES_INTO_UTF32(u1, *s, *(s + 1), *(s + 2)); if (U8_HANGUL_COMPOSABLE_L_V(*state, u1)) { i = 0; *state = U8_STATE_HANGUL_LV; goto COLLECT_A_HANGUL; } if (U8_HANGUL_COMPOSABLE_LV_T(*state, u1)) { i = 0; *state = U8_STATE_HANGUL_LVT; goto COLLECT_A_HANGUL; } } /* * Regardless of whatever it was, if this is * a Starter, we don't collect the character * since that's a new start and we will deal * with it at the next time. */ i = combining_class(uv, s, sz); if (i == U8_COMBINING_CLASS_STARTER) break; /* * We know the current character is a combining * mark. If the previous character wasn't * a Starter (not Hangul) or a combining mark, * then, we don't collect this combining mark. */ if (*state != U8_STATE_START && *state != U8_STATE_COMBINING_MARK) break; *state = U8_STATE_COMBINING_MARK; COLLECT_A_HANGUL: /* * If we collected a Starter and combining * marks up to 30, i.e., total 31 characters, * then, we terminate this degenerately long * combining sequence with a U+034F COMBINING * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in * UTF-8 and turn this into a Stream-Safe * Text. This will be extremely rare but * possible. * * The following will also guarantee that * we are not writing more than 32 characters * plus a NULL at u8s[]. */ if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { TURN_STREAM_SAFE: *state = U8_STATE_START; comb_class[last] = 0; start[last] = saved_sz; disp[last] = 2; last++; u8s[saved_sz++] = 0xCD; u8s[saved_sz++] = 0x8F; break; } /* * Some combining marks also do decompose into * another combining mark or marks. */ if (*state == U8_STATE_COMBINING_MARK) { k = last; l = sz; i = do_decomp(uv, uts, s, sz, canonical_decomposition, state); for (j = 0; j < i; ) { sz = u8_number_of_bytes[uts[j]]; comb_class[last] = combining_class(uv, uts + j, sz); start[last] = saved_sz + j; disp[last] = sz; last++; if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { last = k; goto TURN_STREAM_SAFE; } j += sz; } *state = U8_STATE_COMBINING_MARK; sz = i; s += l; for (i = 0; i < sz; i++) u8s[saved_sz++] = uts[i]; } else { comb_class[last] = i; start[last] = saved_sz; disp[last] = sz; last++; for (i = 0; i < sz; i++) u8s[saved_sz++] = *s++; } /* * If this is U+0345 COMBINING GREEK * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., * iota subscript, and need to be converted to * uppercase letter, convert it to U+0399 GREEK * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), * i.e., convert to capital adscript form as * specified in the Unicode standard. * * This is the only special case of (ambiguous) * case conversion at combining marks and * probably the standard will never have * anything similar like this in future. */ if (is_it_toupper && sz >= 2 && u8s[saved_sz - 2] == 0xCD && u8s[saved_sz - 1] == 0x85) { u8s[saved_sz - 2] = 0xCE; u8s[saved_sz - 1] = 0x99; } } } /* * Let's try to ensure a canonical ordering for the collected * combining marks. We do this only if we have collected * at least one more non-Starter. (The decomposition mapping * data tables have fully (and recursively) expanded and * canonically ordered decompositions.) * * The U8_SWAP_COMB_MARKS() convenience macro has some * assumptions and we are meeting the assumptions. */ last--; if (last >= saved_last) { for (i = 0; i < last; i++) for (j = last; j > i; j--) if (comb_class[j] && comb_class[j - 1] > comb_class[j]) { U8_SWAP_COMB_MARKS(j - 1, j); } } *source = s; if (! canonical_composition) { u8s[saved_sz] = '\0'; return (saved_sz); } /* * Now do the canonical composition. Note that we do this * only after a canonical or compatibility decomposition to * finish up NFC or NFKC. */ sz = do_composition(uv, u8s, comb_class, start, disp, last, &s, slast); } *source = s; return ((size_t)sz); } /* * The do_norm_compare() function does string comparion based on Unicode * simple case mappings and Unicode Normalization definitions. * * It does so by collecting a sequence of character at a time and comparing * the collected sequences from the strings. * * The meanings on the return values are the same as the usual strcmp(). */ static int do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, - int flag, int *errnum) + int flag, int *errnum) { int result; size_t sz1; size_t sz2; uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; uchar_t *s1last; uchar_t *s2last; boolean_t is_it_toupper; boolean_t is_it_tolower; boolean_t canonical_decomposition; boolean_t compatibility_decomposition; boolean_t canonical_composition; u8_normalization_states_t state; s1last = s1 + n1; s2last = s2 + n2; is_it_toupper = flag & U8_TEXTPREP_TOUPPER; is_it_tolower = flag & U8_TEXTPREP_TOLOWER; canonical_decomposition = flag & U8_CANON_DECOMP; compatibility_decomposition = flag & U8_COMPAT_DECOMP; canonical_composition = flag & U8_CANON_COMP; while (s1 < s1last && s2 < s2last) { /* * If the current character is a 7-bit ASCII and the last * character, or, if the current character and the next * character are both some 7-bit ASCII characters then * we treat the current character as a sequence. * * In any other cases, we need to call collect_a_seq(). */ if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { if (is_it_toupper) u8s1[0] = U8_ASCII_TOUPPER(*s1); else if (is_it_tolower) u8s1[0] = U8_ASCII_TOLOWER(*s1); else u8s1[0] = *s1; u8s1[1] = '\0'; sz1 = 1; s1++; } else { state = U8_STATE_START; sz1 = collect_a_seq(uv, u8s1, &s1, s1last, is_it_toupper, is_it_tolower, canonical_decomposition, compatibility_decomposition, canonical_composition, errnum, &state); } if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { if (is_it_toupper) u8s2[0] = U8_ASCII_TOUPPER(*s2); else if (is_it_tolower) u8s2[0] = U8_ASCII_TOLOWER(*s2); else u8s2[0] = *s2; u8s2[1] = '\0'; sz2 = 1; s2++; } else { state = U8_STATE_START; sz2 = collect_a_seq(uv, u8s2, &s2, s2last, is_it_toupper, is_it_tolower, canonical_decomposition, compatibility_decomposition, canonical_composition, errnum, &state); } /* * Now compare the two characters. If they are the same, * we move on to the next character sequences. */ if (sz1 == 1 && sz2 == 1) { if (*u8s1 > *u8s2) return (1); if (*u8s1 < *u8s2) return (-1); } else { result = strcmp((const char *)u8s1, (const char *)u8s2); if (result != 0) return (result); } } /* * We compared until the end of either or both strings. * * If we reached to or went over the ends for the both, that means * they are the same. * * If we reached only one end, that means the other string has * something which then can be used to determine the return value. */ if (s1 >= s1last) { if (s2 >= s2last) return (0); return (-1); } return (1); } /* * The u8_strcmp() function compares two UTF-8 strings quite similar to * the strcmp(). For the comparison, however, Unicode Normalization specific * equivalency and Unicode simple case conversion mappings based equivalency * can be requested and checked against. */ int u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, - int *errnum) + int *errnum) { int f; size_t n1; size_t n2; *errnum = 0; /* * Check on the requested Unicode version, case conversion, and * normalization flag values. */ if (uv > U8_UNICODE_LATEST) { *errnum = ERANGE; uv = U8_UNICODE_LATEST; } if (flag == 0) { flag = U8_STRCMP_CS; } else { f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | U8_STRCMP_CI_LOWER); if (f == 0) { flag |= U8_STRCMP_CS; } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && f != U8_STRCMP_CI_LOWER) { *errnum = EBADF; flag = U8_STRCMP_CS; } f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { *errnum = EBADF; flag = U8_STRCMP_CS; } } if (flag == U8_STRCMP_CS) { return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); } n1 = strlen(s1); n2 = strlen(s2); if (n != 0) { if (n < n1) n1 = n; if (n < n2) n2 = n; } /* * Simple case conversion can be done much faster and so we do * them separately here. */ if (flag == U8_STRCMP_CI_UPPER) { return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, B_TRUE, errnum)); } else if (flag == U8_STRCMP_CI_LOWER) { return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, B_FALSE, errnum)); } return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, flag, errnum)); } size_t u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, - int flag, size_t unicode_version, int *errnum) + int flag, size_t unicode_version, int *errnum) { int f; int sz; uchar_t *ib; uchar_t *ibtail; uchar_t *ob; uchar_t *obtail; boolean_t do_not_ignore_null; boolean_t do_not_ignore_invalid; boolean_t is_it_toupper; boolean_t is_it_tolower; boolean_t canonical_decomposition; boolean_t compatibility_decomposition; boolean_t canonical_composition; size_t ret_val; size_t i; size_t j; uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; u8_normalization_states_t state; if (unicode_version > U8_UNICODE_LATEST) { *errnum = ERANGE; return ((size_t)-1); } f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { *errnum = EBADF; return ((size_t)-1); } f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { *errnum = EBADF; return ((size_t)-1); } if (inarray == NULL || *inlen == 0) return (0); if (outarray == NULL) { *errnum = E2BIG; return ((size_t)-1); } ib = (uchar_t *)inarray; ob = (uchar_t *)outarray; ibtail = ib + *inlen; obtail = ob + *outlen; do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); is_it_toupper = flag & U8_TEXTPREP_TOUPPER; is_it_tolower = flag & U8_TEXTPREP_TOLOWER; ret_val = 0; /* * If we don't have a normalization flag set, we do the simple case * conversion based text preparation separately below. Text * preparation involving Normalization will be done in the false task * block, again, separately since it will take much more time and * resource than doing simple case conversions. */ if (f == 0) { while (ib < ibtail) { if (*ib == '\0' && do_not_ignore_null) break; sz = u8_number_of_bytes[*ib]; if (sz < 0) { if (do_not_ignore_invalid) { *errnum = EILSEQ; ret_val = (size_t)-1; break; } sz = 1; ret_val++; } if (sz == 1) { if (ob >= obtail) { *errnum = E2BIG; ret_val = (size_t)-1; break; } if (is_it_toupper) *ob = U8_ASCII_TOUPPER(*ib); else if (is_it_tolower) *ob = U8_ASCII_TOLOWER(*ib); else *ob = *ib; ib++; ob++; } else if ((ib + sz) > ibtail) { if (do_not_ignore_invalid) { *errnum = EINVAL; ret_val = (size_t)-1; break; } if ((obtail - ob) < (ibtail - ib)) { *errnum = E2BIG; ret_val = (size_t)-1; break; } /* * We treat the remaining incomplete character * bytes as a character. */ ret_val++; while (ib < ibtail) *ob++ = *ib++; } else { if (is_it_toupper || is_it_tolower) { i = do_case_conv(unicode_version, u8s, ib, sz, is_it_toupper); if ((obtail - ob) < i) { *errnum = E2BIG; ret_val = (size_t)-1; break; } ib += sz; for (sz = 0; sz < i; sz++) *ob++ = u8s[sz]; } else { if ((obtail - ob) < sz) { *errnum = E2BIG; ret_val = (size_t)-1; break; } for (i = 0; i < sz; i++) *ob++ = *ib++; } } } } else { canonical_decomposition = flag & U8_CANON_DECOMP; compatibility_decomposition = flag & U8_COMPAT_DECOMP; canonical_composition = flag & U8_CANON_COMP; while (ib < ibtail) { if (*ib == '\0' && do_not_ignore_null) break; /* * If the current character is a 7-bit ASCII * character and it is the last character, or, * if the current character is a 7-bit ASCII * character and the next character is also a 7-bit * ASCII character, then, we copy over this * character without going through collect_a_seq(). * * In any other cases, we need to look further with * the collect_a_seq() function. */ if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { if (ob >= obtail) { *errnum = E2BIG; ret_val = (size_t)-1; break; } if (is_it_toupper) *ob = U8_ASCII_TOUPPER(*ib); else if (is_it_tolower) *ob = U8_ASCII_TOLOWER(*ib); else *ob = *ib; ib++; ob++; } else { *errnum = 0; state = U8_STATE_START; j = collect_a_seq(unicode_version, u8s, &ib, ibtail, is_it_toupper, is_it_tolower, canonical_decomposition, compatibility_decomposition, canonical_composition, errnum, &state); if (*errnum && do_not_ignore_invalid) { ret_val = (size_t)-1; break; } if ((obtail - ob) < j) { *errnum = E2BIG; ret_val = (size_t)-1; break; } for (i = 0; i < j; i++) *ob++ = u8s[i]; } } } *inlen = ibtail - ib; *outlen = obtail - ob; return (ret_val); } #if defined(_KERNEL) && defined(HAVE_SPL) static int __init -unicode_init(void) { +unicode_init(void) +{ return (0); } static void __exit unicode_fini(void) { } module_init(unicode_init); module_exit(unicode_fini); MODULE_DESCRIPTION("Unicode implementation"); MODULE_AUTHOR(ZFS_META_AUTHOR); MODULE_LICENSE(ZFS_META_LICENSE); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(u8_validate); EXPORT_SYMBOL(u8_strcmp); EXPORT_SYMBOL(u8_textprep_str); #endif diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c index df8f80ee4de7..90b7d7d4ef72 100644 --- a/module/zcommon/zfs_fletcher_sse.c +++ b/module/zcommon/zfs_fletcher_sse.c @@ -1,229 +1,231 @@ /* * Implement fast Fletcher4 with SSE2,SSSE3 instructions. (x86) * * Use the 128-bit SSE2/SSSE3 SIMD instructions and registers to compute * Fletcher4 in two incremental 64-bit parallel accumulator streams, * and then combine the streams to form the final four checksum words. * This implementation is a derivative of the AVX SIMD implementation by * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). * * Copyright (C) 2016 Tyler J. Stachecki. * * Authors: * Tyler J. Stachecki * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if defined(HAVE_SSE2) #include #include #include #include #include static void -fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) { +fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) +{ bzero(ctx->sse, 4 * sizeof (zfs_fletcher_sse_t)); } static void -fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { +fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ uint64_t A, B, C, D; /* * The mixing matrix for checksum calculation is: * a = a0 + a1 * b = 2b0 + 2b1 - a1 * c = 4c0 - b0 + 4c1 -3b1 * d = 8d0 - 4c0 + 8d1 - 8c1 + b1; * * c and d are multiplied by 4 and 8, respectively, * before spilling the vectors out to memory. */ A = ctx->sse[0].v[0] + ctx->sse[0].v[1]; B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1]; C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] - 3 * ctx->sse[1].v[1]; D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] - 8 * ctx->sse[2].v[1] + ctx->sse[1].v[1]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); } #define FLETCHER_4_SSE_RESTORE_CTX(ctx) \ { \ asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0])); \ asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1])); \ asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2])); \ asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3])); \ } #define FLETCHER_4_SSE_SAVE_CTX(ctx) \ { \ asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0])); \ asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1])); \ asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2])); \ asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3])); \ } static void fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); kfpu_begin(); FLETCHER_4_SSE_RESTORE_CTX(ctx); asm volatile("pxor %xmm4, %xmm4"); for (; ip < ipend; ip += 2) { asm volatile("movdqu %0, %%xmm5" :: "m"(*ip)); asm volatile("movdqa %xmm5, %xmm6"); asm volatile("punpckldq %xmm4, %xmm5"); asm volatile("punpckhdq %xmm4, %xmm6"); asm volatile("paddq %xmm5, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); asm volatile("paddq %xmm6, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } FLETCHER_4_SSE_SAVE_CTX(ctx); kfpu_end(); } static void fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); kfpu_begin(); FLETCHER_4_SSE_RESTORE_CTX(ctx); for (; ip < ipend; ip += 2) { uint32_t scratch1 = BSWAP_32(ip[0]); uint32_t scratch2 = BSWAP_32(ip[1]); asm volatile("movd %0, %%xmm5" :: "r"(scratch1)); asm volatile("movd %0, %%xmm6" :: "r"(scratch2)); asm volatile("punpcklqdq %xmm6, %xmm5"); asm volatile("paddq %xmm5, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } FLETCHER_4_SSE_SAVE_CTX(ctx); kfpu_end(); } static boolean_t fletcher_4_sse2_valid(void) { return (zfs_sse2_available()); } const fletcher_4_ops_t fletcher_4_sse2_ops = { .init_native = fletcher_4_sse2_init, .fini_native = fletcher_4_sse2_fini, .compute_native = fletcher_4_sse2_native, .init_byteswap = fletcher_4_sse2_init, .fini_byteswap = fletcher_4_sse2_fini, .compute_byteswap = fletcher_4_sse2_byteswap, .valid = fletcher_4_sse2_valid, .name = "sse2" }; #endif /* defined(HAVE_SSE2) */ #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) static void fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { static const zfs_fletcher_sse_t mask = { .v = { 0x0405060700010203, 0x0C0D0E0F08090A0B } }; const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); kfpu_begin(); FLETCHER_4_SSE_RESTORE_CTX(ctx); asm volatile("movdqu %0, %%xmm7"::"m" (mask)); asm volatile("pxor %xmm4, %xmm4"); for (; ip < ipend; ip += 2) { asm volatile("movdqu %0, %%xmm5"::"m" (*ip)); asm volatile("pshufb %xmm7, %xmm5"); asm volatile("movdqa %xmm5, %xmm6"); asm volatile("punpckldq %xmm4, %xmm5"); asm volatile("punpckhdq %xmm4, %xmm6"); asm volatile("paddq %xmm5, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); asm volatile("paddq %xmm6, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } FLETCHER_4_SSE_SAVE_CTX(ctx); kfpu_end(); } static boolean_t fletcher_4_ssse3_valid(void) { return (zfs_sse2_available() && zfs_ssse3_available()); } const fletcher_4_ops_t fletcher_4_ssse3_ops = { .init_native = fletcher_4_sse2_init, .fini_native = fletcher_4_sse2_fini, .compute_native = fletcher_4_sse2_native, .init_byteswap = fletcher_4_sse2_init, .fini_byteswap = fletcher_4_sse2_fini, .compute_byteswap = fletcher_4_ssse3_byteswap, .valid = fletcher_4_ssse3_valid, .name = "ssse3" }; #endif /* defined(HAVE_SSE2) && defined(HAVE_SSSE3) */ diff --git a/module/zfs/abd.c b/module/zfs/abd.c index dca70d6f2bca..6ff266bd6ef8 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -1,1544 +1,1545 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. */ /* * ARC buffer data (ABD). * * ABDs are an abstract data structure for the ARC which can use two * different ways of storing the underlying data: * * (a) Linear buffer. In this case, all the data in the ABD is stored in one * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). * * +-------------------+ * | ABD (linear) | * | abd_flags = ... | * | abd_size = ... | +--------------------------------+ * | abd_buf ------------->| raw buffer of size abd_size | * +-------------------+ +--------------------------------+ * no abd_chunks * * (b) Scattered buffer. In this case, the data in the ABD is split into * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers * to the chunks recorded in an array at the end of the ABD structure. * * +-------------------+ * | ABD (scattered) | * | abd_flags = ... | * | abd_size = ... | * | abd_offset = 0 | +-----------+ * | abd_chunks[0] ----------------------------->| chunk 0 | * | abd_chunks[1] ---------------------+ +-----------+ * | ... | | +-----------+ * | abd_chunks[N-1] ---------+ +------->| chunk 1 | * +-------------------+ | +-----------+ * | ... * | +-----------+ * +----------------->| chunk N-1 | * +-----------+ * * Linear buffers act exactly like normal buffers and are always mapped into the * kernel's virtual memory space, while scattered ABD data chunks are allocated * as physical pages and then mapped in only while they are actually being * accessed through one of the abd_* library functions. Using scattered ABDs * provides several benefits: * * (1) They avoid use of kmem_*, preventing performance problems where running * kmem_reap on very large memory systems never finishes and causes * constant TLB shootdowns. * * (2) Fragmentation is less of an issue since when we are at the limit of * allocatable space, we won't have to search around for a long free * hole in the VA space for large ARC allocations. Each chunk is mapped in * individually, so even if we weren't using segkpm (see next point) we * wouldn't need to worry about finding a contiguous address range. * * (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs * on each ABD access. (If segkpm isn't available then we use all linear * ABDs to avoid this penalty.) See seg_kpm.c for more details. * * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not * available, which is the case on all 32-bit systems and any 64-bit systems * where kpm_enable is turned off. * * In addition to directly allocating a linear or scattered ABD, it is also * possible to create an ABD by requesting the "sub-ABD" starting at an offset * within an existing ABD. In linear buffers this is simple (set abd_buf of * the new ABD to the starting point within the original raw buffer), but * scattered ABDs are a little more complex. The new ABD makes a copy of the * relevant abd_chunks pointers (but not the underlying data). However, to * provide arbitrary rather than only chunk-aligned starting offsets, it also * tracks an abd_offset field which represents the starting point of the data * within the first chunk in abd_chunks. For both linear and scattered ABDs, * creating an offset ABD marks the original ABD as the offset's parent, and the * original ABD's abd_children refcount is incremented. This data allows us to * ensure the root ABD isn't deleted before its children. * * Most consumers should never need to know what type of ABD they're using -- * the ABD public API ensures that it's possible to transparently switch from * using a linear ABD to a scattered one when doing so would be beneficial. * * If you need to use the data within an ABD directly, if you know it's linear * (because you allocated it) you can use abd_to_buf() to access the underlying * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions * which will allocate a raw buffer if necessary. Use the abd_return_buf* * functions to return any raw buffers that are no longer necessary when you're * done using them. * * There are a variety of ABD APIs that implement basic buffer operations: * compare, copy, read, write, and fill with zeroes. If you need a custom * function which progressively accesses the whole ABD, use the abd_iterate_* * functions. */ #include #include #include #include #include #ifdef _KERNEL #include #include #else #define MAX_ORDER 1 #endif typedef struct abd_stats { kstat_named_t abdstat_struct_size; kstat_named_t abdstat_linear_cnt; kstat_named_t abdstat_linear_data_size; kstat_named_t abdstat_scatter_cnt; kstat_named_t abdstat_scatter_data_size; kstat_named_t abdstat_scatter_chunk_waste; kstat_named_t abdstat_scatter_orders[MAX_ORDER]; kstat_named_t abdstat_scatter_page_multi_chunk; kstat_named_t abdstat_scatter_page_multi_zone; kstat_named_t abdstat_scatter_page_alloc_retry; kstat_named_t abdstat_scatter_sg_table_retry; } abd_stats_t; static abd_stats_t abd_stats = { /* Amount of memory occupied by all of the abd_t struct allocations */ { "struct_size", KSTAT_DATA_UINT64 }, /* * The number of linear ABDs which are currently allocated, excluding * ABDs which don't own their data (for instance the ones which were * allocated through abd_get_offset() and abd_get_from_buf()). If an * ABD takes ownership of its buf then it will become tracked. */ { "linear_cnt", KSTAT_DATA_UINT64 }, /* Amount of data stored in all linear ABDs tracked by linear_cnt */ { "linear_data_size", KSTAT_DATA_UINT64 }, /* * The number of scatter ABDs which are currently allocated, excluding * ABDs which don't own their data (for instance the ones which were * allocated through abd_get_offset()). */ { "scatter_cnt", KSTAT_DATA_UINT64 }, /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ { "scatter_data_size", KSTAT_DATA_UINT64 }, /* * The amount of space wasted at the end of the last chunk across all * scatter ABDs tracked by scatter_cnt. */ { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, /* * The number of compound allocations of a given order. These * allocations are spread over all currently allocated ABDs, and * act as a measure of memory fragmentation. */ { { "scatter_order_N", KSTAT_DATA_UINT64 } }, /* * The number of scatter ABDs which contain multiple chunks. * ABDs are preferentially allocated from the minimum number of * contiguous multi-page chunks, a single chunk is optimal. */ { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, /* * The number of scatter ABDs which are split across memory zones. * ABDs are preferentially allocated using pages from a single zone. */ { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, /* * The total number of retries encountered when attempting to * allocate the pages to populate the scatter ABD. */ { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, /* * The total number of retries encountered when attempting to * allocate the sg table for an ABD. */ { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, }; #define ABDSTAT(stat) (abd_stats.stat.value.ui64) #define ABDSTAT_INCR(stat, val) \ atomic_add_64(&abd_stats.stat.value.ui64, (val)) #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) #define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) #define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf) #define abd_for_each_sg(abd, sg, n, i) \ for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) /* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; static kmem_cache_t *abd_cache = NULL; static kstat_t *abd_ksp; static inline size_t abd_chunkcnt_for_bytes(size_t size) { return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); } #ifdef _KERNEL #ifndef CONFIG_HIGHMEM #ifndef __GFP_RECLAIM #define __GFP_RECLAIM __GFP_WAIT #endif static unsigned long abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order) { struct page *page; page = alloc_pages_node(nid, gfp, order); if (!page) return (0); return ((unsigned long) page_address(page)); } /* * The goal is to minimize fragmentation by preferentially populating ABDs * with higher order compound pages from a single zone. Allocation size is * progressively decreased until it can be satisfied without performing * reclaim or compaction. When necessary this function will degenerate to * allocating individual pages and allowing reclaim to satisfy allocations. */ static void abd_alloc_pages(abd_t *abd, size_t size) { struct list_head pages; struct sg_table table; struct scatterlist *sg; struct page *page, *tmp_page; gfp_t gfp = __GFP_NOWARN | GFP_NOIO; gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); int nr_pages = abd_chunkcnt_for_bytes(size); int chunks = 0, zones = 0; size_t remaining_size; int nid = NUMA_NO_NODE; int alloc_pages = 0; int order; INIT_LIST_HEAD(&pages); while (alloc_pages < nr_pages) { unsigned long paddr; unsigned chunk_pages; order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); chunk_pages = (1U << order); paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order); if (paddr == 0) { if (order == 0) { ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); schedule_timeout_interruptible(1); } else { max_order = MAX(0, order - 1); } continue; } page = virt_to_page(paddr); list_add_tail(&page->lru, &pages); if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) zones++; nid = page_to_nid(page); ABDSTAT_BUMP(abdstat_scatter_orders[order]); chunks++; alloc_pages += chunk_pages; } ASSERT3S(alloc_pages, ==, nr_pages); while (sg_alloc_table(&table, chunks, gfp)) { ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); schedule_timeout_interruptible(1); } sg = table.sgl; remaining_size = size; list_for_each_entry_safe(page, tmp_page, &pages, lru) { size_t sg_size = MIN(PAGESIZE << compound_order(page), remaining_size); sg_set_page(sg, page, sg_size, 0); remaining_size -= sg_size; sg = sg_next(sg); list_del(&page->lru); } if (chunks > 1) { ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; if (zones) { ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); abd->abd_flags |= ABD_FLAG_MULTI_ZONE; } } ABD_SCATTER(abd).abd_sgl = table.sgl; ABD_SCATTER(abd).abd_nents = table.nents; } #else /* * Allocate N individual pages to construct a scatter ABD. This function * makes no attempt to request contiguous pages and requires the minimal * number of kernel interfaces. It's designed for maximum compatibility. */ static void abd_alloc_pages(abd_t *abd, size_t size) { struct scatterlist *sg; struct sg_table table; struct page *page; gfp_t gfp = __GFP_NOWARN | GFP_NOIO; int nr_pages = abd_chunkcnt_for_bytes(size); int i; while (sg_alloc_table(&table, nr_pages, gfp)) { ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); schedule_timeout_interruptible(1); } ASSERT3U(table.nents, ==, nr_pages); ABD_SCATTER(abd).abd_sgl = table.sgl; ABD_SCATTER(abd).abd_nents = nr_pages; abd_for_each_sg(abd, sg, nr_pages, i) { while ((page = __page_cache_alloc(gfp)) == NULL) { ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); schedule_timeout_interruptible(1); } ABDSTAT_BUMP(abdstat_scatter_orders[0]); sg_set_page(sg, page, PAGESIZE, 0); } if (nr_pages > 1) { ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; } } #endif /* !CONFIG_HIGHMEM */ static void abd_free_pages(abd_t *abd) { struct scatterlist *sg; struct sg_table table; struct page *page; int nr_pages = ABD_SCATTER(abd).abd_nents; int order, i, j; if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); abd_for_each_sg(abd, sg, nr_pages, i) { for (j = 0; j < sg->length; ) { page = nth_page(sg_page(sg), j >> PAGE_SHIFT); order = compound_order(page); __free_pages(page, order); j += (PAGESIZE << order); ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); } } table.sgl = ABD_SCATTER(abd).abd_sgl; table.nents = table.orig_nents = nr_pages; sg_free_table(&table); } #else /* _KERNEL */ #ifndef PAGE_SHIFT #define PAGE_SHIFT (highbit64(PAGESIZE)-1) #endif struct page; #define kpm_enable 1 #define abd_alloc_chunk(o) \ ((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP)) #define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o)) #define zfs_kmap_atomic(chunk, km) ((void *)chunk) #define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) #define local_irq_save(flags) do { (void)(flags); } while (0) #define local_irq_restore(flags) do { (void)(flags); } while (0) #define nth_page(pg, i) \ ((struct page *)((void *)(pg) + (i) * PAGESIZE)) struct scatterlist { struct page *page; int length; int end; }; static void -sg_init_table(struct scatterlist *sg, int nr) { +sg_init_table(struct scatterlist *sg, int nr) +{ memset(sg, 0, nr * sizeof (struct scatterlist)); sg[nr - 1].end = 1; } #define for_each_sg(sgl, sg, nr, i) \ for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { /* currently we don't use offset */ ASSERT(offset == 0); sg->page = page; sg->length = len; } static inline struct page * sg_page(struct scatterlist *sg) { return (sg->page); } static inline struct scatterlist * sg_next(struct scatterlist *sg) { if (sg->end) return (NULL); return (sg + 1); } static void abd_alloc_pages(abd_t *abd, size_t size) { unsigned nr_pages = abd_chunkcnt_for_bytes(size); struct scatterlist *sg; int i; ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * sizeof (struct scatterlist), KM_SLEEP); sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); abd_for_each_sg(abd, sg, nr_pages, i) { struct page *p = abd_alloc_chunk(0); sg_set_page(sg, p, PAGESIZE, 0); } ABD_SCATTER(abd).abd_nents = nr_pages; } static void abd_free_pages(abd_t *abd) { int i, n = ABD_SCATTER(abd).abd_nents; struct scatterlist *sg; int j; abd_for_each_sg(abd, sg, n, i) { for (j = 0; j < sg->length; j += PAGESIZE) { struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT); abd_free_chunk(p, 0); } } vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); } #endif /* _KERNEL */ void abd_init(void) { int i; abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), 0, NULL, NULL, NULL, NULL, NULL, 0); abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (abd_ksp != NULL) { abd_ksp->ks_data = &abd_stats; kstat_install(abd_ksp); for (i = 0; i < MAX_ORDER; i++) { snprintf(abd_stats.abdstat_scatter_orders[i].name, KSTAT_STRLEN, "scatter_order_%d", i); abd_stats.abdstat_scatter_orders[i].data_type = KSTAT_DATA_UINT64; } } } void abd_fini(void) { if (abd_ksp != NULL) { kstat_delete(abd_ksp); abd_ksp = NULL; } if (abd_cache) { kmem_cache_destroy(abd_cache); abd_cache = NULL; } } static inline void abd_verify(abd_t *abd) { ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | ABD_FLAG_MULTI_CHUNK)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); } else { size_t n; int i; struct scatterlist *sg; ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); ASSERT3U(ABD_SCATTER(abd).abd_offset, <, ABD_SCATTER(abd).abd_sgl->length); n = ABD_SCATTER(abd).abd_nents; abd_for_each_sg(abd, sg, n, i) { ASSERT3P(sg_page(sg), !=, NULL); } } } static inline abd_t * abd_alloc_struct(void) { abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); ASSERT3P(abd, !=, NULL); ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); return (abd); } static inline void abd_free_struct(abd_t *abd) { kmem_cache_free(abd_cache, abd); ABDSTAT_INCR(abdstat_struct_size, -sizeof (abd_t)); } /* * Allocate an ABD, along with its own underlying data buffers. Use this if you * don't care whether the ABD is linear or not. */ abd_t * abd_alloc(size_t size, boolean_t is_metadata) { abd_t *abd; if (!zfs_abd_scatter_enabled || size <= PAGESIZE) return (abd_alloc_linear(size, is_metadata)); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); abd = abd_alloc_struct(); abd->abd_flags = ABD_FLAG_OWNER; abd_alloc_pages(abd, size); if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; abd->abd_parent = NULL; refcount_create(&abd->abd_children); abd->abd_u.abd_scatter.abd_offset = 0; ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, P2ROUNDUP(size, PAGESIZE) - size); return (abd); } static void abd_free_scatter(abd_t *abd) { abd_free_pages(abd); refcount_destroy(&abd->abd_children); ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); ABDSTAT_INCR(abdstat_scatter_chunk_waste, abd->abd_size - P2ROUNDUP(abd->abd_size, PAGESIZE)); abd_free_struct(abd); } /* * Allocate an ABD that must be linear, along with its own underlying data * buffer. Only use this when it would be very annoying to write your ABD * consumer with a scattered ABD. */ abd_t * abd_alloc_linear(size_t size, boolean_t is_metadata) { abd_t *abd = abd_alloc_struct(); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } abd->abd_size = size; abd->abd_parent = NULL; refcount_create(&abd->abd_children); if (is_metadata) { abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); } else { abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); } ABDSTAT_BUMP(abdstat_linear_cnt); ABDSTAT_INCR(abdstat_linear_data_size, size); return (abd); } static void abd_free_linear(abd_t *abd) { if (abd->abd_flags & ABD_FLAG_META) { zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); } else { zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); } refcount_destroy(&abd->abd_children); ABDSTAT_BUMPDOWN(abdstat_linear_cnt); ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); abd_free_struct(abd); } /* * Free an ABD. Only use this on ABDs allocated with abd_alloc() or * abd_alloc_linear(). */ void abd_free(abd_t *abd) { abd_verify(abd); ASSERT3P(abd->abd_parent, ==, NULL); ASSERT(abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) abd_free_linear(abd); else abd_free_scatter(abd); } /* * Allocate an ABD of the same format (same metadata flag, same scatterize * setting) as another ABD. */ abd_t * abd_alloc_sametype(abd_t *sabd, size_t size) { boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; if (abd_is_linear(sabd)) { return (abd_alloc_linear(size, is_metadata)); } else { return (abd_alloc(size, is_metadata)); } } /* * If we're going to use this ABD for doing I/O using the block layer, the * consumer of the ABD data doesn't care if it's scattered or not, and we don't * plan to store this ABD in memory for a long period of time, we should * allocate the ABD type that requires the least data copying to do the I/O. * * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os * using a scatter/gather list we should switch to that and replace this call * with vanilla abd_alloc(). * * On Linux the optimal thing to do would be to use abd_get_offset() and * construct a new ABD which shares the original pages thereby eliminating * the copy. But for the moment a new linear ABD is allocated until this * performance optimization can be implemented. */ abd_t * abd_alloc_for_io(size_t size, boolean_t is_metadata) { return (abd_alloc_linear(size, is_metadata)); } /* * Allocate a new ABD to point to offset off of sabd. It shares the underlying * buffer data with sabd. Use abd_put() to free. sabd must not be freed while * any derived ABDs exist. */ static inline abd_t * abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) { abd_t *abd; abd_verify(sabd); ASSERT3U(off, <=, sabd->abd_size); if (abd_is_linear(sabd)) { abd = abd_alloc_struct(); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ abd->abd_flags = ABD_FLAG_LINEAR; abd->abd_u.abd_linear.abd_buf = (char *)sabd->abd_u.abd_linear.abd_buf + off; } else { int i; struct scatterlist *sg; size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; abd = abd_alloc_struct(); /* * Even if this buf is filesystem metadata, we only track that * if we own the underlying data buffer, which is not true in * this case. Therefore, we don't ever use ABD_FLAG_META here. */ abd->abd_flags = 0; abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { if (new_offset < sg->length) break; new_offset -= sg->length; } ABD_SCATTER(abd).abd_sgl = sg; ABD_SCATTER(abd).abd_offset = new_offset; ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; } abd->abd_size = size; abd->abd_parent = sabd; refcount_create(&abd->abd_children); (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd); return (abd); } abd_t * abd_get_offset(abd_t *sabd, size_t off) { size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; VERIFY3U(size, >, 0); return (abd_get_offset_impl(sabd, off, size)); } abd_t * abd_get_offset_size(abd_t *sabd, size_t off, size_t size) { ASSERT3U(off + size, <=, sabd->abd_size); return (abd_get_offset_impl(sabd, off, size)); } /* * Allocate a linear ABD structure for buf. You must free this with abd_put() * since the resulting ABD doesn't own its own buffer. */ abd_t * abd_get_from_buf(void *buf, size_t size) { abd_t *abd = abd_alloc_struct(); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); /* * Even if this buf is filesystem metadata, we only track that if we * own the underlying data buffer, which is not true in this case. * Therefore, we don't ever use ABD_FLAG_META here. */ abd->abd_flags = ABD_FLAG_LINEAR; abd->abd_size = size; abd->abd_parent = NULL; refcount_create(&abd->abd_children); abd->abd_u.abd_linear.abd_buf = buf; return (abd); } /* * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not * free the underlying scatterlist or buffer. */ void abd_put(abd_t *abd) { abd_verify(abd); ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); if (abd->abd_parent != NULL) { (void) refcount_remove_many(&abd->abd_parent->abd_children, abd->abd_size, abd); } refcount_destroy(&abd->abd_children); abd_free_struct(abd); } /* * Get the raw buffer associated with a linear ABD. */ void * abd_to_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); abd_verify(abd); return (abd->abd_u.abd_linear.abd_buf); } /* * Borrow a raw buffer from an ABD without copying the contents of the ABD * into the buffer. If the ABD is scattered, this will allocate a raw buffer * whose contents are undefined. To copy over the existing data in the ABD, use * abd_borrow_buf_copy() instead. */ void * abd_borrow_buf(abd_t *abd, size_t n) { void *buf; abd_verify(abd); ASSERT3U(abd->abd_size, >=, n); if (abd_is_linear(abd)) { buf = abd_to_buf(abd); } else { buf = zio_buf_alloc(n); } (void) refcount_add_many(&abd->abd_children, n, buf); return (buf); } void * abd_borrow_buf_copy(abd_t *abd, size_t n) { void *buf = abd_borrow_buf(abd, n); if (!abd_is_linear(abd)) { abd_copy_to_buf(buf, abd, n); } return (buf); } /* * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will * not change the contents of the ABD and will ASSERT that you didn't modify * the buffer since it was borrowed. If you want any changes you made to buf to * be copied back to abd, use abd_return_buf_copy() instead. */ void abd_return_buf(abd_t *abd, void *buf, size_t n) { abd_verify(abd); ASSERT3U(abd->abd_size, >=, n); if (abd_is_linear(abd)) { ASSERT3P(buf, ==, abd_to_buf(abd)); } else { ASSERT0(abd_cmp_buf(abd, buf, n)); zio_buf_free(buf, n); } (void) refcount_remove_many(&abd->abd_children, n, buf); } void abd_return_buf_copy(abd_t *abd, void *buf, size_t n) { if (!abd_is_linear(abd)) { abd_copy_from_buf(abd, buf, n); } abd_return_buf(abd, buf, n); } /* * Give this ABD ownership of the buffer that it's storing. Can only be used on * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated * with abd_alloc_linear() which subsequently released ownership of their buf * with abd_release_ownership_of_buf(). */ void abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) { ASSERT(abd_is_linear(abd)); ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); abd_verify(abd); abd->abd_flags |= ABD_FLAG_OWNER; if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } ABDSTAT_BUMP(abdstat_linear_cnt); ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); } void abd_release_ownership_of_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); ASSERT(abd->abd_flags & ABD_FLAG_OWNER); abd_verify(abd); abd->abd_flags &= ~ABD_FLAG_OWNER; /* Disable this flag since we no longer own the data buffer */ abd->abd_flags &= ~ABD_FLAG_META; ABDSTAT_BUMPDOWN(abdstat_linear_cnt); ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); } #ifndef HAVE_1ARG_KMAP_ATOMIC #define NR_KM_TYPE (6) #ifdef _KERNEL int km_table[NR_KM_TYPE] = { KM_USER0, KM_USER1, KM_BIO_SRC_IRQ, KM_BIO_DST_IRQ, KM_PTE0, KM_PTE1, }; #endif #endif struct abd_iter { /* public interface */ void *iter_mapaddr; /* addr corresponding to iter_pos */ size_t iter_mapsize; /* length of data valid at mapaddr */ /* private */ abd_t *iter_abd; /* ABD being iterated through */ size_t iter_pos; size_t iter_offset; /* offset in current sg/abd_buf, */ /* abd_offset included */ struct scatterlist *iter_sg; /* current sg */ #ifndef HAVE_1ARG_KMAP_ATOMIC int iter_km; /* KM_* for kmap_atomic */ #endif }; /* * Initialize the abd_iter. */ static void abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) { abd_verify(abd); aiter->iter_abd = abd; aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; aiter->iter_pos = 0; if (abd_is_linear(abd)) { aiter->iter_offset = 0; aiter->iter_sg = NULL; } else { aiter->iter_offset = ABD_SCATTER(abd).abd_offset; aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; } #ifndef HAVE_1ARG_KMAP_ATOMIC ASSERT3U(km_type, <, NR_KM_TYPE); aiter->iter_km = km_type; #endif } /* * Advance the iterator by a certain amount. Cannot be called when a chunk is * in use. This can be safely called when the aiter has already exhausted, in * which case this does nothing. */ static void abd_iter_advance(struct abd_iter *aiter, size_t amount) { ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); /* There's nothing left to advance to, so do nothing */ if (aiter->iter_pos == aiter->iter_abd->abd_size) return; aiter->iter_pos += amount; aiter->iter_offset += amount; if (!abd_is_linear(aiter->iter_abd)) { while (aiter->iter_offset >= aiter->iter_sg->length) { aiter->iter_offset -= aiter->iter_sg->length; aiter->iter_sg = sg_next(aiter->iter_sg); if (aiter->iter_sg == NULL) { ASSERT0(aiter->iter_offset); break; } } } } /* * Map the current chunk into aiter. This can be safely called when the aiter * has already exhausted, in which case this does nothing. */ static void abd_iter_map(struct abd_iter *aiter) { void *paddr; size_t offset = 0; ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); /* There's nothing left to iterate over, so do nothing */ if (aiter->iter_pos == aiter->iter_abd->abd_size) return; if (abd_is_linear(aiter->iter_abd)) { ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); offset = aiter->iter_offset; aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; } else { offset = aiter->iter_offset; aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, aiter->iter_abd->abd_size - aiter->iter_pos); paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), km_table[aiter->iter_km]); } aiter->iter_mapaddr = (char *)paddr + offset; } /* * Unmap the current chunk from aiter. This can be safely called when the aiter * has already exhausted, in which case this does nothing. */ static void abd_iter_unmap(struct abd_iter *aiter) { /* There's nothing left to unmap, so do nothing */ if (aiter->iter_pos == aiter->iter_abd->abd_size) return; if (!abd_is_linear(aiter->iter_abd)) { /* LINTED E_FUNC_SET_NOT_USED */ zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, km_table[aiter->iter_km]); } ASSERT3P(aiter->iter_mapaddr, !=, NULL); ASSERT3U(aiter->iter_mapsize, >, 0); aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; } int abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_iter_func_t *func, void *private) { int ret = 0; struct abd_iter aiter; abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); abd_iter_init(&aiter, abd, 0); abd_iter_advance(&aiter, off); while (size > 0) { size_t len; abd_iter_map(&aiter); len = MIN(aiter.iter_mapsize, size); ASSERT3U(len, >, 0); ret = func(aiter.iter_mapaddr, len, private); abd_iter_unmap(&aiter); if (ret != 0) break; size -= len; abd_iter_advance(&aiter, len); } return (ret); } struct buf_arg { void *arg_buf; }; static int abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) { struct buf_arg *ba_ptr = private; (void) memcpy(ba_ptr->arg_buf, buf, size); ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; return (0); } /* * Copy abd to buf. (off is the offset in abd.) */ void abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) { struct buf_arg ba_ptr = { buf }; (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, &ba_ptr); } static int abd_cmp_buf_off_cb(void *buf, size_t size, void *private) { int ret; struct buf_arg *ba_ptr = private; ret = memcmp(buf, ba_ptr->arg_buf, size); ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; return (ret); } /* * Compare the contents of abd to buf. (off is the offset in abd.) */ int abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) { struct buf_arg ba_ptr = { (void *) buf }; return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); } static int abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) { struct buf_arg *ba_ptr = private; (void) memcpy(buf, ba_ptr->arg_buf, size); ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; return (0); } /* * Copy from buf to abd. (off is the offset in abd.) */ void abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) { struct buf_arg ba_ptr = { (void *) buf }; (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, &ba_ptr); } /*ARGSUSED*/ static int abd_zero_off_cb(void *buf, size_t size, void *private) { (void) memset(buf, 0, size); return (0); } /* * Zero out the abd from a particular offset to the end. */ void abd_zero_off(abd_t *abd, size_t off, size_t size) { (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); } /* * Iterate over two ABDs and call func incrementally on the two ABDs' data in * equal-sized chunks (passed to func as raw buffers). func could be called many * times during this iteration. */ int abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size, abd_iter_func2_t *func, void *private) { int ret = 0; struct abd_iter daiter, saiter; abd_verify(dabd); abd_verify(sabd); ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); abd_iter_init(&daiter, dabd, 0); abd_iter_init(&saiter, sabd, 1); abd_iter_advance(&daiter, doff); abd_iter_advance(&saiter, soff); while (size > 0) { size_t dlen, slen, len; abd_iter_map(&daiter); abd_iter_map(&saiter); dlen = MIN(daiter.iter_mapsize, size); slen = MIN(saiter.iter_mapsize, size); len = MIN(dlen, slen); ASSERT(dlen > 0 || slen > 0); ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, private); abd_iter_unmap(&saiter); abd_iter_unmap(&daiter); if (ret != 0) break; size -= len; abd_iter_advance(&daiter, len); abd_iter_advance(&saiter, len); } return (ret); } /*ARGSUSED*/ static int abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) { (void) memcpy(dbuf, sbuf, size); return (0); } /* * Copy from sabd to dabd starting from soff and doff. */ void abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) { (void) abd_iterate_func2(dabd, sabd, doff, soff, size, abd_copy_off_cb, NULL); } /*ARGSUSED*/ static int abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) { return (memcmp(bufa, bufb, size)); } /* * Compares the contents of two ABDs. */ int abd_cmp(abd_t *dabd, abd_t *sabd) { ASSERT3U(dabd->abd_size, ==, sabd->abd_size); return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, abd_cmp_cb, NULL)); } /* * Iterate over code ABDs and a data ABD and call @func_raidz_gen. * * @cabds parity ABDs, must have equal size * @dabd data ABD. Can be NULL (in this case @dsize = 0) * @func_raidz_gen should be implemented so that its behaviour * is the same when taking linear and when taking scatter */ void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, - ssize_t csize, ssize_t dsize, const unsigned parity, - void (*func_raidz_gen)(void **, const void *, size_t, size_t)) + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) { int i; ssize_t len, dlen; struct abd_iter caiters[3]; struct abd_iter daiter = {0}; void *caddrs[3]; unsigned long flags; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) abd_iter_init(&caiters[i], cabds[i], i); if (dabd) abd_iter_init(&daiter, dabd, i); ASSERT3S(dsize, >=, 0); local_irq_save(flags); while (csize > 0) { len = csize; if (dabd && dsize > 0) abd_iter_map(&daiter); for (i = 0; i < parity; i++) { abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; } switch (parity) { case 3: len = MIN(caiters[2].iter_mapsize, len); case 2: len = MIN(caiters[1].iter_mapsize, len); case 1: len = MIN(caiters[0].iter_mapsize, len); } /* must be progressive */ ASSERT3S(len, >, 0); if (dabd && dsize > 0) { /* this needs precise iter.length */ len = MIN(daiter.iter_mapsize, len); dlen = len; } else dlen = 0; /* must be progressive */ ASSERT3S(len, >, 0); /* * The iterated function likely will not do well if each * segment except the last one is not multiple of 512 (raidz). */ ASSERT3U(((uint64_t)len & 511ULL), ==, 0); func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&caiters[i]); abd_iter_advance(&caiters[i], len); } if (dabd && dsize > 0) { abd_iter_unmap(&daiter); abd_iter_advance(&daiter, dlen); dsize -= dlen; } csize -= len; ASSERT3S(dsize, >=, 0); ASSERT3S(csize, >=, 0); } local_irq_restore(flags); } /* * Iterate over code ABDs and data reconstruction target ABDs and call * @func_raidz_rec. Function maps at most 6 pages atomically. * * @cabds parity ABDs, must have equal size * @tabds rec target ABDs, at most 3 * @tsize size of data target columns * @func_raidz_rec expects syndrome data in target columns. Function * reconstructs data and overwrites target columns. */ void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, - ssize_t tsize, const unsigned parity, - void (*func_raidz_rec)(void **t, const size_t tsize, void **c, - const unsigned *mul), - const unsigned *mul) + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) { int i; ssize_t len; struct abd_iter citers[3]; struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; unsigned long flags; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { abd_iter_init(&citers[i], cabds[i], 2*i); abd_iter_init(&xiters[i], tabds[i], 2*i+1); } local_irq_save(flags); while (tsize > 0) { for (i = 0; i < parity; i++) { abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; xaddrs[i] = xiters[i].iter_mapaddr; } len = tsize; switch (parity) { case 3: len = MIN(xiters[2].iter_mapsize, len); len = MIN(citers[2].iter_mapsize, len); case 2: len = MIN(xiters[1].iter_mapsize, len); len = MIN(citers[1].iter_mapsize, len); case 1: len = MIN(xiters[0].iter_mapsize, len); len = MIN(citers[0].iter_mapsize, len); } /* must be progressive */ ASSERT3S(len, >, 0); /* * The iterated function likely will not do well if each * segment except the last one is not multiple of 512 (raidz). */ ASSERT3U(((uint64_t)len & 511ULL), ==, 0); func_raidz_rec(xaddrs, len, caddrs, mul); for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&xiters[i]); abd_iter_unmap(&citers[i]); abd_iter_advance(&xiters[i], len); abd_iter_advance(&citers[i], len); } tsize -= len; ASSERT3S(tsize, >=, 0); } local_irq_restore(flags); } #if defined(_KERNEL) && defined(HAVE_SPL) /* * bio_nr_pages for ABD. * @off is the offset in @abd */ unsigned long abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) { unsigned long pos; if (abd_is_linear(abd)) pos = (unsigned long)abd_to_buf(abd) + off; else pos = abd->abd_u.abd_scatter.abd_offset + off; return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT); } /* * bio_map for scatter ABD. * @off is the offset in @abd * Remaining IO size is returned */ unsigned int abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, - unsigned int io_size, size_t off) + unsigned int io_size, size_t off) { int i; struct abd_iter aiter; ASSERT(!abd_is_linear(abd)); ASSERT3U(io_size, <=, abd->abd_size - off); abd_iter_init(&aiter, abd, 0); abd_iter_advance(&aiter, off); for (i = 0; i < bio->bi_max_vecs; i++) { struct page *pg; size_t len, sgoff, pgoff; struct scatterlist *sg; if (io_size <= 0) break; sg = aiter.iter_sg; sgoff = aiter.iter_offset; pgoff = sgoff & (PAGESIZE - 1); len = MIN(io_size, PAGESIZE - pgoff); ASSERT(len > 0); pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); if (bio_add_page(bio, pg, len, pgoff) != len) break; io_size -= len; abd_iter_advance(&aiter, len); } return (io_size); } /* Tunable Parameters */ module_param(zfs_abd_scatter_enabled, int, 0644); MODULE_PARM_DESC(zfs_abd_scatter_enabled, "Toggle whether ABD allocations must be linear."); /* CSTYLED */ module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); #endif diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 2ce8cf628f1d..351e50e404a3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1,7791 +1,7792 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (rangeing from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal ARC algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * ARC list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each ARC state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an ARC list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * It as also possible to register a callback which is run when the * arc_meta_limit is reached and no buffers can be safely evicted. In * this case the arc user should drop a reference on some arc buffers so * they can be reclaimed and the arc_meta_limit honored. For example, * when using the ZPL each dentry holds a references on a znode. These * dentries must be pruned before the arc buffer holding the znode can * be safely evicted. * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the * ability to store the physical data (b_pabd) associated with the DVA of the * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer. The ARC will provide references to this data and will keep it * cached until it is no longer in use. The ARC caches only the L1ARC's physical * data block and will evict any arc_buf_t that is no longer referenced. The * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the * "overhead_size" kstat. * * Depending on the consumer, an arc_buf_t can be requested in uncompressed or * compressed form. The typical case is that consumers will want uncompressed * data, and when that happens a new data buffer is allocated where the data is * decompressed for them to use. Currently the only consumer who wants * compressed arc_buf_t's is "zfs send", when it streams data exactly as it * exists on disk. When this happens, the arc_buf_t's data buffer is shared * with the arc_buf_hdr_t. * * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The * first one is owned by a compressed send consumer (and therefore references * the same compressed data buffer as the arc_buf_hdr_t) and the second could be * used by any other consumer (and has its own uncompressed copy of the data * buffer). * * arc_buf_hdr_t * +-----------+ * | fields | * | common to | * | L1- and | * | L2ARC | * +-----------+ * | l2arc_buf_hdr_t * | | * +-----------+ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | * | +-----------+ | |b_data +-+ * +->+------+ | +-----------+ | * compressed | | | | * data | |<--------------+ | uncompressed * +------+ compressed, | data * shared +-->+------+ * data | | * | | * +------+ * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be * the last buffer in the hdr's b_buf list, however a shared compressed buf can * be anywhere in the hdr's list. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t (note that the shared uncompressed buf is * the last element in the buf list): * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will bcopy the transformed on-disk block into * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * * When the L2ARC is in use, it will also take advantage of the b_pabd. The * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #endif #include #include #include #include #include #include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; #endif static kmutex_t arc_reclaim_lock; static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ int zfs_arc_evict_batch_limit = 10; /* * The number of sublists used for each of the arc state lists. If this * is not set to a suitable value by the user, it will be configured to * the number of CPUs on the system in arc_init(). */ int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ static int arc_grow_retry = 5; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ static int arc_shrink_shift = 7; /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ int arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static int arc_min_prefetch_lifespan; /* * If this percent of memory is free, don't throttle. */ int arc_lotsfree_percent = 10; static int arc_dead; /* * The arc has filled available memory and has now warmed up. */ static boolean_t arc_warm; /* * log2 fraction of the zio arena to keep free. */ int arc_zio_arena_free_shift = 2; /* * These tunables are for performance analysis. */ unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; unsigned long zfs_arc_meta_min = 0; unsigned long zfs_arc_dnode_limit = 0; unsigned long zfs_arc_dnode_reduce_percent = 10; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ int zfs_compressed_arc_enabled = B_TRUE; /* * ARC will evict meta buffers that exceed arc_meta_limit. This * tunable make arc_meta_limit adjustable for different workloads. */ unsigned long zfs_arc_meta_limit_percent = 75; /* * Percentage that can be consumed by dnodes of ARC meta buffers. */ unsigned long zfs_arc_dnode_limit_percent = 10; /* * These tunables are Linux specific */ unsigned long zfs_arc_sys_free = 0; int zfs_arc_min_prefetch_lifespan = 0; int zfs_arc_p_aggressive_disable = 1; int zfs_arc_p_dampener_disable = 1; int zfs_arc_meta_prune = 10000; int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; int zfs_arc_meta_adjust_restarts = 4096; int zfs_arc_lotsfree_percent = 10; /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; kstat_named_t arcstat_misses; kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_misses; kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_misses; kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_misses; kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held * by something using the same buffer, since hash locks are shared * by multiple buffers. */ kstat_named_t arcstat_mutex_miss; /* * Number of buffers skipped because they have I/O in progress, are * indrect prefetch buffers that have not lived long enough, or are * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; /* * Number of times arc_evict_state() was unable to evict enough * buffers to reach its target amount. */ kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_p; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ kstat_named_t arcstat_uncompressed_size; /* * Number of bytes stored in all the arc_buf_t's. This is classified * as "overhead" since this data is typically short-lived and will * be evicted from the arc when it becomes unreferenced unless the * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level * values have been set (see comment in dbuf.c for more information). */ kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only * caches), and arc_buf_t structures (allocated via arc_buf_t * cache). */ kstat_named_t arcstat_hdr_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_DATA. This is generally consumed by buffers backing * on disk user data (e.g. plain file contents). */ kstat_named_t arcstat_data_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_METADATA. This is generally consumed by buffers * backing on disk data that is used for internal ZFS * structures (e.g. ZAP, dnode, indirect blocks, etc). */ kstat_named_t arcstat_metadata_size; /* * Number of bytes consumed by dmu_buf_impl_t objects. */ kstat_named_t arcstat_dbuf_size; /* * Number of bytes consumed by dnode_t objects. */ kstat_named_t arcstat_dnode_size; /* * Number of bytes consumed by bonus buffers. */ kstat_named_t arcstat_bonus_size; /* * Total number of bytes consumed by ARC buffers residing in the * arc_anon state. This includes *all* buffers in the arc_anon * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_anon_size; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mru state. This includes *all* buffers in the arc_mru * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mru_size; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mru_ghost state. The key thing to note * here, is the fact that this size doesn't actually indicate * RAM consumption. The ghost lists only consist of headers and * don't actually have ARC buffers linked off of these headers. * Thus, *if* the headers had associated ARC buffers, these * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mfu state. This includes *all* buffers in the arc_mfu * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mfu_size; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu * state. */ kstat_named_t arcstat_mfu_evictable_data; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_METADATA, and reside in the * arc_mfu state. */ kstat_named_t arcstat_mfu_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mfu_ghost state. See the comment above * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_memory_direct_count; kstat_named_t arcstat_memory_indirect_count; kstat_named_t arcstat_no_grow; kstat_named_t arcstat_tempreserve; kstat_named_t arcstat_loaned_bytes; kstat_named_t arcstat_prune; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_dnode_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; kstat_named_t arcstat_sync_wait_for_async; kstat_named_t arcstat_demand_hit_predictive_prefetch; kstat_named_t arcstat_need_free; kstat_named_t arcstat_sys_free; } arc_stats_t; static arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "dbuf_size", KSTAT_DATA_UINT64 }, { "dnode_size", KSTAT_DATA_UINT64 }, { "bonus_size", KSTAT_DATA_UINT64 }, { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, { "memory_indirect_count", KSTAT_DATA_UINT64 }, { "arc_no_grow", KSTAT_DATA_UINT64 }, { "arc_tempreserve", KSTAT_DATA_UINT64 }, { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, { "arc_prune", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, { "sync_wait_for_async", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, { "arc_sys_free", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } #define ARCSTAT_MAXSTAT(stat) \ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ #define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ #define arc_dbuf_size ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */ #define arc_dnode_size ARCSTAT(arcstat_dnode_size) /* dnode metadata */ #define arc_bonus_size ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */ #define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ /* compressed size of entire arc */ #define arc_compressed_size ARCSTAT(arcstat_compressed_size) /* uncompressed size of entire arc */ #define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) /* number of bytes in the arc from arc_buf_t's */ #define arc_overhead_size ARCSTAT(arcstat_overhead_size) static list_t arc_prune_list; static kmutex_t arc_prune_mtx; static taskq_t *arc_prune_taskq; #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) /* * Other sizes */ #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define HT_LOCK_ALIGN 64 #define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL unsigned char pad[HT_LOCK_PAD]; #endif }; #define BUF_LOCKS 8192 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; struct ht_lock ht_locks[BUF_LOCKS]; } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ /* * We can feed L2ARC from two states of ARC buffers, mru and mfu, * and each of the state has two types: data and metadata. */ #define L2ARC_FEED_TYPES 4 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) /* L2ARC Performance Tunables */ unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ int l2arc_feed_again = B_TRUE; /* turbo warmup */ int l2arc_norw = B_FALSE; /* no reads during writes */ /* * L2ARC Internals */ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read header */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ } l2arc_read_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); static void arc_hdr_free_pabd(arc_buf_hdr_t *); static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); static void arc_tuning_update(void); static void arc_prune_async(int64_t); static uint64_t arc_all_memory(void); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; crc ^= (spa>>8) ^ birth; return (crc); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } ARCSTAT_BUMP(arcstat_hash_elements); ARCSTAT_MAXSTAT(arcstat_hash_elements); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { int i; #if defined(_KERNEL) && defined(HAVE_SPL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel\ */ vmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); #endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ /* ARGSUSED */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_FULL_SIZE); cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); list_link_init(&hdr->b_l1hdr.b_arc_node); list_link_init(&hdr->b_l2hdr.b_l2node); multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } /* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ /* ARGSUSED */ static void hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } /* ARGSUSED */ static void hdr_l2only_dest(void *vbuf, void *unused) { ASSERTV(arc_buf_hdr_t *hdr = vbuf); ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* * Reclaim callback -- invoked when memory is low. */ /* ARGSUSED */ static void hdr_recl(void *unused) { dprintf("hdr_recl called\n"); /* * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). */ if (!arc_dead) cv_signal(&arc_reclaim_thread_cv); } static void buf_init(void) { uint64_t *ct = NULL; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < arc_all_memory()) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; #if defined(_KERNEL) && defined(HAVE_SPL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ buf_hash_table.ht_table = vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); #else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); #endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) { mutex_init(&buf_hash_table.ht_locks[i].ht_lock, NULL, MUTEX_DEFAULT, NULL); } } #define ARC_MINTIME (hz>>4) /* 62 ms */ /* * This is the size that the buf occupies in memory. If the buf is compressed, * it will correspond to the compressed size. You should use this method of * getting the buf size unless you explicitly need the logical size. */ uint64_t arc_buf_size(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); } uint64_t arc_buf_lsize(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } enum zio_compress arc_get_compression(arc_buf_t *buf) { return (ARC_BUF_COMPRESSED(buf) ? HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); } static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); /* * It would be nice to assert arc_can_share() too, but the "hdr isn't * already being shared" requirement prevents us from doing that. */ return (shared); } static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } /* * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data * matches the checksum that is stored in the hdr. If there is no checksum, * or if the buf is compressed, this is a no-op. */ static void arc_cksum_verify(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) { return; } ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); boolean_t valid_cksum; ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * We rely on the blkptr's checksum to determine if the block * is valid or not. When compressed arc is enabled, the l2arc * writes the block to the l2arc just as it appears in the pool. * This allows us to use the blkptr's checksum to validate the * data that we just read off of the l2arc without having to store * a separate checksum in the arc_buf_hdr_t. However, if compressed * arc is disabled, then the data written to the l2arc is always * uncompressed and won't match the block as it exists in the main * pool. When this is the case, we must first compress it if it is * compressed on the main pool before we can validate the checksum. */ if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { uint64_t lsize; uint64_t csize; void *cbuf; ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); lsize = HDR_GET_LSIZE(hdr); csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); if (csize < HDR_GET_PSIZE(hdr)) { /* * Compressed blocks are always a multiple of the * smallest ashift in the pool. Ideally, we would * like to round up the csize to the next * spa_min_ashift but that value may have changed * since the block was last written. Instead, * we rely on the fact that the hdr's psize * was set to the psize of the block when it was * last written. We set the csize to that value * and zero out any part that should not contain * data. */ bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); csize = HDR_GET_PSIZE(hdr); } zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); } /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); zio_pop_transforms(zio); return (valid_cksum); } /* * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a * checksum and attaches it to the buf's hdr so that we can ensure that the buf * isn't modified later on. If buf is compressed or there is already a checksum * on the hdr, this is a no-op (we only checksum uncompressed bufs). */ static void arc_cksum_compute(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } else if (ARC_BUF_COMPRESSED(buf)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } #ifndef _KERNEL void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); } #endif /* ARGSUSED */ static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { ASSERT0(mprotect(buf->b_data, HDR_GET_LSIZE(buf->b_hdr), PROT_READ | PROT_WRITE)); } #endif } /* ARGSUSED */ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), PROT_READ)); #endif } static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } boolean_t arc_is_metadata(arc_buf_t *buf) { return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_cksum_verify(buf); /* * Compressed buffers do not manipulate the b_freeze_cksum or * allocate b_thawed. */ if (ARC_BUF_COMPRESSED(buf)) { return; } ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); arc_buf_unwatch(buf); } void arc_buf_freeze(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; if (ARC_BUF_COMPRESSED(buf)) { return; } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf); mutex_exit(hash_lock); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } } /* * Looks for another buf on the same hdr which has the data decompressed, copies * from it, and returns true. If no such buf exists, returns false. */ static boolean_t arc_buf_try_copy_decompressed_data(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_buf_t *from; boolean_t copied = B_FALSE; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(buf->b_data, !=, NULL); ASSERT(!ARC_BUF_COMPRESSED(buf)); for (from = hdr->b_l1hdr.b_buf; from != NULL; from = from->b_next) { /* can't use our own data buffer */ if (from == buf) { continue; } if (!ARC_BUF_COMPRESSED(from)) { bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); copied = B_TRUE; break; } } /* * There were no decompressed bufs, so there should not be a * checksum on the hdr either. */ EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); return (copied); } /* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr * are already sharing a data buf, no copy is performed. * * If the buf is marked as compressed but uncompressed data was requested, this * will allocate a new data buffer for the buf, remove that flag, and fill the * buf with uncompressed data. You can't request a compressed buf on a hdr with * uncompressed data, and (since we haven't added support for it yet) if you * want compressed data your buf must already be marked as compressed and have * the correct-sized data buffer. */ static int arc_buf_fill(arc_buf_t *buf, boolean_t compressed) { arc_buf_hdr_t *hdr = buf->b_hdr; boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; ASSERT3P(buf->b_data, !=, NULL); IMPLY(compressed, hdr_compressed); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { ASSERT(hdr_compressed); ASSERT(!compressed); ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); /* * If the buf is sharing its data with the hdr, unlink it and * allocate a new data buffer for the buf. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); /* We need to give the buf it's own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); /* Previously overhead was 0; just add new overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } else if (ARC_BUF_COMPRESSED(buf)) { /* We need to reallocate the buf's b_data */ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), buf); buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); /* We increased the size of b_data; update overhead */ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); } /* * Regardless of the buf's previous compression settings, it * should not be compressed at the end of this function. */ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; /* * Try copying the data from another buf which already has a * decompressed version. If that's not possible, it's time to * bite the bullet and decompress the data from the hdr. */ if (arc_buf_try_copy_decompressed_data(buf)) { /* Skip byteswapping and checksumming (already done) */ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); return (0); } else { int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); /* * Absent hardware errors or software bugs, this should * be impossible, but log it anyway so we can debug it. */ if (error != 0) { zfs_dbgmsg( "hdr %p, compress %d, psize %d, lsize %d", hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); return (SET_ERROR(EIO)); } } } /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } /* Compute the hdr's checksum if necessary */ arc_cksum_compute(buf); return (0); } int arc_decompress(arc_buf_t *buf) { return (arc_buf_fill(buf, B_FALSE)); } /* * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); arc_buf_t *buf; ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) refcount_add_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); arc_buf_t *buf; ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) continue; (void) refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, void *tag) { arc_state_t *state; ASSERT(HDR_HAS_L1HDR(hdr)); if (!MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } state = hdr->b_l1hdr.b_state; if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); /* * arc_l2c_only counts as a ghost state so we don't need to explicitly * check to prevent usage of the arc_l2c_only list. */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } return (cnt); } /* * Returns detailed information about a specific arc buffer. When the * state_index argument is set the function will calculate the arc header * list position for its arc state. Since this requires a linear traversal * callers are strongly encourage not to do this. However, it can be helpful * for targeted analysis so the functionality is provided. */ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { arc_buf_hdr_t *hdr = ab->b_hdr; l1arc_buf_hdr_t *l1hdr = NULL; l2arc_buf_hdr_t *l2hdr = NULL; arc_state_t *state = NULL; memset(abi, 0, sizeof (arc_buf_info_t)); if (hdr == NULL) return; abi->abi_flags = hdr->b_flags; if (HDR_HAS_L1HDR(hdr)) { l1hdr = &hdr->b_l1hdr; state = l1hdr->b_state; } if (HDR_HAS_L2HDR(hdr)) l2hdr = &hdr->b_l2hdr; if (l1hdr) { abi->abi_bufcnt = l1hdr->b_bufcnt; abi->abi_access = l1hdr->b_arc_access; abi->abi_mru_hits = l1hdr->b_mru_hits; abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; abi->abi_mfu_hits = l1hdr->b_mfu_hits; abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; abi->abi_holds = refcount_count(&l1hdr->b_refcnt); } if (l2hdr) { abi->abi_l2arc_dattr = l2hdr->b_daddr; abi->abi_l2arc_hits = l2hdr->b_hits; } abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; abi->abi_state_contents = arc_buf_type(hdr); abi->abi_size = arc_hdr_size(hdr); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *old_state; int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); } else { old_state = arc_l2c_only; refcnt = 0; bufcnt = 0; update_old = B_FALSE; } update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_old = B_TRUE; } arc_evictable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_new = B_TRUE; } arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); } else { arc_buf_t *buf; uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) refcount_add_many(&new_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&new_state->arcs_size, arc_hdr_size(hdr), hdr); } else { ASSERT(GHOST_STATE(old_state)); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) refcount_remove_many(&old_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); } else { arc_buf_t *buf; uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) continue; (void) refcount_remove_many( &old_state->arcs_size, arc_buf_size(buf), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); (void) refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } } if (HDR_HAS_L1HDR(hdr)) hdr->b_l1hdr.b_state = new_state; /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. */ ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, space); break; } if (type != ARC_SPACE_DATA) ARCSTAT_INCR(arcstat_meta_used, space); atomic_add_64(&arc_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { default: break; case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_BONUS: ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: ARCSTAT_INCR(arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, -space); break; } if (type != ARC_SPACE_DATA) { ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; ARCSTAT_INCR(arcstat_meta_used, -space); } ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { boolean_t hdr_compressed, buf_compressed; /* * The criteria for sharing a hdr's data are: * 1. the hdr's compression matches the buf's compression * 2. the hdr doesn't need to be byteswapped * 3. the hdr isn't already being shared * 4. the buf is either compressed or it is the last buf in the hdr list * * Criterion #4 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create * a shared uncompressed buf anyway (because the hdr must be compressed * to have the compressed buf). You might also think that #3 is * sufficient to make this guarantee, however it's possible * (specifically in the rare L2ARC write race mentioned in * arc_buf_alloc_impl()) there will be an existing uncompressed buf that * is sharable, but wasn't at the time of its allocation. Rather than * allow a new shared uncompressed buf to be created and then shuffle * the list around to make it the last element, this simply disallows * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; return (buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); } /* * Allocate a buf for this hdr. If you care about the data that's in the hdr, * or if you want a compressed buffer, pass those flags in. Returns 0 if the * copy was made successfully, or an error code otherwise. */ static int arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; boolean_t can_share; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_l2_hits = 0; buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; buf->b_flags = 0; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually * compressed. */ if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; /* * Although the ARC should handle it correctly, levels above the ARC * should prevent us from having multiple compressed bufs off the same * hdr. To ensure we notice it if this behavior changes, we assert this * here the best we can. */ IMPLY(ARC_BUF_COMPRESSED(buf), !HDR_SHARED_DATA(hdr)); /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is * allocate a new buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be * actively involved in an L2ARC write, because if this buf is used by * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. * Second, the hdr's ABD must be linear so that the buf's user doesn't * need to be ABD-aware. */ can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && abd_is_linear(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, arc_buf_size(buf), buf); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); } return (0); } static char *arc_onloan_tag = "onloan"; /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) { arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); atomic_add_64(&arc_loaned_bytes, size); return (buf); } arc_buf_t * arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type) { arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, psize, lsize, compression_type); atomic_add_64(&arc_loaned_bytes, psize); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -arc_buf_size(buf)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); atomic_add_64(&arc_loaned_bytes, -arc_buf_size(buf)); } static void l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) refcount_remove_many(&state->arcs_size, size, hdr); l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ refcount_transfer_ownership(&hdr->b_l1hdr.b_state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_put(hdr->b_l1hdr.b_pabd); hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); } /* * Remove an arc_buf_t from the hdr's buf list and return the last * arc_buf_t on the list. If no buffers remain on the list then return * NULL. */ static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. */ while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); return (lastbuf); } /* * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's * list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf) { arc_buf_t *lastbuf; arc_buf_hdr_t *hdr = buf->b_hdr; /* * Free up the data associated with the buf but only if we're not * sharing this with the hdr. If we are sharing it with the hdr, the * hdr is responsible for doing the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); if (arc_buf_is_shared(buf)) { arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { uint64_t size = arc_buf_size(buf); arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; ASSERT(hdr->b_l1hdr.b_bufcnt > 0); hdr->b_l1hdr.b_bufcnt -= 1; } lastbuf = arc_buf_remove(hdr, buf); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be * wasted temporarily. */ if (lastbuf != NULL) { /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); arc_hdr_free_pabd(hdr); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); } if (hdr->b_l1hdr.b_bufcnt == 0) arc_cksum_free(hdr); /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) { ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_pabd(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); } hdr->b_l1hdr.b_pabd = NULL; hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, enum zio_compress compression_type, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; /* * Allocate the hdr's buffer. This will contain either * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ arc_hdr_alloc_pabd(hdr); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT(HDR_HAS_L2HDR(hdr)); ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_t *buf; arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, ZIO_COMPRESS_OFF, type); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); } /* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ arc_buf_t * arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type) { arc_buf_hdr_t *hdr; arc_buf_t *buf; ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); ASSERT(compression_type > ZIO_COMPRESS_OFF); ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, compression_type, ARC_BUFC_DATA); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); if (!arc_buf_is_shared(buf)) { /* * To ensure that the hdr has the correct data in it if we call * arc_decompress() on this buf before it's been written to * disk, it's easiest if we just set up sharing between the * buf and the hdr. */ ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); arc_hdr_free_pabd(hdr); arc_share_buf(hdr, buf); } return (buf); } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t asize = arc_hdr_size(hdr); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_asize, -asize); ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || hdr->b_l1hdr.b_bufcnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_pabd(hdr); } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, NULL, tag)); arc_hdr_destroy(hdr); return; } mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); arc_buf_destroy_impl(buf); mutex_exit(hash_lock); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing its L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pabd == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < arc_min_prefetch_lifespan)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { ARCSTAT_BUMP(arcstat_mutex_miss); break; } if (buf->b_data != NULL) bytes_evicted += HDR_GET_LSIZE(hdr); mutex_exit(&buf->b_evict_lock); arc_buf_destroy_impl(buf); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } if (hdr->b_l1hdr.b_bufcnt == 0) { arc_cksum_free(hdr); bytes_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly * in arc_free_data_impl(). */ arc_hdr_free_pabd(hdr); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } return (bytes_evicted); } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, int64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; int evict_count = 0; ASSERT3P(marker, !=, NULL); IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; hdr = multilist_sublist_prev(mls, marker)) { if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || (evict_count >= zfs_arc_evict_batch_limit)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t evicted = arc_evict_hdr(hdr, hash_lock); mutex_exit(hash_lock); bytes_evicted += evicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count++; /* * If arc_size isn't overflowing, signal any * threads that might happen to be waiting. * * For each header evicted, we wake up a single * thread. If we used cv_broadcast, we could * wake up "too many" threads causing arc_size * to significantly overflow arc_c; since * arc_get_data_impl() doesn't check for overflow * when it's woken up (it doesn't because it's * possible for the ARC to be overflowing while * full of un-evictable buffers, and the * function should proceed in this case). * * If threads are left sleeping, due to not * using cv_broadcast, they will be woken up * just before arc_reclaim_thread() sleeps. */ mutex_enter(&arc_reclaim_lock); if (!arc_is_overflowing()) cv_signal(&arc_reclaim_waiters_cv); mutex_exit(&arc_reclaim_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); return (bytes_evicted); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; int i; IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); for (i = 0; i < num_sublists; i++) { multilist_sublist_t *mls; markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_adjust_type() and * arc_evict_state_impl(). */ markers[i]->b_spa = 0; mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; /* * Try to reduce pinned dnodes with a floor of arc_dnode_limit. * Request that 10% of the LRUs be scanned by the superblock * shrinker. */ if (type == ARC_BUFC_DATA && arc_dnode_size > arc_dnode_limit) arc_prune_async((arc_dnode_size - arc_dnode_limit) / sizeof (dnode_t) / zfs_arc_dnode_reduce_percent); /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ for (i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (bytes == ARC_EVICT_ALL) bytes_remaining = ARC_EVICT_ALL; else if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); kmem_cache_free(hdr_full_cache, markers[i]); } kmem_free(markers, sizeof (*markers) * num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) break; } return (evicted); } /* * Helper function for arc_prune_async() it is responsible for safely * handling the execution of a registered arc_prune_func_t. */ static void arc_prune_task(void *ptr) { arc_prune_t *ap = (arc_prune_t *)ptr; arc_prune_func_t *func = ap->p_pfunc; if (func != NULL) func(ap->p_adjust, ap->p_private); refcount_remove(&ap->p_refcnt, func); } /* * Notify registered consumers they must drop holds on a portion of the ARC * buffered they reference. This provides a mechanism to ensure the ARC can * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This * is analogous to dnlc_reduce_cache() but more generic. * * This operation is performed asynchronously so it may be safely called * in the context of the arc_reclaim_thread(). A reference is taken here * for each registered arc_prune_t and the arc_prune_task() is responsible * for releasing it once the registered arc_prune_func_t has completed. */ static void arc_prune_async(int64_t adjust) { arc_prune_t *ap; mutex_enter(&arc_prune_mtx); for (ap = list_head(&arc_prune_list); ap != NULL; ap = list_next(&arc_prune_list, ap)) { if (refcount_count(&ap->p_refcnt) >= 2) continue; refcount_add(&ap->p_refcnt, ap->p_pfunc); ap->p_adjust = adjust; if (taskq_dispatch(arc_prune_taskq, arc_prune_task, ap, TQ_SLEEP) == TASKQID_INVALID) { refcount_remove(&ap->p_refcnt, ap->p_pfunc); continue; } ARCSTAT_BUMP(arcstat_prune); } mutex_exit(&arc_prune_mtx); } /* * Evict the specified number of bytes from the state specified, * restricting eviction to the spa and type given. This function * prevents us from trying to evict more from a state's list than * is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { int64_t delta; if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } return (0); } /* * The goal of this function is to evict enough meta data buffers from the * ARC in order to enforce the arc_meta_limit. Achieving this is slightly * more complicated than it appears because it is common for data buffers * to have holds on meta data buffers. In addition, dnode meta data buffers * will be held by the dnodes in the block preventing them from being freed. * This means we can't simply traverse the ARC and expect to always find * enough unheld meta data buffer to release. * * Therefore, this function has been updated to make alternating passes * over the ARC releasing data buffers and then newly unheld meta data * buffers. This ensures forward progress is maintained and arc_meta_used * will decrease. Normally this is sufficient, but if required the ARC * will call the registered prune callbacks causing dentry and inodes to * be dropped from the VFS cache. This will make dnode meta data buffers * available for reclaim. */ static uint64_t arc_adjust_meta_balanced(void) { int64_t delta, prune = 0, adjustmnt; uint64_t total_evicted = 0; arc_buf_contents_t type = ARC_BUFC_DATA; int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); restart: /* * This slightly differs than the way we evict from the mru in * arc_adjust because we don't have a "target" value (i.e. no * "meta" arc_p). As a result, I think we can completely * cannibalize the metadata in the MRU before we evict the * metadata from the MFU. I think we probably need to implement a * "metadata arc_p" value to do this properly. */ adjustmnt = arc_meta_used - arc_meta_limit; if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) { delta = MIN(refcount_count(&arc_mru->arcs_esize[type]), adjustmnt); total_evicted += arc_adjust_impl(arc_mru, 0, delta, type); adjustmnt -= delta; } /* * We can't afford to recalculate adjustmnt here. If we do, * new metadata buffers can sneak into the MRU or ANON lists, * thus penalize the MFU metadata. Although the fudge factor is * small, it has been empirically shown to be significant for * certain workloads (e.g. creating many empty directories). As * such, we use the original calculation for adjustmnt, and * simply decrement the amount of data evicted from the MRU. */ if (adjustmnt > 0 && refcount_count(&arc_mfu->arcs_esize[type]) > 0) { delta = MIN(refcount_count(&arc_mfu->arcs_esize[type]), adjustmnt); total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type); } adjustmnt = arc_meta_used - arc_meta_limit; if (adjustmnt > 0 && refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, refcount_count(&arc_mru_ghost->arcs_esize[type])); total_evicted += arc_adjust_impl(arc_mru_ghost, 0, delta, type); adjustmnt -= delta; } if (adjustmnt > 0 && refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { delta = MIN(adjustmnt, refcount_count(&arc_mfu_ghost->arcs_esize[type])); total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, delta, type); } /* * If after attempting to make the requested adjustment to the ARC * the meta limit is still being exceeded then request that the * higher layers drop some cached objects which have holds on ARC * meta buffers. Requests to the upper layers will be made with * increasingly large scan sizes until the ARC is below the limit. */ if (arc_meta_used > arc_meta_limit) { if (type == ARC_BUFC_DATA) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; if (zfs_arc_meta_prune) { prune += zfs_arc_meta_prune; arc_prune_async(prune); } } if (restarts > 0) { restarts--; goto restart; } } return (total_evicted); } /* * Evict metadata buffers from the cache, such that arc_meta_used is * capped by the arc_meta_limit tunable. */ static uint64_t arc_adjust_meta_only(void) { uint64_t total_evicted = 0; int64_t target; /* * If we're over the meta limit, we want to evict enough * metadata to get back under the meta limit. We don't want to * evict so much that we drop the MRU below arc_p, though. If * we're over the meta limit more than we're over arc_p, we * evict some from the MRU here, and some from the MFU below. */ target = MIN((int64_t)(arc_meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) - arc_p)); total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the * space allotted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(arc_meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); return (total_evicted); } static uint64_t arc_adjust_meta(void) { if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) return (arc_adjust_meta_only()); else return (arc_adjust_meta_balanced()); } /* * Return the type of the oldest buffer in the given arc state * * This function will select a random sublist of type ARC_BUFC_DATA and * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist * is compared, and the type which contains the "older" buffer will be * returned. */ static arc_buf_contents_t arc_adjust_type(arc_state_t *state) { multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; int data_idx = multilist_get_random_index(data_ml); int meta_idx = multilist_get_random_index(meta_ml); multilist_sublist_t *data_mls; multilist_sublist_t *meta_mls; arc_buf_contents_t type; arc_buf_hdr_t *data_hdr; arc_buf_hdr_t *meta_hdr; /* * We keep the sublist lock until we're finished, to prevent * the headers from being destroyed via arc_evict_state(). */ data_mls = multilist_sublist_lock(data_ml, data_idx); meta_mls = multilist_sublist_lock(meta_ml, meta_idx); /* * These two loops are to ensure we skip any markers that * might be at the tail of the lists due to arc_evict_state(). */ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { if (data_hdr->b_spa != 0) break; } for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { if (meta_hdr->b_spa != 0) break; } if (data_hdr == NULL && meta_hdr == NULL) { type = ARC_BUFC_DATA; } else if (data_hdr == NULL) { ASSERT3P(meta_hdr, !=, NULL); type = ARC_BUFC_METADATA; } else if (meta_hdr == NULL) { ASSERT3P(data_hdr, !=, NULL); type = ARC_BUFC_DATA; } else { ASSERT3P(data_hdr, !=, NULL); ASSERT3P(meta_hdr, !=, NULL); /* The headers can't be on the sublist without an L1 header */ ASSERT(HDR_HAS_L1HDR(data_hdr)); ASSERT(HDR_HAS_L1HDR(meta_hdr)); if (data_hdr->b_l1hdr.b_arc_access < meta_hdr->b_l1hdr.b_arc_access) { type = ARC_BUFC_DATA; } else { type = ARC_BUFC_METADATA; } } multilist_sublist_unlock(meta_mls); multilist_sublist_unlock(data_mls); return (type); } /* * Evict buffers from the cache, such that arc_size is capped by arc_c. */ static uint64_t arc_adjust(void) { uint64_t total_evicted = 0; uint64_t bytes; int64_t target; /* * If we're over arc_meta_limit, we want to correct that before * potentially evicting data buffers below. */ total_evicted += arc_adjust_meta(); /* * Adjust MRU size * * If we're over the target cache size, we want to evict enough * from the list to get back to our target size. We don't want * to evict too much from the MRU, such that it drops below * arc_p. So, if we're over our target cache size more than * the MRU is over arc_p, we'll evict enough to get back to * arc_p here, and then evict more from the MFU below. */ target = MIN((int64_t)(arc_size - arc_c), (int64_t)(refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); /* * If we're below arc_meta_min, always prefer to evict data. * Otherwise, try to satisfy the requested number of bytes to * evict from the type which contains older buffers; in an * effort to keep newer buffers in the cache regardless of their * type. If we cannot satisfy the number of bytes from this * type, spill over into the next type. */ if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && arc_meta_used > arc_meta_min) { bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); } else { bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from metadata. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Adjust MFU size * * Now that we've tried to evict enough from the MRU to get its * size back to arc_p, if we're still above the target cache * size, we evict the rest from the MFU. */ target = arc_size - arc_c; if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && arc_meta_used > arc_meta_min) { bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); } else { bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* * Adjust ghost lists * * In addition to the above, the ARC also defines target values * for the ghost lists. The sum of the mru list and mru ghost * list should never exceed the target size of the cache, and * the sum of the mru list, mfu list, mru ghost list, and mfu * ghost list should never exceed twice the target size of the * cache. The following logic enforces these limits on the ghost * caches, and evicts from them as needed. */ target = refcount_count(&arc_mru->arcs_size) + refcount_count(&arc_mru_ghost->arcs_size) - arc_c; bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); /* * We assume the sum of the mru list and mfu list is less than * or equal to arc_c (we enforced this above), which means we * can use the simpler of the two equations below: * * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c * mru ghost + mfu ghost <= arc_c */ target = refcount_count(&arc_mru_ghost->arcs_size) + refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); return (total_evicted); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == 0); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } void arc_shrink(int64_t to_free) { uint64_t c = arc_c; if (c > to_free && c - to_free > arc_c_min) { arc_c = c - to_free; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_c > arc_size) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } else { arc_c = arc_c_min; } if (arc_size > arc_c) (void) arc_adjust(); } /* * Return maximum amount of memory that we could possibly use. Reduced * to half of all memory in user space which is primarily used for testing. */ static uint64_t arc_all_memory(void) { #ifdef _KERNEL return (MIN(ptob(physmem), vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC))); #else return (ptob(physmem) / 2); #endif } typedef enum free_memory_reason_t { FMR_UNKNOWN, FMR_NEEDFREE, FMR_LOTSFREE, FMR_SWAPFS_MINFREE, FMR_PAGES_PP_MAXIMUM, FMR_HEAP_ARENA, FMR_ZIO_ARENA, } free_memory_reason_t; int64_t last_free_memory; free_memory_reason_t last_free_reason; #ifdef _KERNEL /* * Additional reserve of pages for pp_reserve. */ int64_t arc_pages_pp_reserve = 64; /* * Additional reserve of pages for swapfs. */ int64_t arc_swapfs_reserve = 64; #endif /* _KERNEL */ /* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates * the amount of memory that needs to be freed up. */ static int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL uint64_t available_memory = ptob(freemem); int64_t n; #ifdef __linux__ pgcnt_t needfree = btop(arc_need_free); pgcnt_t lotsfree = btop(arc_sys_free); pgcnt_t desfree = 0; #endif #if defined(__i386) available_memory = MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); #endif if (needfree > 0) { n = PAGESIZE * (-needfree); if (n < lowest) { lowest = n; r = FMR_NEEDFREE; } } /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. * lotsfree is the high-water mark for pageout, and needfree is the * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ n = PAGESIZE * (btop(available_memory) - lotsfree - needfree - desfree); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; } #ifndef __linux__ /* * check to make sure that swapfs has enough space so that anon * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - desfree - arc_swapfs_reserve); if (n < lowest) { lowest = n; r = FMR_SWAPFS_MINFREE; } /* * Check that we have enough availrmem that memory locking (e.g., via * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum * stores the number of pages that cannot be locked; when availrmem * drops below pages_pp_maximum, page locking mechanisms such as * page_pp_lock() will fail.) */ n = PAGESIZE * (availrmem - pages_pp_maximum - arc_pages_pp_reserve); if (n < lowest) { lowest = n; r = FMR_PAGES_PP_MAXIMUM; } #endif #if defined(__i386) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ n = vmem_size(heap_arena, VMEM_FREE) - (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); if (n < lowest) { lowest = n; r = FMR_HEAP_ARENA; } #endif /* * If zio data pages are being allocated out of a separate heap segment, * then enforce that the size of available vmem for this arena remains * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. * * Note that reducing the arc_zio_arena_free_shift keeps more virtual * memory (in the zio_arena) free, which can avoid memory * fragmentation issues. */ if (zio_arena != NULL) { n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - (vmem_size(zio_arena, VMEM_ALLOC) >> arc_zio_arena_free_shift); if (n < lowest) { lowest = n; r = FMR_ZIO_ARENA; } } #else /* _KERNEL */ /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) lowest = -1024; #endif /* _KERNEL */ last_free_memory = lowest; last_free_reason = r; return (lowest); } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } static void arc_kmem_reap_now(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) { /* * We are exceeding our meta-data cache limit. * Prune some entries to release holds on meta-data. */ arc_prune_async(zfs_arc_meta_prune); } for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { #ifdef _ILP32 /* reach upper limit of cache size on 32-bit */ if (zio_buf_cache[i] == NULL) break; #endif if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(range_seg_cache); if (zio_arena != NULL) { /* * Ask the vmem arena to reclaim unused memory from its * quantum caches. */ vmem_qcache_reap(zio_arena); } } /* * Threads can block in arc_get_data_impl() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in * arc_get_data_impl() are sleeping while holding the hash lock for their * particular arc header. Thus, we must be careful to never sleep on a * hash lock in this thread. This is to prevent the following deadlock: * * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", * waiting for the reclaim thread to signal it. * * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, * fails, and goes to sleep forever. * * This possible deadlock is avoided by always acquiring a hash lock * using mutex_tryenter() from arc_reclaim_thread(). */ static void arc_reclaim_thread(void) { fstrans_cookie_t cookie = spl_fstrans_mark(); hrtime_t growtime = 0; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_lock); while (!arc_reclaim_thread_exit) { int64_t to_free; int64_t free_memory = arc_available_memory(); uint64_t evicted = 0; arc_tuning_update(); /* * This is necessary in order for the mdb ::arc dcmd to * show up to date information. Since the ::arc command * does not call the kstat's update function, without * this call, the command may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this change, the data might be up to 1 second * out of date; but that should suffice. The arc_state_t * structures can be queried directly if more accurate * information is needed. */ #ifndef __linux__ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); #endif mutex_exit(&arc_reclaim_lock); if (free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 5) seconds * before considering growing. */ growtime = gethrtime() + SEC2NSEC(arc_grow_retry); arc_kmem_reap_now(); /* * If we are still low on memory, shrink the ARC * so that we have arc_shrink_min free space. */ free_memory = arc_available_memory(); to_free = (arc_c >> arc_shrink_shift) - free_memory; if (to_free > 0) { #ifdef _KERNEL to_free = MAX(to_free, arc_need_free); #endif arc_shrink(to_free); } } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= growtime) { arc_no_grow = B_FALSE; } evicted = arc_adjust(); mutex_enter(&arc_reclaim_lock); /* * If evicted is zero, we couldn't evict anything via * arc_adjust(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. */ if (arc_size <= arc_c || evicted == 0) { /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * up any threads before we go to sleep and clear * arc_need_free since nothing more can be done. */ cv_broadcast(&arc_reclaim_waiters_cv); arc_need_free = 0; /* * Block until signaled, or after one second (we * might need to perform arc_kmem_reap_now() * even if we aren't being signalled) */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_sig_hires(&arc_reclaim_thread_cv, &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); } } arc_reclaim_thread_exit = B_FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ spl_fstrans_unmark(cookie); thread_exit(); } #ifdef _KERNEL /* * Determine the amount of memory eligible for eviction contained in the * ARC. All clean data reported by the ghost lists can always be safely * evicted. Due to arc_c_min, the same does not hold for all clean data * contained by the regular mru and mfu lists. * * In the case of the regular mru and mfu lists, we need to report as * much clean data as possible, such that evicting that same reported * data will not bring arc_size below arc_c_min. Thus, in certain * circumstances, the total amount of clean data in the mru and mfu * lists might not actually be evictable. * * The following two distinct cases are accounted for: * * 1. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is greater than or equal to arc_c_min. * (i.e. amount of dirty data >= arc_c_min) * * This is the easy case; all clean data contained by the mru and mfu * lists is evictable. Evicting all clean data can only drop arc_size * to the amount of dirty data, which is greater than arc_c_min. * * 2. The sum of the amount of dirty data contained by both the mru and * mfu lists, plus the ARC's other accounting (e.g. the anon list), * is less than arc_c_min. * (i.e. arc_c_min > amount of dirty data) * * 2.1. arc_size is greater than or equal arc_c_min. * (i.e. arc_size >= arc_c_min > amount of dirty data) * * In this case, not all clean data from the regular mru and mfu * lists is actually evictable; we must leave enough clean data * to keep arc_size above arc_c_min. Thus, the maximum amount of * evictable data from the two lists combined, is exactly the * difference between arc_size and arc_c_min. * * 2.2. arc_size is less than arc_c_min * (i.e. arc_c_min > arc_size > amount of dirty data) * * In this case, none of the data contained in the mru and mfu * lists is evictable, even if it's clean. Since arc_size is * already below arc_c_min, evicting any more would only * increase this negative difference. */ static uint64_t -arc_evictable_memory(void) { +arc_evictable_memory(void) +{ uint64_t arc_clean = refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); uint64_t ghost_clean = refcount_count(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]) + refcount_count(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]) + refcount_count(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]) + refcount_count(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0); if (arc_dirty >= arc_c_min) return (ghost_clean + arc_clean); return (ghost_clean + MAX((int64_t)arc_size - (int64_t)arc_c_min, 0)); } /* * If sc->nr_to_scan is zero, the caller is requesting a query of the * number of objects which can potentially be freed. If it is nonzero, * the request is to free that many objects. * * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks * in struct shrinker and also require the shrinker to return the number * of objects freed. * * Older kernels require the shrinker to return the number of freeable * objects following the freeing of nr_to_free. */ static spl_shrinker_t __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) { int64_t pages; /* The arc is considered warm once reclaim has occurred */ if (unlikely(arc_warm == B_FALSE)) arc_warm = B_TRUE; /* Return the potential number of reclaimable pages */ pages = btop((int64_t)arc_evictable_memory()); if (sc->nr_to_scan == 0) return (pages); /* Not allowed to perform filesystem reclaim */ if (!(sc->gfp_mask & __GFP_FS)) return (SHRINK_STOP); /* Reclaim in progress */ if (mutex_tryenter(&arc_reclaim_lock) == 0) return (SHRINK_STOP); mutex_exit(&arc_reclaim_lock); /* * Evict the requested number of pages by shrinking arc_c the * requested amount. If there is nothing left to evict just * reap whatever we can from the various arc slabs. */ if (pages > 0) { arc_shrink(ptob(sc->nr_to_scan)); arc_kmem_reap_now(); #ifdef HAVE_SPLIT_SHRINKER_CALLBACK pages = MAX(pages - btop(arc_evictable_memory()), 0); #else pages = btop(arc_evictable_memory()); #endif } else { arc_kmem_reap_now(); pages = SHRINK_STOP; } /* * We've reaped what we can, wake up threads. */ cv_broadcast(&arc_reclaim_waiters_cv); /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd * threads were unable to asynchronously keep enough free memory * available. In this case set arc_no_grow to briefly pause arc * growth to avoid compounding the memory pressure. */ if (current_is_kswapd()) { ARCSTAT_BUMP(arcstat_memory_indirect_count); } else { arc_no_grow = B_TRUE; arc_need_free = ptob(sc->nr_to_scan); ARCSTAT_BUMP(arcstat_memory_direct_count); } return (pages); } SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); SPL_SHRINKER_DECLARE(arc_shrinker, arc_shrinker_func, DEFAULT_SEEKS); #endif /* _KERNEL */ /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); if (state == arc_l2c_only) return; ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); if (!zfs_arc_p_dampener_disable) mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); if (!zfs_arc_p_dampener_disable) mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); if (arc_reclaim_needed()) { cv_signal(&arc_reclaim_thread_cv); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ static boolean_t arc_is_overflowing(void) { /* Always allow at least one block of overflow */ uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); return (arc_size >= arc_c + overflow); } static abd_t * arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { ASSERT(type == ARC_BUFC_DATA); return (abd_alloc(size, B_FALSE)); } } static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_get_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { ASSERT(type == ARC_BUFC_DATA); return (zio_data_buf_alloc(size)); } } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); /* * If arc_size is currently overflowing, and has grown past our * upper limit, we must be adding data faster than the evict * thread can evict. Thus, to ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we halt and wait for the * eviction thread to catch up. * * It's also possible that the reclaim thread is unable to evict * enough buffers to get arc_size below the overflow limit (e.g. * due to buffers being un-evictable, or hash lock collisions). * In this case, we want to proceed regardless if we're * overflowing; thus we don't use a while loop here. */ if (arc_is_overflowing()) { mutex_enter(&arc_reclaim_lock); /* * Now that we've acquired the lock, we may no longer be * over the overflow limit, lets check. * * We're ignoring the case of spurious wake ups. If that * were to happen, it'd let this thread consume an ARC * buffer before it should have (i.e. before we're under * the overflow limit and were signalled by the reclaim * thread). As long as that is a rare occurrence, it * shouldn't cause any harm. */ if (arc_is_overflowing()) { cv_signal(&arc_reclaim_thread_cv); cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); } mutex_exit(&arc_reclaim_lock); } VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_consume(size, ARC_SPACE_META); } else { arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(state)) { (void) refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) refcount_add_many(&state->arcs_esize[type], size, tag); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && (refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } } static void arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) { arc_free_data_impl(hdr, size, tag); abd_free(abd); } static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); arc_free_data_impl(hdr, size, tag); if (type == ARC_BUFC_METADATA) { zio_buf_free(buf, size); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(buf, size); } } /* * Free the arc data buffer. */ static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) refcount_remove_many(&state->arcs_size, size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ if (HDR_PREFETCH(hdr)) { if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ if (HDR_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ if ((HDR_PREFETCH(hdr)) != 0) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); /* link protected by hash_lock */ ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); } atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ if (HDR_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { cmn_err(CE_PANIC, "invalid arc state 0x%p", hdr->b_l1hdr.b_state); } } /* a generic arc_done_func_t which you can use */ /* ARGSUSED */ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } /* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { arc_buf_destroy(buf, arg); *bufp = NULL; } else { *bufp = buf; ASSERT(buf->b_data); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); } } static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; boolean_t no_zio_error = (zio->io_error == 0); int callback_cnt = 0; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { arc_buf_hdr_t *found; ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } if (no_zio_error) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* * If a read request has a callback (i.e. acb_done is not NULL), then we * make a buf containing the data according to the parameters which were * passed in. The implementation of arc_buf_alloc_impl() ensures that we * aren't needlessly decompressing the data multiple times. */ for (acb = callback_list; acb != NULL; acb = acb->acb_next) { int error; if (!acb->acb_done) continue; /* This is a demand read since prefetches don't use callbacks */ callback_cnt++; error = arc_buf_alloc_impl(hdr, acb->acb_private, acb->acb_compressed, no_zio_error, &acb->acb_buf); if (no_zio_error) { zio->io_error = error; } } hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (callback_cnt == 0) { ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); if (no_zio_error) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_l1hdr.b_cv); if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done) acb->acb_done(zio, acb->acb_buf, acb->acb_private); if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; int rc = 0; ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); top: if (!BP_IS_EMBEDDED(bp)) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This sync read must wait for an * in-progress async read (e.g. a predictive * prefetch). Async reads are queued * separately at the vdev_queue layer, so * this is a form of priority inversion. * Ideally, we would "inherit" the demand * i/o's priority by moving the i/o from * the async queue to the synchronous queue, * but there is currently no mechanism to do * so. Track this so that we can evaluate * the magnitude of this potential performance * problem. * * Note that if the prefetch i/o is already * active (has been issued to the device), * the prefetch improved performance, because * we issued it sooner than we would have * without the prefetch. */ DTRACE_PROBE1(arc__sync__wait__for__async, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_sync_wait_for_async); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); goto out; } mutex_exit(hash_lock); goto out; } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); if (done) { if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { /* * This is a demand read which does not have to * wait for i/o because we did a predictive * prefetch i/o for it, which has completed. */ DTRACE_PROBE1( arc__demand__hit__predictive__prefetch, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ VERIFY0(arc_buf_alloc_impl(hdr, private, compressed_read, B_TRUE, &buf)); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) done(NULL, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; /* * Gracefully handle a damaged logical block size as a * checksum error. */ if (lsize > spa_maxblocksize(spa)) { rc = SET_ERROR(ECKSUM); goto out; } if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_GET_COMPRESS(bp), type); if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } } else { /* * This block is in the ghost cache. If it was L2-only * (and thus didn't have an L1 hdr), we realloc the * header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * This is a delicate dance that we play here. * This hdr is in the ghost list so we access it * to move it out of the ghost list before we * initiate the read. If it's a prefetch then * it won't have a callback so we'll remove the * reference that arc_buf_alloc_impl() created. We * do this after we've called arc_access() to * avoid hitting an assert in remove_reference(). */ arc_access(hdr, hash_lock); arc_hdr_alloc_pabd(hdr); } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); /* * If compression is enabled on the hdr, then will do * RAW I/O and will store the compressed data in the hdr's * data block. Otherwise, the hdr's data block will contain * the uncompressed data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW; } if (*arc_flags & ARC_FLAG_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } if (priority == ZIO_PRIORITY_ASYNC_READ) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); if (hash_lock != NULL) mutex_exit(hash_lock); /* * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); atomic_inc_32(&hdr->b_l2hdr.b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + lsize < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(HDR_GET_COMPRESS(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, size, hdr->b_l1hdr.b_pabd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; /* l2arc read error; goto zio_read() */ } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); if (l2arc_ndev != 0) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } out: spa_read_history_add(spa, zb, *arc_flags); return (rc); } arc_prune_t * arc_add_prune_callback(arc_prune_func_t *func, void *private) { arc_prune_t *p; p = kmem_alloc(sizeof (*p), KM_SLEEP); p->p_pfunc = func; p->p_private = private; list_link_init(&p->p_node); refcount_create(&p->p_refcnt); mutex_enter(&arc_prune_mtx); refcount_add(&p->p_refcnt, &arc_prune_list); list_insert_head(&arc_prune_list, p); mutex_exit(&arc_prune_mtx); return (p); } void arc_remove_prune_callback(arc_prune_t *p) { boolean_t wait = B_FALSE; mutex_enter(&arc_prune_mtx); list_remove(&arc_prune_list, p); if (refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) wait = B_TRUE; mutex_exit(&arc_prune_mtx); /* wait for arc_prune_task to finish */ if (wait) taskq_wait_outstanding(arc_prune_taskq, 0); ASSERT0(refcount_count(&p->p_refcnt)); refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has a reference (i.e. a dedup-ed, * dmu_sync-ed block). If this block is being prefetched, then it * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr * until the I/O completes. A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { kmutex_t *hash_lock; arc_state_t *state; arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { mutex_exit(&buf->b_evict_lock); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT(HDR_EMPTY(hdr)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) arc_hdr_l2hdr_destroy(hdr); mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); enum zio_compress compress = HDR_GET_COMPRESS(hdr); arc_buf_contents_t type = arc_buf_type(hdr); arc_buf_t *lastbuf = NULL; VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); VERIFY(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. Then, setup a new * block sharing relationship with the last buffer * on the arc_buf_t list. */ arc_unshare_buf(hdr, buf); /* * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { arc_hdr_alloc_pabd(hdr); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end * of the list. Compressed buffers don't have the * same requirements. This makes it hard to * simply assert that the lastbuf is shared so * we rely on the hdr's compression flags to determine * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, arc_buf_size(buf), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], arc_buf_size(buf), buf); } hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); mutex_exit(hash_lock); /* * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_bufcnt = 1; nhdr->b_l1hdr.b_mru_hits = 0; nhdr->b_l1hdr.b_mru_ghost_hits = 0; nhdr->b_l1hdr.b_mfu_hits = 0; nhdr->b_l1hdr.b_mfu_ghost_hits = 0; nhdr->b_l1hdr.b_l2_hits = 0; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); (void) refcount_add_many(&arc_anon->arcs_size, HDR_GET_LSIZE(nhdr), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; hdr->b_l1hdr.b_mfu_ghost_hits = 0; hdr->b_l1hdr.b_l2_hits = 0; arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); enum zio_compress compress; fstrans_cookie_t cookie = spl_fstrans_mark(); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback was invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pabd(hdr); } } } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); arc_cksum_compute(buf); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); compress = BP_GET_COMPRESS(zio->io_bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); /* * Fill the hdr with data. If the hdr is compressed, the data we want * is available from the zio, otherwise we can take it from the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a * lot of shareable data. As a compromise, we check whether scattered * ABDs are allowed, and assume that if they are then the user wants * the ARC to be primarily filled with them regardless of the data being * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { arc_hdr_alloc_pabd(hdr); /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF); ASSERT3U(psize, >, 0); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } } else { ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); arc_share_buf(hdr, buf); } arc_hdr_verify(hdr, zio->io_bp); spl_fstrans_unmark(cookie); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } /* * The SPA calls this callback for each physical write that happens on behalf * of a logical write. See the comment in dbuf_write_physdone() for details. */ static void arc_write_physdone(zio_t *zio) { arc_write_callback_t *cb = zio->io_private; if (cb->awcb_physdone != NULL) cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT3U(zio->io_error, ==, 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); abd_put(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *children_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (ARC_BUF_COMPRESSED(buf)) { ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_OFF); zio_flags |= ZIO_FLAG_RAW; } callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pabd(hdr); } VERIFY3P(buf->b_data, !=, NULL); arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); zio = zio_write(pio, spa, txg, bp, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, priority, zio_flags, zb); return (zio); } static int arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; #ifdef __linux__ pgcnt_t minfree = btop(arc_sys_free / 4); #endif #if defined(__i386) available_memory = MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); #endif if (available_memory > arc_all_memory() * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { last_txg = txg; page_load = 0; } /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ if (current_is_kswapd()) { if (page_load > MAX(ptob(minfree), available_memory) / 4) { DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (SET_ERROR(ERESTART)); } /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; return (0); } else if (page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); return (SET_ERROR(EAGAIN)); } page_load = 0; #endif return (0); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (!arc_no_grow && reserve > arc_c/4 && reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT)) arc_c = MIN(arc_c_max, reserve * 4); /* * Throttle when the calculated memory footprint for the TXG * exceeds the target ARC size. */ if (reserve > arc_c) { DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); return (SET_ERROR(ERESTART)); } /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { uint64_t meta_esize = refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve >> 10, meta_esize >> 10, data_esize >> 10, reserve >> 10, arc_c >> 10); DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); evict_data->value.ui64 = refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) { return (EACCES); } else { arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); } return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout its lifetime * (i.e. its b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. */ return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } /* * Called during module initialization and periodically thereafter to * apply reasonable changes to the exposed performance tunings. Non-zero * zfs_* values which differ from the currently set values will be applied. */ static void arc_tuning_update(void) { uint64_t percent, allmem = arc_all_memory(); /* Valid range: 64M - */ if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && (zfs_arc_max > 64 << 20) && (zfs_arc_max < allmem) && (zfs_arc_max > arc_c_min)) { arc_c_max = zfs_arc_max; arc_c = arc_c_max; arc_p = (arc_c >> 1); /* Valid range of arc_meta_limit: arc_meta_min - arc_c_max */ percent = MIN(zfs_arc_meta_limit_percent, 100); arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); percent = MIN(zfs_arc_dnode_limit_percent, 100); arc_dnode_limit = (percent * arc_meta_limit) / 100; } /* Valid range: 32M - */ if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_min <= arc_c_max)) { arc_c_min = zfs_arc_min; arc_c = MAX(arc_c, arc_c_min); } /* Valid range: 16M - */ if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && (zfs_arc_meta_min <= arc_c_max)) { arc_meta_min = zfs_arc_meta_min; arc_meta_limit = MAX(arc_meta_limit, arc_meta_min); arc_dnode_limit = arc_meta_limit / 10; } /* Valid range: - */ if ((zfs_arc_meta_limit) && (zfs_arc_meta_limit != arc_meta_limit) && (zfs_arc_meta_limit >= zfs_arc_meta_min) && (zfs_arc_meta_limit <= arc_c_max)) arc_meta_limit = zfs_arc_meta_limit; /* Valid range: - */ if ((zfs_arc_dnode_limit) && (zfs_arc_dnode_limit != arc_dnode_limit) && (zfs_arc_dnode_limit >= zfs_arc_meta_min) && (zfs_arc_dnode_limit <= arc_c_max)) arc_dnode_limit = zfs_arc_dnode_limit; /* Valid range: 1 - N */ if (zfs_arc_grow_retry) arc_grow_retry = zfs_arc_grow_retry; /* Valid range: 1 - N */ if (zfs_arc_shrink_shift) { arc_shrink_shift = zfs_arc_shrink_shift; arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); } /* Valid range: 1 - N */ if (zfs_arc_p_min_shift) arc_p_min_shift = zfs_arc_p_min_shift; /* Valid range: 1 - N ticks */ if (zfs_arc_min_prefetch_lifespan) arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan; /* Valid range: 0 - 100 */ if ((zfs_arc_lotsfree_percent >= 0) && (zfs_arc_lotsfree_percent <= 100)) arc_lotsfree_percent = zfs_arc_lotsfree_percent; /* Valid range: 0 - */ if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem); } static void arc_state_init(void) { arc_anon = &ARC_anon; arc_mru = &ARC_mru; arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; arc_l2c_only = &ARC_l2c_only; multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_anon->arcs_size); refcount_create(&arc_mru->arcs_size); refcount_create(&arc_mru_ghost->arcs_size); refcount_create(&arc_mfu->arcs_size); refcount_create(&arc_mfu_ghost->arcs_size); refcount_create(&arc_l2c_only->arcs_size); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; arc_mfu->arcs_state = ARC_STATE_MFU; arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; } static void arc_state_fini(void) { refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_anon->arcs_size); refcount_destroy(&arc_mru->arcs_size); refcount_destroy(&arc_mru_ghost->arcs_size); refcount_destroy(&arc_mfu->arcs_size); refcount_destroy(&arc_mfu_ghost->arcs_size); refcount_destroy(&arc_l2c_only->arcs_size); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); } uint64_t arc_max_bytes(void) { return (arc_c_max); } void arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; #ifdef _KERNEL /* * Register a shrinker to support synchronous (direct) memory * reclaim from the arc. This is done to prevent kswapd from * swapping out pages when it is preferable to shrink the arc. */ spl_register_shrinker(&arc_shrinker); /* Set to 1/64 of all memory or a minimum of 512K */ arc_sys_free = MAX(allmem / 64, (512 * 1024)); arc_need_free = 0; #endif /* Set max to 1/2 of all memory */ arc_c_max = allmem / 2; /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ #ifndef _KERNEL arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); #else arc_c_min = 2ULL << SPA_MAXBLOCKSHIFT; #endif arc_c = arc_c_max; arc_p = (arc_c >> 1); arc_size = 0; /* Set min to 1/2 of arc_c_min */ arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; /* Initialize maximum observed usage to zero */ arc_meta_max = 0; /* * Set arc_meta_limit to a percent of arc_c_max with a floor of * arc_meta_min, and a ceiling of arc_c_max. */ percent = MIN(zfs_arc_meta_limit_percent, 100); arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); percent = MIN(zfs_arc_dnode_limit_percent, 100); arc_dnode_limit = (percent * arc_meta_limit) / 100; /* Apply user specified tunings */ arc_tuning_update(); if (zfs_arc_num_sublists_per_state < 1) zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; arc_state_init(); buf_init(); list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); arc_prune_taskq = taskq_create("arc_prune", max_ncpus, defclsyspri, max_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); arc_reclaim_thread_exit = B_FALSE; arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, defclsyspri); arc_dead = B_FALSE; arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by a module parameter, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 25% of physical memory). */ if (zfs_dirty_data_max_max == 0) zfs_dirty_data_max_max = allmem * zfs_dirty_data_max_max_percent / 100; if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = allmem * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } } void arc_fini(void) { arc_prune_t *p; #ifdef _KERNEL spl_unregister_shrinker(&arc_shrinker); #endif /* _KERNEL */ mutex_enter(&arc_reclaim_lock); arc_reclaim_thread_exit = B_TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to * B_FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); } mutex_exit(&arc_reclaim_lock); /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } taskq_wait(arc_prune_taskq); taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); while ((p = list_head(&arc_prune_list)) != NULL) { list_remove(&arc_prune_list, p); refcount_remove(&p->p_refcnt, &arc_prune_list); refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); } mutex_exit(&arc_prune_mtx); list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); mutex_destroy(&arc_reclaim_lock); cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); arc_state_fini(); buf_fini(); ASSERT0(arc_loaned_bytes); } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); } static uint64_t l2arc_write_size(void) { uint64_t size; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; } while (vdev_is_dead(next->l2ad_vdev)); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev)) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write(void) { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); /* * Skipped - drop L2ARC entry and mark the header as no * longer L2 eligibile. */ if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); bytes_dropped += arc_hdr_size(hdr); (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ valid_cksum = arc_cksum_is_equal(hdr, zio); if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; default: return (NULL); } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; buflist = &dev->l2ad_buflist; if (!all && dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. */ return; } if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { /* * When nearing the end of the device, evict to the end * before the device write hand jumps to the start. */ taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } if (HDR_L2_WRITE_HEAD(hdr)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ list_remove(buflist, hdr); mutex_exit(hash_lock); continue; } if (!all && HDR_HAS_L2HDR(hdr) && (hdr->b_l2hdr.b_daddr > taddr || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } ASSERT(HDR_HAS_L2HDR(hdr)); if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_sz, headroom; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); int try; ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); /* * Copy buffers for L2ARC writing. */ for (try = 0; try < L2ARC_FEED_TYPES; try++) { multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; VERIFY3P(mls, !=, NULL); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; uint64_t asize, size; abd_t *to_write; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. */ continue; } passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } if ((write_asize + HDR_GET_LSIZE(hdr)) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3U(arc_hdr_size(hdr), >, 0); size = arc_hdr_size(hdr); (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); /* * Normally the L2ARC can use the hdr's data, but if * we're sharing data between the hdr and one of its * bufs, L2ARC needs its own copy of the data so that * the ZIO below can't race with the buf consumer. To * ensure that this copy will be available for the * lifetime of the ZIO and be cleaned up afterwards, we * add it to the l2arc_free_on_write queue. */ if (!HDR_SHARED_DATA(hdr)) { to_write = hdr->b_l1hdr.b_pabd; } else { to_write = abd_alloc_for_io(size, HDR_ISTYPE_METADATA(hdr)); abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); l2arc_free_abd_on_write(to_write, size, arc_buf_type(hdr)); } wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, size, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); write_sz += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); write_asize += size; /* * Keep the clock hand suitably device-aligned. */ asize = vdev_psize_to_asize(dev->l2ad_vdev, size); write_psize += asize; dev->l2ad_hand += asize; mutex_exit(hash_lock); (void) zio_nowait(wzio); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); return (0); } ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); ARCSTAT_INCR(arcstat_l2_asize, write_asize); vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { dev->l2ad_hand = dev->l2ad_start; dev->l2ad_first = B_FALSE; } dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; return (write_asize); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static void l2arc_feed_thread(void) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); fstrans_cookie_t cookie; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); cookie = spl_fstrans_mark(); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_sig(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } spl_fstrans_unmark(cookie); l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd)); /* * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; list_link_init(&adddev->l2ad_node); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); refcount_create(&adddev->l2ad_alloc); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *dev, *nextdev, *remdev = NULL; /* * Find the device by vdev */ mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { nextdev = list_next(l2arc_dev_list, dev); if (vd == dev->l2ad_vdev) { remdev = dev; break; } } ASSERT3P(remdev, !=, NULL); /* * Remove device from global list */ list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); mutex_destroy(&remdev->l2ad_mtx); refcount_destroy(&remdev->l2ad_alloc); kmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; l2arc_writes_sent = 0; l2arc_writes_done = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { /* * This is called from dmu_fini(), which is called from spa_fini(); * Because of this, we can assume that all l2arc devices have * already been removed when the pools themselves were removed. */ l2arc_do_free_on_write(); mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, defclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(arc_buf_size); EXPORT_SYMBOL(arc_write); EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); /* BEGIN CSTYLED */ module_param(zfs_arc_min, ulong, 0644); MODULE_PARM_DESC(zfs_arc_min, "Min arc size"); module_param(zfs_arc_max, ulong, 0644); MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); module_param(zfs_arc_meta_limit, ulong, 0644); MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); module_param(zfs_arc_meta_limit_percent, ulong, 0644); MODULE_PARM_DESC(zfs_arc_meta_limit_percent, "Percent of arc size for arc meta limit"); module_param(zfs_arc_meta_min, ulong, 0644); MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata"); module_param(zfs_arc_meta_prune, int, 0644); MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune"); module_param(zfs_arc_meta_adjust_restarts, int, 0644); MODULE_PARM_DESC(zfs_arc_meta_adjust_restarts, "Limit number of restarts in arc_adjust_meta"); module_param(zfs_arc_meta_strategy, int, 0644); MODULE_PARM_DESC(zfs_arc_meta_strategy, "Meta reclaim strategy"); module_param(zfs_arc_grow_retry, int, 0644); MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); module_param(zfs_arc_p_aggressive_disable, int, 0644); MODULE_PARM_DESC(zfs_arc_p_aggressive_disable, "disable aggressive arc_p grow"); module_param(zfs_arc_p_dampener_disable, int, 0644); MODULE_PARM_DESC(zfs_arc_p_dampener_disable, "disable arc_p adapt dampener"); module_param(zfs_arc_shrink_shift, int, 0644); MODULE_PARM_DESC(zfs_arc_shrink_shift, "log2(fraction of arc to reclaim)"); module_param(zfs_arc_p_min_shift, int, 0644); MODULE_PARM_DESC(zfs_arc_p_min_shift, "arc_c shift to calc min/max arc_p"); module_param(zfs_arc_average_blocksize, int, 0444); MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size"); module_param(zfs_compressed_arc_enabled, int, 0644); MODULE_PARM_DESC(zfs_arc_average_blocksize, "Disable compressed arc buffers"); module_param(zfs_arc_min_prefetch_lifespan, int, 0644); MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block"); module_param(zfs_arc_num_sublists_per_state, int, 0644); MODULE_PARM_DESC(zfs_arc_num_sublists_per_state, "Number of sublists used in each of the ARC state lists"); module_param(l2arc_write_max, ulong, 0644); MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval"); module_param(l2arc_write_boost, ulong, 0644); MODULE_PARM_DESC(l2arc_write_boost, "Extra write bytes during device warmup"); module_param(l2arc_headroom, ulong, 0644); MODULE_PARM_DESC(l2arc_headroom, "Number of max device writes to precache"); module_param(l2arc_headroom_boost, ulong, 0644); MODULE_PARM_DESC(l2arc_headroom_boost, "Compressed l2arc_headroom multiplier"); module_param(l2arc_feed_secs, ulong, 0644); MODULE_PARM_DESC(l2arc_feed_secs, "Seconds between L2ARC writing"); module_param(l2arc_feed_min_ms, ulong, 0644); MODULE_PARM_DESC(l2arc_feed_min_ms, "Min feed interval in milliseconds"); module_param(l2arc_noprefetch, int, 0644); MODULE_PARM_DESC(l2arc_noprefetch, "Skip caching prefetched buffers"); module_param(l2arc_feed_again, int, 0644); MODULE_PARM_DESC(l2arc_feed_again, "Turbo L2ARC warmup"); module_param(l2arc_norw, int, 0644); MODULE_PARM_DESC(l2arc_norw, "No reads during writes"); module_param(zfs_arc_lotsfree_percent, int, 0644); MODULE_PARM_DESC(zfs_arc_lotsfree_percent, "System free memory I/O throttle in bytes"); module_param(zfs_arc_sys_free, ulong, 0644); MODULE_PARM_DESC(zfs_arc_sys_free, "System free memory target size in bytes"); module_param(zfs_arc_dnode_limit, ulong, 0644); MODULE_PARM_DESC(zfs_arc_dnode_limit, "Minimum bytes of dnodes in arc"); module_param(zfs_arc_dnode_limit_percent, ulong, 0644); MODULE_PARM_DESC(zfs_arc_dnode_limit_percent, "Percent of ARC meta buffers for dnodes"); module_param(zfs_arc_dnode_reduce_percent, ulong, 0644); MODULE_PARM_DESC(zfs_arc_dnode_reduce_percent, "Percentage of excess dnodes to try to unpin"); /* END CSTYLED */ #endif diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index b7dfb858721a..ca1a443032c8 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1,3905 +1,3905 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct dbuf_hold_impl_data { /* Function arguments */ dnode_t *dh_dn; uint8_t dh_level; uint64_t dh_blkid; boolean_t dh_fail_sparse; boolean_t dh_fail_uncached; void *dh_tag; dmu_buf_impl_t **dh_dbp; /* Local variables */ dmu_buf_impl_t *dh_db; dmu_buf_impl_t *dh_parent; blkptr_t *dh_bp; int dh_err; dbuf_dirty_record_t *dh_dr; arc_buf_contents_t dh_type; int dh_depth; }; static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp, int depth); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); uint_t zfs_dbuf_evict_key; /* * Number of times that zfs_free_range() took the slow path while doing * a zfs receive. A nonzero value indicates a potential performance problem. */ uint64_t zfs_free_range_recv_miss; static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); #ifndef __lint extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); #endif /* ! __lint */ /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_kmem_cache; static taskq_t *dbu_evict_taskq; static kthread_t *dbuf_cache_evict_thread; static kmutex_t dbuf_evict_lock; static kcondvar_t dbuf_evict_cv; static boolean_t dbuf_evict_thread_exit; /* * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that * are not currently held but have been recently released. These dbufs * are not eligible for arc eviction until they are aged out of the cache. * Dbufs are added to the dbuf cache once the last hold is released. If a * dbuf is later accessed and still exists in the dbuf cache, then it will * be removed from the cache and later re-added to the head of the cache. * Dbufs that are aged out of the cache will be immediately destroyed and * become eligible for arc eviction. */ static multilist_t dbuf_cache; static refcount_t dbuf_cache_size; unsigned long dbuf_cache_max_bytes = 100 * 1024 * 1024; /* Cap the size of the dbuf cache to log2 fraction of arc size. */ int dbuf_cache_max_shift = 5; /* * The dbuf cache uses a three-stage eviction policy: * - A low water marker designates when the dbuf eviction thread * should stop evicting from the dbuf cache. * - When we reach the maximum size (aka mid water mark), we * signal the eviction thread to run. * - The high water mark indicates when the eviction thread * is unable to keep up with the incoming load and eviction must * happen in the context of the calling thread. * * The dbuf cache: * (max size) * low water mid water hi water * +----------------------------------------+----------+----------+ * | | | | * | | | | * | | | | * | | | | * +----------------------------------------+----------+----------+ * stop signal evict * evicting eviction directly * thread * * The high and low water marks indicate the operating range for the eviction * thread. The low water mark is, by default, 90% of the total size of the * cache and the high water mark is at 110% (both of these percentages can be * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, * respectively). The eviction thread will try to ensure that the cache remains * within this range by waking up every second and checking if the cache is * above the low water mark. The thread can also be woken up by callers adding * elements into the cache if the cache is larger than the mid water (i.e max * cache size). Once the eviction thread is woken up and eviction is required, * it will continue evicting buffers until it's able to reduce the cache size * to the low water mark. If the cache size continues to grow and hits the high * water mark, then callers adding elements to the cache will begin to evict * directly from the cache until the cache is no longer above the high water * mark. */ /* * The percentage above and below the maximum cache size. */ uint_t dbuf_cache_hiwater_pct = 10; uint_t dbuf_cache_lowater_pct = 10; /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { dmu_buf_impl_t *db = vdb; bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); multilist_link_init(&db->db_cache_link); return (0); } /* ARGSUSED */ static void dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; static uint64_t dbuf_hash_count; static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { uintptr_t osv = (uintptr_t)os; uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); return (crc); } #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv; uint64_t idx; dmu_buf_impl_t *db; hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } static dmu_buf_impl_t * dbuf_find_bonus(objset_t *os, uint64_t object) { dnode_t *dn; dmu_buf_impl_t *db = NULL; if (dnode_hold(os, object, FTAG, &dn) == 0) { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus != NULL) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } return (db); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid, hv, idx; dmu_buf_impl_t *dbf; blkid = db->db_blkid; hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_inc_64(&dbuf_hash_count); return (NULL); } /* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv, idx; dmu_buf_impl_t *dbf, **dbp; hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); idx = hv & h->hash_table_mask; /* * We mustn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_hash_count); } typedef enum { DBVU_EVICTING, DBVU_NOT_EVICTING } dbvu_verify_type_t; static void dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) { #ifdef ZFS_DEBUG int64_t holds; if (db->db_user == NULL) return; /* Only data blocks support the attachment of user data. */ ASSERT(db->db_level == 0); /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); ASSERT3U(db->db_state, ==, DB_CACHED); holds = refcount_count(&db->db_holds); if (verify_type == DBVU_EVICTING) { /* * Immediate eviction occurs when holds == dirtycnt. * For normal eviction buffers, holds is zero on * eviction, except when dbuf_fix_old_data() calls * dbuf_clear_data(). However, the hold count can grow * during eviction even though db_mtx is held (see * dmu_bonus_hold() for an example), so we can only * test the generic invariant that holds >= dirtycnt. */ ASSERT3U(holds, >=, db->db_dirtycnt); } else { if (db->db_user_immediate_evict == TRUE) ASSERT3U(holds, >=, db->db_dirtycnt); else ASSERT3U(holds, >, 0); } #endif } static void dbuf_evict_user(dmu_buf_impl_t *db) { dmu_buf_user_t *dbu = db->db_user; ASSERT(MUTEX_HELD(&db->db_mtx)); if (dbu == NULL) return; dbuf_verify_user(db, DBVU_EVICTING); db->db_user = NULL; #ifdef ZFS_DEBUG if (dbu->dbu_clear_on_evict_dbufp != NULL) *dbu->dbu_clear_on_evict_dbufp = NULL; #endif /* * Invoke the callback from a taskq to avoid lock order reversals * and limit stack depth. */ taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, &dbu->dbu_tqent); } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { /* * Consider indirect blocks and spill blocks to be meta data. */ if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); } } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the dbuf eviction * code is laid out; dbuf_evict_thread() assumes dbufs are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ unsigned int dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) { dmu_buf_impl_t *db = obj; /* * The assumption here, is the hash value for a given * dmu_buf_impl_t will remain constant throughout it's lifetime * (i.e. it's objset, object, level and blkid fields don't change). * Thus, we don't need to store the dbuf's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. */ return (dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid) % multilist_get_num_sublists(ml)); } static inline boolean_t dbuf_cache_above_hiwater(void) { uint64_t dbuf_cache_hiwater_bytes = (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; return (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); } static inline boolean_t dbuf_cache_above_lowater(void) { uint64_t dbuf_cache_lowater_bytes = (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; return (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); } /* * Evict the oldest eligible dbuf from the dbuf cache. */ static void dbuf_evict_one(void) { int idx = multilist_get_random_index(&dbuf_cache); multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); dmu_buf_impl_t *db; ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); /* * Set the thread's tsd to indicate that it's processing evictions. * Once a thread stops evicting from the dbuf cache it will * reset its tsd to NULL. */ ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); db = multilist_sublist_tail(mls); while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { db = multilist_sublist_prev(mls, db); } DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, multilist_sublist_t *, mls); if (db != NULL) { multilist_sublist_remove(mls, db); multilist_sublist_unlock(mls); (void) refcount_remove_many(&dbuf_cache_size, db->db.db_size, db); dbuf_destroy(db); } else { multilist_sublist_unlock(mls); } (void) tsd_set(zfs_dbuf_evict_key, NULL); } /* * The dbuf evict thread is responsible for aging out dbufs from the * cache. Once the cache has reached it's maximum size, dbufs are removed * and destroyed. The eviction thread will continue running until the size * of the dbuf cache is at or below the maximum size. Once the dbuf is aged * out of the cache it is destroyed and becomes eligible for arc eviction. */ static void dbuf_evict_thread(void) { callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); mutex_enter(&dbuf_evict_lock); while (!dbuf_evict_thread_exit) { while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_sig_hires(&dbuf_evict_cv, &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); /* * Keep evicting as long as we're above the low water mark * for the cache. We do this without holding the locks to * minimize lock contention. */ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { dbuf_evict_one(); } mutex_enter(&dbuf_evict_lock); } dbuf_evict_thread_exit = B_FALSE; cv_broadcast(&dbuf_evict_cv); CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ thread_exit(); } /* * Wake up the dbuf eviction thread if the dbuf cache is at its max size. * If the dbuf cache is at its high water mark, then evict a dbuf from the * dbuf cache using the callers context. */ static void dbuf_evict_notify(void) { /* * We use thread specific data to track when a thread has * started processing evictions. This allows us to avoid deeply * nested stacks that would have a call flow similar to this: * * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() * ^ | * | | * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ * * The dbuf_eviction_thread will always have its tsd set until * that thread exits. All other threads will only set their tsd * if they are participating in the eviction process. This only * happens if the eviction thread is unable to process evictions * fast enough. To keep the dbuf cache size in check, other threads * can evict from the dbuf cache directly. Those threads will set * their tsd values so that we ensure that they only evict one dbuf * from the dbuf cache. */ if (tsd_get(zfs_dbuf_evict_key) != NULL) return; if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { boolean_t evict_now = B_FALSE; mutex_enter(&dbuf_evict_lock); if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { evict_now = dbuf_cache_above_hiwater(); cv_signal(&dbuf_evict_cv); } mutex_exit(&dbuf_evict_lock); if (evict_now) { dbuf_evict_one(); } } } void dbuf_init(void) { uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) hsize <<= 1; retry: h->hash_table_mask = hsize - 1; #if defined(_KERNEL) && defined(HAVE_SPL) /* * Large allocations which do not require contiguous pages * should be using vmem_alloc() in the linux kernel */ h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); #else h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); #endif if (h->hash_table == NULL) { /* XXX - we should really return an error instead of assert */ ASSERT(hsize > (1ULL << 10)); hsize >>= 1; goto retry; } dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); /* * Setup the parameters for the dbuf cache. We cap the size of the * dbuf cache to 1/32nd (default) of the size of the ARC. */ dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, arc_max_bytes() >> dbuf_cache_max_shift); /* * All entries are queued via taskq_dispatch_ent(), so min/maxalloc * configuration is not required. */ dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_cache_link), zfs_arc_num_sublists_per_state, dbuf_cache_multilist_index_func); refcount_create(&dbuf_cache_size); tsd_create(&zfs_dbuf_evict_key, NULL); dbuf_evict_thread_exit = B_FALSE; mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, NULL, 0, &p0, TS_RUN, minclsyspri); } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; int i; dbuf_stats_destroy(); for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); #if defined(_KERNEL) && defined(HAVE_SPL) /* * Large allocations which do not require contiguous pages * should be using vmem_free() in the linux kernel */ vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); #else kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); #endif kmem_cache_destroy(dbuf_kmem_cache); taskq_destroy(dbu_evict_taskq); mutex_enter(&dbuf_evict_lock); dbuf_evict_thread_exit = B_TRUE; while (dbuf_evict_thread_exit) { cv_signal(&dbuf_evict_cv); cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); } mutex_exit(&dbuf_evict_lock); tsd_destroy(&zfs_dbuf_evict_key); mutex_destroy(&dbuf_evict_lock); cv_destroy(&dbuf_evict_cv); refcount_destroy(&dbuf_cache_size); multilist_destroy(&dbuf_cache); } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ ASSERTV(int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT); ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. * * There is an exception to this rule for indirect blocks; in * this case, if the indirect block is a hole, we fill in a few * fields on each of the child blocks (importantly, birth time) * to prevent hole birth times from being lost when you * partially fill in a hole. */ if (db->db_dirtycnt == 0) { if (db->db_level == 0) { uint64_t *buf = db->db.db_data; int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } else { int i; blkptr_t *bps = db->db.db_data; ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, db->db.db_size); /* * We want to verify that all the blkptrs in the * indirect block are holes, but we may have * automatically set up a few fields for them. * We iterate through each blkptr and verify * they only have those fields set. */ for (i = 0; i < db->db.db_size / sizeof (blkptr_t); i++) { blkptr_t *bp = &bps[i]; ASSERT(ZIO_CHECKSUM_IS_ZERO( &bp->blk_cksum)); ASSERT( DVA_IS_EMPTY(&bp->blk_dva[0]) && DVA_IS_EMPTY(&bp->blk_dva[1]) && DVA_IS_EMPTY(&bp->blk_dva[2])); ASSERT0(bp->blk_fill); ASSERT0(bp->blk_pad[0]); ASSERT0(bp->blk_pad[1]); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(BP_IS_HOLE(bp)); ASSERT0(bp->blk_phys_birth); } } } } DB_DNODE_EXIT(db); } #endif static void dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); ASSERT3P(db->db_buf, ==, NULL); db->db.db_data = NULL; if (db->db_state != DB_NOFILL) db->db_state = DB_UNCACHED; } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); db->db_buf = buf; ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, B_FALSE, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); db->db_buf = NULL; dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); } /* * Calculate which level n block references the data at the level 0 offset * provided. */ uint64_t dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) { if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { /* * The level n blkid is equal to the level 0 blkid divided by * the number of level 0s in a level n block. * * The level 0 blkid is offset >> datablkshift = * offset / 2^datablkshift. * * The number of level 0s in a level n is the number of block * pointers in an indirect block, raised to the power of level. * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). * * Thus, the level n blkid is: offset / * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) * = offset / 2^(datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) * = offset >> (datablkshift + level * * (indblkshift - SPA_BLKPTRSHIFT)) */ const unsigned exp = dn->dn_datablkshift + level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); if (exp >= 8 * sizeof (offset)) { /* This only happens on the highest indirection level */ ASSERT3U(level, ==, dn->dn_nlevels - 1); return (0); } ASSERT3U(exp, <, 8 * sizeof (offset)); return (offset >> exp); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } static void dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else if (zio == NULL || zio->io_error == 0) { dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL); } static int dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { dnode_t *dn; zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { /* * The bonus length stored in the dnode may be less than * the maximum available space in the bonus buffer. */ int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) bzero(db->db.db_data, max_bonuslen); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return (0); } /* * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type, db->db.db_size)); bzero(db->db.db_data, db->db.db_size); if (db->db_blkptr != NULL && db->db_level > 0 && BP_IS_HOLE(db->db_blkptr) && db->db_blkptr->blk_birth != 0) { blkptr_t *bps = db->db.db_data; int i; for (i = 0; i < ((1 << DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); i++) { blkptr_t *bp = &bps[i]; ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 1 << dn->dn_indblkshift); BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ? dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr)); BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1); BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); } } DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return (0); } DB_DNODE_EXIT(db); db->db_state = DB_READ; mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); return (err); } /* * This is our just-in-time copy function. It makes a copy of buffers that * have been modified in a previous transaction group before we access them in * the current active group. * * This function is used in three places: when we are dirtying a buffer for the * first time in a txg, when we are freeing a range in a dnode that includes * this buffer, and when we are accessing a buffer which was received compressed * and later referenced in a WRITE_BYREF record. * * Note that when we are called from dbuf_free_range() we do not put a hold on * the buffer, we just traverse the active dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn = DB_DNODE(db); int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); arc_space_consume(bonuslen, ARC_SPACE_BONUS); bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = arc_buf_size(db->db_buf); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; enum zio_compress compress_type = arc_get_compression(db->db_buf); if (compress_type == ZIO_COMPRESS_OFF) { dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } else { ASSERT3U(type, ==, ARC_BUFC_DATA); dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, size, arc_buf_lsize(db->db_buf), compress_type); } bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { db->db_buf = NULL; dbuf_clear_data(db); } } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; boolean_t havepzio = (zio != NULL); boolean_t prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) return (SET_ERROR(EIO)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { /* * If the arc buf is compressed, we need to decompress it to * read the data. This could happen during the "zfs receive" of * a stream which is compressed and deduplicated. */ if (db->db_buf != NULL && arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { dbuf_fix_old_data(db, spa_syncing_txg(dmu_objset_spa(db->db_objset))); err = arc_decompress(db->db_buf); dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; if (zio == NULL && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); err = dbuf_read_impl(db, zio, flags); /* dbuf_read_impl has dropped db_mtx for us */ if (!err && prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); if (!err && !havepzio && zio != NULL) err = zio_wait(zio); } else { /* * Another reader came in while the dbuf was in flight * between UNCACHED and CACHED. Either a writer will finish * writing the buffer (sending the dbuf to CACHED) or the * first reader's request will reach the read_done callback * and send the dbuf to CACHED. Otherwise, a failure * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); } mutex_exit(&db->db_mtx); } ASSERT(err || havepzio || db->db_state == DB_CACHED); return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. * * This is a no-op if the dataset is in the middle of an incremental * receive; see comment below for details. */ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db_search; dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; boolean_t freespill = (start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID); if (end_blkid > dn->dn_maxblkid && !freespill) end_blkid = dn->dn_maxblkid; dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); db_search->db_level = 0; db_search->db_blkid = start_blkid; db_search->db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) { /* There can't be any dbufs in this range; no need to search. */ #ifdef DEBUG db = avl_find(&dn->dn_dbufs, db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); ASSERT(db == NULL || db->db_level > 0); #endif goto out; } else if (dmu_objset_is_receiving(dn->dn_objset)) { /* * If we are receiving, we expect there to be no dbufs in * the range to be freed, because receive modifies each * block at most once, and in offset order. If this is * not the case, it can lead to performance problems, * so note that we unexpectedly took the slow path. */ atomic_inc_64(&zfs_free_range_recv_miss); } db = avl_find(&dn->dn_dbufs, db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = db_next) { db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level != 0 || db->db_blkid > end_blkid) { break; } ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { /* mutex has been dropped and dbuf destroyed */ continue; } if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_destroy(db); continue; } /* The dbuf is referenced */ if (db->db_last_dirty != NULL) { dbuf_dirty_record_t *dr = db->db_last_dirty; if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } out: kmem_free(db_search, sizeof (dmu_buf_impl_t)); mutex_exit(&dn->dn_dbufs_mtx); } static int dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; /* * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. * * This logic ensures that only block births for * filled blocks are considered. */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_last_dirty && (db->db_blkptr == NULL || !BP_IS_HOLE(db->db_blkptr))) { birth_txg = db->db_last_dirty->dr_txg; } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { birth_txg = db->db_blkptr->blk_birth; } /* * If this block don't exist or is in a snapshot, it can't be freed. * Don't pass the bp to dsl_dataset_block_freeable() since we * are holding the db_mtx lock and might deadlock if we are * prefetching a dedup-ed block. */ if (birth_txg != 0) return (ds == NULL || dsl_dataset_block_freeable(ds, NULL, birth_txg)); else return (B_FALSE); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* XXX does *this* func really need the lock? */ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ obuf = db->db_buf; bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); db->db_last_dirty->dt.dl.dr_data = buf; } mutex_exit(&db->db_mtx); dnode_willuse_space(dn, size-osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { ASSERTV(objset_t *os = db->db_objset); ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); (void) arc_release(db->db_buf, db); } /* * We already have a dirty record for this TXG, and we are being * dirtied again. */ static void dbuf_redirty(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) { /* Already released on initial dirty, so just thaw. */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } } } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); /* * Don't set dirtyctx to SYNC if we're just modifying this as we * initialize the objset. */ if (dn->dn_dirtyctx == DN_UNDIRTIED && !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); ASSERT(dn->dn_dirtyctx_firstset == NULL); dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); } mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); dbuf_redirty(dr); mutex_exit(&db->db_mtx); return (dr); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { /* * Update the accounting. * Note: we delay "free accounting" until after we drop * the db_mtx. This keeps us from grabbing other locks * (and possibly deadlocking) in bp_get_dsize() while * also holding the db_mtx. */ dnode_willuse_space(dn, db->db.db_size, tx); do_free_accounting = dbuf_block_freeable(db); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); list_link_init(&dr->dr_dirty_node); if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; *drp = dr; /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } /* * The dn_struct_rwlock prevents db_blkptr from changing * due to a write from syncing context completing * while we are running, so we want to acquire it before * looking at db_blkptr. */ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? bp_get_dsize(os->os_spa, bp) : db->db.db_size; /* * This is only a guess -- if the dbuf is dirty * in a previous txg, we don't know how much * space it will use on disk yet. We should * really have the struct_rwlock to access * db_blkptr, but since this is just a guess, * it's OK if we get an odd answer. */ ddt_prefetch(os->os_spa, bp); dnode_willuse_space(dn, -willfree, tx); } if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level+1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); /* * Due to our use of dn_nlevels below, this can only be called * in open context, unless we are operating on the MOS. * From syncing context, dn_nlevels may be different from the * dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) return (B_FALSE); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); *drp = dr->dr_next; /* * Note that there are three places in dbuf_dirty() * where this dirty record may be put on a list. * Make sure to do a list_remove corresponding to * every one of those list_insert calls. */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } return (B_FALSE); } void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; dbuf_dirty_record_t *dr; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); /* * Quick check for dirtyness. For already dirty blocks, this * reduces runtime of this function by >90%, and overall performance * by 50% for some workloads (e.g. file deletion with indirect blocks * cached). */ mutex_enter(&db->db_mtx); for (dr = db->db_last_dirty; dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { /* * It's possible that it is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { /* This dbuf is already dirty and cached. */ dbuf_redirty(dr); mutex_exit(&db->db_mtx); return; } } mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; dmu_buf_will_fill(db_fake, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); (void) dbuf_dirty(db, tx); } #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) { mutex_enter(&db->db_mtx); DBUF_VERIFY(db); if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; if (etype == BP_EMBEDDED_TYPE_DATA) { ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), SPA_FEATURE_EMBEDDED_DATA)); } DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); dmu_buf_will_not_fill(dbuf, tx); ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); dl = &db->db_last_dirty->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); BP_SET_TYPE(&dl->dr_overridden_by, type); BP_SET_LEVEL(&dl->dr_overridden_by, 0); BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); ASSERT(buf != NULL); ASSERT(arc_buf_lsize(buf) == db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); } void dbuf_destroy(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; } if (db->db_blkid == DMU_BONUS_BLKID) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); ASSERT(db->db.db_data != NULL); kmem_free(db->db.db_data, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); db->db_state = DB_UNCACHED; } dbuf_clear_data(db); if (multilist_link_active(&db->db_cache_link)) { multilist_remove(&dbuf_cache, db); (void) refcount_remove_many(&dbuf_cache_size, db->db.db_size, db); } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; /* * Now that db_state is DB_EVICTING, nobody else can find this via * the hash table. We can now drop db_mtx, which allows us to * acquire the dn_dbufs_mtx. */ mutex_exit(&db->db_mtx); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID) { boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); if (needlock) mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); if (needlock) mutex_exit(&dn->dn_dbufs_mtx); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; dbuf_hash_remove(db); } else { DB_DNODE_EXIT(db); } ASSERT(refcount_is_zero(&db->db_holds)); db->db_parent = NULL; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); ASSERT(!multilist_link_active(&db->db_cache_link)); kmem_cache_free(dbuf_kmem_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) dbuf_rele(parent, db); } /* * Note: While bpp will always be updated if the function returns success, * parentp will not be updated if the dnode does not have dn_dbuf filled in; * this happens when the dnode is the meta-dnode, or a userused or groupused * object. */ __attribute__((always_inline)) static inline int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh) { int nlevels, epbs; *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = DN_SPILL_BLKPTR(dn->dn_phys); else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } nlevels = (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); /* * This assertion shouldn't trip as long as the max indirect block size * is less than 1M. The reason for this is that up to that point, * the number of levels required to address an entire object with blocks * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 * (i.e. we can address the entire object), objects will all use at most * N-1 levels and the assertion won't overflow. However, once epbs is * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be * enough to address an entire object, so objects will have 5 levels, * but then this assertion will overflow. * * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we * need to redo this logic to handle overflows. */ ASSERT(level >= nlevels || ((nlevels - level - 1) * epbs) + highbit64(dn->dn_phys->dn_nblkptr) <= 64); if (level >= nlevels || blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << ((nlevels - level - 1) * epbs)) || (fail_sparse && blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err; if (dh == NULL) { err = dbuf_hold_impl(dn, level+1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp); } else { __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, blkid >> epbs, fail_sparse, FALSE, NULL, parentp, dh->dh_depth + 1); err = __dbuf_hold_impl(dh + 1); } if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; db->db_user = NULL; db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before its added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } avl_add(&dn->dn_dbufs, db); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); atomic_inc_32(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); return (db); } typedef struct dbuf_prefetch_arg { spa_t *dpa_spa; /* The spa to issue the prefetch in. */ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ int dpa_curlevel; /* The current level that we're reading */ dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ } dbuf_prefetch_arg_t; /* * Actually issue the prefetch read for the block given. */ static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { arc_flags_t aflags; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return; aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &dpa->dpa_zb); } /* * Called when an indirect block above our prefetch target is read in. This * will either read in the next indirect block down the tree or issue the actual * prefetch if the next block down is our target. */ static void dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; uint64_t nextblkid; blkptr_t *bp; ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); /* * The dpa_dnode is only valid if we are called with a NULL * zio. This indicates that the arc_read() returned without * first calling zio_read() to issue a physical read. Once * a physical read is made the dpa_dnode must be invalidated * as the locks guarding it may have been dropped. If the * dpa_dnode is still valid, then we want to add it to the dbuf * cache. To do so, we must hold the dbuf associated with the block * we just prefetched, read its contents so that we associate it * with an arc_buf_t, and then release it. */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); if (zio->io_flags & ZIO_FLAG_RAW) { ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); dpa->dpa_dnode = NULL; } else if (dpa->dpa_dnode != NULL) { uint64_t curblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); } dpa->dpa_curlevel--; nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } arc_buf_destroy(abuf, private); } /* * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to * complete. */ void dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags) { blkptr_t bp; int epbs, nlevels, curlevel; uint64_t curblkid; dmu_buf_impl_t *db; zio_t *pio; dbuf_prefetch_arg_t *dpa; dsl_dataset_t *ds; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) return; if (dnode_block_freed(dn, blkid)) return; /* * This dnode hasn't been written to disk yet, so there's nothing to * prefetch. */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) return; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) return; db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db != NULL) { mutex_exit(&db->db_mtx); /* * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ return; } /* * Find the closest ancestor (indirect block) of the target block * that is present in the cache. In this indirect block, we will * find the bp that is at curlevel, curblkid. */ curlevel = level; curblkid = blkid; while (curlevel < nlevels - 1) { int parent_level = curlevel + 1; uint64_t parent_blkid = curblkid >> epbs; dmu_buf_impl_t *db; if (dbuf_hold_impl(dn, parent_level, parent_blkid, FALSE, TRUE, FTAG, &db) == 0) { blkptr_t *bpp = db->db_buf->b_data; bp = bpp[P2PHASE(curblkid, 1 << epbs)]; dbuf_rele(db, FTAG); break; } curlevel = parent_level; curblkid = parent_blkid; } if (curlevel == nlevels - 1) { /* No cached indirect blocks found. */ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); bp = dn->dn_phys->dn_blkptr[curblkid]; } if (BP_IS_HOLE(&bp)) return; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, ZIO_FLAG_CANFAIL); dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); ds = dn->dn_objset->os_dsl_dataset; SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, level, blkid); dpa->dpa_curlevel = curlevel; dpa->dpa_prio = prio; dpa->dpa_aflags = aflags; dpa->dpa_spa = dn->dn_objset->os_spa; dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; /* * If we have the indirect just above us, no need to do the asynchronous * prefetch chain; we'll just run the last step ourselves. If we're at * a higher level, though, we want to issue the prefetches for all the * indirect blocks asynchronously, so we can go on with whatever we were * doing. */ if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, curlevel, curblkid); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, &bp, dbuf_prefetch_indirect_done, dpa, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &iter_aflags, &zb); } /* * We use pio here instead of dpa_zio since it's possible that * dpa may have already been freed. */ zio_nowait(pio); } #define DBUF_HOLD_IMPL_MAX_DEPTH 20 /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) { ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH); dh->dh_parent = NULL; ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); *(dh->dh_dbp) = NULL; /* dbuf_find() returns with db_mtx held */ dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, dh->dh_level, dh->dh_blkid); if (dh->dh_db == NULL) { dh->dh_bp = NULL; if (dh->dh_fail_uncached) return (SET_ERROR(ENOENT)); ASSERT3P(dh->dh_parent, ==, NULL); dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh); if (dh->dh_fail_sparse) { if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) dh->dh_err = SET_ERROR(ENOENT); if (dh->dh_err) { if (dh->dh_parent) dbuf_rele(dh->dh_parent, NULL); return (dh->dh_err); } } if (dh->dh_err && dh->dh_err != ENOENT) return (dh->dh_err); dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, dh->dh_parent, dh->dh_bp); } if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { mutex_exit(&dh->dh_db->db_mtx); return (SET_ERROR(ENOENT)); } if (dh->dh_db->db_buf != NULL) ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (dh->dh_db->db_level == 0 && dh->dh_db->db_blkid != DMU_BONUS_BLKID && dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { dh->dh_dr = dh->dh_db->db_data_pending; if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) { dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db); dbuf_set_data(dh->dh_db, arc_alloc_buf(dh->dh_dn->dn_objset->os_spa, dh->dh_db, dh->dh_type, dh->dh_db->db.db_size)); bcopy(dh->dh_dr->dt.dl.dr_data->b_data, dh->dh_db->db.db_data, dh->dh_db->db.db_size); } } if (multilist_link_active(&dh->dh_db->db_cache_link)) { ASSERT(refcount_is_zero(&dh->dh_db->db_holds)); multilist_remove(&dbuf_cache, dh->dh_db); (void) refcount_remove_many(&dbuf_cache_size, dh->dh_db->db.db_size, dh->dh_db); } (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); DBUF_VERIFY(dh->dh_db); mutex_exit(&dh->dh_db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (dh->dh_parent) dbuf_rele(dh->dh_parent, NULL); ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); *(dh->dh_dbp) = dh->dh_db; return (0); } /* * The following code preserves the recursive function dbuf_hold_impl() * but moves the local variables AND function arguments to the heap to * minimize the stack frame size. Enough space is initially allocated * on the stack for 20 levels of recursion. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { struct dbuf_hold_impl_data *dh; int error; dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) * DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, fail_uncached, tag, dbp, 0); error = __dbuf_hold_impl(dh); kmem_free(dh, sizeof (struct dbuf_hold_impl_data) * DBUF_HOLD_IMPL_MAX_DEPTH); return (error); } static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, dnode_t *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp, int depth) + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp, int depth) { dh->dh_dn = dn; dh->dh_level = level; dh->dh_blkid = blkid; dh->dh_fail_sparse = fail_sparse; dh->dh_fail_uncached = fail_uncached; dh->dh_tag = tag; dh->dh_dbp = dbp; dh->dh_db = NULL; dh->dh_parent = NULL; dh->dh_bp = NULL; dh->dh_err = 0; dh->dh_dr = NULL; dh->dh_type = 0; dh->dh_depth = depth; } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); DB_DNODE_ENTER(db); dn = DB_DNODE(db); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); VERIFY3S(holds, >, 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref boolean_t dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *found_db; boolean_t result = B_FALSE; if (blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else found_db = dbuf_find(os, obj, 0, blkid); if (found_db != NULL) { if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { (void) refcount_add(&db->db_holds, tag); result = B_TRUE; } mutex_exit(&found_db->db_mtx); } return (result); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag); } void dmu_buf_rele(dmu_buf_t *db, void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf != NULL && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); } if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_user_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn; boolean_t evict_dbuf = db->db_pending_evict; /* * If the dnode moves here, we cannot cross this * barrier until the move completes. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); atomic_dec_32(&dn->dn_dbufs_count); /* * Decrementing the dbuf count means that the bonus * buffer's dnode hold is no longer discounted in * dnode_move(). The dnode cannot move until after * the dnode_rele() below. */ DB_DNODE_EXIT(db); /* * Do not reference db after its lock is dropped. * Another thread may evict it. */ mutex_exit(&db->db_mtx); if (evict_dbuf) dnode_evict_bonus(dn); dnode_rele(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_destroy(db); } else if (arc_released(db->db_buf)) { /* * This dbuf has anonymous data associated with it. */ dbuf_destroy(db); } else { boolean_t do_arc_evict = B_FALSE; blkptr_t bp; spa_t *spa = dmu_objset_spa(db->db_objset); if (!DBUF_IS_CACHEABLE(db) && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr) && !BP_IS_EMBEDDED(db->db_blkptr)) { do_arc_evict = B_TRUE; bp = *db->db_blkptr; } if (!DBUF_IS_CACHEABLE(db) || db->db_pending_evict) { dbuf_destroy(db); } else if (!multilist_link_active(&db->db_cache_link)) { multilist_insert(&dbuf_cache, db); (void) refcount_add_many(&dbuf_cache_size, db->db.db_size, db); mutex_exit(&db->db_mtx); dbuf_evict_notify(); } if (do_arc_evict) arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (refcount_count(&db->db_holds)); } void * dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; mutex_enter(&db->db_mtx); dbuf_verify_user(db, DBVU_NOT_EVICTING); if (db->db_user == old_user) db->db_user = new_user; else old_user = db->db_user; dbuf_verify_user(db, DBVU_NOT_EVICTING); mutex_exit(&db->db_mtx); return (old_user); } void * dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, NULL, user)); } void * dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_user_immediate_evict = TRUE; return (dmu_buf_set_user(db_fake, user)); } void * dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dbuf_verify_user(db, DBVU_NOT_EVICTING); return (db->db_user); } void dmu_buf_user_evict_wait() { taskq_wait(dbu_evict_taskq); } boolean_t dmu_buf_freeable(dmu_buf_t *dbuf) { boolean_t res = B_FALSE; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; if (db->db_blkptr) res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, db->db_blkptr, db->db_blkptr->blk_birth); return (res); } blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_blkptr); } objset_t * dmu_buf_get_objset(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_objset); } dnode_t * dmu_buf_dnode_enter(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_ENTER(dbi); return (DB_DNODE(dbi)); } void dmu_buf_dnode_exit(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; DB_DNODE_EXIT(dbi); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mis-match). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } /* * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it * is critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } /* * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is * critical the we not allow the compiler to inline this function in to * dbuf_sync_list() thereby drastically bloating the stack usage. */ noinline static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { /* * In the previous transaction group, the bonus buffer * was entirely used to store the attributes for the * dnode which overrode the dn_spill field. However, * when adding more attributes to the file a spill * block was required to hold the extra attributes. * * Make sure to clear the garbage left in the dn_spill * field from the previous attributes in the bonus * buffer. Otherwise, after writing out the spill * block to the new allocated dva, it will free * the old block pointed to by the invalid dn_spill. */ db->db_blkptr = NULL; } dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); ASSERT0(db->db_level); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { int slots = DB_DNODE(db)->dn_num_slots; int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); kmem_free(*datap, bonuslen); arc_space_return(bonuslen, ARC_SPACE_BONUS); } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ int psize = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type = arc_get_compression(*datap); if (compress_type == ZIO_COMPRESS_OFF) { *datap = arc_alloc_buf(os->os_spa, db, type, psize); } else { int lsize = arc_buf_lsize(*datap); ASSERT3U(type, ==, ARC_BUFC_DATA); *datap = arc_alloc_compressed_buf(os->os_spa, db, psize, lsize, compress_type); } bcopy(db->db.db_data, (*datap)->b_data, psize); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); DB_DNODE_EXIT(db); } else { /* * Although zio_nowait() does not "wait for an IO", it does * initiate the IO. If this is an empty write it seems plausible * that the IO could actually be completed before the nowait * returns. We need to DB_DNODE_EXIT() first in case * zio_nowait() invalidates the dbuf. */ DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while ((dr = list_head(list))) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { VERIFY3U(dr->dr_dbuf->db_level, ==, level); } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT3P(db->db_blkptr, !=, NULL); ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (bp->blk_birth != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype) || BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(bp)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { i = 0; while (i < db->db.db_size) { dnode_phys_t *dnp = db->db.db_data + i; i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { fill++; i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } } } else { if (BP_IS_HOLE(bp)) { fill = 0; } else { fill = 1; } } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) bp->blk_fill = fill; mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); *db->db_blkptr = *bp; rw_exit(&dn->dn_struct_rwlock); } /* ARGSUSED */ /* * This function gets called just prior to running through the compression * stage of the zio pipeline. If we're an indirect block comprised of only * holes, then we want this indirect to be compressed away to a hole. In * order to do that we must zero out any information about the holes that * this indirect points to prior to before we try to compress it. */ static void dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp; uint64_t i; int epbs; ASSERT3U(db->db_level, >, 0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; /* Determine if all our children are holes */ for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } /* * If all the children are holes, then zero them all out so that * we may get compressed away. */ if (i == 1ULL << epbs) { /* didn't find any non-holes */ bzero(db->db.db_data, db->db.db_size); } DB_DNODE_EXIT(db); } /* * The SPA will call this callback several times for each zio - once * for every physical child i/o (zio->io_phys_children times). This * allows the DMU to monitor the progress of each logical i/o. For example, * there may be 2 copies of an indirect block, or many fragments of a RAID-Z * block. There may be a long delay before all copies/fragments are completed, * so this callback allows us to retire dirty space gradually, as the physical * i/os complete. */ /* ARGSUSED */ static void dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) { dmu_buf_impl_t *db = arg; objset_t *os = db->db_objset; dsl_pool_t *dp = dmu_objset_pool(os); dbuf_dirty_record_t *dr; int delta = 0; dr = db->db_data_pending; ASSERT3U(dr->dr_txg, ==, zio->io_txg); /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding * error will be cleaned up by dsl_pool_sync()'s call to * dsl_pool_undirty_space(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); } /* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. */ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); drp = &db->db_last_dirty; while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); DB_DNODE_EXIT(db); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { ASSERTV(int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); if (zio->io_abd != NULL) abd_put(zio->io_abd); } /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; int wp_flag = 0; ASSERT(dmu_tx_is_syncing(tx)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } } } if (parent != dn->dn_dbuf) { /* Our parent is an indirect block. */ /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); /* * We're about to modify our parent's db_data by modifying * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); zio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(zio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, (data != NULL && arc_get_compression(data) != ZIO_COMPRESS_OFF) ? arc_get_compression(data) : ZIO_COMPRESS_INHERIT, &zp); DB_DNODE_EXIT(db); /* * We copy the blkptr now (rather than when we instantiate the dirty * record), because its value can change between open context and * syncing context. We do not need to hold dn_struct_rwlock to read * db_blkptr because we are in syncing context. */ dr->dr_bp_copy = *db->db_blkptr; if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ abd_t *contents = (data != NULL) ? abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, dbuf_write_override_ready, NULL, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { arc_done_func_t *children_ready_cb = NULL; ASSERT(arc_released(data)); /* * For indirect blocks, we want to setup the children * ready callback so that we can properly handle an indirect * block that only contains holes. */ if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(zio, os->os_spa, txg, &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dbuf_find); EXPORT_SYMBOL(dbuf_is_metadata); EXPORT_SYMBOL(dbuf_destroy); EXPORT_SYMBOL(dbuf_loan_arcbuf); EXPORT_SYMBOL(dbuf_whichblock); EXPORT_SYMBOL(dbuf_read); EXPORT_SYMBOL(dbuf_unoverride); EXPORT_SYMBOL(dbuf_free_range); EXPORT_SYMBOL(dbuf_new_size); EXPORT_SYMBOL(dbuf_release_bp); EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); EXPORT_SYMBOL(dmu_buf_rele); EXPORT_SYMBOL(dbuf_assign_arcbuf); EXPORT_SYMBOL(dbuf_prefetch); EXPORT_SYMBOL(dbuf_hold_impl); EXPORT_SYMBOL(dbuf_hold); EXPORT_SYMBOL(dbuf_hold_level); EXPORT_SYMBOL(dbuf_create_bonus); EXPORT_SYMBOL(dbuf_spill_set_blksz); EXPORT_SYMBOL(dbuf_rm_spill); EXPORT_SYMBOL(dbuf_add_ref); EXPORT_SYMBOL(dbuf_rele); EXPORT_SYMBOL(dbuf_rele_and_unlock); EXPORT_SYMBOL(dbuf_refcount); EXPORT_SYMBOL(dbuf_sync_list); EXPORT_SYMBOL(dmu_buf_set_user); EXPORT_SYMBOL(dmu_buf_set_user_ie); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_freeable); EXPORT_SYMBOL(dmu_buf_get_blkptr); /* BEGIN CSTYLED */ module_param(dbuf_cache_max_bytes, ulong, 0644); MODULE_PARM_DESC(dbuf_cache_max_bytes, "Maximum size in bytes of the dbuf cache."); module_param(dbuf_cache_hiwater_pct, uint, 0644); MODULE_PARM_DESC(dbuf_cache_hiwater_pct, "Percentage over dbuf_cache_max_bytes when dbufs must be evicted " "directly."); module_param(dbuf_cache_lowater_pct, uint, 0644); MODULE_PARM_DESC(dbuf_cache_lowater_pct, "Percentage below dbuf_cache_max_bytes when the evict thread stops " "evicting dbufs."); module_param(dbuf_cache_max_shift, int, 0644); MODULE_PARM_DESC(dbuf_cache_max_shift, "Cap the size of the dbuf cache to a log2 fraction of arc size."); /* END CSTYLED */ #endif diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index fec02eb8b61d..30c3b2cd07f7 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1,2172 +1,2172 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Enable/disable nopwrite feature. */ int zfs_nopwrite_enabled = 1; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, { DMU_BSWAP_UINT64, TRUE, "object array" }, { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, { DMU_BSWAP_UINT64, TRUE, "bpobj" }, { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, { DMU_BSWAP_ZAP, TRUE, "DSL props" }, { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, { DMU_BSWAP_UINT8, FALSE, "zvol object" }, { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, { DMU_BSWAP_UINT8, TRUE, "SPA history" }, { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, { DMU_BSWAP_UINT8, TRUE, "FUID table" }, { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, { DMU_BSWAP_UINT8, TRUE, "System attributes" }, { DMU_BSWAP_ZAP, TRUE, "SA master node" }, { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, { DMU_BSWAP_ZAP, TRUE, "scan translations" }, { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (0); } int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_OLD_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; dmu_buf_impl_t *db; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); *dbp = &db->db; return (0); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else { dbuf_rele(db, tag); *dbp = NULL; } return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; ASSERT(length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); return (SET_ERROR(EIO)); } /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); dbp[i] = &db->db; } if ((flags & DMU_READ_NO_PREFETCH) == 0 && DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { dmu_zfetch(&dn->dn_zfetch, blkid, nblks, read && DNODE_IS_CACHEABLE(dn)); } rw_exit(&dn->dn_struct_rwlock); /* wait for async i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the * indirect blocks prefeteched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not in * cache, they will be asychronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; int nblks, err; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, level, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * last block we want to prefetch, and dbuf_whichblock(dn, level, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { int i; blkid = dbuf_whichblock(dn, level, offset); for (i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) { uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); uint64_t blks; ASSERT3U(minimum, <=, *start); if (*start - minimum <= iblkrange * maxblks) { *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; return (0); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size; int err; if (dn == NULL) return (SET_ERROR(EINVAL)); object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; if (offset >= object_size) return (0); if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin; dmu_tx_t *tx; chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_end - chunk_begin); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx); dmu_tx_commit(tx); length -= chunk_end - chunk_begin; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == -1ULL || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dmu_buf_t **dbp; int numbufs, err = 0; /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { uint64_t newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } return (err); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_read_impl(dn, offset, size, buf, flags); dnode_rele(dn, FTAG); return (err); } int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { return (dmu_read_impl(dn, offset, size, buf, flags)); } static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { int i; for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } /* * DMU support for xuio */ kstat_t *xuio_ksp = NULL; typedef struct xuio_stats { /* loaned yet not returned arc_buf */ kstat_named_t xuiostat_onloan_rbuf; kstat_named_t xuiostat_onloan_wbuf; /* whether a copy is made when loaning out a read buffer */ kstat_named_t xuiostat_rbuf_copied; kstat_named_t xuiostat_rbuf_nocopy; /* whether a copy is made when assigning a write buffer */ kstat_named_t xuiostat_wbuf_copied; kstat_named_t xuiostat_wbuf_nocopy; } xuio_stats_t; static xuio_stats_t xuio_stats = { { "onloan_read_buf", KSTAT_DATA_UINT64 }, { "onloan_write_buf", KSTAT_DATA_UINT64 }, { "read_buf_copied", KSTAT_DATA_UINT64 }, { "read_buf_nocopy", KSTAT_DATA_UINT64 }, { "write_buf_copied", KSTAT_DATA_UINT64 }, { "write_buf_nocopy", KSTAT_DATA_UINT64 } }; #define XUIOSTAT_INCR(stat, val) \ atomic_add_64(&xuio_stats.stat.value.ui64, (val)) #define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) #ifdef HAVE_UIO_ZEROCOPY int dmu_xuio_init(xuio_t *xuio, int nblk) { dmu_xuio_t *priv; uio_t *uio = &xuio->xu_uio; uio->uio_iovcnt = nblk; uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); priv->cnt = nblk; priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); priv->iovp = (iovec_t *)uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); return (0); } void dmu_xuio_fini(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int nblk = priv->cnt; kmem_free(priv->iovp, nblk * sizeof (iovec_t)); kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); kmem_free(priv, sizeof (dmu_xuio_t)); if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); } /* * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { struct iovec *iov; uio_t *uio = &xuio->xu_uio; dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int i = priv->next++; ASSERT(i < priv->cnt); ASSERT(off + n <= arc_buf_lsize(abuf)); iov = (iovec_t *)uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; priv->bufs[i] = abuf; return (0); } int dmu_xuio_cnt(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); return (priv->cnt); } arc_buf_t * dmu_xuio_arcbuf(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); return (priv->bufs[i]); } void dmu_xuio_clear(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); priv->bufs[i] = NULL; } #endif /* HAVE_UIO_ZEROCOPY */ static void xuio_stat_init(void) { xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (xuio_ksp != NULL) { xuio_ksp->ks_data = &xuio_stats; kstat_install(xuio_ksp); } } static void xuio_stat_fini(void) { if (xuio_ksp != NULL) { kstat_delete(xuio_ksp); xuio_ksp = NULL; } } void xuio_stat_wbuf_copied(void) { XUIOSTAT_BUMP(xuiostat_wbuf_copied); } void xuio_stat_wbuf_nocopy(void) { XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); } #ifdef _KERNEL static int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; #ifdef HAVE_UIO_ZEROCOPY xuio_t *xuio = NULL; #endif /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); #ifdef HAVE_UIO_ZEROCOPY if (xuio) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); if (!err) { uio->uio_resid -= tocpy; uio->uio_loffset += tocpy; } if (abuf == dbuf_abuf) XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else #endif err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset uio->uio_loffset. */ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } static int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); /* * XXX uiomove could block forever (eg.nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that uiomove won't * block. */ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset uio->uio_loffset. */ int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #endif /* _KERNEL */ /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); arc_buf_destroy(buf, FTAG); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(dbuf); /* * We can only assign if the offset is aligned, the arc buf is the * same size as the dbuf, and the dbuf is not metadata. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { objset_t *os; uint64_t object; /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); os = dn->dn_objset; object = dn->dn_object; DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { ASSERTV(blkptr_t *bp = zio->io_bp); ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); ASSERTV(uint8_t chksum = BP_GET_CHECKSUM(bp_orig)); ASSERT(BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); ASSERT(zio_checksum_table[chksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig); if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { /* * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) * then there is nothing to do here. Otherwise, free the * newly allocated block in this txg. */ if (zio->io_flags & ZIO_FLAG_NOPWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); abd_put(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { blkptr_t *bp = zgd->zgd_bp; dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, ZIO_COMPRESS_INHERIT, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = db->db_last_dirty; while (dr && dr->dr_txg != txg) dr = dr->dr_next; if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, - dmu_tx_t *tx) + dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, - dmu_tx_t *tx) + dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, - dmu_tx_t *tx) + dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } int zfs_mdcomp_disable = 0; /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, enum zio_compress override_compress, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { if (zfs_mdcomp_disable) { compress = ZIO_COMPRESS_EMPTY; } else { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); } /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_METADATA) || (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_MOST && (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) copies++; } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_OFF; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checkum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and * compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually * exclusive. */ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; /* * If we're writing a pre-compressed buffer, the compression type we use * must match the data. If it hasn't been compressed yet, then we should * use the value dictated by the policies above. */ zp->zp_compress = override_compress != ZIO_COMPRESS_INHERIT ? override_compress : compress; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; } int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int i, err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); /* * Sync any current changes before * we go trundling through the block pointers. */ for (i = 0; i < TXG_SIZE; i++) { if (list_link_active(&dn->dn_dirty_link[i])) break; } if (i != TXG_SIZE) { dnode_rele(dn, FTAG); txg_wait_synced(dmu_objset_pool(os), 0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); } err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err); } void __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp = dn->dn_phys; int i; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. * This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + dn->dn_num_slots; DB_DNODE_EXIT(db); } void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *dnsize = dn->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } /* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { } void dmu_init(void) { abd_init(); zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); dmu_objset_init(); dnode_init(); zfetch_init(); dmu_tx_init(); l2arc_init(); arc_init(); dbuf_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dmu_bonus_hold); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); EXPORT_SYMBOL(dmu_object_info_from_db); EXPORT_SYMBOL(dmu_object_size_from_db); EXPORT_SYMBOL(dmu_object_dnsize_from_db); EXPORT_SYMBOL(dmu_object_set_blocksize); EXPORT_SYMBOL(dmu_object_set_checksum); EXPORT_SYMBOL(dmu_object_set_compress); EXPORT_SYMBOL(dmu_write_policy); EXPORT_SYMBOL(dmu_sync); EXPORT_SYMBOL(dmu_request_arcbuf); EXPORT_SYMBOL(dmu_return_arcbuf); EXPORT_SYMBOL(dmu_assign_arcbuf); EXPORT_SYMBOL(dmu_buf_hold); EXPORT_SYMBOL(dmu_ot); module_param(zfs_mdcomp_disable, int, 0644); MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression"); module_param(zfs_nopwrite_enabled, int, 0644); MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes"); #endif diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index d73993428e81..e31f6e3c04a6 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1,3373 +1,3373 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; int zfs_send_queue_length = 16 * 1024 * 1024; int zfs_recv_queue_length = 16 * 1024 * 1024; /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ int zfs_send_set_freerecords_bit = B_TRUE; static char *dmu_recv_tag = "dmu_recv_tag"; const char *recv_clone_name = "%recv"; #define BP_SPAN(datablkszsec, indblkshift, level) \ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (indblkshift - SPA_BLKPTRSHIFT))) static void byteswap_record(dmu_replay_record_t *drr); struct send_thread_arg { bqueue_t q; dsl_dataset_t *ds; /* Dataset to traverse */ uint64_t fromtxg; /* Traverse from this txg */ int flags; /* flags to pass to traverse_dataset */ int error_code; boolean_t cancel; zbookmark_phys_t resume; }; struct send_block_record { boolean_t eos_marker; /* Marks the end of the stream */ blkptr_t bp; zbookmark_phys_t zb; uint8_t indblkshift; uint16_t datablkszsec; bqueue_node_t ln; }; typedef struct dump_bytes_io { dmu_sendarg_t *dbi_dsp; void *dbi_buf; int dbi_len; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; dmu_sendarg_t *dsp = dbi->dbi_dsp; dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); ssize_t resid; /* have to get resid to get detailed errno */ /* * The code does not rely on this (len being a multiple of 8). We keep * this assertion because of the corresponding assertion in * receive_read(). Keeping this assertion ensures that we do not * inadvertently break backwards compatibility (causing the assertion * in receive_read() to trigger on old software). * * Removing the assertions could be rolled into a new feature that uses * data that isn't 8-byte aligned; if the assertions were removed, a * feature flag would have to be added. */ ASSERT0(dbi->dbi_len % 8); dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, (caddr_t)dbi->dbi_buf, dbi->dbi_len, 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); mutex_enter(&ds->ds_sendstream_lock); *dsp->dsa_off += dbi->dbi_len; mutex_exit(&ds->ds_sendstream_lock); } static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { dump_bytes_io_t dbi; dbi.dbi_dsp = dsp; dbi.dbi_buf = buf; dbi.dbi_len = len; #if defined(HAVE_LARGE_STACKS) dump_bytes_cb(&dbi); #else /* * The vn_rdwr() call is performed in a taskq to ensure that there is * always enough stack space to write safely to the target filesystem. * The ZIO_TYPE_FREE threads are used because there can be a lot of * them and they are used in vdev_file.c for a similar purpose. */ spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); #endif /* HAVE_LARGE_STACKS */ return (dsp->dsa_err); } /* * For all record types except BEGIN, fill in the checksum (overlaid in * drr_u.drr_checksum.drr_checksum). The checksum verifies everything * up to the start of the checksum itself. */ static int dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); (void) fletcher_4_incremental_native(dsp->dsa_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dsp->dsa_zc); if (dsp->dsa_drr->drr_type == DRR_BEGIN) { dsp->dsa_sent_begin = B_TRUE; } else { ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. drr_checksum.drr_checksum)); dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; } if (dsp->dsa_drr->drr_type == DRR_END) { dsp->dsa_sent_end = B_TRUE; } (void) fletcher_4_incremental_native(&dsp->dsa_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dsp->dsa_zc); if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { (void) fletcher_4_incremental_native(payload, payload_len, &dsp->dsa_zc); if (dump_bytes(dsp, payload, payload_len) != 0) return (SET_ERROR(EINTR)); } return (0); } /* * Fill in the drr_free struct, or perform aggregation if the previous record is * also a free record, and the two are adjacent. * * Note that we send free records even for a full send, because we want to be * able to receive a full send as a clone, which requires a list of all the free * and freeobject records that were generated on the source. */ static int dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) { struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); /* * When we receive a free record, dbuf_free_range() assumes * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given * object,offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * * If the increasing-order constraint ever changes, we should find * another way to assert that the one-record constraint is still * satisfied. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); if (length != -1ULL && offset + length < offset) length = -1ULL; /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_FREE records can only be aggregated with * other DRR_FREE records. DRR_FREEOBJECTS records can only be * aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREE) { /* * There should never be a PENDING_FREE if length is -1 * (because dump_dnode is the only place where this * function is called with a -1, and only after flushing * any pending record). */ ASSERT(length != -1ULL); /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; drrf->drr_length = length; drrf->drr_toguid = dsp->dsa_toguid; if (length == -1ULL) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); } else { dsp->dsa_pending_op = PENDING_FREE; } return (0); } static int dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data) { uint64_t payload_size; struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); /* * We send data in increasing object, offset order. * See comment in dump_free() for details. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); dsp->dsa_last_data_object = object; dsp->dsa_last_data_offset = offset + lsize - 1; /* * If there is any kind of pending aggregation (currently either * a grouping of free objects or free blocks), push it out to * the stream, since aggregation can't be done across operations * of different types. */ if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a WRITE record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_logical_size = lsize; /* only set the compression fields if the buf is compressed */ if (lsize != psize) { ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); ASSERT3S(psize, >, 0); ASSERT3S(lsize, >=, psize); drrw->drr_compressiontype = BP_GET_COMPRESS(bp); drrw->drr_compressed_size = psize; payload_size = drrw->drr_compressed_size; } else { payload_size = drrw->drr_logical_size; } if (bp == NULL || BP_IS_EMBEDDED(bp)) { /* * There's no pre-computed checksum for partial-block * writes or embedded BP's, so (like * fletcher4-checkummed blocks) userland will have to * compute a dedup-capable checksum itself. */ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; } else { drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & ZCHECKSUM_FLAG_DEDUP) drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); drrw->drr_key.ddk_cksum = bp->blk_cksum; } if (dump_record(dsp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp) { char buf[BPE_PAYLOAD_SIZE]; struct drr_write_embedded *drrw = &(dsp->dsa_drr->drr_u.drr_write_embedded); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (EINTR); dsp->dsa_pending_op = PENDING_NONE; } ASSERT(BP_IS_EMBEDDED(bp)); bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_compression = BP_GET_COMPRESS(bp); drrw->drr_etype = BPE_GET_ETYPE(bp); drrw->drr_lsize = BPE_GET_LSIZE(bp); drrw->drr_psize = BPE_GET_PSIZE(bp); decode_embedded_bp_compressed(bp, buf); if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) return (EINTR); return (0); } static int dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) { struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a SPILL record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; drrs->drr_toguid = dsp->dsa_toguid; if (dump_record(dsp, data, blksz) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for * blocks of the same type (i.e., DRR_FREE records can only be * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records * can only be aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREEOBJECTS) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one */ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { drrfo->drr_numobjs += numobjs; return (0); } else { /* can't be aggregated. Push out pending record */ if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; drrfo->drr_toguid = dsp->dsa_toguid; dsp->dsa_pending_op = PENDING_FREEOBJECTS; return (0); } static int dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); if (object < dsp->dsa_resume_object) { /* * Note: when resuming, we will visit all the dnodes in * the block of dnodes that we are resuming from. In * this case it's unnecessary to send the dnodes prior to * the one we are resuming from. We should be at most one * block's worth of dnodes behind the resume point. */ ASSERT3U(dsp->dsa_resume_object - object, <, 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); return (0); } if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dsp, object, 1)); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write an OBJECT record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; drro->drr_bonuslen = dnp->dn_bonuslen; drro->drr_dn_slots = dnp->dn_extra_slots + 1; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; drro->drr_toguid = dsp->dsa_toguid; if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; if (dump_record(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { return (SET_ERROR(EINTR)); } /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) return (SET_ERROR(EINTR)); if (dsp->dsa_err != 0) return (SET_ERROR(EINTR)); return (0); } static boolean_t backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) { if (!BP_IS_EMBEDDED(bp)) return (B_FALSE); /* * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) return (B_FALSE); /* * Embed type must be explicitly enabled. */ switch (BPE_GET_ETYPE(bp)) { case BP_EMBEDDED_TYPE_DATA: if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (B_TRUE); break; default: return (B_FALSE); } return (B_FALSE); } /* * This is the callback function to traverse_dataset that acts as the worker * thread for dmu_send_impl. */ /*ARGSUSED*/ static int send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { struct send_thread_arg *sta = arg; struct send_block_record *record; uint64_t record_size; int err = 0; ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= sta->resume.zb_object); if (sta->cancel) return (SET_ERROR(EINTR)); if (bp == NULL) { ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); return (0); } else if (zb->zb_level < 0) { return (0); } record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); record->eos_marker = B_FALSE; record->bp = *bp; record->zb = *zb; record->indblkshift = dnp->dn_indblkshift; record->datablkszsec = dnp->dn_datablkszsec; record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; bqueue_enqueue(&sta->q, record, record_size); return (err); } /* * This function kicks off the traverse_dataset. It also handles setting the * error code of the thread in case something goes wrong, and pushes the End of * Stream record when the traverse_dataset call has finished. If there is no * dataset to traverse, the thread immediately pushes End of Stream marker. */ static void send_traverse_thread(void *arg) { struct send_thread_arg *st_arg = arg; int err; struct send_block_record *data; fstrans_cookie_t cookie = spl_fstrans_mark(); if (st_arg->ds != NULL) { err = traverse_dataset_resume(st_arg->ds, st_arg->fromtxg, &st_arg->resume, st_arg->flags, send_cb, st_arg); if (err != EINTR) st_arg->error_code = err; } data = kmem_zalloc(sizeof (*data), KM_SLEEP); data->eos_marker = B_TRUE; bqueue_enqueue(&st_arg->q, data, 1); spl_fstrans_unmark(cookie); thread_exit(); } /* * This function actually handles figuring out what kind of record needs to be * dumped, reading the data (which has hopefully been prefetched), and calling * the appropriate helper function. */ static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) { dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); const blkptr_t *bp = &data->bp; const zbookmark_phys_t *zb = &data->zb; uint8_t indblkshift = data->indblkshift; uint16_t dblkszsec = data->datablkszsec; spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; uint64_t dnobj; ASSERT3U(zb->zb_level, >=, 0); ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= dsa->dsa_resume_object); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); } else if (BP_IS_HOLE(bp) && zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); } else if (BP_IS_HOLE(bp)) { uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t offset = zb->zb_blkid * span; err = dump_free(dsa, zb->zb_object, offset, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { dnode_phys_t *blk; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int i; ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); blk = abuf->b_data; dnobj = zb->zb_blkid * epb; for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { err = dump_dnode(dsa, dnobj + i, blk + i); if (err != 0) break; } arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dblkszsec << SPA_MINBLOCKSHIFT; ASSERT0(zb->zb_level); err = dump_write_embedded(dsa, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = dblkszsec << SPA_MINBLOCKSHIFT; uint64_t offset; enum zio_flag zioflags = ZIO_FLAG_CANFAIL; /* * If we have large blocks stored on disk but the send flags * don't allow us to send large blocks, we split the data from * the arc buf into chunks. */ boolean_t split_large_blocks = data->datablkszsec > SPA_OLD_MAXBLOCKSIZE && !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); /* * We should only request compressed data from the ARC if all * the following are true: * - stream compression was requested * - we aren't splitting large blocks into smaller chunks * - the data won't need to be byteswapped before sending * - this isn't an embedded block * - this isn't metadata (if receiving on a different endian * system it can be byteswapped more easily) */ boolean_t request_compressed = (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); ASSERT0(zb->zb_level); ASSERT(zb->zb_object > dsa->dsa_resume_object || (zb->zb_object == dsa->dsa_resume_object && zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); if (request_compressed) zioflags |= ZIO_FLAG_RAW; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { uint64_t *ptr; /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, blksz); for (ptr = abuf->b_data; (char *)ptr < (char *)abuf->b_data + blksz; ptr++) *ptr = 0x2f5baddb10cULL; } else { return (SET_ERROR(EIO)); } } offset = zb->zb_blkid * blksz; if (split_large_blocks) { char *buf = abuf->b_data; ASSERT3U(arc_get_compression(abuf), ==, ZIO_COMPRESS_OFF); while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); err = dump_write(dsa, type, zb->zb_object, offset, n, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { err = dump_write(dsa, type, zb->zb_object, offset, blksz, arc_buf_size(abuf), bp, abuf->b_data); } arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); return (err); } /* * Pop the new data off the queue, and free the old data. */ static struct send_block_record * get_next_record(bqueue_t *bq, struct send_block_record *data) { struct send_block_record *tmp = bqueue_dequeue(bq); kmem_free(data, sizeof (*data)); return (tmp); } /* * Actually do the bulk of the work in a zfs send. * * Note: Releases dp using the specified tag. */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, offset_t *off) { objset_t *os; dmu_replay_record_t *drr; dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; struct send_thread_arg to_arg; void *payload = NULL; size_t payload_len = 0; struct send_block_record *to_data; err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, DMU_SUBSTREAM); bzero(&to_arg, sizeof (to_arg)); #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); dsl_pool_rele(dp, tag); return (SET_ERROR(EINVAL)); } if (version >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } } #endif if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; } if (compressok) { featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; } if ((featureflags & (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) != 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { featureflags |= DMU_BACKUP_FEATURE_LZ4; } if (resumeobj != 0 || resumeoff != 0) { featureflags |= DMU_BACKUP_FEATURE_RESUMING; } DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); drr->drr_u.drr_begin.drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; if (zfs_send_set_freerecords_bit) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; if (ancestor_zb != NULL) { drr->drr_u.drr_begin.drr_fromguid = ancestor_zb->zbm_guid; fromtxg = ancestor_zb->zbm_creation_txg; } dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); if (!to_ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); dsp->dsa_drr = drr; dsp->dsa_vp = vp; dsp->dsa_outfd = outfd; dsp->dsa_proc = curproc; dsp->dsa_os = os; dsp->dsa_off = off; dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_featureflags = featureflags; dsp->dsa_resume_object = resumeobj; dsp->dsa_resume_offset = resumeoff; mutex_enter(&to_ds->ds_sendstream_lock); list_insert_head(&to_ds->ds_sendstreams, dsp); mutex_exit(&to_ds->ds_sendstream_lock); dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); if (resumeobj != 0 || resumeoff != 0) { dmu_object_info_t to_doi; nvlist_t *nvl; err = dmu_object_info(os, resumeobj, &to_doi); if (err != 0) goto out; SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, resumeoff / to_doi.doi_data_block_size); nvl = fnvlist_alloc(); fnvlist_add_uint64(nvl, "resume_object", resumeobj); fnvlist_add_uint64(nvl, "resume_offset", resumeoff); payload = fnvlist_pack(nvl, &payload_len); drr->drr_payloadlen = payload_len; fnvlist_free(nvl); } err = dump_record(dsp, payload, payload_len); fnvlist_pack_free(payload, payload_len); if (err != 0) { err = dsp->dsa_err; goto out; } err = bqueue_init(&to_arg.q, zfs_send_queue_length, offsetof(struct send_block_record, ln)); to_arg.error_code = 0; to_arg.cancel = B_FALSE; to_arg.ds = to_ds; to_arg.fromtxg = fromtxg; to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, TS_RUN, minclsyspri); to_data = bqueue_dequeue(&to_arg.q); while (!to_data->eos_marker && err == 0) { err = do_dump(dsp, to_data); to_data = get_next_record(&to_arg.q, to_data); if (issig(JUSTLOOKING) && issig(FORREAL)) err = EINTR; } if (err != 0) { to_arg.cancel = B_TRUE; while (!to_data->eos_marker) { to_data = get_next_record(&to_arg.q, to_data); } } kmem_free(to_data, sizeof (*to_data)); bqueue_destroy(&to_arg.q); if (err == 0 && to_arg.error_code != 0) err = to_arg.error_code; if (err != 0) goto out; if (dsp->dsa_pending_op != PENDING_NONE) if (dump_record(dsp, NULL, 0) != 0) err = SET_ERROR(EINTR); if (err != 0) { if (err == EINTR && dsp->dsa_err != 0) err = dsp->dsa_err; goto out; } bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; if (dump_record(dsp, NULL, 0) != 0) err = dsp->dsa_err; out: mutex_enter(&to_ds->ds_sendstream_lock); list_remove(&to_ds->ds_sendstreams, dsp); mutex_exit(&to_ds->ds_sendstream_lock); VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); dsl_dataset_long_rele(to_ds, FTAG); return (err); } int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, int outfd, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; dsl_dataset_t *fromds = NULL; int err; err = dsl_pool_hold(pool, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != 0) { zfs_bookmark_phys_t zb; boolean_t is_clone; err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); if (err != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, compressok, outfd, 0, 0, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, compressok, outfd, 0, 0, vp, off); } dsl_dataset_rele(ds, FTAG); return (err); } int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; boolean_t owned = B_FALSE; if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) return (SET_ERROR(EINVAL)); err = dsl_pool_hold(tosnap, FTAG, &dp); if (err != 0) return (err); if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { /* * We are sending a filesystem or volume. Ensure * that it doesn't change by owning the dataset. */ err = dsl_dataset_own(dp, tosnap, FTAG, &ds); owned = B_TRUE; } else { err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); } if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != NULL) { zfs_bookmark_phys_t zb; boolean_t is_clone = B_FALSE; int fsnamelen = strchr(tosnap, '@') - tosnap; /* * If the fromsnap is in a different filesystem, then * mark the send stream as a clone. */ if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || (fromsnap[fsnamelen] != '@' && fromsnap[fsnamelen] != '#')) { is_clone = B_TRUE; } if (strchr(fromsnap, '@')) { dsl_dataset_t *fromds; err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); if (err == 0) { if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (ds->ds_dir != fromds->ds_dir); dsl_dataset_rele(fromds, FTAG); } } else { err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); } if (err != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, compressok, outfd, resumeobj, resumeoff, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, compressok, outfd, resumeobj, resumeoff, vp, off); } if (owned) dsl_dataset_disown(ds, FTAG); else dsl_dataset_rele(ds, FTAG); return (err); } static int dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) { int err; uint64_t size; /* * Assume that space (both on-disk and in-stream) is dominated by * data. We will adjust for indirect blocks and the copies property, * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). */ uint64_t recordsize; uint64_t record_count; /* Assume all (uncompressed) blocks are recordsize. */ err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); if (err != 0) return (err); record_count = uncompressed / recordsize; /* * If we're estimating a send size for a compressed stream, use the * compressed data size to estimate the stream size. Otherwise, use the * uncompressed data size. */ size = stream_compressed ? compressed : uncompressed; /* * Subtract out approximate space used by indirect blocks. * Assume most space is used by data blocks (non-indirect, non-dnode). * Assume no ditto blocks or internal fragmentation. * * Therefore, space used by indirect blocks is sizeof(blkptr_t) per * block. */ size -= record_count * sizeof (blkptr_t); /* Add in the space for the record associated with each block. */ size += record_count * sizeof (dmu_replay_record_t); *sizep = size; return (0); } int dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, boolean_t stream_compressed, uint64_t *sizep) { int err; uint64_t uncomp, comp; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* tosnap must be a snapshot */ if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* fromsnap, if provided, must be a snapshot */ if (fromds != NULL && !fromds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* * fromsnap must be an earlier snapshot from the same fs as tosnap, * or the origin's fs. */ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) return (SET_ERROR(EXDEV)); /* Get compressed and uncompressed size estimates of changed data. */ if (fromds == NULL) { uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; comp = dsl_dataset_phys(ds)->ds_compressed_bytes; } else { uint64_t used; err = dsl_dataset_space_written(fromds, ds, &used, &comp, &uncomp); if (err != 0) return (err); } err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, stream_compressed, sizep); return (err); } struct calculate_send_arg { uint64_t uncompressed; uint64_t compressed; }; /* * Simple callback used to traverse the blocks of a snapshot and sum their * uncompressed and compressed sizes. */ /* ARGSUSED */ static int dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct calculate_send_arg *space = arg; if (bp != NULL && !BP_IS_HOLE(bp)) { space->uncompressed += BP_GET_UCSIZE(bp); space->compressed += BP_GET_PSIZE(bp); } return (0); } /* * Given a desination snapshot and a TXG, calculate the approximate size of a * send stream sent from that TXG. from_txg may be zero, indicating that the * whole snapshot will be sent. */ int dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, boolean_t stream_compressed, uint64_t *sizep) { int err; struct calculate_send_arg size = { 0 }; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* tosnap must be a snapshot */ if (!dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); /* verify that from_txg is before the provided snapshot was taken */ if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { return (SET_ERROR(EXDEV)); } /* * traverse the blocks of the snapshot with birth times after * from_txg, summing their uncompressed size */ err = traverse_dataset(ds, from_txg, TRAVERSE_POST, dmu_calculate_send_traversal, &size); if (err) return (err); err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, size.compressed, stream_compressed, sizep); return (err); } typedef struct dmu_recv_begin_arg { const char *drba_origin; dmu_recv_cookie_t *drba_cookie; cred_t *drba_cred; uint64_t drba_snapobj; } dmu_recv_begin_arg_t; static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid) { uint64_t val; int error; dsl_pool_t *dp = ds->ds_dir->dd_pool; /* temporary clone name must not exist */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EBUSY : error); /* new snapshot name must not exist */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EEXIST : error); /* * Check snapshot limit before receiving. We'll recheck again at the * end, but might as well abort before receiving if we're already over * the limit. * * Note that we do not check the file system limit with * dsl_dir_fscount_check because the temporary %clones don't count * against that limit. */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) return (error); if (fromguid != 0) { dsl_dataset_t *snap; uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Find snapshot in this dir that matches fromguid. */ while (obj != 0) { error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (SET_ERROR(ENODEV)); if (snap->ds_dir != ds->ds_dir) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENODEV)); } if (dsl_dataset_phys(snap)->ds_guid == fromguid) break; obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); } if (obj == 0) return (SET_ERROR(ENODEV)); if (drba->drba_cookie->drc_force) { drba->drba_snapobj = obj; } else { /* * If we are not forcing, there must be no * changes since fromsnap. */ if (dsl_dataset_modified_since_snap(ds, snap)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ETXTBSY)); } drba->drba_snapobj = ds->ds_prev->ds_object; } dsl_dataset_rele(snap, FTAG); } else { /* if full, then must be forced */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); /* start from $ORIGIN@$ORIGIN, if supported */ drba->drba_snapobj = dp->dp_origin_snap != NULL ? dp->dp_origin_snap->ds_object : 0; } return (0); } static int dmu_recv_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; int error; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) return (SET_ERROR(EINVAL)); /* Verify pool version supports SA if SA_SPILL feature set */ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); if (drba->drba_cookie->drc_resumable && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plain WRITE record, so the pool must have the * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED * records. Same with WRITE_EMBEDDED records that use LZ4 compression. */ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate large blocks * to smaller ones, so the pool must have the LARGE_BLOCKS * feature enabled if the stream has LARGE_BLOCKS. */ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate large dnodes * to smaller ones, so the pool must have the LARGE_DNODE * feature enabled if the stream has LARGE_DNODE. */ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ if (flags & DRR_FLAG_CLONE || drba->drba_origin) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = recv_begin_check_existing_impl(drba, ds, fromguid); dsl_dataset_rele(ds, FTAG); } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ char buf[ZFS_MAX_DATASET_NAME_LEN]; /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || drba->drba_origin)) return (SET_ERROR(ENOENT)); /* * If we're receiving a full send as a clone, and it doesn't * contain all the necessary free records and freeobject * records, reject it. */ if (fromguid == 0 && drba->drba_origin && !(flags & DRR_FLAG_FREERECORDS)) return (SET_ERROR(EINVAL)); /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, sizeof (buf)); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) return (error); /* * Check filesystem and snapshot limits before receiving. We'll * recheck snapshot limits again at the end (we create the * filesystems and increment those counts during begin_sync). */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (drba->drba_origin != NULL) { dsl_dataset_t *origin; error = dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid && fromguid != 0) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); } dsl_dataset_rele(origin, FTAG); } dsl_dataset_rele(ds, FTAG); error = 0; } return (error); } static void dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; struct drr_begin *drrb = drba->drba_cookie->drc_drrb; const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds, *newds; uint64_t dsobj; int error; uint64_t crflags = 0; if (drrb->drr_flags & DRR_FLAG_CI_DATA) crflags |= DS_FLAG_CI_DATASET; error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* create temporary clone */ dsl_dataset_t *snap = NULL; if (drba->drba_snapobj != 0) { VERIFY0(dsl_dataset_hold_obj(dp, drba->drba_snapobj, FTAG, &snap)); } dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, snap, crflags, drba->drba_cred, tx); if (drba->drba_snapobj != 0) dsl_dataset_rele(snap, FTAG); dsl_dataset_rele(ds, FTAG); } else { dsl_dir_t *dd; const char *tail; dsl_dataset_t *origin = NULL; VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); if (drba->drba_origin != NULL) { VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin)); } /* Create new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, origin, crflags, drba->drba_cred, tx); if (origin != NULL) dsl_dataset_rele(origin, FTAG); dsl_dir_rele(dd, FTAG); drba->drba_cookie->drc_newfs = B_TRUE; } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); if (drba->drba_cookie->drc_resumable) { uint64_t one = 1; uint64_t zero = 0; dsl_dataset_zapify(newds, tx); if (drrb->drr_fromguid != 0) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, 8, 1, &drrb->drr_fromguid, tx)); } VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, 8, 1, &drrb->drr_toguid, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, 8, 1, &one, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, 8, 1, &zero, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, 8, 1, &zero, tx)); if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, 8, 1, &one, tx)); } if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_EMBED_DATA) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, 8, 1, &one, tx)); } if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_COMPRESSED) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, 8, 1, &one, tx)); } } dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; /* * If we actually created a non-clone, we need to create the * objset in our new dataset. */ if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } drba->drba_cookie->drc_ds = newds; spa_history_log_internal_ds(newds, "receive", tx, ""); } static int dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; int error; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; uint64_t val; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES) return (SET_ERROR(EINVAL)); /* Verify pool version supports SA if SA_SPILL feature set */ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plain WRITE record, so the pool must have the * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED * records. Same with WRITE_EMBEDDED records that use LZ4 compression. */ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error != 0) return (error); } /* check that ds is marked inconsistent */ if (!DS_IS_INCONSISTENT(ds)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* check that there is resuming data, and that the toguid matches */ if (!dsl_dataset_is_zapified(ds)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); if (error != 0 || drrb->drr_toguid != val) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* * Check if the receive is still running. If so, it will be owned. * Note that nothing else can own the dataset (e.g. after the receive * fails) because it will be marked inconsistent. */ if (dsl_dataset_has_owner(ds)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EBUSY)); } /* There should not be any snapshots of this fs yet. */ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* * Note: resume point will be checked when we process the first WRITE * record. */ /* check that the origin matches */ val = 0; (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); if (drrb->drr_fromguid != val) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds; uint64_t dsobj; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); drba->drba_cookie->drc_newfs = B_TRUE; } /* clear the inconsistent flag so that we can own it */ ASSERT(DS_IS_INCONSISTENT(ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; dsobj = ds->ds_object; dsl_dataset_rele(ds, FTAG); VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); drba->drba_cookie->drc_ds = ds; spa_history_log_internal_ds(ds, "resume receive", tx, ""); } /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) { dmu_recv_begin_arg_t drba = { 0 }; bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drr_begin = drr_begin; drc->drc_drrb = &drr_begin->drr_u.drr_begin; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; drc->drc_resumable = resumable; drc->drc_cred = CRED(); if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; (void) fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); byteswap_record(drr_begin); } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { (void) fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { return (SET_ERROR(EINVAL)); } drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_RESUMING) { return (dsl_sync_task(tofs, dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } else { return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } } struct receive_record_arg { dmu_replay_record_t header; void *payload; /* Pointer to a buffer containing the payload */ /* * If the record is a write, pointer to the arc_buf_t containing the * payload. */ arc_buf_t *write_buf; int payload_size; uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ bqueue_node_t node; }; struct receive_writer_arg { objset_t *os; boolean_t byteswap; bqueue_t q; /* * These three args are used to signal to the main thread that we're * done. */ kmutex_t mutex; kcondvar_t cv; boolean_t done; int err; /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; boolean_t resumable; uint64_t last_object, last_offset; uint64_t bytes_read; /* bytes read when current record created */ }; struct objlist { list_t list; /* List of struct receive_objnode. */ /* * Last object looked up. Used to assert that objects are being looked * up in ascending order. */ uint64_t last_lookup; }; struct receive_objnode { list_node_t node; uint64_t object; }; struct receive_arg { objset_t *os; vnode_t *vp; /* The vnode to read the stream from */ uint64_t voff; /* The current offset in the stream */ uint64_t bytes_read; /* * A record that has had its payload read in, but hasn't yet been handed * off to the worker thread. */ struct receive_record_arg *rrd; /* A record that has had its header read in, but not its payload. */ struct receive_record_arg *next_rrd; zio_cksum_t cksum; zio_cksum_t prev_cksum; int err; boolean_t byteswap; /* Sorted list of objects not to issue prefetches for. */ struct objlist ignore_objlist; }; typedef struct guid_map_entry { uint64_t guid; dsl_dataset_t *gme_ds; avl_node_t avlnode; } guid_map_entry_t; static int guid_compare(const void *arg1, const void *arg2) { const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1; const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2; return (AVL_CMP(gmep1->guid, gmep2->guid)); } static void free_guid_map_onexit(void *arg) { avl_tree_t *ca = arg; void *cookie = NULL; guid_map_entry_t *gmep; while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { dsl_dataset_long_rele(gmep->gme_ds, gmep); dsl_dataset_rele(gmep->gme_ds, gmep); kmem_free(gmep, sizeof (guid_map_entry_t)); } avl_destroy(ca); kmem_free(ca, sizeof (avl_tree_t)); } static int receive_read(struct receive_arg *ra, int len, void *buf) { int done = 0; /* * The code doesn't rely on this (lengths being multiples of 8). See * comment in dump_bytes. */ ASSERT0(len % 8); while (done < len) { ssize_t resid; ra->err = vn_rdwr(UIO_READ, ra->vp, (char *)buf + done, len - done, ra->voff, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); if (resid == len - done) { /* * Note: ECKSUM indicates that the receive * was interrupted and can potentially be resumed. */ ra->err = SET_ERROR(ECKSUM); } ra->voff += len - done - resid; done = len - resid; if (ra->err != 0) return (ra->err); } ra->bytes_read += len; ASSERT3U(done, ==, len); return (0); } noinline static void byteswap_record(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; case DRR_OBJECT: DO64(drr_object.drr_object); DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); DO64(drr_object.drr_toguid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_logical_size); DO64(drr_write.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); DO64(drr_write.drr_key.ddk_prop); DO64(drr_write.drr_compressed_size); break; case DRR_WRITE_BYREF: DO64(drr_write_byref.drr_object); DO64(drr_write_byref.drr_offset); DO64(drr_write_byref.drr_length); DO64(drr_write_byref.drr_toguid); DO64(drr_write_byref.drr_refguid); DO64(drr_write_byref.drr_refobject); DO64(drr_write_byref.drr_refoffset); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. drr_key.ddk_cksum); DO64(drr_write_byref.drr_key.ddk_prop); break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); DO64(drr_write_embedded.drr_length); DO64(drr_write_embedded.drr_toguid); DO32(drr_write_embedded.drr_lsize); DO32(drr_write_embedded.drr_psize); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; case DRR_SPILL: DO64(drr_spill.drr_object); DO64(drr_spill.drr_length); DO64(drr_spill.drr_toguid); break; case DRR_END: DO64(drr_end.drr_toguid); ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); break; default: break; } if (drr->drr_type != DRR_BEGIN) { ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); } #undef DO64 #undef DO32 } static inline uint8_t deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) { if (bonus_type == DMU_OT_SA) { return (1); } else { return (1 + ((DN_OLD_MAX_BONUSLEN - MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); } } static void save_resume_state(struct receive_writer_arg *rwa, uint64_t object, uint64_t offset, dmu_tx_t *tx) { int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; if (!rwa->resumable) return; /* * We use ds_resume_bytes[] != 0 to indicate that we need to * update this on disk, so it must not be 0. */ ASSERT(rwa->bytes_read != 0); /* * We only resume from write records, which have a valid * (non-meta-dnode) object number. */ ASSERT(object != 0); /* * For resuming to work correctly, we must receive records in order, * sorted by object,offset. This is checked by the callers, but * assert it here for good measure. */ ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); ASSERT3U(rwa->bytes_read, >=, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, - void *data) + void *data) { dmu_object_info_t doi; dmu_tx_t *tx; uint64_t object; int err; if (drro->drr_type == DMU_OT_NONE || !DMU_OT_IS_VALID(drro->drr_type) || !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os)))) { return (SET_ERROR(EINVAL)); } err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT) return (SET_ERROR(EINVAL)); object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file * contents before we can change this type of metadata in the dnode. */ if (err == 0) { int nblkptr; nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr) { err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } } tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, drro->drr_dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || drro->drr_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* currently allocated, but with different properties */ err = dmu_object_reclaim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); return (0); } /* ARGSUSED */ noinline static int receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; int next_err = 0; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { dmu_object_info_t doi; int err; err = dmu_object_info(rwa->os, obj, &doi); if (err == ENOENT) { obj++; continue; } else if (err != 0) { return (err); } err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); } if (next_err != ESRCH) return (next_err); return (0); } noinline static int receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, - arc_buf_t *abuf) + arc_buf_t *abuf) { dmu_tx_t *tx; dmu_buf_t *bonus; int err; if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); /* * For resuming to work, records must be in increasing order * by (object, offset). */ if (drrw->drr_object < rwa->last_object || (drrw->drr_object == rwa->last_object && drrw->drr_offset < rwa->last_offset)) { return (SET_ERROR(EINVAL)); } rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, DRR_WRITE_PAYLOAD_SIZE(drrw)); } /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); /* * Note: If the receive fails, we want the resume stream to start * with the same record that we last successfully received (as opposed * to the next record), so that we can verify that we are * resuming from the correct location. */ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); return (0); } /* * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed * streams to refer to a copy of the data that is already on the * system because it came in earlier in the stream. This function * finds the earlier copy of the data, and uses that copy instead of * data from the stream to fulfill this write. */ static int receive_write_byref(struct receive_writer_arg *rwa, struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; guid_map_entry_t gmesrch; guid_map_entry_t *gmep; avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) return (SET_ERROR(EINVAL)); /* * If the GUID of the referenced dataset is different from the * GUID of the target dataset, find the referenced dataset. */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (SET_ERROR(EINVAL)); } if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (SET_ERROR(EINVAL)); } else { ref_os = rwa->os; } err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); if (err != 0) return (err); tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); /* See comment in restore_write. */ save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_write_embedded(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwe, void *data) { dmu_tx_t *tx; int err; if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) return (EINVAL); if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) return (EINVAL); if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (EINVAL); if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write_embedded(rwa->os, drrwe->drr_object, drrwe->drr_offset, data, drrwe->drr_etype, drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); /* See comment in restore_write. */ save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); dmu_tx_commit(tx); return (0); } static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, void *data) { dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); return (err); } dmu_buf_will_dirty(db_spill, tx); if (db_spill->db_size < drrs->drr_length) VERIFY(0 == dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); bcopy(data, db_spill->db_data, drrs->drr_length); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_commit(tx); return (0); } /* ARGSUSED */ noinline static int receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; if (drrf->drr_length != -1ULL && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); return (err); } /* used to destroy the drc_ds on error */ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { if (drc->drc_resumable) { /* wait for our resume state to be written to disk */ txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); } else { char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(drc->drc_ds, name); dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); (void) dsl_destroy_head(name); } } static void receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); } else { (void) fletcher_4_incremental_native(buf, len, &ra->cksum); } } /* * Read the payload into a buffer of size len, and update the current record's * payload field. * Allocate ra->next_rrd and read the next record's header into * ra->next_rrd->header. * Verify checksum of payload and next record. */ static int receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) { int err; zio_cksum_t cksum_orig; zio_cksum_t *cksump; if (len != 0) { ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); err = receive_read(ra, len, buf); if (err != 0) return (err); receive_cksum(ra, len, buf); /* note: rrd is NULL when reading the begin record's payload */ if (ra->rrd != NULL) { ra->rrd->payload = buf; ra->rrd->payload_size = len; ra->rrd->bytes_read = ra->bytes_read; } } ra->prev_cksum = ra->cksum; ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); err = receive_read(ra, sizeof (ra->next_rrd->header), &ra->next_rrd->header); ra->next_rrd->bytes_read = ra->bytes_read; if (err != 0) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; return (err); } if (ra->next_rrd->header.drr_type == DRR_BEGIN) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; return (SET_ERROR(EINVAL)); } /* * Note: checksum is of everything up to but not including the * checksum itself. */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); receive_cksum(ra, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &ra->next_rrd->header); cksum_orig = ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; cksump = &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; if (ra->byteswap) byteswap_record(&ra->next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; return (SET_ERROR(ECKSUM)); } receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); return (0); } static void objlist_create(struct objlist *list) { list_create(&list->list, sizeof (struct receive_objnode), offsetof(struct receive_objnode, node)); list->last_lookup = 0; } static void objlist_destroy(struct objlist *list) { struct receive_objnode *n; for (n = list_remove_head(&list->list); n != NULL; n = list_remove_head(&list->list)) { kmem_free(n, sizeof (*n)); } list_destroy(&list->list); } /* * This function looks through the objlist to see if the specified object number * is contained in the objlist. In the process, it will remove all object * numbers in the list that are smaller than the specified object number. Thus, * any lookup of an object number smaller than a previously looked up object * number will always return false; therefore, all lookups should be done in * ascending order. */ static boolean_t objlist_exists(struct objlist *list, uint64_t object) { struct receive_objnode *node = list_head(&list->list); ASSERT3U(object, >=, list->last_lookup); list->last_lookup = object; while (node != NULL && node->object < object) { VERIFY3P(node, ==, list_remove_head(&list->list)); kmem_free(node, sizeof (*node)); node = list_head(&list->list); } return (node != NULL && node->object == object); } /* * The objlist is a list of object numbers stored in ascending order. However, * the insertion of new object numbers does not seek out the correct location to * store a new object number; instead, it appends it to the list for simplicity. * Thus, any users must take care to only insert new object numbers in ascending * order. */ static void objlist_insert(struct objlist *list, uint64_t object) { struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); node->object = object; #ifdef ZFS_DEBUG { struct receive_objnode *last_object = list_tail(&list->list); uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); ASSERT3U(node->object, >, last_objnum); } #endif list_insert_tail(&list->list, node); } /* * Issue the prefetch reads for any necessary indirect blocks. * * We use the object ignore list to tell us whether or not to issue prefetches * for a given object. We do this for both correctness (in case the blocksize * of an object has changed) and performance (if the object doesn't exist, don't * needlessly try to issue prefetches). We also trim the list as we go through * the stream to prevent it from growing to an unbounded size. * * The object numbers within will always be in sorted order, and any write * records we see will also be in sorted order, but they're not sorted with * respect to each other (i.e. we can get several object records before * receiving each object's write records). As a result, once we've reached a * given object number, we can safely remove any reference to lower object * numbers in the ignore list. In practice, we receive up to 32 object records * before receiving write records, so the list can have up to 32 nodes in it. */ /* ARGSUSED */ static void receive_read_prefetch(struct receive_arg *ra, uint64_t object, uint64_t offset, uint64_t length) { if (!objlist_exists(&ra->ignore_objlist, object)) { dmu_prefetch(ra->os, object, 1, offset, length, ZIO_PRIORITY_SYNC_READ); } } /* * Read records off the stream, issuing any necessary prefetches. */ static int receive_read_record(struct receive_arg *ra) { int err; switch (ra->rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); void *buf = kmem_zalloc(size, KM_SLEEP); dmu_object_info_t doi; err = receive_read_payload_and_next_header(ra, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } err = dmu_object_info(ra->os, drro->drr_object, &doi); /* * See receive_read_prefetch for an explanation why we're * storing this object in the ignore_obj_list. */ if (err == ENOENT || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { objlist_insert(&ra->ignore_objlist, drro->drr_object); err = 0; } return (err); } case DRR_FREEOBJECTS: { err = receive_read_payload_and_next_header(ra, 0, NULL); return (err); } case DRR_WRITE: { struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; arc_buf_t *abuf; boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); if (DRR_WRITE_COMPRESSED(drrw)) { ASSERT3U(drrw->drr_compressed_size, >, 0); ASSERT3U(drrw->drr_logical_size, >=, drrw->drr_compressed_size); ASSERT(!is_meta); abuf = arc_loan_compressed_buf( dmu_objset_spa(ra->os), drrw->drr_compressed_size, drrw->drr_logical_size, drrw->drr_compressiontype); } else { abuf = arc_loan_buf(dmu_objset_spa(ra->os), is_meta, drrw->drr_logical_size); } err = receive_read_payload_and_next_header(ra, DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); if (err != 0) { dmu_return_arcbuf(abuf); return (err); } ra->rrd->write_buf = abuf; receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); } case DRR_WRITE_BYREF: { struct drr_write_byref *drrwb = &ra->rrd->header.drr_u.drr_write_byref; err = receive_read_payload_and_next_header(ra, 0, NULL); receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, drrwb->drr_length); return (err); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &ra->rrd->header.drr_u.drr_write_embedded; uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); void *buf = kmem_zalloc(size, KM_SLEEP); err = receive_read_payload_and_next_header(ra, size, buf); if (err != 0) { kmem_free(buf, size); return (err); } receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length); return (err); } case DRR_FREE: { /* * It might be beneficial to prefetch indirect blocks here, but * we don't really have the data to decide for sure. */ err = receive_read_payload_and_next_header(ra, 0, NULL); return (err); } case DRR_END: { struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) return (SET_ERROR(ECKSUM)); return (0); } case DRR_SPILL: { struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); err = receive_read_payload_and_next_header(ra, drrs->drr_length, buf); if (err != 0) kmem_free(buf, drrs->drr_length); return (err); } default: return (SET_ERROR(EINVAL)); } } /* * Commit the records to the pool. */ static int receive_process_record(struct receive_writer_arg *rwa, struct receive_record_arg *rrd) { int err; /* Processing in order, therefore bytes_read should be increasing. */ ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; switch (rrd->header.drr_type) { case DRR_OBJECT: { struct drr_object *drro = &rrd->header.drr_u.drr_object; err = receive_object(rwa, drro, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; return (err); } case DRR_FREEOBJECTS: { struct drr_freeobjects *drrfo = &rrd->header.drr_u.drr_freeobjects; return (receive_freeobjects(rwa, drrfo)); } case DRR_WRITE: { struct drr_write *drrw = &rrd->header.drr_u.drr_write; err = receive_write(rwa, drrw, rrd->write_buf); /* if receive_write() is successful, it consumes the arc_buf */ if (err != 0) dmu_return_arcbuf(rrd->write_buf); rrd->write_buf = NULL; rrd->payload = NULL; return (err); } case DRR_WRITE_BYREF: { struct drr_write_byref *drrwbr = &rrd->header.drr_u.drr_write_byref; return (receive_write_byref(rwa, drrwbr)); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = &rrd->header.drr_u.drr_write_embedded; err = receive_write_embedded(rwa, drrwe, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; return (err); } case DRR_FREE: { struct drr_free *drrf = &rrd->header.drr_u.drr_free; return (receive_free(rwa, drrf)); } case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; err = receive_spill(rwa, drrs, rrd->payload); kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; return (err); } default: return (SET_ERROR(EINVAL)); } } /* * dmu_recv_stream's worker thread; pull records off the queue, and then call * receive_process_record When we're done, signal the main thread and exit. */ static void receive_writer_thread(void *arg) { struct receive_writer_arg *rwa = arg; struct receive_record_arg *rrd; fstrans_cookie_t cookie = spl_fstrans_mark(); for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; rrd = bqueue_dequeue(&rwa->q)) { /* * If there's an error, the main thread will stop putting things * on the queue, but we need to clear everything in it before we * can exit. */ if (rwa->err == 0) { rwa->err = receive_process_record(rwa, rrd); } else if (rrd->write_buf != NULL) { dmu_return_arcbuf(rrd->write_buf); rrd->write_buf = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); rrd->payload = NULL; } kmem_free(rrd, sizeof (*rrd)); } kmem_free(rrd, sizeof (*rrd)); mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); mutex_exit(&rwa->mutex); spl_fstrans_unmark(cookie); thread_exit(); } static int resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) { uint64_t val; objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; uint64_t dsobj = dmu_objset_id(ra->os); uint64_t resume_obj, resume_off; if (nvlist_lookup_uint64(begin_nvl, "resume_object", &resume_obj) != 0 || nvlist_lookup_uint64(begin_nvl, "resume_offset", &resume_off) != 0) { return (SET_ERROR(EINVAL)); } VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); if (resume_obj != val) return (SET_ERROR(EINVAL)); VERIFY0(zap_lookup(mos, dsobj, DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); if (resume_off != val) return (SET_ERROR(EINVAL)); return (0); } /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a * worker thread, read the records off the stream one by one, and issue * prefetches for any necessary indirect blocks. It will then push the records * onto an internal blocking queue. The worker thread will pull the records off * the queue, and actually write the data into the DMU. This way, the worker * thread doesn't have to wait for reads to complete, since everything it needs * (the indirect blocks) will be prefetched. * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { int err = 0; struct receive_arg *ra; struct receive_writer_arg *rwa; int featureflags; uint32_t payloadlen; void *payload; nvlist_t *begin_nvl = NULL; ra = kmem_zalloc(sizeof (*ra), KM_SLEEP); rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); ra->byteswap = drc->drc_byteswap; ra->cksum = drc->drc_cksum; ra->vp = vp; ra->voff = *voffp; if (dsl_dataset_is_zapified(drc->drc_ds)) { (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, sizeof (ra->bytes_read), 1, &ra->bytes_read); } objlist_create(&ra->ignore_objlist); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra->os)); ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); /* if this stream is dedup'ed, set up the avl tree for guid mapping */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { minor_t minor; if (cleanup_fd == -1) { ra->err = SET_ERROR(EBADF); goto out; } ra->err = zfs_onexit_fd_hold(cleanup_fd, &minor); if (ra->err != 0) { cleanup_fd = -1; goto out; } if (*action_handlep == 0) { rwa->guid_to_ds_map = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); avl_create(rwa->guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); err = zfs_onexit_add_cb(minor, free_guid_map_onexit, rwa->guid_to_ds_map, action_handlep); if (ra->err != 0) goto out; } else { err = zfs_onexit_cb_data(minor, *action_handlep, (void **)&rwa->guid_to_ds_map); if (ra->err != 0) goto out; } drc->drc_guid_to_ds_map = rwa->guid_to_ds_map; } payloadlen = drc->drc_drr_begin->drr_payloadlen; payload = NULL; if (payloadlen != 0) payload = kmem_alloc(payloadlen, KM_SLEEP); err = receive_read_payload_and_next_header(ra, payloadlen, payload); if (err != 0) { if (payloadlen != 0) kmem_free(payload, payloadlen); goto out; } if (payloadlen != 0) { err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); kmem_free(payload, payloadlen); if (err != 0) goto out; } if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { err = resume_check(ra, begin_nvl); if (err != 0) goto out; } (void) bqueue_init(&rwa->q, zfs_recv_queue_length, offsetof(struct receive_record_arg, node)); cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL); mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); rwa->os = ra->os; rwa->byteswap = drc->drc_byteswap; rwa->resumable = drc->drc_resumable; (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, TS_RUN, minclsyspri); /* * We're reading rwa->err without locks, which is safe since we are the * only reader, and the worker thread is the only writer. It's ok if we * miss a write for an iteration or two of the loop, since the writer * thread will keep freeing records we send it until we send it an eos * marker. * * We can leave this loop in 3 ways: First, if rwa->err is * non-zero. In that case, the writer thread will free the rrd we just * pushed. Second, if we're interrupted; in that case, either it's the * first loop and ra->rrd was never allocated, or it's later and ra->rrd * has been handed off to the writer thread who will free it. Finally, * if receive_read_record fails or we're at the end of the stream, then * we free ra->rrd and exit. */ while (rwa->err == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); break; } ASSERT3P(ra->rrd, ==, NULL); ra->rrd = ra->next_rrd; ra->next_rrd = NULL; /* Allocates and loads header into ra->next_rrd */ err = receive_read_record(ra); if (ra->rrd->header.drr_type == DRR_END || err != 0) { kmem_free(ra->rrd, sizeof (*ra->rrd)); ra->rrd = NULL; break; } bqueue_enqueue(&rwa->q, ra->rrd, sizeof (struct receive_record_arg) + ra->rrd->payload_size); ra->rrd = NULL; } if (ra->next_rrd == NULL) ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); ra->next_rrd->eos_marker = B_TRUE; bqueue_enqueue(&rwa->q, ra->next_rrd, 1); mutex_enter(&rwa->mutex); while (!rwa->done) { cv_wait(&rwa->cv, &rwa->mutex); } mutex_exit(&rwa->mutex); cv_destroy(&rwa->cv); mutex_destroy(&rwa->mutex); bqueue_destroy(&rwa->q); if (err == 0) err = rwa->err; out: nvlist_free(begin_nvl); if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); if (err != 0) { /* * Clean up references. If receive is not resumable, * destroy what we created, so we don't leave it in * the inconsistent state. */ dmu_recv_cleanup_ds(drc); } *voffp = ra->voff; objlist_destroy(&ra->ignore_objlist); kmem_free(ra, sizeof (*ra)); kmem_free(rwa, sizeof (*rwa)); return (err); } static int dmu_recv_end_check(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int error; ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); if (error != 0) return (error); if (drc->drc_force) { /* * We will destroy any snapshots in tofs (i.e. before * origin_head) that are after the origin (which is * the snap before drc_ds, because drc_ds can not * have any snaps of its own). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) break; if (snap->ds_dir != origin_head->ds_dir) error = SET_ERROR(EINVAL); if (error == 0) { error = dsl_destroy_snapshot_check_impl( snap, B_FALSE); } obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); if (error != 0) break; } if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } } error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, origin_head, drc->drc_force, drc->drc_owner, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); } return (error); } static void dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head)); if (drc->drc_force) { /* * Destroy any snapshots of drc_tofs (origin_head) * after the origin (the snap before drc_ds). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &snap)); ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_destroy_snapshot_sync_impl(snap, B_FALSE, tx); dsl_dataset_rele(snap, FTAG); } } VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); dsl_dataset_snapshot_sync_impl(origin_head, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(origin_head->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(origin_head->ds_dbuf, tx); dsl_dataset_phys(origin_head)->ds_flags &= ~DS_FLAG_INCONSISTENT; dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); if (drc->drc_owner != NULL) VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); } else { dsl_dataset_t *ds = drc->drc_ds; dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); dsl_dataset_phys(ds->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(ds->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(ds->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; if (dsl_dataset_has_resume_receive_state(ds)) { (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OBJECT, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OFFSET, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_BYTES, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, tx); (void) zap_remove(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TONAME, tx); } } drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE); /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode, * we can evict its bonus buffer. */ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); drc->drc_ds = NULL; } static int add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) { dsl_pool_t *dp; dsl_dataset_t *snapds; guid_map_entry_t *gmep; int err; ASSERT(guid_map != NULL); err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); if (err == 0) { gmep->guid = dsl_dataset_phys(snapds)->ds_guid; gmep->gme_ds = snapds; avl_add(guid_map, gmep); dsl_dataset_long_hold(snapds, gmep); } else { kmem_free(gmep, sizeof (*gmep)); } dsl_pool_rele(dp, FTAG); return (err); } static int dmu_recv_end_modified_blocks = 3; static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { int error; #ifdef _KERNEL /* * We will be destroying the ds; make sure its origin is unmounted if * necessary. */ char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(drc->drc_ds, name); zfs_destroy_unmount_origin(name); #endif error = dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); if (error != 0) dmu_recv_cleanup_ds(drc); return (error); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { int error; error = dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); if (error != 0) { dmu_recv_cleanup_ds(drc); } else if (drc->drc_guid_to_ds_map != NULL) { (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map, drc->drc_newsnapobj); } return (error); } int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) { drc->drc_owner = owner; if (drc->drc_newfs) return (dmu_recv_new_end(drc)); else return (dmu_recv_existing_end(drc)); } /* * Return TRUE if this objset is currently being received into. */ boolean_t dmu_objset_is_receiving(objset_t *os) { return (os->os_dsl_dataset != NULL && os->os_dsl_dataset->ds_owner == dmu_recv_tag); } #if defined(_KERNEL) module_param(zfs_send_corrupt_data, int, 0644); MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data"); #endif diff --git a/module/zfs/policy.c b/module/zfs/policy.c index 5b1de29e4a6e..03e8f748b746 100644 --- a/module/zfs/policy.c +++ b/module/zfs/policy.c @@ -1,303 +1,303 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013, Joyent, Inc. All rights reserved. * Copyright (C) 2016 Lawrence Livermore National Security, LLC. * * For Linux the vast majority of this enforcement is already handled via * the standard Linux VFS permission checks. However certain administrative * commands which bypass the standard mechanisms may need to make use of * this functionality. */ #include #include #include /* * The passed credentials cannot be directly verified because Linux only * provides and interface to check the *current* process credentials. In * order to handle this the capable() test is only run when the passed * credentials match the current process credentials or the kcred. In * all other cases this function must fail and return the passed err. */ static int priv_policy(const cred_t *cr, int capability, boolean_t all, int err) { ASSERT3S(all, ==, B_FALSE); if (cr != CRED() && (cr != kcred)) return (err); if (!capable(capability)) return (err); return (0); } /* * Checks for operations that are either client-only or are used by * both clients and servers. */ int secpolicy_nfs(const cred_t *cr) { return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); } /* * Catch all system configuration. */ int secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) { return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); } /* * Like secpolicy_vnode_access() but we get the actual wanted mode and the * current mode of the file, not the missing bits. * * Enforced in the Linux VFS. */ int secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, - mode_t curmode, mode_t wantmode) + mode_t curmode, mode_t wantmode) { return (0); } /* * This is a special routine for ZFS; it is used to determine whether * any of the privileges in effect allow any form of access to the * file. There's no reason to audit this or any reason to record * this. More work is needed to do the "KPLD" stuff. */ int secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) { if (crgetfsuid(cr) == owner) return (0); if (zpl_inode_owner_or_capable(ip)) return (0); if (priv_policy(cr, CAP_DAC_OVERRIDE, B_FALSE, EPERM) == 0) return (0); if (priv_policy(cr, CAP_DAC_READ_SEARCH, B_FALSE, EPERM) == 0) return (0); return (EPERM); } /* * Determine if subject can chown owner of a file. */ int secpolicy_vnode_chown(const cred_t *cr, uid_t owner) { if (crgetfsuid(cr) == owner) return (0); return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM)); } /* * Determine if subject can change group ownership of a file. */ int secpolicy_vnode_create_gid(const cred_t *cr) { return (priv_policy(cr, CAP_SETGID, B_FALSE, EPERM)); } /* * Policy determines whether we can remove an entry from a directory, * regardless of permission bits. */ int secpolicy_vnode_remove(const cred_t *cr) { return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM)); } /* * Determine that subject can modify the mode of a file. allzone privilege * needed when modifying root owned object. */ int secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) { if (crgetfsuid(cr) == owner) return (0); return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM)); } /* * Are we allowed to retain the set-uid/set-gid bits when * changing ownership or when writing to a file? * "issuid" should be true when set-uid; only in that case * root ownership is checked (setgid is assumed). * * Enforced in the Linux VFS. */ int secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) { return (0); } /* * Determine that subject can set the file setgid flag. */ int secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) { if (crgetfsgid(cr) != gid && !groupmember(gid, cr)) return (priv_policy(cr, CAP_FSETID, B_FALSE, EPERM)); return (0); } /* * Determine if the subject can inject faults in the ZFS fault injection * framework. Requires all privileges. */ int secpolicy_zinject(const cred_t *cr) { return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); } /* * Determine if the subject has permission to manipulate ZFS datasets * (not pools). Equivalent to the SYS_MOUNT privilege. */ int secpolicy_zfs(const cred_t *cr) { return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); } void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (vap->va_mode & S_ISUID) != 0 && (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { vap->va_mask |= AT_MODE; vap->va_mode &= ~(S_ISUID|S_ISGID); } } /* * Determine that subject can set the file setid flags. */ static int secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) { if (crgetfsuid(cr) == owner) return (0); return (priv_policy(cr, CAP_FSETID, B_FALSE, EPERM)); } /* * Determine that subject can make a file a "sticky". * * Enforced in the Linux VFS. */ static int secpolicy_vnode_stky_modify(const cred_t *cr) { return (0); } int secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, const vattr_t *ovap, cred_t *cr) { int error; if ((vap->va_mode & S_ISUID) != 0 && (error = secpolicy_vnode_setid_modify(cr, ovap->va_uid)) != 0) { return (error); } /* * Check privilege if attempting to set the * sticky bit on a non-directory. */ if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 && secpolicy_vnode_stky_modify(cr) != 0) { vap->va_mode &= ~S_ISVTX; } /* * Check for privilege if attempting to set the * group-id bit. */ if ((vap->va_mode & S_ISGID) != 0 && secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { vap->va_mode &= ~S_ISGID; } return (0); } /* * Check privileges for setting xvattr attributes */ int secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype) { return (secpolicy_vnode_chown(cr, owner)); } /* * Check privileges for setattr attributes. * * Enforced in the Linux VFS. */ int secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap, const struct vattr *ovap, int flags, int unlocked_access(void *, int, cred_t *), void *node) { return (0); } /* * Check privileges for links. * * Enforced in the Linux VFS. */ int secpolicy_basic_link(const cred_t *cr) { return (0); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 84c19c7ca0f0..fa9bdd7b8e10 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1,2126 +1,2127 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include /* * SPA locking * * There are four basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy/import/export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. * * spa_rename() is also implemented within this file since it requires * manipulation of the namespace. */ static avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; static kcondvar_t spa_namespace_cv; int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; int spa_mode_global; #ifdef ZFS_DEBUG /* Everything except dprintf and spa is on by default in debug builds */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ int zfs_recover = B_FALSE; /* * If destroy encounters an EIO while reading metadata (e.g. indirect * blocks), space referenced by the missing metadata can not be freed. * Normally this causes the background destroy to become "stalled", as * it is unable to make forward progress. While in this stalled state, * all remaining space to free from the error-encountering filesystem is * "temporarily leaked". Set this flag to cause it to ignore the EIO, * permanently leak the space from indirect blocks that can not be read, * and continue to free everything else that it can. * * The default, "stalling" behavior is useful if the storage partially * fails (i.e. some but not all i/os fail), and then later recovers. In * this case, we will be able to continue pool operations while it is * partially failed, and when it recovers, we can continue to free the * space, with no leaks. However, note that this case is actually * fairly rare. * * Typically pools either (a) fail completely (but perhaps temporarily, * e.g. a top-level vdev going offline), or (b) have localized, * permanent errors (e.g. disk returns the wrong data due to bit flip or * firmware bug). In case (a), this setting does not matter because the * pool will be suspended and the sync thread will not be able to make * forward progress regardless. In case (b), because the error is * permanent, the best we can do is leak the minimum amount of space, * which is what setting this flag will do. Therefore, it is reasonable * for this flag to normally be set, but we chose the more conservative * approach of not setting it, so that there is no possibility of * leaking space in the "partial temporary" failure case. */ int zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in a system panic. */ unsigned long zfs_deadman_synctime_ms = 1000000ULL; /* * By default the deadman is enabled. */ int zfs_deadman_enabled = 1; /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ int spa_asize_inflation = 24; /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed. This ensures that we don't run the pool * completely out of space, due to unaccounted changes (e.g. to the MOS). * It also limits the worst-case time to allocate space. If we have * less than this amount of free space, most ZPL operations (e.g. write, * create) will return ENOSPC. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half * the slop space is free. Typically, once the pool has less than the slop * space free, the user will use these operations to free up space in the pool. * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these * operations result in a net increase in the amount of space used, * it is possible to run the pool completely out of space, causing it to * be permanently read-only. * * See also the comments in zfs_space_check_t. */ int spa_slop_shift = 5; /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { int i; for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); refcount_create_untracked(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; } } static void spa_config_lock_destroy(spa_t *spa) { int i; for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); refcount_destroy(&scl->scl_count); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); } } int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) { int i; for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (!refcount_is_zero(&scl->scl_count)) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } scl->scl_writer = curthread; } (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } return (1); } void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) { int wlocks_held = 0; int i; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || scl->scl_write_wanted) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (!refcount_is_zero(&scl->scl_count)) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } ASSERT(wlocks_held <= locks); } void spa_config_exit(spa_t *spa, int locks, void *tag) { int i; for (i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(!refcount_is_zero(&scl->scl_count)); if (refcount_remove(&scl->scl_count, tag) == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int i, locks_held = 0; for (i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(search.spa_name, "/@#"); if (cp != NULL) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); return (spa); } /* * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. * If the zfs_deadman_enabled flag is set then it inspects all vdev queues * looking for potentially hung I/Os. */ void spa_deadman(void *arg) { spa_t *spa = arg; zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, ++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); } /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; int t; int i; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); for (t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); spa_stats_init(spa); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) spa->spa_root = spa_strdup(altroot); avl_create(&spa->spa_alloc_tree, zio_timestamp_compare, sizeof (zio_t), offsetof(zio_t, io_alloc_node)); /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); } if (spa->spa_label_features == NULL) { VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, KM_SLEEP) == 0); } spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature * refcount cache. */ for (i = 0; i < SPA_FEATURES; i++) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; int t; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); if (spa->spa_root) spa_strfree(spa->spa_root); while ((dp = list_head(&spa->spa_config_list)) != NULL) { list_remove(&spa->spa_config_list, dp); if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } avl_destroy(&spa->spa_alloc_tree); list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); refcount_destroy(&spa->spa_refcount); spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); zio_checksum_templates_free(spa); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); mutex_destroy(&spa->spa_alloc_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, void *tag) { ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_close(spa_t *spa, void *tag) { ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) refcount_remove(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t held by a dsl dir that is * being asynchronously released. Async releases occur from a taskq * performing eviction of dsl datasets and dirs. The namespace lock * isn't held and the hold by the object being evicted may contribute to * spa_minref (e.g. dataset or directory released during pool export), * so the asserts in spa_close() do not apply. */ void spa_async_close(spa_t *spa, void *tag) { (void) refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the * number of references acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); return (refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static inline int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; return (AVL_CMP(sa->aux_guid, sb->aux_guid)); } void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. * - A spare may be added to a pool even if it's actively in use within * another pool. * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) { int config_changed = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_config_sync(spa, B_FALSE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; if (vd != NULL || error == 0) vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 0, 0, B_FALSE); if (vd != NULL) { vdev_state_dirty(vd->vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(1M) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_config_sync(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); /* * When we are creating the pool (tx_txg==TXG_INITIAL), we can't * dirty the vdev config because lock SCL_CONFIG is not held. * Thankfully, in this case we don't need to dirty the config * because it will be written out anyway when we finish * creating the pool. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); } } void spa_deactivate_mos_feature(spa_t *spa, const char *feature) { if (nvlist_remove_all(spa->spa_label_features, feature) == 0) vdev_config_dirty(spa->spa_root_vdev); } /* * Rename a spa_t. */ int spa_rename(const char *name, const char *newname) { spa_t *spa; int err; /* * Lookup the spa_t and grab the config lock for writing. We need to * actually open the pool so that we can sync out the necessary labels. * It's OK to call spa_open() with the namespace lock held because we * allow recursive calls for other reasons. */ mutex_enter(&spa_namespace_lock); if ((err = spa_open(name, &spa, FTAG)) != 0) { mutex_exit(&spa_namespace_lock); return (err); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); avl_remove(&spa_namespace_avl, spa); (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); avl_add(&spa_namespace_avl, spa); /* * Sync all labels to disk with the new names by marking the root vdev * dirty and waiting for it to sync. It will pick up the new pool name * during the sync. */ vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa->spa_dsl_pool, 0); /* * Sync the updated config cache. */ spa_config_sync(spa, B_FALSE, B_TRUE); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (0); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); bcopy(s, new, len); new[len] = '\0'; return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_get_random(uint64_t range) { uint64_t r; ASSERT(range != 0); (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); return (r % range); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid = spa_get_random(-1ULL); if (spa != NULL) { while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) guid = spa_get_random(-1ULL); } else { while (guid == 0 || spa_guid_exists(guid, 0)) guid = spa_get_random(-1ULL); } return (guid); } void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; char *checksum = NULL; char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); (void) snprintf(type, sizeof (type), "bswap %s %s", DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? "metadata" : "data", dmu_ot_byteswap[bswap].ob_name); } else { (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } if (!BP_IS_EMBEDDED(bp)) { checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. */ uint64_t strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } boolean_t spa_is_initializing(spa_t *spa) { return (spa->spa_is_initializing); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else (void) strncpy(buf, spa->spa_root, buflen); } int spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t guid; /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev == NULL) return (spa->spa_config_guid); guid = spa->spa_last_synced_guid != 0 ? spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; /* * Return the most recently synced out guid unless we're * in syncing context. */ if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else return (guid); } uint64_t spa_load_guid(spa_t *spa) { /* * This is a GUID that exists solely as a reference for the * purposes of the arc. It is generated at load time, and * is never written to persistent storage. */ return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* ARGSUSED */ uint64_t spa_get_asize(spa_t *spa, uint64_t lsize) { return (lsize * spa_asize_inflation); } /* * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), * or at least 32MB. * * See the comment above spa_slop_shift for details. */ uint64_t -spa_get_slop_space(spa_t *spa) { +spa_get_slop_space(spa_t *spa) +{ uint64_t space = spa_get_dspace(spa); return (MAX(space >> spa_slop_shift, SPA_MINDEVSIZE >> 1)); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ddt_get_dedup_dspace(spa); } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint8_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); } void spa_evicting_os_register(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_insert_head(&spa->spa_evicting_os_list, os); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_deregister(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_remove(&spa->spa_evicting_os_list, os); cv_broadcast(&spa->spa_evicting_os_cv); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_wait(spa_t *spa) { mutex_enter(&spa->spa_evicting_os_lock); while (!list_is_empty(&spa->spa_evicting_os_list)) cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); mutex_exit(&spa->spa_evicting_os_lock); dmu_buf_user_evict_wait(); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t spa_deadman_synctime(spa_t *spa) { return (spa->spa_deadman_synctime); } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd != NULL) dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; int d; for (d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; int d; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); return (AVL_ISIGN(s)); } void spa_boot_init(void) { spa_config_load(); } void spa_init(int mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; #ifndef _KERNEL if (spa_mode_global != FREAD && dprintf_find_string("watch")) { struct sigaction sa; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sa.sa_sigaction = arc_buf_sigsegv; if (sigaction(SIGSEGV, &sa, NULL) == -1) { perror("could not enable watchpoints: " "sigaction(SIGSEGV, ...) = "); } else { arc_watch = B_TRUE; } } #endif fm_init(); refcount_init(); unique_init(); range_tree_init(); metaslab_alloc_trace_init(); ddt_init(); zio_init(); dmu_init(); zil_init(); vdev_cache_stat_init(); vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); l2arc_start(); } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_file_fini(); vdev_cache_stat_fini(); vdev_raidz_math_fini(); zil_fini(); dmu_fini(); zio_fini(); ddt_fini(); metaslab_alloc_trace_fini(); range_tree_fini(); unique_fini(); refcount_fini(); fm_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } /* * Return whether this pool has slogs. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_rotor != NULL); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & FWRITE)); } /* * Returns true if there is a pending sync task in any of the current * syncing txg, the current quiescing txg, or the current open txg. */ boolean_t spa_has_pending_synctask(spa_t *spa) { return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks)); } int spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); spa->spa_scan_pass_exam = 0; vdev_scan_stat_init(spa->spa_root_vdev); } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) return (SET_ERROR(ENOENT)); bzero(ps, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; ps->pss_state = scn->scn_phys.scn_state; /* data not stored on disk */ ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_exam = spa->spa_scan_pass_exam; return (0); } boolean_t spa_debug_enabled(spa_t *spa) { return (spa->spa_debug); } int spa_maxblocksize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SPA_MAXBLOCKSIZE); else return (SPA_OLD_MAXBLOCKSIZE); } int spa_maxdnodesize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (DNODE_MAX_SIZE); else return (DNODE_MIN_SIZE); } #if defined(_KERNEL) && defined(HAVE_SPL) /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); EXPORT_SYMBOL(spa_close); EXPORT_SYMBOL(spa_refcount_zero); /* Pool configuration lock */ EXPORT_SYMBOL(spa_config_tryenter); EXPORT_SYMBOL(spa_config_enter); EXPORT_SYMBOL(spa_config_exit); EXPORT_SYMBOL(spa_config_held); /* Pool vdev add/remove lock */ EXPORT_SYMBOL(spa_vdev_enter); EXPORT_SYMBOL(spa_vdev_exit); /* Pool vdev state change lock */ EXPORT_SYMBOL(spa_vdev_state_enter); EXPORT_SYMBOL(spa_vdev_state_exit); /* Accessor functions */ EXPORT_SYMBOL(spa_shutting_down); EXPORT_SYMBOL(spa_get_dsl); EXPORT_SYMBOL(spa_get_rootblkptr); EXPORT_SYMBOL(spa_set_rootblkptr); EXPORT_SYMBOL(spa_altroot); EXPORT_SYMBOL(spa_sync_pass); EXPORT_SYMBOL(spa_name); EXPORT_SYMBOL(spa_guid); EXPORT_SYMBOL(spa_last_synced_txg); EXPORT_SYMBOL(spa_first_txg); EXPORT_SYMBOL(spa_syncing_txg); EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); EXPORT_SYMBOL(spa_get_asize); EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); EXPORT_SYMBOL(spa_normal_class); EXPORT_SYMBOL(spa_log_class); EXPORT_SYMBOL(spa_max_replication); EXPORT_SYMBOL(spa_prev_software_version); EXPORT_SYMBOL(spa_get_failmode); EXPORT_SYMBOL(spa_suspended); EXPORT_SYMBOL(spa_bootfs); EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); EXPORT_SYMBOL(spa_maxdnodesize); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_rename); EXPORT_SYMBOL(spa_guid_exists); EXPORT_SYMBOL(spa_strdup); EXPORT_SYMBOL(spa_strfree); EXPORT_SYMBOL(spa_get_random); EXPORT_SYMBOL(spa_generate_guid); EXPORT_SYMBOL(snprintf_blkptr); EXPORT_SYMBOL(spa_freeze); EXPORT_SYMBOL(spa_upgrade); EXPORT_SYMBOL(spa_evict_all); EXPORT_SYMBOL(spa_lookup_by_guid); EXPORT_SYMBOL(spa_has_spare); EXPORT_SYMBOL(dva_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize); EXPORT_SYMBOL(spa_has_slogs); EXPORT_SYMBOL(spa_is_root); EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); /* BEGIN CSTYLED */ module_param(zfs_flags, uint, 0644); MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags"); module_param(zfs_recover, int, 0644); MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors"); module_param(zfs_free_leak_on_eio, int, 0644); MODULE_PARM_DESC(zfs_free_leak_on_eio, "Set to ignore IO errors during free and permanently leak the space"); module_param(zfs_deadman_synctime_ms, ulong, 0644); MODULE_PARM_DESC(zfs_deadman_synctime_ms, "Expiration time in milliseconds"); module_param(zfs_deadman_enabled, int, 0644); MODULE_PARM_DESC(zfs_deadman_enabled, "Enable deadman timer"); module_param(spa_asize_inflation, int, 0644); MODULE_PARM_DESC(spa_asize_inflation, "SPA size estimate multiplication factor"); module_param(spa_slop_shift, int, 0644); MODULE_PARM_DESC(spa_slop_shift, "Reserved free space in pool"); /* END CSTYLED */ #endif diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 77bfef3a0a9b..c8f896276aa6 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,3684 +1,3685 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * When a vdev is added, it will be divided into approximately (but no * more than) this number of metaslabs. */ int metaslabs_per_vdev = 200; /* * Virtual device management. */ static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; int c; for (c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* * The allocatable space for a raidz vdev is N * sizeof(smallest child), * so each child must provide at least 1/Nth of its asize. */ if (pvd->vdev_ops == &vdev_raidz_ops) return (pvd->vdev_min_asize / pvd->vdev_children); return (pvd->vdev_min_asize); } void vdev_set_min_asize(vdev_t *vd) { int c; vd->vdev_min_asize = vdev_get_min_asize(vd); for (c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; int c; if (vd->vdev_guid == guid) return (vd); for (c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; int c; if (vd->vdev_ops->vdev_op_leaf) return (1); for (c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { return (vdev_count_leaves_impl(spa->spa_root_vdev)); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; int c; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; int t; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, DELAYS_PER_SECOND, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, CHECKSUMS_PER_SECOND, 1); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL, &vd->vdev_dtl_lock); } txg_list_create(&vd->vdev_ms_list, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); /* * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(ENOTSUP)); if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(ENOTSUP)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } } else { nparity = 0; } ASSERT(nparity != -1ULL); vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_islog = islog; vd->vdev_nparity = nparity; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vd->vdev_enc_sysfs_path) == 0) vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (parent && !parent->vdev_parent && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); vd->vdev_mg = metaslab_group_create(islog ? spa_log_class(spa) : spa_normal_class(spa), vd); } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be * valid in the current context. Local vdevs will * remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { int c, t; spa_t *spa = vd->vdev_spa; /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { int c; if (vd == NULL) return; vd->vdev_top = tvd; for (c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (oldc != 0) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (m = oldc; m < newc; m++) { uint64_t object = 0; if (txg == 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error) return (error); } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error) return (error); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (oldc == 0 && !vd->vdev_removing) metaslab_group_activate(vd->vdev_mg); if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { uint64_t m; uint64_t count = vd->vdev_ms_count; if (vd->vdev_ms != NULL) { metaslab_group_passivate(vd->vdev_mg); for (m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; } ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; int l; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { int c; #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; int c; /* * in order to handle pools on top of zvols, do the opens * in a single thread so that the same thread holds the * spa_namespace_lock */ if (vdev_uses_zvols(vd)) { retry_sync: for (c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); } else { tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); if (tq == NULL) goto retry_sync; for (c = 0; c < children; c++) VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); taskq_destroy(tq); } vd->vdev_nonrot = B_TRUE; for (c = 0; c < children; c++) vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t ashift = 0; int c; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; if (vd->vdev_ashift == 0) vd->vdev_ashift = ashift; } else { /* * Detect if the alignment requirement has increased. * We don't want to make the pool unavailable, just * post an event instead. */ if (ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, 0, 0); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy and the asize has increased, * then we've experienced dynamic LUN growth. If automatic * expansion is enabled then use the additional space. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the min and max ashift values for normal data devices. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && !vd->vdev_islog && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && vdev_resilver_needed(vd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); return (0); } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * If 'strict' is false ignore the spa guid check. This is necessary because * if the machine crashed during a re-guid the new guid might have been written * to all of the vdev labels, but not the cached config. The strict check * will be performed when the pool is opened again using the mos config. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd, boolean_t strict) { spa_t *spa = vd->vdev_spa; nvlist_t *label; uint64_t guid = 0, top_guid; uint64_t state; int c; for (c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c], strict) != 0) return (SET_ERROR(EBADF)); /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { uint64_t aux_guid = 0; nvlist_t *nvl; uint64_t txg = spa_last_synced_txg(spa) != 0 ? spa_last_synced_txg(spa) : -1ULL; if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); return (0); } if (strict && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != spa_guid(spa))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; /* * If this vdev just became a top-level vdev because its * sibling was detached, it will have adopted the parent's * vdev guid -- but the label may or may not be on disk yet. * Fortunately, either version of the label will have the * same top guid, so if we're a top-level vdev, we can * safely compare to that instead. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (SET_ERROR(EBADF)); /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; } return (0); } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; ASSERTV(spa_t *spa = vd->vdev_spa); ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int c; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { int c; ASSERT(spa_is_root(vd->vdev_spa)); for (c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache && !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd, B_TRUE); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : ENXIO); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { /* * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. */ vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); ASSERT(!vd->vdev_ishole); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { int c; for (c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(rt->rt_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(rt->rt_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); mutex_enter(rt->rt_lock); if (range_tree_space(rt) != 0) dirty = range_tree_contains(rt, txg, size); mutex_exit(rt->rt_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(rt->rt_lock); empty = (range_tree_space(rt) == 0); mutex_exit(rt->rt_lock); return (empty); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_start - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_end); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd) { spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_resilver_txg == 0 || range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the scn_max_txg * value to the highest txg value that exists in all DTLs. If this * device's max DTL is not part of this scan (i.e. it is not in * the range (scn_min_txg, scn_max_txg] then it is not eligible * for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); return (B_TRUE); } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int c, t, minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; mutex_enter(&vd->vdev_dtl_lock); /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to * excise regions on vdevs that were available during * the entire duration of this scan. */ if (scrub_txg != 0 && (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) && vdev_dtl_should_excise(vd)) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering and no longer has any * DTLs then reset its resilvering flag and dirty * the top level so that we persist the change. */ if (vd->vdev_resilver_txg != 0 && range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (t = 0; t < DTL_TYPES; t++) { int c; /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vd->vdev_nparity != 0) minref = vd->vdev_nparity + 1; /* RAID-Z */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int error = 0; int c; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(!vd->vdev_ishole); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); /* * Now that we've opened the space_map we need to update * the in-core DTL. */ space_map_update(vd->vdev_dtl_sm); error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); return (error); } for (c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { uint64_t i; if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); } } for (i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; kmutex_t rtlock; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(!vd->vdev_ishole); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0, &vd->vdev_dtl_lock)); ASSERT(vd->vdev_dtl_sm != NULL); } mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); rtsync = range_tree_create(NULL, NULL, &rtlock); mutex_enter(&rtlock); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); mutex_exit(&rtlock); mutex_destroy(&rtlock); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " "new object %llu", txg, spa_name(spa), object, space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); mutex_enter(&vd->vdev_dtl_lock); space_map_update(vd->vdev_dtl_sm); mutex_exit(&vd->vdev_dtl_lock); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); if (!required && zio_injection_enabled) required = !!zio_handle_device_injection(vd, NULL, ECHILD); return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; int c; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } void vdev_load(vdev_t *vd) { int c; /* * Recursively load all children. */ for (c = 0; c < vd->vdev_children; c++) vdev_load(vd->vdev_child[c]); /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && !vd->vdev_ishole && (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || vdev_metaslab_init(vd, 0) != 0)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } void vdev_remove(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; dmu_tx_t *tx; int m, i; tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp == NULL || msp->ms_sm == NULL) continue; mutex_enter(&msp->ms_lock); /* * If the metaslab was not loaded when the vdev * was removed then the histogram accounting may * not be accurate. Update the histogram information * here so that we ensure that the metaslab group * and metaslab class are up-to-date. */ metaslab_group_histogram_remove(mg, msp); VERIFY0(space_map_allocated(msp->ms_sm)); space_map_free(msp->ms_sm, tx); space_map_close(msp->ms_sm); msp->ms_sm = NULL; mutex_exit(&msp->ms_lock); } metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } if (vd->vdev_ms_array) { (void) dmu_object_free(mos, vd->vdev_ms_array, tx); vd->vdev_ms_array = 0; } if (vd->vdev_islog && vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(!vd->vdev_ishole); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))) metaslab_sync_done(msp, txg); if (reassess) metaslab_sync_reassess(vd->vdev_mg); } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { ASSERT(vd == vd->vdev_top); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); dmu_tx_commit(tx); } /* * Remove the metadata associated with this vdev once it's empty. */ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove(vd, txg); while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t postevent = B_FALSE; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); postevent = (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ? B_TRUE : B_FALSE; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } if (postevent) spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_offline_log(spa); spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; for (c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) spa_async_request(spa, SPA_ASYNC_RESILVER); spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && !vd->vdev_ishole && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { int t; for (t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int c, t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_class[t].vqc_active; vsx->vsx_pend_queue[t] = avl_numnodes( &vd->vdev_queue.vq_class[t].vqc_queued_tree); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { mutex_enter(&vd->vdev_stat_lock); if (vs) { bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; } } ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_READER) != 0); vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; int c; for (c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_phys_t *scn_phys = &spa->spa_dsl_pool->dp_scan->scn_phys; uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { vs->vs_ops[type]++; vs->vs_bytes[type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[zio->io_priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[zio->io_priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[zio->io_priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta = space_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; metaslab_group_t *mg = vd->vdev_mg; metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); if (mc == spa_normal_class(spa)) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } if (mc != NULL) { ASSERT(rvd == vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, dspace_delta); } } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_config_dirty_list, vd); } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; int c; if (vd->vdev_children > 0) { for (c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes into the decision. */ if (child->vdev_ishole) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } zfs_ereport_post(class, spa, vd, NULL, save_state, 0); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. */ boolean_t vdev_is_bootable(vdev_t *vd) { #if defined(__sun__) || defined(__sun) /* * Currently, we do not support RAID-Z or partial configuration. * In addition, only a single top-level vdev is allowed and none of the * leaves can be wholedisks. */ int c; if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && vd->vdev_children > 1) { return (B_FALSE); } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { return (B_FALSE); } } else if (vd->vdev_wholedisk == 1) { return (B_FALSE); } for (c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } #endif /* __sun__ || __sun */ return (B_TRUE); } /* * Load the state from the original vdev tree (ovd) which * we've retrieved from the MOS config object. If the original * vdev was offline or faulted then we transfer that state to the * device in the current vdev tree (nvd). */ void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { int c; ASSERT(nvd->vdev_top->vdev_islog); ASSERT(spa_config_held(nvd->vdev_spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); for (c = 0; c < nvd->vdev_children; c++) vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); if (nvd->vdev_ops->vdev_op_leaf) { /* * Restore the persistent vdev state */ nvd->vdev_offline = ovd->vdev_offline; nvd->vdev_faulted = ovd->vdev_faulted; nvd->vdev_degraded = ovd->vdev_degraded; nvd->vdev_removed = ovd->vdev_removed; } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { int c; if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd) { int c; for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime we log a zevent. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) { zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " "delta %lluns, last io %lluns", fio->io_timestamp, delta, vq->vq_io_complete_ts); zfs_ereport_post(FM_EREPORT_ZFS_DELAY, spa, vd, fio, 0, 0); } } mutex_exit(&vq->vq_lock); } } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); - +/* BEGIN CSTYLED */ module_param(metaslabs_per_vdev, int, 0644); MODULE_PARM_DESC(metaslabs_per_vdev, "Divide added vdev into approximately (but no more than) this number " "of metaslabs"); +/* END CSTYLED */ #endif diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index dc2a346426ca..256431e6b334 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -1,670 +1,672 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include #include #include #include #include #include /* * Virtual device vector for mirroring. */ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; } mirror_child_t; typedef struct mirror_map { int *mm_preferred; int mm_preferred_cnt; int mm_children; boolean_t mm_replacing; boolean_t mm_root; mirror_child_t mm_child[]; } mirror_map_t; static int vdev_mirror_shift = 21; /* * The load configuration settings below are tuned by default for * the case where all devices are of the same rotational type. * * If there is a mixture of rotating and non-rotating media, setting * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results * as it will direct more reads to the non-rotating vdevs which are more likely * to have a higher performance. */ /* Rotating media load calculation configuration. */ static int zfs_vdev_mirror_rotating_inc = 0; static int zfs_vdev_mirror_rotating_seek_inc = 5; static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; /* Non-rotating media load calculation configuration. */ static int zfs_vdev_mirror_non_rotating_inc = 0; static int zfs_vdev_mirror_non_rotating_seek_inc = 1; static inline size_t vdev_mirror_map_size(int children) { return (offsetof(mirror_map_t, mm_child[children]) + sizeof (int) * children); } static inline mirror_map_t * vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root) { mirror_map_t *mm; mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); mm->mm_children = children; mm->mm_replacing = replacing; mm->mm_root = root; mm->mm_preferred = (int *)((uintptr_t)mm + offsetof(mirror_map_t, mm_child[children])); return (mm); } static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { vdev_mirror_map_free, zio_vsd_default_cksum_report }; static int vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) { uint64_t lastoffset; int load; /* All DVAs have equal weight at the root. */ if (mm->mm_root) return (INT_MAX); /* * We don't return INT_MAX if the device is resilvering i.e. * vdev_resilver_txg != 0 as when tested performance was slightly * worse overall when resilvering with compared to without. */ /* Standard load based on pending queue length. */ load = vdev_queue_length(vd); lastoffset = vdev_queue_lastoffset(vd); if (vd->vdev_nonrot) { /* Non-rotating media. */ if (lastoffset == zio_offset) return (load + zfs_vdev_mirror_non_rotating_inc); /* * Apply a seek penalty even for non-rotating devices as * sequential I/O's can be aggregated into fewer operations on * the device, thus avoiding unnecessary per-command overhead * and boosting performance. */ return (load + zfs_vdev_mirror_non_rotating_seek_inc); } /* Rotating media I/O's which directly follow the last I/O. */ if (lastoffset == zio_offset) return (load + zfs_vdev_mirror_rotating_inc); /* * Apply half the seek increment to I/O's within seek offset * of the last I/O queued to this vdev as they should incur less * of a seek increment. */ if (ABS(lastoffset - zio_offset) < zfs_vdev_mirror_rotating_seek_offset) return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); /* Apply the full seek increment to all other I/O's. */ return (load + zfs_vdev_mirror_rotating_seek_inc); } /* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline static mirror_map_t * vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { mm = vdev_mirror_map_alloc(vd->vdev_children, (vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops), B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; } } zio->io_vsd = mm; zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); } static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *ashift) { int numerrors = 0; int lasterror = 0; int c; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *ashift = MAX(*ashift, cvd->vdev_ashift); } if (numerrors == vd->vdev_children) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_mirror_close(vdev_t *vd) { int c; for (c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } static void vdev_mirror_child_done(zio_t *zio) { mirror_child_t *mc = zio->io_private; mc->mc_error = zio->io_error; mc->mc_tried = 1; mc->mc_skipped = 0; } static void vdev_mirror_scrub_done(zio_t *zio) { mirror_child_t *mc = zio->io_private; if (zio->io_error == 0) { zio_t *pio; zio_link_t *zl = NULL; mutex_enter(&zio->io_lock); while ((pio = zio_walk_parents(zio, &zl)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); abd_copy(pio->io_abd, zio->io_abd, pio->io_size); mutex_exit(&pio->io_lock); } mutex_exit(&zio->io_lock); } abd_free(zio->io_abd); mc->mc_error = zio->io_error; mc->mc_tried = 1; mc->mc_skipped = 0; } /* * Check the other, lower-index DVAs to see if they're on the same * vdev as the child we picked. If they are, use them since they * are likely to have been allocated from the primary metaslab in * use at the time, and hence are more likely to have locality with * single-copy data. */ static int vdev_mirror_dva_select(zio_t *zio, int p) { dva_t *dva = zio->io_bp->blk_dva; mirror_map_t *mm = zio->io_vsd; int preferred; int c; preferred = mm->mm_preferred[p]; for (p--; p >= 0; p--) { c = mm->mm_preferred[p]; if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) preferred = c; } return (preferred); } static int vdev_mirror_preferred_child_randomize(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; int p; if (mm->mm_root) { p = spa_get_random(mm->mm_preferred_cnt); return (vdev_mirror_dva_select(zio, p)); } /* * To ensure we don't always favour the first matching vdev, * which could lead to wear leveling issues on SSD's, we * use the I/O offset as a pseudo random seed into the vdevs * which have the lowest load. */ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; return (mm->mm_preferred[p]); } /* * Try to find a vdev whose DTL doesn't contain the block we want to read * prefering vdevs based on determined load. * * Try to find a child whose DTL doesn't contain the block we want to read. * If we can't, try the read on any vdev we haven't already tried. */ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; uint64_t txg = zio->io_txg; int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); lowest_load = INT_MAX; mm->mm_preferred_cnt = 0; for (c = 0; c < mm->mm_children; c++) { mirror_child_t *mc; mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; continue; } mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); if (mc->mc_load > lowest_load) continue; if (mc->mc_load < lowest_load) { lowest_load = mc->mc_load; mm->mm_preferred_cnt = 0; } mm->mm_preferred[mm->mm_preferred_cnt] = c; mm->mm_preferred_cnt++; } if (mm->mm_preferred_cnt == 1) { vdev_queue_register_lastoffset( mm->mm_child[mm->mm_preferred[0]].mc_vd, zio); return (mm->mm_preferred[0]); } if (mm->mm_preferred_cnt > 1) { int c = vdev_mirror_preferred_child_randomize(zio); vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); return (c); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ for (c = 0; c < mm->mm_children; c++) { if (!mm->mm_child[c].mc_tried) { vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); return (c); } } /* * Every child failed. There's no place left to look. */ return (-1); } static void vdev_mirror_io_start(zio_t *zio) { mirror_map_t *mm; mirror_child_t *mc; int c, children; mm = vdev_mirror_map_init(zio); if (zio->io_type == ZIO_TYPE_READ) { if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { /* * For scrubbing reads we need to allocate a read * buffer for each child and issue reads to all * children. If any child succeeds, it will copy its * data into zio->io_data in vdev_mirror_scrub_done. */ for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, abd_alloc_sametype(zio->io_abd, zio->io_size), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } zio_execute(zio); return; } /* * For normal reads just pick one child. */ c = vdev_mirror_child_select(zio); children = (c >= 0); } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE); /* * Writes go to all children. */ c = 0; children = mm->mm_children; } while (children--) { mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); c++; } zio_execute(zio); } static int vdev_mirror_worst_error(mirror_map_t *mm) { int c, error[2] = { 0, 0 }; for (c = 0; c < mm->mm_children; c++) { mirror_child_t *mc = &mm->mm_child[c]; int s = mc->mc_speculative; error[s] = zio_worst_error(error[s], mc->mc_error); } return (error[0] ? error[0] : error[1]); } static void vdev_mirror_io_done(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; mirror_child_t *mc; int c; int good_copies = 0; int unexpected_errors = 0; for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; if (mc->mc_error) { if (!mc->mc_skipped) unexpected_errors++; } else if (mc->mc_tried) { good_copies++; } } if (zio->io_type == ZIO_TYPE_WRITE) { /* * XXX -- for now, treat partial writes as success. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ /* XXPOLICY */ if (good_copies != mm->mm_children) { /* * Always require at least one good copy. * * For ditto blocks (io_vd == NULL), require * all copies to be good. * * XXX -- for replacing vdevs, there's no great answer. * If the old device is really dead, we may not even * be able to access it -- so we only want to * require good writes to the new device. But if * the new device turns out to be flaky, we want * to be able to detach it -- which requires all * writes to the old device to have succeeded. */ if (good_copies == 0 || zio->io_vd == NULL) zio->io_error = vdev_mirror_worst_error(mm); } return; } ASSERT(zio->io_type == ZIO_TYPE_READ); /* * If we don't have a good copy yet, keep trying other children. */ /* XXPOLICY */ if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { ASSERT(c >= 0 && c < mm->mm_children); mc = &mm->mm_child[c]; zio_vdev_io_redone(zio); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_READ, zio->io_priority, 0, vdev_mirror_child_done, mc)); return; } /* XXPOLICY */ if (good_copies == 0) { zio->io_error = vdev_mirror_worst_error(mm); ASSERT(zio->io_error != 0); } if (good_copies && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER) || ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { /* * Use the good data we have in hand to repair damaged children. */ for (c = 0; c < mm->mm_children; c++) { /* * Don't rewrite known good children. * Not only is it unnecessary, it could * actually be harmful: if the system lost * power while rewriting the only good copy, * there would be no good copies left! */ mc = &mm->mm_child[c]; if (mc->mc_error == 0) { if (mc->mc_tried) continue; if (!(zio->io_flags & ZIO_FLAG_SCRUB) && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; mc->mc_error = SET_ERROR(ESTALE); } zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } static void vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted == vd->vdev_children) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } vdev_ops_t vdev_mirror_ops = { vdev_mirror_open, vdev_mirror_close, vdev_default_asize, vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, NULL, NULL, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { vdev_mirror_open, vdev_mirror_close, vdev_default_asize, vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, NULL, NULL, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { vdev_mirror_open, vdev_mirror_close, vdev_default_asize, vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, NULL, NULL, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; #if defined(_KERNEL) && defined(HAVE_SPL) +/* BEGIN CSTYLED */ module_param(zfs_vdev_mirror_rotating_inc, int, 0644); MODULE_PARM_DESC(zfs_vdev_mirror_rotating_inc, "Rotating media load increment for non-seeking I/O's"); module_param(zfs_vdev_mirror_rotating_seek_inc, int, 0644); MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_inc, "Rotating media load increment for seeking I/O's"); module_param(zfs_vdev_mirror_rotating_seek_offset, int, 0644); + MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_offset, "Offset in bytes from the last I/O which " "triggers a reduced rotating media seek increment"); module_param(zfs_vdev_mirror_non_rotating_inc, int, 0644); MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_inc, "Non-rotating media load increment for non-seeking I/O's"); module_param(zfs_vdev_mirror_non_rotating_seek_inc, int, 0644); MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_seek_inc, "Non-rotating media load increment for seeking I/O's"); - +/* END CSTYLED */ #endif diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index a175bcf770d9..a64e3b023574 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -1,652 +1,652 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. */ #include #include #include #include #include #include #include extern boolean_t raidz_will_scalar_work(void); /* Opaque implementation with NULL methods to represent original methods */ static const raidz_impl_ops_t vdev_raidz_original_impl = { .name = "original", .is_supported = raidz_will_scalar_work, }; /* RAIDZ parity op that contain the fastest methods */ static raidz_impl_ops_t vdev_raidz_fastest_impl = { .name = "fastest" }; /* All compiled in implementations */ const raidz_impl_ops_t *raidz_all_maths[] = { &vdev_raidz_original_impl, &vdev_raidz_scalar_impl, #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */ &vdev_raidz_sse2_impl, #endif #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */ &vdev_raidz_ssse3_impl, #endif #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */ &vdev_raidz_avx2_impl, #endif #if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */ &vdev_raidz_avx512f_impl, #endif #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ &vdev_raidz_avx512bw_impl, #endif #if defined(__aarch64__) &vdev_raidz_aarch64_neon_impl, &vdev_raidz_aarch64_neonx2_impl, #endif }; /* Indicate that benchmark has been completed */ static boolean_t raidz_math_initialized = B_FALSE; /* Select raidz implementation */ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX - 1) #define IMPL_ORIGINAL (0) #define IMPL_SCALAR (1) #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i)) static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; static uint32_t user_sel_impl = IMPL_FASTEST; /* Hold all supported implementations */ static size_t raidz_supp_impl_cnt = 0; static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; /* * kstats values for supported implementations * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] */ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; /* kstat for benchmarked implementations */ static kstat_t *raidz_math_kstat = NULL; /* * Selects the raidz operation for raidz_map * If rm_ops is set to NULL original raidz implementation will be used */ raidz_impl_ops_t * vdev_raidz_math_get_ops() { raidz_impl_ops_t *ops = NULL; const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); switch (impl) { case IMPL_FASTEST: ASSERT(raidz_math_initialized); ops = &vdev_raidz_fastest_impl; break; #if !defined(_KERNEL) case IMPL_CYCLE: { ASSERT(raidz_math_initialized); ASSERT3U(raidz_supp_impl_cnt, >, 0); /* Cycle through all supported implementations */ static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; ops = raidz_supp_impl[idx]; } break; #endif case IMPL_ORIGINAL: ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; break; case IMPL_SCALAR: ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl; break; default: ASSERT3U(impl, <, raidz_supp_impl_cnt); ASSERT3U(raidz_supp_impl_cnt, >, 0); ops = raidz_supp_impl[impl]; break; } ASSERT3P(ops, !=, NULL); return (ops); } /* * Select parity generation method for raidz_map */ int vdev_raidz_math_generate(raidz_map_t *rm) { raidz_gen_f gen_parity = NULL; switch (raidz_parity(rm)) { case 1: gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; break; case 2: gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ]; break; case 3: gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR]; break; default: gen_parity = NULL; cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", raidz_parity(rm)); break; } /* if method is NULL execute the original implementation */ if (gen_parity == NULL) return (RAIDZ_ORIGINAL_IMPL); gen_parity(rm); return (0); } static raidz_rec_f reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, - const int nbaddata) + const int nbaddata) { if (nbaddata == 1 && parity_valid[CODE_P]) { return (rm->rm_ops->rec[RAIDZ_REC_P]); } return ((raidz_rec_f) NULL); } static raidz_rec_f reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid, - const int nbaddata) + const int nbaddata) { if (nbaddata == 1) { if (parity_valid[CODE_P]) { return (rm->rm_ops->rec[RAIDZ_REC_P]); } else if (parity_valid[CODE_Q]) { return (rm->rm_ops->rec[RAIDZ_REC_Q]); } } else if (nbaddata == 2 && parity_valid[CODE_P] && parity_valid[CODE_Q]) { return (rm->rm_ops->rec[RAIDZ_REC_PQ]); } return ((raidz_rec_f) NULL); } static raidz_rec_f reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, - const int nbaddata) + const int nbaddata) { if (nbaddata == 1) { if (parity_valid[CODE_P]) { return (rm->rm_ops->rec[RAIDZ_REC_P]); } else if (parity_valid[CODE_Q]) { return (rm->rm_ops->rec[RAIDZ_REC_Q]); } else if (parity_valid[CODE_R]) { return (rm->rm_ops->rec[RAIDZ_REC_R]); } } else if (nbaddata == 2) { if (parity_valid[CODE_P] && parity_valid[CODE_Q]) { return (rm->rm_ops->rec[RAIDZ_REC_PQ]); } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) { return (rm->rm_ops->rec[RAIDZ_REC_PR]); } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) { return (rm->rm_ops->rec[RAIDZ_REC_QR]); } } else if (nbaddata == 3 && parity_valid[CODE_P] && parity_valid[CODE_Q] && parity_valid[CODE_R]) { return (rm->rm_ops->rec[RAIDZ_REC_PQR]); } return ((raidz_rec_f) NULL); } /* * Select data reconstruction method for raidz_map * @parity_valid - Parity validity flag * @dt - Failed data index array * @nbaddata - Number of failed data columns */ int vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, - const int *dt, const int nbaddata) + const int *dt, const int nbaddata) { raidz_rec_f rec_fn = NULL; switch (raidz_parity(rm)) { case PARITY_P: rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); break; case PARITY_PQ: rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); break; case PARITY_PQR: rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", raidz_parity(rm)); break; } if (rec_fn == NULL) return (RAIDZ_ORIGINAL_IMPL); else return (rec_fn(rm, dt)); } const char *raidz_gen_name[] = { "gen_p", "gen_pq", "gen_pqr" }; const char *raidz_rec_name[] = { "rec_p", "rec_q", "rec_r", "rec_pq", "rec_pr", "rec_qr", "rec_pqr" }; #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1) static int raidz_math_kstat_headers(char *buf, size_t size) { int i; ssize_t off; ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); off = snprintf(buf, size, "%-17s", "implementation"); for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) off += snprintf(buf + off, size - off, "%-16s", raidz_gen_name[i]); for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) off += snprintf(buf + off, size - off, "%-16s", raidz_rec_name[i]); (void) snprintf(buf + off, size - off, "\n"); return (0); } static int raidz_math_kstat_data(char *buf, size_t size, void *data) { raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; raidz_impl_kstat_t *cstat = (raidz_impl_kstat_t *)data; ssize_t off = 0; int i; ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); if (cstat == fstat) { off += snprintf(buf + off, size - off, "%-17s", "fastest"); for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) { int id = fstat->gen[i]; off += snprintf(buf + off, size - off, "%-16s", raidz_supp_impl[id]->name); } for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) { int id = fstat->rec[i]; off += snprintf(buf + off, size - off, "%-16s", raidz_supp_impl[id]->name); } } else { ptrdiff_t id = cstat - raidz_impl_kstats; off += snprintf(buf + off, size - off, "%-17s", raidz_supp_impl[id]->name); for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) off += snprintf(buf + off, size - off, "%-16llu", (u_longlong_t)cstat->gen[i]); for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) off += snprintf(buf + off, size - off, "%-16llu", (u_longlong_t)cstat->rec[i]); } (void) snprintf(buf + off, size - off, "\n"); return (0); } static void * raidz_math_kstat_addr(kstat_t *ksp, loff_t n) { if (n <= raidz_supp_impl_cnt) ksp->ks_private = (void *) (raidz_impl_kstats + n); else ksp->ks_private = NULL; return (ksp->ks_private); } #define BENCH_D_COLS (8ULL) #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ #define BENCH_NS MSEC2NSEC(25) /* 25ms */ typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); static void benchmark_gen_impl(raidz_map_t *rm, const int fn) { (void) fn; vdev_raidz_generate_parity(rm); } static void benchmark_rec_impl(raidz_map_t *rm, const int fn) { static const int rec_tgt[7][3] = { {1, 2, 3}, /* rec_p: bad QR & D[0] */ {0, 2, 3}, /* rec_q: bad PR & D[0] */ {0, 1, 3}, /* rec_r: bad PQ & D[0] */ {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ }; vdev_raidz_reconstruct(rm, rec_tgt[fn], 3); } /* * Benchmarking of all supported implementations (raidz_supp_impl_cnt) * is performed by setting the rm_ops pointer and calling the top level * generate/reconstruct methods of bench_rm. */ static void benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) { uint64_t run_cnt, speed, best_speed = 0; hrtime_t t_start, t_diff; raidz_impl_ops_t *curr_impl; raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; int impl, i; for (impl = 0; impl < raidz_supp_impl_cnt; impl++) { /* set an implementation to benchmark */ curr_impl = raidz_supp_impl[impl]; bench_rm->rm_ops = curr_impl; run_cnt = 0; t_start = gethrtime(); do { for (i = 0; i < 25; i++, run_cnt++) bench_fn(bench_rm, fn); t_diff = gethrtime() - t_start; } while (t_diff < BENCH_NS); speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC; speed /= (t_diff * BENCH_COLS); if (bench_fn == benchmark_gen_impl) raidz_impl_kstats[impl].gen[fn] = speed; else raidz_impl_kstats[impl].rec[fn] = speed; /* Update fastest implementation method */ if (speed > best_speed) { best_speed = speed; if (bench_fn == benchmark_gen_impl) { fstat->gen[fn] = impl; vdev_raidz_fastest_impl.gen[fn] = curr_impl->gen[fn]; } else { fstat->rec[fn] = impl; vdev_raidz_fastest_impl.rec[fn] = curr_impl->rec[fn]; } } } } void vdev_raidz_math_init(void) { raidz_impl_ops_t *curr_impl; zio_t *bench_zio = NULL; raidz_map_t *bench_rm = NULL; uint64_t bench_parity; int i, c, fn; /* move supported impl into raidz_supp_impl */ for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; /* initialize impl */ if (curr_impl->init) curr_impl->init(); if (curr_impl->is_supported()) raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl; } membar_producer(); /* complete raidz_supp_impl[] init */ raidz_supp_impl_cnt = c; /* number of supported impl */ #if !defined(_KERNEL) /* Skip benchmarking and use last implementation as fastest */ memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1], sizeof (vdev_raidz_fastest_impl)); strcpy(vdev_raidz_fastest_impl.name, "fastest"); raidz_math_initialized = B_TRUE; /* Use 'cycle' math selection method for userspace */ VERIFY0(vdev_raidz_impl_set("cycle")); return; #endif /* Fake an zio and run the benchmark on a warmed up buffer */ bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); bench_zio->io_offset = 0; bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); /* Benchmark parity generation methods */ for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { bench_parity = fn + 1; /* New raidz_map is needed for each generate_p/q/r */ bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, BENCH_D_COLS + bench_parity, bench_parity); benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl); vdev_raidz_map_free(bench_rm); } /* Benchmark data reconstruction methods */ bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, BENCH_COLS, PARITY_PQR); for (fn = 0; fn < RAIDZ_REC_NUM; fn++) benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); vdev_raidz_map_free(bench_rm); /* cleanup the bench zio */ abd_free(bench_zio->io_abd); kmem_free(bench_zio, sizeof (zio_t)); /* install kstats for all impl */ raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (raidz_math_kstat != NULL) { raidz_math_kstat->ks_data = NULL; raidz_math_kstat->ks_ndata = UINT32_MAX; kstat_set_raw_ops(raidz_math_kstat, raidz_math_kstat_headers, raidz_math_kstat_data, raidz_math_kstat_addr); kstat_install(raidz_math_kstat); } /* Finish initialization */ atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); raidz_math_initialized = B_TRUE; } void vdev_raidz_math_fini(void) { raidz_impl_ops_t const *curr_impl; int i; if (raidz_math_kstat != NULL) { kstat_delete(raidz_math_kstat); raidz_math_kstat = NULL; } /* fini impl */ for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { curr_impl = raidz_all_maths[i]; if (curr_impl->fini) curr_impl->fini(); } } static const struct { char *name; uint32_t sel; } math_impl_opts[] = { #if !defined(_KERNEL) { "cycle", IMPL_CYCLE }, #endif { "fastest", IMPL_FASTEST }, { "original", IMPL_ORIGINAL }, { "scalar", IMPL_SCALAR } }; /* * Function sets desired raidz implementation. * * If we are called before init(), user preference will be saved in * user_sel_impl, and applied in later init() call. This occurs when module * parameter is specified on module load. Otherwise, directly update * zfs_vdev_raidz_impl. * * @val Name of raidz implementation to use * @param Unused. */ int vdev_raidz_impl_set(const char *val) { int err = -EINVAL; char req_name[RAIDZ_IMPL_NAME_MAX]; uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl); size_t i; /* sanitize input */ i = strnlen(val, RAIDZ_IMPL_NAME_MAX); if (i == 0 || i == RAIDZ_IMPL_NAME_MAX) return (err); strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX); while (i > 0 && !!isspace(req_name[i-1])) i--; req_name[i] = '\0'; /* Check mandatory options */ for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) { if (strcmp(req_name, math_impl_opts[i].name) == 0) { impl = math_impl_opts[i].sel; err = 0; break; } } /* check all supported impl if init() was already called */ if (err != 0 && raidz_math_initialized) { /* check all supported implementations */ for (i = 0; i < raidz_supp_impl_cnt; i++) { if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) { impl = i; err = 0; break; } } } if (err == 0) { if (raidz_math_initialized) atomic_swap_32(&zfs_vdev_raidz_impl, impl); else atomic_swap_32(&user_sel_impl, impl); } return (err); } #if defined(_KERNEL) && defined(HAVE_SPL) #include static int zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) { return (vdev_raidz_impl_set(val)); } static int zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) { int i, cnt = 0; char *fmt; const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); ASSERT(raidz_math_initialized); /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name); } /* list all supported implementations */ for (i = 0; i < raidz_supp_impl_cnt; i++) { fmt = (i == impl) ? "[%s] " : "%s "; cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name); } return (cnt); } module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, - zfs_vdev_raidz_impl_get, NULL, 0644); + zfs_vdev_raidz_impl_get, NULL, 0644); MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation."); #endif diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c index c7b8afd38893..e3ad06776503 100644 --- a/module/zfs/vdev_raidz_math_aarch64_neon.c +++ b/module/zfs/vdev_raidz_math_aarch64_neon.c @@ -1,2279 +1,2279 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2016 Romain Dolbeau. All rights reserved. */ #include #include #if defined(__aarch64__) #include "vdev_raidz_math_aarch64_neon_common.h" #define SYN_STRIDE 4 #define ZERO_STRIDE 4 #define ZERO_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_33_36() #define ZERO_D 0, 1, 2, 3 #define COPY_STRIDE 4 #define COPY_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_33_36() #define COPY_D 0, 1, 2, 3 #define ADD_STRIDE 4 #define ADD_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_33_36() #define ADD_D 0, 1, 2, 3 #define MUL_STRIDE 4 #define MUL_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_33_36() #define MUL_D 0, 1, 2, 3 #define GEN_P_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_33_36() #define GEN_P_STRIDE 4 #define GEN_P_P 0, 1, 2, 3 #define GEN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQ_STRIDE 4 #define GEN_PQ_D 0, 1, 2, 3 #define GEN_PQ_C 4, 5, 6, 7 #define GEN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define GEN_PQR_STRIDE 4 #define GEN_PQR_D 0, 1, 2, 3 #define GEN_PQR_C 4, 5, 6, 7 #define SYN_Q_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define SYN_Q_STRIDE 4 #define SYN_Q_D 0, 1, 2, 3 #define SYN_Q_X 4, 5, 6, 7 #define SYN_R_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define SYN_R_STRIDE 4 #define SYN_R_D 0, 1, 2, 3 #define SYN_R_X 4, 5, 6, 7 #define SYN_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define SYN_PQ_STRIDE 4 #define SYN_PQ_D 0, 1, 2, 3 #define SYN_PQ_X 4, 5, 6, 7 #define REC_PQ_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() #define REC_PQ_STRIDE 2 #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 #define REC_PQ_T 4, 5 #define SYN_PR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define SYN_PR_STRIDE 4 #define SYN_PR_D 0, 1, 2, 3 #define SYN_PR_X 4, 5, 6, 7 #define REC_PR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() #define REC_PR_STRIDE 2 #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 #define REC_PR_T 4, 5 #define SYN_QR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define SYN_QR_STRIDE 4 #define SYN_QR_D 0, 1, 2, 3 #define SYN_QR_X 4, 5, 6, 7 #define REC_QR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() #define REC_QR_STRIDE 2 #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 #define REC_QR_T 4, 5 #define SYN_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_16() \ GEN_X_DEFINE_17() \ GEN_X_DEFINE_33_36() #define SYN_PQR_STRIDE 4 #define SYN_PQR_D 0, 1, 2, 3 #define SYN_PQR_X 4, 5, 6, 7 #define REC_PQR_DEFINE() \ GEN_X_DEFINE_0_3() \ GEN_X_DEFINE_4_5() \ GEN_X_DEFINE_6_7() \ GEN_X_DEFINE_8_9() \ GEN_X_DEFINE_31() \ GEN_X_DEFINE_32() \ GEN_X_DEFINE_33_36() #define REC_PQR_STRIDE 2 #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 #include #include "vdev_raidz_math_impl.h" DEFINE_GEN_METHODS(aarch64_neon); DEFINE_REC_METHODS(aarch64_neon); static boolean_t raidz_will_aarch64_neon_work(void) { return (B_TRUE); // __arch64__ requires NEON } const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = { .init = NULL, .fini = NULL, .gen = RAIDZ_GEN_METHODS(aarch64_neon), .rec = RAIDZ_REC_METHODS(aarch64_neon), .is_supported = &raidz_will_aarch64_neon_work, .name = "aarch64_neon" }; #endif /* defined(__aarch64__) */ #if defined(__aarch64__) - +/* BEGIN CSTYLED */ const uint8_t __attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 }, { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc }, { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 }, { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 }, { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd }, { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 }, { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 }, { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 }, { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 }, { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 }, { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa }, { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 }, { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc }, { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 }, { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 }, { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 }, { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 }, { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 }, { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 }, { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 }, { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab }, { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba }, { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 }, { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c }, { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 }, { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 }, { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d }, { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 }, { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 }, { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 }, { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 }, { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b }, { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a }, { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 }, { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7, 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf }, { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce, 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde }, { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9, 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc, 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb, 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2, 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5, 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed }, { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8, 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 }, { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff, 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 }, { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6, 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 }, { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1, 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 }, { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4, 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 }, { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3, 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b }, { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed, 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 }, { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7, 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f }, { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe, 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e }, { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac, 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab, 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2, 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5, 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d }, { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88, 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 }, { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f, 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 }, { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86, 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 }, { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81, 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 }, { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94, 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 }, { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93, 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b }, { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a, 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a }, { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d, 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27, 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f }, { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e, 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e }, { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29, 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c, 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b, 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32, 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35, 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d }, { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18, 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 }, { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16, 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 }, { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11, 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 }, { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04, 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 }, { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03, 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b }, { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a, 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a }, { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d, 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 }, { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57, 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf }, { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e, 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce }, { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59, 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b, 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42, 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45, 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd }, { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68, 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 }, { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f, 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 }, { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66, 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 }, { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61, 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 }, { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74, 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 }, { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73, 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b }, { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a, 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a }, { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d, 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 }, { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e, 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e }, { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89, 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 }, { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c, 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc }, { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b, 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 }, { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92, 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 }, { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95, 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad }, { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8, 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 }, { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf, 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 }, { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6, 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 }, { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1, 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4, 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3, 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa, 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad, 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 }, { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7, 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f }, { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe, 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e }, { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9, 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 }, { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec, 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c }, { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb, 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 }, { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2, 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 }, { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5, 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d }, { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8, 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 }, { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf, 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 }, { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6, 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 }, { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1, 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3, 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda, 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd, 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67, 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f }, { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e, 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e }, { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69, 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c, 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c }, { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b, 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75, 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d }, { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58, 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 }, { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f, 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 }, { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56, 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 }, { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51, 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44, 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43, 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a, 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d, 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 }, { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17, 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f }, { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e, 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e }, { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19, 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 }, { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c, 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac }, { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b, 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 }, { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02, 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 }, { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05, 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd }, { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28, 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 }, { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f, 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 }, { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26, 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 }, { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34, 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33, 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a, 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d, 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 }, { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47, 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f }, { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e, 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e }, { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49, 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 }, { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c, 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c }, { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b, 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 }, { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52, 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 }, { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55, 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78, 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f, 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76, 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71, 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 }, { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64, 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 }, { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63, 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b }, { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a, 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a }, { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37, 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf }, { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39, 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 }, { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c, 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c }, { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b, 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 }, { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22, 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 }, { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25, 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08, 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f, 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06, 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01, 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 }, { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14, 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 }, { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13, 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb }, { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a, 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea }, { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d, 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 }, { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7, 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf }, { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae, 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe }, { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9, 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 }, { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc, 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c }, { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb, 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 }, { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2, 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 }, { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5, 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f, 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96, 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91, 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 }, { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84, 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 }, { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83, 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb }, { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a, 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa }, { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d, 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7, 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f }, { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde, 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e }, { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9, 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc, 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c }, { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2, 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5, 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8, 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef, 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6, 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1, 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 }, { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4, 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 }, { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3, 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b }, { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa, 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a }, { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd, 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 } }; - +/* END CSTYLED */ #endif /* defined(__aarch64__) */ diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h index 0a40677b612d..ea592c0f12da 100644 --- a/module/zfs/vdev_raidz_math_impl.h +++ b/module/zfs/vdev_raidz_math_impl.h @@ -1,1477 +1,1477 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. */ #ifndef _VDEV_RAIDZ_MATH_IMPL_H #define _VDEV_RAIDZ_MATH_IMPL_H #include #define raidz_inline inline __attribute__((always_inline)) #ifndef noinline #define noinline __attribute__((noinline)) #endif /* * Functions calculate multiplication constants for data reconstruction. * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and * used parity columns for reconstruction. * @rm RAIDZ map * @tgtidx array of missing data indexes * @coeff output array of coefficients. Array must be provided by * user and must hold minimum MUL_CNT values. */ static noinline void raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) { const unsigned ncols = raidz_ncols(rm); const unsigned x = tgtidx[TARGET_X]; coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1)); } static noinline void raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) { const unsigned ncols = raidz_ncols(rm); const unsigned x = tgtidx[TARGET_X]; coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1)); } static noinline void raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) { const unsigned ncols = raidz_ncols(rm); const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t a, b, e; a = gf_exp2(x + 255 - y); b = gf_exp2(255 - (ncols - x - 1)); e = a ^ 0x01; coeff[MUL_PQ_X] = gf_div(a, e); coeff[MUL_PQ_Y] = gf_div(b, e); } static noinline void raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) { const unsigned ncols = raidz_ncols(rm); const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t a, b, e; a = gf_exp4(x + 255 - y); b = gf_exp4(255 - (ncols - x - 1)); e = a ^ 0x01; coeff[MUL_PR_X] = gf_div(a, e); coeff[MUL_PR_Y] = gf_div(b, e); } static noinline void raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) { const unsigned ncols = raidz_ncols(rm); const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t nx, ny, nxxy, nxyy, d; nx = gf_exp2(ncols - x - 1); ny = gf_exp2(ncols - y - 1); nxxy = gf_mul(gf_mul(nx, nx), ny); nxyy = gf_mul(gf_mul(nx, ny), ny); d = nxxy ^ nxyy; coeff[MUL_QR_XQ] = ny; coeff[MUL_QR_X] = gf_div(ny, d); coeff[MUL_QR_YQ] = nx; coeff[MUL_QR_Y] = gf_div(nx, d); } static noinline void raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) { const unsigned ncols = raidz_ncols(rm); const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; const unsigned z = tgtidx[TARGET_Z]; gf_t nx, ny, nz, nxx, nyy, nzz, nyyz, nyzz, xd, yd; nx = gf_exp2(ncols - x - 1); ny = gf_exp2(ncols - y - 1); nz = gf_exp2(ncols - z - 1); nxx = gf_exp4(ncols - x - 1); nyy = gf_exp4(ncols - y - 1); nzz = gf_exp4(ncols - z - 1); nyyz = gf_mul(gf_mul(ny, nz), ny); nyzz = gf_mul(nzz, ny); xd = gf_mul(nxx, ny) ^ gf_mul(nx, nyy) ^ nyyz ^ gf_mul(nxx, nz) ^ gf_mul(nzz, nx) ^ nyzz; yd = gf_inv(ny ^ nz); coeff[MUL_PQR_XP] = gf_div(nyyz ^ nyzz, xd); coeff[MUL_PQR_XQ] = gf_div(nyy ^ nzz, xd); coeff[MUL_PQR_XR] = gf_div(ny ^ nz, xd); coeff[MUL_PQR_YU] = nx; coeff[MUL_PQR_YP] = gf_mul(nz, yd); coeff[MUL_PQR_YQ] = yd; } /* * Method for zeroing a buffer (can be implemented using SIMD). * This method is used by multiple for gen/rec functions. * * @dc Destination buffer * @dsize Destination buffer size * @private Unused */ static int raidz_zero_abd_cb(void *dc, size_t dsize, void *private) { v_t *dst = (v_t *)dc; size_t i; ZERO_DEFINE(); (void) private; /* unused */ ZERO(ZERO_D); for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) { STORE(dst + i, ZERO_D); STORE(dst + i + ZERO_STRIDE, ZERO_D); } return (0); } #define raidz_zero(dabd, size) \ { \ abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \ } /* * Method for copying two buffers (can be implemented using SIMD). * This method is used by multiple for gen/rec functions. * * @dc Destination buffer * @sc Source buffer * @dsize Destination buffer size * @ssize Source buffer size * @private Unused */ static int raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private) { v_t *dst = (v_t *)dc; const v_t *src = (v_t *)sc; size_t i; COPY_DEFINE(); (void) private; /* unused */ for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) { LOAD(src + i, COPY_D); STORE(dst + i, COPY_D); LOAD(src + i + COPY_STRIDE, COPY_D); STORE(dst + i + COPY_STRIDE, COPY_D); } return (0); } #define raidz_copy(dabd, sabd, size) \ { \ abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\ } /* * Method for adding (XORing) two buffers. * Source and destination are XORed together and result is stored in * destination buffer. This method is used by multiple for gen/rec functions. * * @dc Destination buffer * @sc Source buffer * @dsize Destination buffer size * @ssize Source buffer size * @private Unused */ static int raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private) { v_t *dst = (v_t *)dc; const v_t *src = (v_t *)sc; size_t i; ADD_DEFINE(); (void) private; /* unused */ for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) { LOAD(dst + i, ADD_D); XOR_ACC(src + i, ADD_D); STORE(dst + i, ADD_D); LOAD(dst + i + ADD_STRIDE, ADD_D); XOR_ACC(src + i + ADD_STRIDE, ADD_D); STORE(dst + i + ADD_STRIDE, ADD_D); } return (0); } #define raidz_add(dabd, sabd, size) \ { \ abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\ } /* * Method for multiplying a buffer with a constant in GF(2^8). * Symbols from buffer are multiplied by a constant and result is stored * back in the same buffer. * * @dc In/Out data buffer. * @size Size of the buffer * @private pointer to the multiplication constant (unsigned) */ static int raidz_mul_abd_cb(void *dc, size_t size, void *private) { const unsigned mul = *((unsigned *)private); v_t *d = (v_t *)dc; size_t i; MUL_DEFINE(); for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) { LOAD(d + i, MUL_D); MUL(mul, MUL_D); STORE(d + i, MUL_D); LOAD(d + i + MUL_STRIDE, MUL_D); MUL(mul, MUL_D); STORE(d + i + MUL_STRIDE, MUL_D); } return (0); } /* * Syndrome generation/update macros * * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros */ #define P_D_SYNDROME(D, T, t) \ { \ LOAD((t), T); \ XOR(D, T); \ STORE((t), T); \ } #define Q_D_SYNDROME(D, T, t) \ { \ LOAD((t), T); \ MUL2(T); \ XOR(D, T); \ STORE((t), T); \ } #define Q_SYNDROME(T, t) \ { \ LOAD((t), T); \ MUL2(T); \ STORE((t), T); \ } #define R_D_SYNDROME(D, T, t) \ { \ LOAD((t), T); \ MUL4(T); \ XOR(D, T); \ STORE((t), T); \ } #define R_SYNDROME(T, t) \ { \ LOAD((t), T); \ MUL4(T); \ STORE((t), T); \ } /* * PARITY CALCULATION * * Macros *_SYNDROME are used for parity/syndrome calculation. * *_D_SYNDROME() macros are used to calculate syndrome between 0 and * length of data column, and *_SYNDROME() macros are only for updating * the parity/syndrome if data column is shorter. * * P parity is calculated using raidz_add_abd(). */ /* * Generate P parity (RAIDZ1) * * @rm RAIDZ map */ static raidz_inline void raidz_generate_p_impl(raidz_map_t * const rm) { size_t c; const size_t ncols = raidz_ncols(rm); const size_t psize = rm->rm_col[CODE_P].rc_size; abd_t *pabd = rm->rm_col[CODE_P].rc_abd; size_t size; abd_t *dabd; raidz_math_begin(); /* start with first data column */ raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); for (c = 2; c < ncols; c++) { dabd = rm->rm_col[c].rc_abd; size = rm->rm_col[c].rc_size; /* add data column */ raidz_add(pabd, dabd, size); } raidz_math_end(); } /* * Generate PQ parity (RAIDZ2) * The function is called per data column. * * @c array of pointers to parity (code) columns * @dc pointer to data column * @csize size of parity columns * @dsize size of data column */ static void raidz_gen_pq_add(void **c, const void *dc, const size_t csize, - const size_t dsize) + const size_t dsize) { v_t *p = (v_t *)c[0]; v_t *q = (v_t *)c[1]; const v_t *d = (v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const qend = q + (csize / sizeof (v_t)); GEN_PQ_DEFINE(); MUL2_SETUP(); for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE, q += GEN_PQ_STRIDE) { LOAD(d, GEN_PQ_D); P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p); Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q); } for (; q < qend; q += GEN_PQ_STRIDE) { Q_SYNDROME(GEN_PQ_C, q); } } /* * Generate PQ parity (RAIDZ2) * * @rm RAIDZ map */ static raidz_inline void raidz_generate_pq_impl(raidz_map_t * const rm) { size_t c; const size_t ncols = raidz_ncols(rm); const size_t csize = rm->rm_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { rm->rm_col[CODE_P].rc_abd, rm->rm_col[CODE_Q].rc_abd }; raidz_math_begin(); raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); for (c = 3; c < ncols; c++) { dabd = rm->rm_col[c].rc_abd; dsize = rm->rm_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, raidz_gen_pq_add); } raidz_math_end(); } /* * Generate PQR parity (RAIDZ3) * The function is called per data column. * * @c array of pointers to parity (code) columns * @dc pointer to data column * @csize size of parity columns * @dsize size of data column */ static void raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, - const size_t dsize) + const size_t dsize) { v_t *p = (v_t *)c[0]; v_t *q = (v_t *)c[1]; v_t *r = (v_t *)c[CODE_R]; const v_t *d = (v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const qend = q + (csize / sizeof (v_t)); GEN_PQR_DEFINE(); MUL2_SETUP(); for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE, q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { LOAD(d, GEN_PQR_D); P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p); Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q); R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r); } for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { Q_SYNDROME(GEN_PQR_C, q); R_SYNDROME(GEN_PQR_C, r); } } /* * Generate PQR parity (RAIDZ2) * * @rm RAIDZ map */ static raidz_inline void raidz_generate_pqr_impl(raidz_map_t * const rm) { size_t c; const size_t ncols = raidz_ncols(rm); const size_t csize = rm->rm_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { rm->rm_col[CODE_P].rc_abd, rm->rm_col[CODE_Q].rc_abd, rm->rm_col[CODE_R].rc_abd }; raidz_math_begin(); raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); for (c = 4; c < ncols; c++) { dabd = rm->rm_col[c].rc_abd; dsize = rm->rm_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, raidz_gen_pqr_add); } raidz_math_end(); } /* * DATA RECONSTRUCTION * * Data reconstruction process consists of two phases: * - Syndrome calculation * - Data reconstruction * * Syndrome is calculated by generating parity using available data columns * and zeros in places of erasure. Existing parity is added to corresponding * syndrome value to obtain the [P|Q|R]syn values from equation: * P = Psyn + Dx + Dy + Dz * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz * * For data reconstruction phase, the corresponding equations are solved * for missing data (Dx, Dy, Dz). This generally involves multiplying known * symbols by an coefficient and adding them together. The multiplication * constant coefficients are calculated ahead of the operation in * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. * * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" * and "short" columns. * For this reason, reconstruction is performed in minimum of * two steps. First, from offset 0 to short_size, then from short_size to * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work * over both ranges. The split also enables removal of conditional expressions * from loop bodies, improving throughput of SIMD implementations. * For the best performance, all functions marked with raidz_inline attribute * must be inlined by compiler. * * parity data * columns columns * <----------> <------------------> * x y <----+ missing columns (x, y) * | | * +---+---+---+---+-v-+---+-v-+---+ ^ 0 * | | | | | | | | | | * | | | | | | | | | | * | P | Q | R | D | D | D | D | D | | * | | | | 0 | 1 | 2 | 3 | 4 | | * | | | | | | | | | v * | | | | | +---+---+---+ ^ short_size * | | | | | | | * +---+---+---+---+---+ v big_size * <------------------> <----------> * big columns short columns * */ /* * Reconstruct single data column using P parity * * @syn_method raidz_add_abd() * @rec_method not applicable * * @rm RAIDZ map * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) { size_t c; const size_t firstdc = raidz_parity(rm); const size_t ncols = raidz_ncols(rm); const size_t x = tgtidx[TARGET_X]; const size_t xsize = rm->rm_col[x].rc_size; abd_t *xabd = rm->rm_col[x].rc_abd; size_t size; abd_t *dabd; raidz_math_begin(); /* copy P into target */ raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); /* generate p_syndrome */ for (c = firstdc; c < ncols; c++) { if (c == x) continue; dabd = rm->rm_col[c].rc_abd; size = MIN(rm->rm_col[c].rc_size, xsize); raidz_add(xabd, dabd, size); } raidz_math_end(); return (1 << CODE_P); } /* * Generate Q syndrome (Qsyn) * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @xsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, - const size_t dsize) + const size_t dsize) { v_t *x = (v_t *)xc[TARGET_X]; const v_t *d = (v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const xend = x + (xsize / sizeof (v_t)); SYN_Q_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { LOAD(d, SYN_Q_D); Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x); } for (; x < xend; x += SYN_STRIDE) { Q_SYNDROME(SYN_Q_X, x); } } /* * Reconstruct single data column using Q parity * * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * * @rm RAIDZ map * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = raidz_parity(rm); const size_t ncols = raidz_ncols(rm); const size_t x = tgtidx[TARGET_X]; abd_t *xabd = rm->rm_col[x].rc_abd; const size_t xsize = rm->rm_col[x].rc_size; abd_t *tabds[] = { xabd }; unsigned coeff[MUL_CNT]; raidz_rec_q_coeff(rm, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x) { dabd = NULL; dsize = 0; } else { dabd = rm->rm_col[c].rc_abd; dsize = rm->rm_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, raidz_syn_q_abd); } /* add Q to the syndrome */ raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); raidz_math_end(); return (1 << CODE_Q); } /* * Generate R syndrome (Rsyn) * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, - const size_t dsize) + const size_t dsize) { v_t *x = (v_t *)xc[TARGET_X]; const v_t *d = (v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const xend = x + (tsize / sizeof (v_t)); SYN_R_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { LOAD(d, SYN_R_D); R_D_SYNDROME(SYN_R_D, SYN_R_X, x); } for (; x < xend; x += SYN_STRIDE) { R_SYNDROME(SYN_R_X, x); } } /* * Reconstruct single data column using R parity * * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * * @rm RAIDZ map * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = raidz_parity(rm); const size_t ncols = raidz_ncols(rm); const size_t x = tgtidx[TARGET_X]; const size_t xsize = rm->rm_col[x].rc_size; abd_t *xabd = rm->rm_col[x].rc_abd; abd_t *tabds[] = { xabd }; unsigned coeff[MUL_CNT]; raidz_rec_r_coeff(rm, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x) { dabd = NULL; dsize = 0; } else { dabd = rm->rm_col[c].rc_abd; dsize = rm->rm_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, raidz_syn_r_abd); } /* add R to the syndrome */ raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); raidz_math_end(); return (1 << CODE_R); } /* * Generate P and Q syndromes * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize, - const size_t dsize) + const size_t dsize) { v_t *x = (v_t *)tc[TARGET_X]; v_t *y = (v_t *)tc[TARGET_Y]; const v_t *d = (v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const yend = y + (tsize / sizeof (v_t)); SYN_PQ_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { LOAD(d, SYN_PQ_D); P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x); Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y); } for (; y < yend; y += SYN_STRIDE) { Q_SYNDROME(SYN_PQ_X, y); } } /* * Reconstruct data using PQ parity and PQ syndromes * * @tc syndrome/result columns * @tsize size of syndrome/result columns * @c parity columns * @mul array of multiplication constants */ static void raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, - const unsigned *mul) + const unsigned *mul) { v_t *x = (v_t *)tc[TARGET_X]; v_t *y = (v_t *)tc[TARGET_Y]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *p = (v_t *)c[CODE_P]; const v_t *q = (v_t *)c[CODE_Q]; REC_PQ_DEFINE(); for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE, p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) { LOAD(x, REC_PQ_X); LOAD(y, REC_PQ_Y); XOR_ACC(p, REC_PQ_X); XOR_ACC(q, REC_PQ_Y); /* Save Pxy */ COPY(REC_PQ_X, REC_PQ_T); /* Calc X */ MUL(mul[MUL_PQ_X], REC_PQ_X); MUL(mul[MUL_PQ_Y], REC_PQ_Y); XOR(REC_PQ_Y, REC_PQ_X); STORE(x, REC_PQ_X); /* Calc Y */ XOR(REC_PQ_T, REC_PQ_X); STORE(y, REC_PQ_X); } } /* * Reconstruct two data columns using PQ parity * * @syn_method raidz_syn_pq_abd() * @rec_method raidz_rec_pq_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = raidz_parity(rm); const size_t ncols = raidz_ncols(rm); const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t xsize = rm->rm_col[x].rc_size; const size_t ysize = rm->rm_col[y].rc_size; abd_t *xabd = rm->rm_col[x].rc_abd; abd_t *yabd = rm->rm_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { rm->rm_col[CODE_P].rc_abd, rm->rm_col[CODE_Q].rc_abd }; unsigned coeff[MUL_CNT]; raidz_rec_pq_coeff(rm, tgtidx, coeff); /* * Check if some of targets is shorter then others * In this case, shorter target needs to be replaced with * new buffer so that syndrome can be calculated. */ if (ysize < xsize) { yabd = abd_alloc(xsize, B_FALSE); tabds[1] = yabd; } raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x || c == y) { dabd = NULL; dsize = 0; } else { dabd = rm->rm_col[c].rc_abd; dsize = rm->rm_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, raidz_syn_pq_abd); } abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff); /* Copy shorter targets back to the original abd buffer */ if (ysize < xsize) raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); return ((1 << CODE_P) | (1 << CODE_Q)); } /* * Generate P and R syndromes * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize, - const size_t dsize) + const size_t dsize) { v_t *x = (v_t *)c[TARGET_X]; v_t *y = (v_t *)c[TARGET_Y]; const v_t *d = (v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); const v_t * const yend = y + (tsize / sizeof (v_t)); SYN_PR_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { LOAD(d, SYN_PR_D); P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x); R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y); } for (; y < yend; y += SYN_STRIDE) { R_SYNDROME(SYN_PR_X, y); } } /* * Reconstruct data using PR parity and PR syndromes * * @tc syndrome/result columns * @tsize size of syndrome/result columns * @c parity columns * @mul array of multiplication constants */ static void raidz_rec_pr_abd(void **t, const size_t tsize, void **c, - const unsigned *mul) + const unsigned *mul) { v_t *x = (v_t *)t[TARGET_X]; v_t *y = (v_t *)t[TARGET_Y]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *p = (v_t *)c[CODE_P]; const v_t *q = (v_t *)c[CODE_Q]; REC_PR_DEFINE(); for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE, p += REC_PR_STRIDE, q += REC_PR_STRIDE) { LOAD(x, REC_PR_X); LOAD(y, REC_PR_Y); XOR_ACC(p, REC_PR_X); XOR_ACC(q, REC_PR_Y); /* Save Pxy */ COPY(REC_PR_X, REC_PR_T); /* Calc X */ MUL(mul[MUL_PR_X], REC_PR_X); MUL(mul[MUL_PR_Y], REC_PR_Y); XOR(REC_PR_Y, REC_PR_X); STORE(x, REC_PR_X); /* Calc Y */ XOR(REC_PR_T, REC_PR_X); STORE(y, REC_PR_X); } } /* * Reconstruct two data columns using PR parity * * @syn_method raidz_syn_pr_abd() * @rec_method raidz_rec_pr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = raidz_parity(rm); const size_t ncols = raidz_ncols(rm); const size_t x = tgtidx[0]; const size_t y = tgtidx[1]; const size_t xsize = rm->rm_col[x].rc_size; const size_t ysize = rm->rm_col[y].rc_size; abd_t *xabd = rm->rm_col[x].rc_abd; abd_t *yabd = rm->rm_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { rm->rm_col[CODE_P].rc_abd, rm->rm_col[CODE_R].rc_abd }; unsigned coeff[MUL_CNT]; raidz_rec_pr_coeff(rm, tgtidx, coeff); /* * Check if some of targets are shorter then others. * They need to be replaced with a new buffer so that syndrome can * be calculated on full length. */ if (ysize < xsize) { yabd = abd_alloc(xsize, B_FALSE); tabds[1] = yabd; } raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x || c == y) { dabd = NULL; dsize = 0; } else { dabd = rm->rm_col[c].rc_abd; dsize = rm->rm_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, raidz_syn_pr_abd); } abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff); /* * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); return ((1 << CODE_P) | (1 << CODE_Q)); } /* * Generate Q and R syndromes * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize, - const size_t dsize) + const size_t dsize) { v_t *x = (v_t *)c[TARGET_X]; v_t *y = (v_t *)c[TARGET_Y]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *d = (v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); SYN_QR_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { LOAD(d, SYN_PQ_D); Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x); R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y); } for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) { Q_SYNDROME(SYN_QR_X, x); R_SYNDROME(SYN_QR_X, y); } } /* * Reconstruct data using QR parity and QR syndromes * * @tc syndrome/result columns * @tsize size of syndrome/result columns * @c parity columns * @mul array of multiplication constants */ static void raidz_rec_qr_abd(void **t, const size_t tsize, void **c, - const unsigned *mul) + const unsigned *mul) { v_t *x = (v_t *)t[TARGET_X]; v_t *y = (v_t *)t[TARGET_Y]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *p = (v_t *)c[CODE_P]; const v_t *q = (v_t *)c[CODE_Q]; REC_QR_DEFINE(); for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE, p += REC_QR_STRIDE, q += REC_QR_STRIDE) { LOAD(x, REC_QR_X); LOAD(y, REC_QR_Y); XOR_ACC(p, REC_QR_X); XOR_ACC(q, REC_QR_Y); /* Save Pxy */ COPY(REC_QR_X, REC_QR_T); /* Calc X */ MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */ STORE(x, REC_QR_X); /* Calc Y */ MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */ XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */ MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */ STORE(y, REC_QR_T); } } /* * Reconstruct two data columns using QR parity * * @syn_method raidz_syn_qr_abd() * @rec_method raidz_rec_qr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = raidz_parity(rm); const size_t ncols = raidz_ncols(rm); const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t xsize = rm->rm_col[x].rc_size; const size_t ysize = rm->rm_col[y].rc_size; abd_t *xabd = rm->rm_col[x].rc_abd; abd_t *yabd = rm->rm_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { rm->rm_col[CODE_Q].rc_abd, rm->rm_col[CODE_R].rc_abd }; unsigned coeff[MUL_CNT]; raidz_rec_qr_coeff(rm, tgtidx, coeff); /* * Check if some of targets is shorter then others * In this case, shorter target needs to be replaced with * new buffer so that syndrome can be calculated. */ if (ysize < xsize) { yabd = abd_alloc(xsize, B_FALSE); tabds[1] = yabd; } raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x || c == y) { dabd = NULL; dsize = 0; } else { dabd = rm->rm_col[c].rc_abd; dsize = rm->rm_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, raidz_syn_qr_abd); } abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff); /* * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); return ((1 << CODE_Q) | (1 << CODE_R)); } /* * Generate P, Q, and R syndromes * * @xc array of pointers to syndrome columns * @dc data column (NULL if missing) * @tsize size of syndrome columns * @dsize size of data column (0 if missing) */ static void raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize, - const size_t dsize) + const size_t dsize) { v_t *x = (v_t *)c[TARGET_X]; v_t *y = (v_t *)c[TARGET_Y]; v_t *z = (v_t *)c[TARGET_Z]; const v_t * const yend = y + (tsize / sizeof (v_t)); const v_t *d = (v_t *)dc; const v_t * const dend = d + (dsize / sizeof (v_t)); SYN_PQR_DEFINE(); MUL2_SETUP(); for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE, z += SYN_STRIDE) { LOAD(d, SYN_PQR_D); P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x) Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y); R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z); } for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) { Q_SYNDROME(SYN_PQR_X, y); R_SYNDROME(SYN_PQR_X, z); } } /* * Reconstruct data using PRQ parity and PQR syndromes * * @tc syndrome/result columns * @tsize size of syndrome/result columns * @c parity columns * @mul array of multiplication constants */ static void raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, - const unsigned * const mul) + const unsigned * const mul) { v_t *x = (v_t *)t[TARGET_X]; v_t *y = (v_t *)t[TARGET_Y]; v_t *z = (v_t *)t[TARGET_Z]; const v_t * const xend = x + (tsize / sizeof (v_t)); const v_t *p = (v_t *)c[CODE_P]; const v_t *q = (v_t *)c[CODE_Q]; const v_t *r = (v_t *)c[CODE_R]; REC_PQR_DEFINE(); for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE, z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE, r += REC_PQR_STRIDE) { LOAD(x, REC_PQR_X); LOAD(y, REC_PQR_Y); LOAD(z, REC_PQR_Z); XOR_ACC(p, REC_PQR_X); XOR_ACC(q, REC_PQR_Y); XOR_ACC(r, REC_PQR_Z); /* Save Pxyz and Qxyz */ COPY(REC_PQR_X, REC_PQR_XS); COPY(REC_PQR_Y, REC_PQR_YS); /* Calc X */ MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ XOR(REC_PQR_Y, REC_PQR_X); MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */ STORE(x, REC_PQR_X); /* Calc Y */ XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ STORE(y, REC_PQR_YS); /* Calc Z */ XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ STORE(z, REC_PQR_YS); } } /* * Reconstruct three data columns using PQR parity * * @syn_method raidz_syn_pqr_abd() * @rec_method raidz_rec_pqr_abd() * * @rm RAIDZ map * @tgtidx array of missing data indexes */ static raidz_inline int raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; const size_t firstdc = raidz_parity(rm); const size_t ncols = raidz_ncols(rm); const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t z = tgtidx[TARGET_Z]; const size_t xsize = rm->rm_col[x].rc_size; const size_t ysize = rm->rm_col[y].rc_size; const size_t zsize = rm->rm_col[z].rc_size; abd_t *xabd = rm->rm_col[x].rc_abd; abd_t *yabd = rm->rm_col[y].rc_abd; abd_t *zabd = rm->rm_col[z].rc_abd; abd_t *tabds[] = { xabd, yabd, zabd }; abd_t *cabds[] = { rm->rm_col[CODE_P].rc_abd, rm->rm_col[CODE_Q].rc_abd, rm->rm_col[CODE_R].rc_abd }; unsigned coeff[MUL_CNT]; raidz_rec_pqr_coeff(rm, tgtidx, coeff); /* * Check if some of targets is shorter then others * In this case, shorter target needs to be replaced with * new buffer so that syndrome can be calculated. */ if (ysize < xsize) { yabd = abd_alloc(xsize, B_FALSE); tabds[1] = yabd; } if (zsize < xsize) { zabd = abd_alloc(xsize, B_FALSE); tabds[2] = zabd; } raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); raidz_zero(zabd, xsize); } /* generate q_syndrome */ for (c = firstdc+1; c < ncols; c++) { if (c == x || c == y || c == z) { dabd = NULL; dsize = 0; } else { dabd = rm->rm_col[c].rc_abd; dsize = rm->rm_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, raidz_syn_pqr_abd); } abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff); /* * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); if (zsize < xsize) raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); if (zsize < xsize) abd_free(zabd); return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); } #endif /* _VDEV_RAIDZ_MATH_IMPL_H */ diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c index cebb0fe2b15d..a015baab2d83 100644 --- a/module/zfs/vdev_raidz_math_ssse3.c +++ b/module/zfs/vdev_raidz_math_ssse3.c @@ -1,2474 +1,2475 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. */ #include #if defined(__x86_64) && defined(HAVE_SSSE3) #include #include #define __asm __asm__ __volatile__ #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) #define VR0_(REG, ...) "xmm"#REG #define VR1_(_1, REG, ...) "xmm"#REG #define VR2_(_1, _2, REG, ...) "xmm"#REG #define VR3_(_1, _2, _3, REG, ...) "xmm"#REG #define VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG #define VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG #define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG #define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG #define VR0(r...) VR0_(r) #define VR1(r...) VR1_(r) #define VR2(r...) VR2_(r, 1) #define VR3(r...) VR3_(r, 1, 2) #define VR4(r...) VR4_(r, 1, 2) #define VR5(r...) VR5_(r, 1, 2, 3) #define VR6(r...) VR6_(r, 1, 2, 3, 4) #define VR7(r...) VR7_(r, 1, 2, 3, 4, 5) #define R_01(REG1, REG2, ...) REG1, REG2 #define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 #define R_23(REG...) _R_23(REG, 1, 2, 3) #define ASM_BUG() ASSERT(0) const uint8_t gf_clmul_mod_lt[4*256][16]; #define ELEM_SIZE 16 typedef struct v { uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); } v_t; #define XOR_ACC(src, r...) \ { \ switch (REG_CNT(r)) { \ case 4: \ __asm( \ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ "pxor 0x20(%[SRC]), %%" VR2(r) "\n" \ "pxor 0x30(%[SRC]), %%" VR3(r) "\n" \ : : [SRC] "r" (src)); \ break; \ case 2: \ __asm( \ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ : : [SRC] "r" (src)); \ break; \ default: \ ASM_BUG(); \ } \ } #define XOR(r...) \ { \ switch (REG_CNT(r)) { \ case 8: \ __asm( \ "pxor %" VR0(r) ", %" VR4(r) "\n" \ "pxor %" VR1(r) ", %" VR5(r) "\n" \ "pxor %" VR2(r) ", %" VR6(r) "\n" \ "pxor %" VR3(r) ", %" VR7(r)); \ break; \ case 4: \ __asm( \ "pxor %" VR0(r) ", %" VR2(r) "\n" \ "pxor %" VR1(r) ", %" VR3(r)); \ break; \ default: \ ASM_BUG(); \ } \ } #define ZERO(r...) XOR(r, r) #define COPY(r...) \ { \ switch (REG_CNT(r)) { \ case 8: \ __asm( \ "movdqa %" VR0(r) ", %" VR4(r) "\n" \ "movdqa %" VR1(r) ", %" VR5(r) "\n" \ "movdqa %" VR2(r) ", %" VR6(r) "\n" \ "movdqa %" VR3(r) ", %" VR7(r)); \ break; \ case 4: \ __asm( \ "movdqa %" VR0(r) ", %" VR2(r) "\n" \ "movdqa %" VR1(r) ", %" VR3(r)); \ break; \ default: \ ASM_BUG(); \ } \ } #define LOAD(src, r...) \ { \ switch (REG_CNT(r)) { \ case 4: \ __asm( \ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ "movdqa 0x20(%[SRC]), %%" VR2(r) "\n" \ "movdqa 0x30(%[SRC]), %%" VR3(r) "\n" \ : : [SRC] "r" (src)); \ break; \ case 2: \ __asm( \ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ : : [SRC] "r" (src)); \ break; \ default: \ ASM_BUG(); \ } \ } #define STORE(dst, r...) \ { \ switch (REG_CNT(r)) { \ case 4: \ __asm( \ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ "movdqa %%" VR2(r)", 0x20(%[DST])\n" \ "movdqa %%" VR3(r)", 0x30(%[DST])\n" \ : : [DST] "r" (dst)); \ break; \ case 2: \ __asm( \ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ : : [DST] "r" (dst)); \ break; \ default: \ ASM_BUG(); \ } \ } #define MUL2_SETUP() \ { \ __asm( \ "movd %[mask], %%xmm15\n" \ "pshufd $0x0, %%xmm15, %%xmm15\n" \ : : [mask] "r" (0x1d1d1d1d)); \ } #define _MUL2_x2(r...) \ { \ switch (REG_CNT(r)) { \ case 2: \ __asm( \ "pxor %xmm14, %xmm14\n" \ "pxor %xmm13, %xmm13\n" \ "pcmpgtb %" VR0(r)", %xmm14\n" \ "pcmpgtb %" VR1(r)", %xmm13\n" \ "pand %xmm15, %xmm14\n" \ "pand %xmm15, %xmm13\n" \ "paddb %" VR0(r)", %" VR0(r) "\n" \ "paddb %" VR1(r)", %" VR1(r) "\n" \ "pxor %xmm14, %" VR0(r) "\n" \ "pxor %xmm13, %" VR1(r)); \ break; \ default: \ ASM_BUG(); \ } \ } #define MUL2(r...) \ { \ switch (REG_CNT(r)) { \ case 4: \ _MUL2_x2(R_01(r)); \ _MUL2_x2(R_23(r)); \ break; \ case 2: \ _MUL2_x2(r); \ break; \ default: \ ASM_BUG(); \ } \ } #define MUL4(r...) \ { \ MUL2(r); \ MUL2(r); \ } #define _0f "xmm15" #define _a_save "xmm14" #define _b_save "xmm13" #define _lt_mod_a "xmm12" #define _lt_clmul_a "xmm11" #define _lt_mod_b "xmm10" #define _lt_clmul_b "xmm15" #define _MULx2(c, r...) \ { \ switch (REG_CNT(r)) { \ case 2: \ __asm( \ /* lts for upper part */ \ "movd %[mask], %%" _0f "\n" \ "pshufd $0x0, %%" _0f ", %%" _0f "\n" \ "movdqa 0x00(%[lt]), %%" _lt_mod_a "\n" \ "movdqa 0x10(%[lt]), %%" _lt_clmul_a "\n" \ /* upper part */ \ "movdqa %%" VR0(r) ", %%" _a_save "\n" \ "movdqa %%" VR1(r) ", %%" _b_save "\n" \ "psraw $0x4, %%" VR0(r) "\n" \ "psraw $0x4, %%" VR1(r) "\n" \ "pand %%" _0f ", %%" _a_save "\n" \ "pand %%" _0f ", %%" _b_save "\n" \ "pand %%" _0f ", %%" VR0(r) "\n" \ "pand %%" _0f ", %%" VR1(r) "\n" \ \ "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \ "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \ \ "pshufb %%" VR0(r) ",%%" _lt_mod_a "\n" \ "pshufb %%" VR1(r) ",%%" _lt_mod_b "\n" \ "pshufb %%" VR0(r) ",%%" _lt_clmul_a "\n" \ "pshufb %%" VR1(r) ",%%" _lt_clmul_b "\n" \ \ "pxor %%" _lt_mod_a ",%%" _lt_clmul_a "\n" \ "pxor %%" _lt_mod_b ",%%" _lt_clmul_b "\n" \ "movdqa %%" _lt_clmul_a ",%%" VR0(r) "\n" \ "movdqa %%" _lt_clmul_b ",%%" VR1(r) "\n" \ /* lts for lower part */ \ "movdqa 0x20(%[lt]), %%" _lt_mod_a "\n" \ "movdqa 0x30(%[lt]), %%" _lt_clmul_a "\n" \ "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \ "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \ /* lower part */ \ "pshufb %%" _a_save ",%%" _lt_mod_a "\n" \ "pshufb %%" _b_save ",%%" _lt_mod_b "\n" \ "pshufb %%" _a_save ",%%" _lt_clmul_a "\n" \ "pshufb %%" _b_save ",%%" _lt_clmul_b "\n" \ \ "pxor %%" _lt_mod_a ",%%" VR0(r) "\n" \ "pxor %%" _lt_mod_b ",%%" VR1(r) "\n" \ "pxor %%" _lt_clmul_a ",%%" VR0(r) "\n" \ "pxor %%" _lt_clmul_b ",%%" VR1(r) "\n" \ : : [mask] "r" (0x0f0f0f0f), \ [lt] "r" (gf_clmul_mod_lt[4*(c)])); \ break; \ default: \ ASM_BUG(); \ } \ } #define MUL(c, r...) \ { \ switch (REG_CNT(r)) { \ case 4: \ _MULx2(c, R_23(r)); \ _MULx2(c, R_01(r)); \ break; \ case 2: \ _MULx2(c, R_01(r)); \ break; \ default: \ ASM_BUG(); \ } \ } #define raidz_math_begin() kfpu_begin() #define raidz_math_end() kfpu_end() #define SYN_STRIDE 4 #define ZERO_STRIDE 4 #define ZERO_DEFINE() {} #define ZERO_D 0, 1, 2, 3 #define COPY_STRIDE 4 #define COPY_DEFINE() {} #define COPY_D 0, 1, 2, 3 #define ADD_STRIDE 4 #define ADD_DEFINE() {} #define ADD_D 0, 1, 2, 3 #define MUL_STRIDE 4 #define MUL_DEFINE() {} #define MUL_D 0, 1, 2, 3 #define GEN_P_STRIDE 4 #define GEN_P_DEFINE() {} #define GEN_P_P 0, 1, 2, 3 #define GEN_PQ_STRIDE 4 #define GEN_PQ_DEFINE() {} #define GEN_PQ_D 0, 1, 2, 3 #define GEN_PQ_C 4, 5, 6, 7 #define GEN_PQR_STRIDE 4 #define GEN_PQR_DEFINE() {} #define GEN_PQR_D 0, 1, 2, 3 #define GEN_PQR_C 4, 5, 6, 7 #define SYN_Q_DEFINE() {} #define SYN_Q_D 0, 1, 2, 3 #define SYN_Q_X 4, 5, 6, 7 #define SYN_R_DEFINE() {} #define SYN_R_D 0, 1, 2, 3 #define SYN_R_X 4, 5, 6, 7 #define SYN_PQ_DEFINE() {} #define SYN_PQ_D 0, 1, 2, 3 #define SYN_PQ_X 4, 5, 6, 7 #define REC_PQ_STRIDE 2 #define REC_PQ_DEFINE() {} #define REC_PQ_X 0, 1 #define REC_PQ_Y 2, 3 #define REC_PQ_T 4, 5 #define SYN_PR_DEFINE() {} #define SYN_PR_D 0, 1, 2, 3 #define SYN_PR_X 4, 5, 6, 7 #define REC_PR_STRIDE 2 #define REC_PR_DEFINE() {} #define REC_PR_X 0, 1 #define REC_PR_Y 2, 3 #define REC_PR_T 4, 5 #define SYN_QR_DEFINE() {} #define SYN_QR_D 0, 1, 2, 3 #define SYN_QR_X 4, 5, 6, 7 #define REC_QR_STRIDE 2 #define REC_QR_DEFINE() {} #define REC_QR_X 0, 1 #define REC_QR_Y 2, 3 #define REC_QR_T 4, 5 #define SYN_PQR_DEFINE() {} #define SYN_PQR_D 0, 1, 2, 3 #define SYN_PQR_X 4, 5, 6, 7 #define REC_PQR_STRIDE 2 #define REC_PQR_DEFINE() {} #define REC_PQR_X 0, 1 #define REC_PQR_Y 2, 3 #define REC_PQR_Z 4, 5 #define REC_PQR_XS 6, 7 #define REC_PQR_YS 8, 9 #include #include "vdev_raidz_math_impl.h" DEFINE_GEN_METHODS(ssse3); DEFINE_REC_METHODS(ssse3); static boolean_t raidz_will_ssse3_work(void) { return (zfs_sse_available() && zfs_sse2_available() && zfs_ssse3_available()); } const raidz_impl_ops_t vdev_raidz_ssse3_impl = { .init = NULL, .fini = NULL, .gen = RAIDZ_GEN_METHODS(ssse3), .rec = RAIDZ_REC_METHODS(ssse3), .is_supported = &raidz_will_ssse3_work, .name = "ssse3" }; #endif /* defined(__x86_64) && defined(HAVE_SSSE3) */ #if defined(__x86_64) #if defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) - +/* BEGIN CSTYLED */ const uint8_t -__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { +__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = +{ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 }, { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc }, { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 }, { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 }, { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd }, { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 }, { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 }, { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 }, { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 }, { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 }, { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa }, { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 }, { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc }, { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 }, { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 }, { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 }, { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 }, { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 }, { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 }, { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 }, { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab }, { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba }, { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e }, { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 }, { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c }, { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 }, { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 }, { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d }, { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 }, { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 }, { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 }, { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 }, { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b }, { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a }, { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 }, { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7, 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf }, { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce, 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde }, { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9, 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc, 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb, 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2, 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5, 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed }, { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8, 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 }, { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff, 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 }, { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6, 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 }, { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1, 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 }, { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4, 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 }, { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3, 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b }, { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed, 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 }, { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7, 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f }, { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe, 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e }, { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac, 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab, 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2, 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5, 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d }, { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88, 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 }, { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f, 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 }, { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86, 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 }, { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81, 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 }, { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94, 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 }, { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93, 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b }, { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a, 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a }, { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d, 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27, 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f }, { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e, 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e }, { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29, 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c, 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b, 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32, 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 }, { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35, 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d }, { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18, 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 }, { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16, 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 }, { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11, 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 }, { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04, 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 }, { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03, 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b }, { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a, 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a }, { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d, 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 }, { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57, 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf }, { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e, 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce }, { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59, 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b, 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42, 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 }, { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45, 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd }, { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68, 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 }, { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f, 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 }, { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66, 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 }, { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61, 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 }, { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74, 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 }, { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73, 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b }, { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a, 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a }, { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d, 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 }, { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e, 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e }, { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89, 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 }, { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c, 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc }, { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b, 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 }, { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92, 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 }, { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95, 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad }, { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8, 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 }, { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf, 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 }, { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6, 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 }, { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1, 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4, 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3, 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa, 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad, 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 }, { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7, 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f }, { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe, 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e }, { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9, 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 }, { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec, 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c }, { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb, 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 }, { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2, 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 }, { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5, 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d }, { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8, 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 }, { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf, 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 }, { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6, 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 }, { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1, 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3, 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda, 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd, 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67, 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f }, { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e, 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e }, { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69, 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c, 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c }, { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b, 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75, 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d }, { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58, 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 }, { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f, 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 }, { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56, 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 }, { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51, 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44, 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43, 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a, 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d, 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 }, { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17, 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f }, { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e, 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e }, { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19, 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 }, { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c, 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac }, { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b, 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 }, { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02, 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 }, { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05, 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd }, { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28, 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 }, { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f, 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 }, { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26, 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 }, { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34, 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33, 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a, 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d, 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 }, { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47, 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f }, { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e, 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e }, { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49, 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 }, { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c, 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c }, { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b, 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 }, { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52, 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 }, { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55, 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78, 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f, 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76, 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 }, { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71, 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 }, { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64, 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 }, { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63, 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b }, { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a, 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a }, { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37, 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf }, { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39, 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 }, { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c, 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c }, { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b, 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 }, { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22, 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 }, { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25, 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08, 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f, 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06, 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 }, { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01, 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 }, { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14, 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 }, { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13, 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb }, { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a, 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea }, { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d, 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 }, { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7, 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf }, { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae, 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe }, { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9, 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 }, { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc, 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c }, { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb, 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 }, { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2, 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 }, { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5, 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f, 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96, 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 }, { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91, 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 }, { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84, 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 }, { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83, 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb }, { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a, 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa }, { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d, 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7, 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f }, { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde, 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e }, { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9, 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc, 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c }, { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2, 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 }, { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5, 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8, 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef, 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6, 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 }, { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1, 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 }, { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4, 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 }, { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3, 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b }, { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa, 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a }, { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd, 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 } }; - +/* END CSTYLED */ #endif /* defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) */ #endif /* defined(__x86_64) */ diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c index d0e2f7ee0409..e340462fb91a 100644 --- a/module/zfs/zfs_ctldir.c +++ b/module/zfs/zfs_ctldir.c @@ -1,1273 +1,1273 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * LLNL-CODE-403049. * Rewritten for Linux by: * Rohan Puri * Brian Behlendorf * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. */ /* * ZFS control directory (a.k.a. ".zfs") * * This directory provides a common location for all ZFS meta-objects. * Currently, this is only the 'snapshot' and 'shares' directory, but this may * expand in the future. The elements are built dynamically, as the hierarchy * does not actually exist on disk. * * For 'snapshot', we don't want to have all snapshots always mounted, because * this would take up a huge amount of space in /etc/mnttab. We have three * types of objects: * * ctldir ------> snapshotdir -------> snapshot * | * | * V * mounted fs * * The 'snapshot' node contains just enough information to lookup '..' and act * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we * perform an automount of the underlying filesystem and return the * corresponding inode. * * All mounts are handled automatically by an user mode helper which invokes * the mount mount procedure. Unmounts are handled by allowing the mount * point to expire so the kernel may automatically unmount it. * * The '.zfs', '.zfs/snapshot', and all directories created under * '.zfs/snapshot' (ie: '.zfs/snapshot/') all share the same * share the same zfs_sb_t as the head filesystem (what '.zfs' lives under). * * File systems mounted on top of the '.zfs/snapshot/' paths * (ie: snapshots) are complete ZFS filesystems and have their own unique * zfs_sb_t. However, the fsid reported by these mounts will be the same * as that used by the parent zfs_sb_t to make NFS happy. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" /* * Two AVL trees are maintained which contain all currently automounted * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t * entry which MUST: * * - be attached to both trees, and * - be unique, no duplicate entries are allowed. * * The zfs_snapshots_by_name tree is indexed by the full dataset name * while the zfs_snapshots_by_objsetid tree is indexed by the unique * objsetid. This allows for fast lookups either by name or objsetid. */ static avl_tree_t zfs_snapshots_by_name; static avl_tree_t zfs_snapshots_by_objsetid; static krwlock_t zfs_snapshot_lock; /* * Control Directory Tunables (.zfs) */ int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; int zfs_admin_snapshot = 1; typedef struct { char *se_name; /* full snapshot name */ char *se_path; /* full mount path */ spa_t *se_spa; /* pool spa */ uint64_t se_objsetid; /* snapshot objset id */ struct dentry *se_root_dentry; /* snapshot root dentry */ taskqid_t se_taskqid; /* scheduled unmount taskqid */ avl_node_t se_node_name; /* zfs_snapshots_by_name link */ avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ refcount_t se_refcount; /* reference count */ } zfs_snapentry_t; static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay); /* * Allocate a new zfs_snapentry_t being careful to make a copy of the * the snapshot name and provided mount point. No reference is taken. */ static zfs_snapentry_t * zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa, uint64_t objsetid, struct dentry *root_dentry) { zfs_snapentry_t *se; se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); se->se_name = strdup(full_name); se->se_path = strdup(full_path); se->se_spa = spa; se->se_objsetid = objsetid; se->se_root_dentry = root_dentry; se->se_taskqid = TASKQID_INVALID; refcount_create(&se->se_refcount); return (se); } /* * Free a zfs_snapentry_t the called must ensure there are no active * references. */ static void zfsctl_snapshot_free(zfs_snapentry_t *se) { refcount_destroy(&se->se_refcount); strfree(se->se_name); strfree(se->se_path); kmem_free(se, sizeof (zfs_snapentry_t)); } /* * Hold a reference on the zfs_snapentry_t. */ static void zfsctl_snapshot_hold(zfs_snapentry_t *se) { refcount_add(&se->se_refcount, NULL); } /* * Release a reference on the zfs_snapentry_t. When the number of * references drops to zero the structure will be freed. */ static void zfsctl_snapshot_rele(zfs_snapentry_t *se) { if (refcount_remove(&se->se_refcount, NULL) == 0) zfsctl_snapshot_free(se); } /* * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part * of the trees a reference is held. */ static void zfsctl_snapshot_add(zfs_snapentry_t *se) { ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); refcount_add(&se->se_refcount, NULL); avl_add(&zfs_snapshots_by_name, se); avl_add(&zfs_snapshots_by_objsetid, se); } /* * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped, * this can result in the structure being freed if that was the last * remaining reference. */ static void zfsctl_snapshot_remove(zfs_snapentry_t *se) { ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); avl_remove(&zfs_snapshots_by_name, se); avl_remove(&zfs_snapshots_by_objsetid, se); zfsctl_snapshot_rele(se); } /* * Snapshot name comparison function for the zfs_snapshots_by_name. */ static int snapentry_compare_by_name(const void *a, const void *b) { const zfs_snapentry_t *se_a = a; const zfs_snapentry_t *se_b = b; int ret; ret = strcmp(se_a->se_name, se_b->se_name); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } /* * Snapshot name comparison function for the zfs_snapshots_by_objsetid. */ static int snapentry_compare_by_objsetid(const void *a, const void *b) { const zfs_snapentry_t *se_a = a; const zfs_snapentry_t *se_b = b; if (se_a->se_spa != se_b->se_spa) return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1); if (se_a->se_objsetid < se_b->se_objsetid) return (-1); else if (se_a->se_objsetid > se_b->se_objsetid) return (1); else return (0); } /* * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname * is found a pointer to the zfs_snapentry_t is returned and a reference * taken on the structure. The caller is responsible for dropping the * reference with zfsctl_snapshot_rele(). If the snapname is not found * NULL will be returned. */ static zfs_snapentry_t * zfsctl_snapshot_find_by_name(char *snapname) { zfs_snapentry_t *se, search; ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); search.se_name = snapname; se = avl_find(&zfs_snapshots_by_name, &search, NULL); if (se) refcount_add(&se->se_refcount, NULL); return (se); } /* * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id * rather than the snapname. In all other respects it behaves the same * as zfsctl_snapshot_find_by_name(). */ static zfs_snapentry_t * zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) { zfs_snapentry_t *se, search; ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); search.se_spa = spa; search.se_objsetid = objsetid; se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL); if (se) refcount_add(&se->se_refcount, NULL); return (se); } /* * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is * removed, renamed, and added back to the new correct location in the tree. */ static int zfsctl_snapshot_rename(char *old_snapname, char *new_snapname) { zfs_snapentry_t *se; ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); se = zfsctl_snapshot_find_by_name(old_snapname); if (se == NULL) return (ENOENT); zfsctl_snapshot_remove(se); strfree(se->se_name); se->se_name = strdup(new_snapname); zfsctl_snapshot_add(se); zfsctl_snapshot_rele(se); return (0); } /* * Delayed task responsible for unmounting an expired automounted snapshot. */ static void snapentry_expire(void *data) { zfs_snapentry_t *se = (zfs_snapentry_t *)data; spa_t *spa = se->se_spa; uint64_t objsetid = se->se_objsetid; if (zfs_expire_snapshot <= 0) { zfsctl_snapshot_rele(se); return; } se->se_taskqid = TASKQID_INVALID; (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); zfsctl_snapshot_rele(se); /* * Reschedule the unmount if the zfs_snapentry_t wasn't removed. * This can occur when the snapshot is busy. */ rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); zfsctl_snapshot_rele(se); } rw_exit(&zfs_snapshot_lock); } /* * Cancel an automatic unmount of a snapname. This callback is responsible * for dropping the reference on the zfs_snapentry_t which was taken when * during dispatch. */ static void zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) { ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) { se->se_taskqid = TASKQID_INVALID; zfsctl_snapshot_rele(se); } } /* * Dispatch the unmount task for delayed handling with a hold protecting it. */ static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) { ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID); if (delay <= 0) return; zfsctl_snapshot_hold(se); se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); } /* * Schedule an automatic unmount of objset id to occur in delay seconds from * now. Any previous delayed unmount will be cancelled in favor of the * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name() * and held until the outstanding task is handled or cancelled. */ int zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay) { zfs_snapentry_t *se; int error = ENOENT; rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_unmount_delay_impl(se, delay); zfsctl_snapshot_rele(se); error = 0; } rw_exit(&zfs_snapshot_lock); return (error); } /* * Check if snapname is currently mounted. Returned non-zero when mounted * and zero when unmounted. */ static boolean_t zfsctl_snapshot_ismounted(char *snapname) { zfs_snapentry_t *se; boolean_t ismounted = B_FALSE; rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) { zfsctl_snapshot_rele(se); ismounted = B_TRUE; } rw_exit(&zfs_snapshot_lock); return (ismounted); } /* * Check if the given inode is a part of the virtual .zfs directory. */ boolean_t zfsctl_is_node(struct inode *ip) { return (ITOZ(ip)->z_is_ctldir); } /* * Check if the given inode is a .zfs/snapshots/snapname directory. */ boolean_t zfsctl_is_snapdir(struct inode *ip) { return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS)); } /* * Allocate a new inode with the passed id and ops. */ static struct inode * zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops) { struct timespec now = current_fs_time(zsb->z_sb); struct inode *ip; znode_t *zp; ip = new_inode(zsb->z_sb); if (ip == NULL) return (NULL); zp = ITOZ(ip); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_id = id; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_zn_prefetch = 0; zp->z_moved = 0; zp->z_sa_hdl = NULL; zp->z_blksz = 0; zp->z_seq = 0; zp->z_mapcnt = 0; zp->z_size = 0; zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_TRUE; zp->z_is_sa = B_FALSE; zp->z_is_stale = B_FALSE; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); ip->i_uid = SUID_TO_KUID(0); ip->i_gid = SGID_TO_KGID(0); ip->i_blkbits = SPA_MINBLOCKSHIFT; ip->i_atime = now; ip->i_mtime = now; ip->i_ctime = now; ip->i_fop = fops; ip->i_op = ops; if (insert_inode_locked(ip)) { unlock_new_inode(ip); iput(ip); return (NULL); } mutex_enter(&zsb->z_znodes_lock); list_insert_tail(&zsb->z_all_znodes, zp); zsb->z_nr_znodes++; membar_producer(); mutex_exit(&zsb->z_znodes_lock); unlock_new_inode(ip); return (ip); } /* * Lookup the inode with given id, it will be allocated if needed. */ static struct inode * zfsctl_inode_lookup(zfs_sb_t *zsb, uint64_t id, const struct file_operations *fops, const struct inode_operations *ops) { struct inode *ip = NULL; while (ip == NULL) { ip = ilookup(zsb->z_sb, (unsigned long)id); if (ip) break; /* May fail due to concurrent zfsctl_inode_alloc() */ ip = zfsctl_inode_alloc(zsb, id, fops, ops); } return (ip); } /* * Create the '.zfs' directory. This directory is cached as part of the VFS * structure. This results in a hold on the zfs_sb_t. The code in zfs_umount() * therefore checks against a vfs_count of 2 instead of 1. This reference * is removed when the ctldir is destroyed in the unmount. All other entities * under the '.zfs' directory are created dynamically as needed. * * Because the dynamically created '.zfs' directory entries assume the use * of 64-bit inode numbers this support must be disabled on 32-bit systems. */ int zfsctl_create(zfs_sb_t *zsb) { ASSERT(zsb->z_ctldir == NULL); zsb->z_ctldir = zfsctl_inode_alloc(zsb, ZFSCTL_INO_ROOT, &zpl_fops_root, &zpl_ops_root); if (zsb->z_ctldir == NULL) return (SET_ERROR(ENOENT)); return (0); } /* * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name. * Only called when the filesystem is unmounted. */ void zfsctl_destroy(zfs_sb_t *zsb) { if (zsb->z_issnap) { zfs_snapentry_t *se; spa_t *spa = zsb->z_os->os_spa; uint64_t objsetid = dmu_objset_id(zsb->z_os); rw_enter(&zfs_snapshot_lock, RW_WRITER); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_remove(se); zfsctl_snapshot_rele(se); } rw_exit(&zfs_snapshot_lock); } else if (zsb->z_ctldir) { iput(zsb->z_ctldir); zsb->z_ctldir = NULL; } } /* * Given a root znode, retrieve the associated .zfs directory. * Add a hold to the vnode and return it. */ struct inode * zfsctl_root(znode_t *zp) { ASSERT(zfs_has_ctldir(zp)); igrab(ZTOZSB(zp)->z_ctldir); return (ZTOZSB(zp)->z_ctldir); } /* * Generate a long fid which includes the root object and objset of a * snapshot but not the generation number. For the root object the * generation number is ignored when zero to avoid needing to open * the dataset when generating fids for the snapshot names. */ static int zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp) { zfs_sb_t *zsb = ITOZSB(ip); zfid_short_t *zfid = (zfid_short_t *)fidp; zfid_long_t *zlfid = (zfid_long_t *)fidp; uint32_t gen = 0; uint64_t object; uint64_t objsetid; int i; object = zsb->z_root; objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino; zfid->zf_len = LONG_FID_LEN; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; return (0); } /* * Generate an appropriate fid for an entry in the .zfs directory. */ int zfsctl_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfs_sb_t *zsb = ITOZSB(ip); uint64_t object = zp->z_id; zfid_short_t *zfid; int i; ZFS_ENTER(zsb); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; ZFS_EXIT(zsb); return (SET_ERROR(ENOSPC)); } if (zfsctl_is_snapdir(ip)) { ZFS_EXIT(zsb); return (zfsctl_snapdir_fid(ip, fidp)); } zfid = (zfid_short_t *)fidp; zfid->zf_len = SHORT_FID_LEN; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* .zfs znodes always have a generation number of 0 */ for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = 0; ZFS_EXIT(zsb); return (0); } /* * Construct a full dataset name in full_name: "pool/dataset@snap_name" */ static int zfsctl_snapshot_name(zfs_sb_t *zsb, const char *snap_name, int len, char *full_name) { objset_t *os = zsb->z_os; if (zfs_component_namecheck(snap_name, NULL, NULL) != 0) return (SET_ERROR(EILSEQ)); dmu_objset_name(os, full_name); if ((strlen(full_name) + 1 + strlen(snap_name)) >= len) return (SET_ERROR(ENAMETOOLONG)); (void) strcat(full_name, "@"); (void) strcat(full_name, snap_name); return (0); } /* * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" */ static int zfsctl_snapshot_path(struct path *path, int len, char *full_path) { char *path_buffer, *path_ptr; int path_len, error = 0; path_buffer = kmem_alloc(len, KM_SLEEP); path_ptr = d_path(path, path_buffer, len); if (IS_ERR(path_ptr)) { error = -PTR_ERR(path_ptr); goto out; } path_len = path_buffer + len - 1 - path_ptr; if (path_len > len) { error = SET_ERROR(EFAULT); goto out; } memcpy(full_path, path_ptr, path_len); full_path[path_len] = '\0'; out: kmem_free(path_buffer, len); return (error); } /* * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" */ static int zfsctl_snapshot_path_objset(zfs_sb_t *zsb, uint64_t objsetid, int path_len, char *full_path) { objset_t *os = zsb->z_os; fstrans_cookie_t cookie; char *snapname; boolean_t case_conflict; uint64_t id, pos = 0; int error = 0; if (zsb->z_mntopts->z_mntpoint == NULL) return (ENOENT); cookie = spl_fstrans_mark(); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dmu_snapshot_list_next(zsb->z_os, ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos, &case_conflict); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto out; if (id == objsetid) break; } memset(full_path, 0, path_len); snprintf(full_path, path_len - 1, "%s/.zfs/snapshot/%s", zsb->z_mntopts->z_mntpoint, snapname); out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); spl_fstrans_unmark(cookie); return (error); } /* * Special case the handling of "..". */ int zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfs_sb_t *zsb = ITOZSB(dip); int error = 0; ZFS_ENTER(zsb); if (strcmp(name, "..") == 0) { *ipp = dip->i_sb->s_root->d_inode; } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIR, &zpl_fops_snapdir, &zpl_ops_snapdir); } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) { *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SHARES, &zpl_fops_shares, &zpl_ops_shares); } else { *ipp = NULL; } if (*ipp == NULL) error = SET_ERROR(ENOENT); ZFS_EXIT(zsb); return (error); } /* * Lookup entry point for the 'snapshot' directory. Try to open the * snapshot if it exist, creating the pseudo filesystem inode as necessary. * Perform a mount of the associated dataset on top of the inode. */ int zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfs_sb_t *zsb = ITOZSB(dip); uint64_t id; int error; ZFS_ENTER(zsb); error = dmu_snapshot_lookup(zsb->z_os, name, &id); if (error) { ZFS_EXIT(zsb); return (error); } *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIRS - id, &simple_dir_operations, &simple_dir_inode_operations); if (*ipp == NULL) error = SET_ERROR(ENOENT); ZFS_EXIT(zsb); return (error); } /* * Renaming a directory under '.zfs/snapshot' will automatically trigger * a rename of the snapshot to the new given name. The rename is confined * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere. */ int zfsctl_snapdir_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, cred_t *cr, int flags) { zfs_sb_t *zsb = ITOZSB(sdip); char *to, *from, *real, *fsname; int error; if (!zfs_admin_snapshot) return (EACCES); ZFS_ENTER(zsb); to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zsb->z_case == ZFS_CASE_INSENSITIVE) { error = dmu_snapshot_realname(zsb->z_os, snm, real, ZFS_MAX_DATASET_NAME_LEN, NULL); if (error == 0) { snm = real; } else if (error != ENOTSUP) { goto out; } } dmu_objset_name(zsb->z_os, fsname); error = zfsctl_snapshot_name(ITOZSB(sdip), snm, ZFS_MAX_DATASET_NAME_LEN, from); if (error == 0) error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, ZFS_MAX_DATASET_NAME_LEN, to); if (error == 0) error = zfs_secpolicy_rename_perms(from, to, cr); if (error != 0) goto out; /* * Cannot move snapshots out of the snapdir. */ if (sdip != tdip) { error = SET_ERROR(EINVAL); goto out; } /* * No-op when names are identical. */ if (strcmp(snm, tnm) == 0) { error = 0; goto out; } rw_enter(&zfs_snapshot_lock, RW_WRITER); error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); if (error == 0) (void) zfsctl_snapshot_rename(snm, tnm); rw_exit(&zfs_snapshot_lock); out: kmem_free(from, ZFS_MAX_DATASET_NAME_LEN); kmem_free(to, ZFS_MAX_DATASET_NAME_LEN); kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); ZFS_EXIT(zsb); return (error); } /* * Removing a directory under '.zfs/snapshot' will automatically trigger * the removal of the snapshot with the given name. */ int zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) { zfs_sb_t *zsb = ITOZSB(dip); char *snapname, *real; int error; if (!zfs_admin_snapshot) return (EACCES); ZFS_ENTER(zsb); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zsb->z_case == ZFS_CASE_INSENSITIVE) { error = dmu_snapshot_realname(zsb->z_os, name, real, ZFS_MAX_DATASET_NAME_LEN, NULL); if (error == 0) { name = real; } else if (error != ENOTSUP) { goto out; } } error = zfsctl_snapshot_name(ITOZSB(dip), name, ZFS_MAX_DATASET_NAME_LEN, snapname); if (error == 0) error = zfs_secpolicy_destroy_perms(snapname, cr); if (error != 0) goto out; error = zfsctl_snapshot_unmount(snapname, MNT_FORCE); if ((error == 0) || (error == ENOENT)) error = dsl_destroy_snapshot(snapname, B_FALSE); out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); ZFS_EXIT(zsb); return (error); } /* * Creating a directory under '.zfs/snapshot' will automatically trigger * the creation of a new snapshot with the given name. */ int zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, - struct inode **ipp, cred_t *cr, int flags) + struct inode **ipp, cred_t *cr, int flags) { zfs_sb_t *zsb = ITOZSB(dip); char *dsname; int error; if (!zfs_admin_snapshot) return (EACCES); dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if (zfs_component_namecheck(dirname, NULL, NULL) != 0) { error = SET_ERROR(EILSEQ); goto out; } dmu_objset_name(zsb->z_os, dsname); error = zfs_secpolicy_snapshot_perms(dsname, cr); if (error != 0) goto out; if (error == 0) { error = dmu_objset_snapshot_one(dsname, dirname); if (error != 0) goto out; error = zfsctl_snapdir_lookup(dip, dirname, ipp, 0, cr, NULL, NULL); } out: kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); return (error); } /* * Attempt to unmount a snapshot by making a call to user space. * There is no assurance that this can or will succeed, is just a * best effort. In the case where it does fail, perhaps because * it's in use, the unmount will fail harmlessly. */ int zfsctl_snapshot_unmount(char *snapname, int flags) { char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL, NULL }; char *envp[] = { NULL }; zfs_snapentry_t *se; int error; rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) { rw_exit(&zfs_snapshot_lock); return (ENOENT); } rw_exit(&zfs_snapshot_lock); if (flags & MNT_FORCE) argv[4] = "-fn"; argv[5] = se->se_path; dprintf("unmount; path=%s\n", se->se_path); error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); zfsctl_snapshot_rele(se); /* * The umount system utility will return 256 on error. We must * assume this error is because the file system is busy so it is * converted to the more sensible EBUSY. */ if (error) error = SET_ERROR(EBUSY); return (error); } #define MOUNT_BUSY 0x80 /* Mount failed due to EBUSY (from mntent.h) */ int zfsctl_snapshot_mount(struct path *path, int flags) { struct dentry *dentry = path->dentry; struct inode *ip = dentry->d_inode; zfs_sb_t *zsb; zfs_sb_t *snap_zsb; zfs_snapentry_t *se; char *full_name, *full_path; char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, NULL }; char *envp[] = { NULL }; int error; struct path spath; if (ip == NULL) return (EISDIR); zsb = ITOZSB(ip); ZFS_ENTER(zsb); full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); error = zfsctl_snapshot_name(zsb, dname(dentry), ZFS_MAX_DATASET_NAME_LEN, full_name); if (error) goto error; error = zfsctl_snapshot_path(path, MAXPATHLEN, full_path); if (error) goto error; /* * Multiple concurrent automounts of a snapshot are never allowed. * The snapshot may be manually mounted as many times as desired. */ if (zfsctl_snapshot_ismounted(full_name)) { error = 0; goto error; } /* * Attempt to mount the snapshot from user space. Normally this * would be done using the vfs_kern_mount() function, however that * function is marked GPL-only and cannot be used. On error we * careful to log the real error to the console and return EISDIR * to safely abort the automount. This should be very rare. * * If the user mode helper happens to return EBUSY, a concurrent * mount is already in progress in which case the error is ignored. * Take note that if the program was executed successfully the return * value from call_usermodehelper() will be (exitcode << 8 + signal). */ dprintf("mount; name=%s path=%s\n", full_name, full_path); argv[5] = full_name; argv[6] = full_path; error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) { cmn_err(CE_WARN, "Unable to automount %s/%s: %d", full_path, full_name, error); error = SET_ERROR(EISDIR); } else { /* * EBUSY, this could mean a concurrent mount, or the * snapshot has already been mounted at completely * different place. We return 0 so VFS will retry. For * the latter case the VFS will retry several times * and return ELOOP, which is probably not a very good * behavior. */ error = 0; } goto error; } /* * Follow down in to the mounted snapshot and set MNT_SHRINKABLE * to identify this as an automounted filesystem. */ spath = *path; path_get(&spath); if (zpl_follow_down_one(&spath)) { snap_zsb = ITOZSB(spath.dentry->d_inode); snap_zsb->z_parent = zsb; dentry = spath.dentry; spath.mnt->mnt_flags |= MNT_SHRINKABLE; rw_enter(&zfs_snapshot_lock, RW_WRITER); se = zfsctl_snapshot_alloc(full_name, full_path, snap_zsb->z_os->os_spa, dmu_objset_id(snap_zsb->z_os), dentry); zfsctl_snapshot_add(se); zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); rw_exit(&zfs_snapshot_lock); } path_put(&spath); error: kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); kmem_free(full_path, MAXPATHLEN); ZFS_EXIT(zsb); return (error); } /* * Given the objset id of the snapshot return its zfs_sb_t as zsbp. */ int zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp) { zfs_snapentry_t *se; int error; spa_t *spa = ((zfs_sb_t *)(sb->s_fs_info))->z_os->os_spa; /* * Verify that the snapshot is mounted then lookup the mounted root * rather than the covered mount point. This may fail if the * snapshot has just been unmounted by an unrelated user space * process. This race cannot occur to an expired mount point * because we hold the zfs_snapshot_lock to prevent the race. */ rw_enter(&zfs_snapshot_lock, RW_READER); if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { zfs_sb_t *zsb; zsb = ITOZSB(se->se_root_dentry->d_inode); ASSERT3U(dmu_objset_id(zsb->z_os), ==, objsetid); if (time_after(jiffies, zsb->z_snap_defer_time + MAX(zfs_expire_snapshot * HZ / 2, HZ))) { zsb->z_snap_defer_time = jiffies; zfsctl_snapshot_unmount_cancel(se); zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); } *zsbp = zsb; zfsctl_snapshot_rele(se); error = SET_ERROR(0); } else { error = SET_ERROR(ENOENT); } rw_exit(&zfs_snapshot_lock); /* * Automount the snapshot given the objset id by constructing the * full mount point and performing a traversal. */ if (error == ENOENT) { struct path path; char *mnt; mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP); error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid, MAXPATHLEN, mnt); if (error) { kmem_free(mnt, MAXPATHLEN); return (SET_ERROR(error)); } error = kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); if (error == 0) { *zsbp = ITOZSB(path.dentry->d_inode); path_put(&path); } kmem_free(mnt, MAXPATHLEN); } return (error); } int zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfs_sb_t *zsb = ITOZSB(dip); struct inode *ip; znode_t *dzp; int error; ZFS_ENTER(zsb); if (zsb->z_shares_dir == 0) { ZFS_EXIT(zsb); return (SET_ERROR(ENOTSUP)); } if ((error = zfs_zget(zsb, zsb->z_shares_dir, &dzp)) == 0) { error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL); iput(ZTOI(dzp)); } ZFS_EXIT(zsb); return (error); } /* * Initialize the various pieces we'll need to create and manipulate .zfs * directories. Currently this is unused but available. */ void zfsctl_init(void) { avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node_name)); avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid, sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node_objsetid)); rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL); } /* * Cleanup the various pieces we needed for .zfs directories. In particular * ensure the expiry timer is canceled safely. */ void zfsctl_fini(void) { avl_destroy(&zfs_snapshots_by_name); avl_destroy(&zfs_snapshots_by_objsetid); rw_destroy(&zfs_snapshot_lock); } module_param(zfs_admin_snapshot, int, 0644); MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot"); module_param(zfs_expire_snapshot, int, 0644); MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 36b90596971c..b4f9b0b50312 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1,4199 +1,4199 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl" }; int zio_dva_throttle_enabled = B_TRUE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif int zio_delay_max = ZIO_DELAY_MAX; #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; #else int zio_buf_debug_limit = 0; #endif static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); void zio_init(void) { size_t c; vmem_t *data_alloc_arena = NULL; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; #if defined(_ILP32) && defined(_KERNEL) /* * Cache size limited to 1M on 32-bit platforms until ARC * buffers no longer require virtual address space. */ if (size > zfs_max_recordsize) break; #endif while (!ISP2(p2)) p2 &= p2 - 1; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; /* * Here's the problem - on 4K native devices in userland on * Linux using O_DIRECT, buffers must be 4K aligned or I/O * will fail with EINVAL, causing zdb (and others) to coredump. * Since userland probably doesn't need optimized buffer caches, * we just force 4K alignment on everything. */ align = 8 * SPA_MINBLOCKSIZE; #else if (size <= 4 * SPA_MINBLOCKSIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = MIN(p2 >> 2, PAGESIZE); } #endif if (align != 0) { char name[36]; (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, data_alloc_arena, cflags); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t c; kmem_cache_t *last_cache = NULL; kmem_cache_t *last_data_cache = NULL; for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { #ifdef _ILP32 /* * Cache size limited to 1M on 32-bit platforms until ARC * buffers no longer require virtual address space. */ if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) break; #endif #if defined(ZFS_DEBUG) && !defined(_KERNEL) if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((c + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[c], (long long unsigned)zio_buf_cache_frees[c]); #endif if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); } zio_buf_cache[c] = NULL; if (zio_data_buf_cache[c] != last_data_cache) { last_data_cache = zio_data_buf_cache[c]; kmem_cache_destroy(zio_data_buf_cache[c]); } zio_data_buf_cache[c] = NULL; } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); kmem_cache_free(zio_data_buf_cache[c], buf); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); /* * Ensure that anyone expecting this zio to contain a linear ABD isn't * going to get a nasty surprise when they try to access the data. */ IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks and decompression * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, tmp, zio->io_size, size); abd_return_buf_copy(data, tmp, size); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); int w; /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT(cio->io_child_type <= pio->io_child_type); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); for (w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); pio->io_child_count++; cio->io_parent_count++; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); pio->io_child_count--; cio->io_parent_count--; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) { uint64_t *countp = &zio->io_children[child][wait]; boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * Dispatch the parent zio in its own taskq so that * the child can continue to make progress. This also * prevents overflowing the stack when we have deeply nested * parent-child relationships. */ zio_taskq_dispatch(pio, type, B_FALSE); } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_timestamp_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; int cmp; cmp = AVL_CMP(z1->io_queued_timestamp, z2->io_queued_timestamp); if (likely(cmp)) return (cmp); cmp = AVL_CMP(z1->io_offset, z2->io_offset); if (likely(cmp)) return (cmp); return (AVL_PCMP(z1, z2)); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; ASSERT3U(psize, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } static void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) { int i; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { zfs_panic_recover("blkptr at %p has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); vdev_t *vd; uint64_t offset, asize; if (vdevid >= spa->spa_root_vdev->vdev_children) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { zfs_panic_recover("blkptr at %p DVA %u has hole " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } offset = DVA_GET_OFFSET(&bp->blk_dva[i]); asize = DVA_GET_ASIZE(&bp->blk_dva[i]); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "OFFSET %llu", bp, i, (longlong_t)offset); } } } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; zfs_blkptr_verify(spa, bp); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_physdone = physdone; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). */ if (data == NULL && zio->io_prop.zp_dedup_verify) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); } } zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags) { zio_t *zio; enum zio_stage stage = ZIO_FREE_PIPELINE; ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); metaslab_check_free(spa, bp); arc_freed(spa, bp); /* * GANG and DEDUP blocks can induce a read (for the gang block header, * or the DDT), so issue them asynchronously so that this thread is * not tied up. */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) stage |= ZIO_STAGE_ISSUE_ASYNC; zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); return (zio); } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; dprintf_bp(bp, "claiming in txg %llu", txg); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); ASSERT(txg == spa_first_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, done, private, flags)); } return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - abd_t *data, uint64_t size, int type, zio_priority_t priority, - enum zio_flag flags, zio_done_func_t *done, void *private) + abd_t *data, uint64_t size, int type, zio_priority_t priority, + enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; ASSERT(vd->vdev_parent == (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } if (vd->vdev_children == 0) offset += VDEV_LABEL_START_SIZE; flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERTV(metaslab_class_t *mc = spa_normal_class(pio->io_spa)); ASSERT(mc->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT(zio->io_executor == NULL); ASSERT(zio->io_orig_size == zio->io_size); ASSERT(size <= zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static int zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static int zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) return (ZIO_PIPELINE_CONTINUE); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (ZIO_PIPELINE_CONTINUE); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (ZIO_PIPELINE_CONTINUE); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (ZIO_PIPELINE_CONTINUE); } static int zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; int pass = 1; EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) return (ZIO_PIPELINE_STOP); if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && psize == lsize) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (ZIO_PIPELINE_CONTINUE); } else { /* * Round up compressed size up to the ashift * of the smallest-ashift device, and zero the tail. * This ensures that the compressed size of the BP * (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ size_t rounded; ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); rounded = (size_t)P2ROUNDUP(psize, 1ULL << spa->spa_min_ashift); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cdata, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; ASSERT(psize != 0); zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (ZIO_PIPELINE_CONTINUE); } static int zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if (zio->io_priority == ZIO_PRIORITY_NOW && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { kthread_t *executor = zio->io_executor; spa_t *spa = zio->io_spa; zio_type_t t; for (t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (taskq_member(tqs->stqs_taskq[i], executor)) return (B_TRUE); } } return (B_FALSE); } static int zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } void zio_interrupt(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; clock_t expire_at_tick = ddi_get_lbolt() + NSEC_TO_TICK(diff); DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); if (NSEC_TO_TICK(diff) == 0) { /* Our delay is less than a jiffy - just spin */ zfs_sleep_until(zio->io_target_timestamp); } else { /* * Use taskq_dispatch_delay() in the place of * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, (task_func_t *)zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just * finish the zio without a delay. */ zio_interrupt(zio); } } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(zio_t *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { zio->io_executor = curthread; ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; int rv; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) return; ASSERT(rv == ZIO_PIPELINE_CONTINUE); } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { int error; ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) cv_wait_io(&zio->io_cv, &zio->io_lock); mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { ASSERT(zio->io_executor == NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && zio_unique_parent(zio) == NULL) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; kpreempt_disable(); pio = spa->spa_async_zio_root[CPU_SEQID]; kpreempt_enable(); zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); } /* * ========================================================================== * Reexecute or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; int c, w; zio_link_t *zl = NULL; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; for (w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); mutex_enter(&pio->io_lock); for (w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); } /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on him. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " "failure and has been suspended.\n", spa_name(spa)); zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = B_TRUE; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = B_FALSE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_put(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_put(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } /* ARGSUSED */ static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; int g; for (g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; int g; if (gn == NULL) return; for (g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; int g; ASSERT(gio == zio_unique_parent(zio)); ASSERT(zio->io_child_count == 0); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_put(zio->io_abd); for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; int g; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static int zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (ZIO_PIPELINE_CONTINUE); } static int zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; int d; ASSERTV(zio_t *gio = zio->io_gang_leader); if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { abd_put(zio->io_abd); } static int zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); zio_prop_t zp; int g, error; int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); flags |= METASLAB_ASYNC_ALLOC; VERIFY(refcount_held(&mc->mc_alloc_slots, pio)); /* * The logical zio has already placed a reservation for * 'copies' allocation slots but gang blocks may require * additional copies. These additional copies * (i.e. gbh_copies - copies) are guaranteed to succeed * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); /* * If we failed to allocate the gang block header then * we remove any additional allocation reservations that * we placed here. The original reservation will * be removed when the logical I/O goes to the ready * stage. */ metaslab_class_throttle_unreserve(mc, gbh_copies - copies, pio); } pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (g = 0; resid != 0; resid -= lsize, g++) { zio_t *cio; lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], abd_get_offset(pio->io_abd, pio->io_size - resid), lsize, lsize, &zp, zio_write_gang_member_ready, NULL, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA)); /* * Gang children won't throttle but we should * account for their work, so reserve an allocation * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, zp.zp_copies, cio, flags)); } zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; /* * We didn't allocate this bp, so make sure it doesn't get unmarked. */ pio->io_flags &= ~ZIO_FLAG_FASTWRITE; zio_nowait(zio); return (ZIO_PIPELINE_CONTINUE); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static int zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (ZIO_PIPELINE_CONTINUE); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_abd == NULL) dde->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static int zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; int p; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (ZIO_PIPELINE_CONTINUE); for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (ZIO_PIPELINE_CONTINUE); } static int zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (ZIO_PIPELINE_CONTINUE); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (ZIO_PIPELINE_CONTINUE); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; int p; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_data directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL && do_raw) { return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } for (p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; zio_link_t *zl; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { zio_link_t *zl = NULL; while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static void zio_ddt_ditto_write_done(zio_t *zio) { int p = DDT_PHYS_DITTO; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_key_t *ddk = &dde->dde_key; ASSERTV(zio_prop_t *zp = &zio->io_prop); ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); if (ddp->ddp_phys_birth != 0) ddt_phys_free(ddt, ddk, ddp, zio->io_txg); ddt_phys_fill(ddp, bp); } ddt_exit(ddt); } static int zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; int ditto_copies; zio_t *cio = NULL; zio_t *dio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; } zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); ASSERT(ditto_copies < SPA_DVAS_PER_BP); if (ditto_copies > ddt_ditto_copies_present(dde) && dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { zio_prop_t czp = *zp; czp.zp_copies = ditto_copies; /* * If we arrived here with an override bp, we won't have run * the transform stack, so we won't have the data we need to * generate a child i/o. So, toss the override bp and restart. * This is safe, because using the override bp is just an * optimization; and it's rare, so the cost doesn't matter. */ if (zio->io_bp_override) { zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; zio->io_pipeline = ZIO_WRITE_PIPELINE; zio->io_bp_override = NULL; BP_ZERO(bp); ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); if (cio) zio_nowait(cio); if (dio) zio_nowait(dio); return (ZIO_PIPELINE_CONTINUE); } ddt_entry_t *freedde; /* for debugging */ static int zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddp = ddt_phys_select(dde, bp); if (ddp) ddt_phys_decref(ddp); } ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(spa_t *spa) { zio_t *zio; ASSERT(MUTEX_HELD(&spa->spa_alloc_lock)); zio = avl_first(&spa->spa_alloc_tree); if (zio == NULL) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ if (!metaslab_class_throttle_reserve(spa_normal_class(spa), zio->io_prop.zp_copies, zio, 0)) { return (NULL); } avl_remove(&spa->spa_alloc_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } static int zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (ZIO_PIPELINE_CONTINUE); } ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); mutex_enter(&spa->spa_alloc_lock); ASSERT(zio->io_type == ZIO_TYPE_WRITE); avl_add(&spa->spa_alloc_tree, zio); nio = zio_io_to_allocate(zio->io_spa); mutex_exit(&spa->spa_alloc_lock); if (nio == zio) return (ZIO_PIPELINE_CONTINUE); if (nio != NULL) { ASSERT3U(nio->io_queued_timestamp, <=, zio->io_queued_timestamp); ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE); /* * We are passing control to a new zio so make sure that * it is processed by a different thread. We do this to * avoid stack overflows that can occur when parents are * throttled and children are making progress. We allow * it to go to the head of the taskq since it's already * been waiting. */ zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE); } return (ZIO_PIPELINE_STOP); } void zio_allocate_dispatch(spa_t *spa) { zio_t *zio; mutex_enter(&spa->spa_alloc_lock); zio = zio_io_to_allocate(spa); mutex_exit(&spa->spa_alloc_lock); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } static int zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio); if (error != 0) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; } return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { int g; ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t use_slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); } return (error); } /* * Free an intent log block. */ void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) { ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); ASSERT(!BP_IS_GANG(bp)); zio_free(spa, txg, bp); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static int zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } ASSERT3P(zio->io_logical, !=, zio); /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care * about non-scrubbing, top-level reads and writes with the following * characteristics: * - synchronous writes of user data to non-slog devices * - any reads of user data * When these conditions are met, adjust the timestamp of spa_last_io * which allows the scan thread to adjust its workload accordingly. */ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && vd == vd->vdev_top && !vd->vdev_islog && zio->io_bookmark.zb_objset != DMU_META_OBJSET && zio->io_txg != spa_syncing_txg(spa)) { uint64_t old = spa->spa_last_io; uint64_t new = ddi_get_lbolt64(); if (old != new) (void) atomic_cas_64(&spa->spa_last_io, old, new); } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering with nested replication. * For example, given a mirror of mirrors, (A+B)+(C+D), if only * A is out of date, we'll read from C+D, then use the data to * resilver A+B -- but we don't actually want to resilver B, just A. * The top-level mirror has no way to know this, so instead we just * discard unnecessary repairs as we work our way down the vdev tree. * The same logic applies to any form of nested replication: * ditto + mirror, RAID-Z + replacing, etc. This covers them all. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (ZIO_PIPELINE_CONTINUE); } if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (ZIO_PIPELINE_CONTINUE); if ((zio = vdev_queue_io(zio)) == NULL) return (ZIO_PIPELINE_STOP); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (ZIO_PIPELINE_STOP); } } zio->io_delay = gethrtime(); vd->vdev_ops->vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } static int zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injection(vd, zio, EIO); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); return (ZIO_PIPELINE_CONTINUE); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const void *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } /*ARGSUSED*/ void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); abd_copy_to_buf(buf, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_buf_free; } static int zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (ZIO_PIPELINE_STOP); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && zio->io_physdone != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); zio->io_physdone(zio->io_logical); } return (ZIO_PIPELINE_CONTINUE); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static int zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (ZIO_PIPELINE_CONTINUE); } static int zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, zio, zio->io_offset, zio->io_size, NULL, &info); } } return (ZIO_PIPELINE_CONTINUE); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static int zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) return (ZIO_PIPELINE_STOP); if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( spa_normal_class(zio->io_spa), zio->io_prop.zp_copies, zio); zio_allocate_dispatch(zio->io_spa); } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (ZIO_PIPELINE_CONTINUE); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; ASSERTV(zio_t *lio = zio->io_logical); ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY))); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* * Parents of gang children can have two flavors -- ones that * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) * and ones that allocated the constituent blocks. The allocation * throttle needs to know the allocating parent zio so we must find * it here. */ if (pio->io_child_type == ZIO_CHILD_GANG) { /* * If our parent is a rewrite gang child then our grandparent * would have been the one that performed the allocation. */ if (pio->io_flags & ZIO_FLAG_IO_REWRITE) pio = zio_unique_parent(pio); flags |= METASLAB_GANG_CHILD; } ASSERT(IO_IS_ALLOCATING(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa), 1, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ zio_allocate_dispatch(zio->io_spa); } static int zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recurisvely up to 19 levels deep. */ uint64_t psize = zio->io_size; zio_t *pio, *pio_next; int c, w; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { ASSERT(spa_normal_class( zio->io_spa)->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } /* * If the allocation throttle is enabled, verify that * we have decremented the refcounts for every I/O that was throttled. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio); VERIFY(refcount_not_held( &(spa_normal_class(zio->io_spa)->mc_alloc_slots), zio)); } for (c = 0; c < ZIO_CHILD_TYPES; c++) for (w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT(!BP_SHOULD_BYTESWAP(zio->io_bp)); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); char *abuf = NULL; abd_t *adata = zio->io_abd; if (asize != psize) { adata = abd_alloc_linear(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } if (adata != NULL) abuf = abd_borrow_buf_copy(adata, asize); zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); if (adata != NULL) abd_return_buf(adata, abuf, asize); if (asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ if (zio->io_delay >= MSEC2NSEC(zio_delay_max)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio, 0, 0); } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, zio); zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, zio, 0, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute = 0; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } return (ZIO_PIPELINE_STOP); } ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (ZIO_PIPELINE_STOP); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT(last_block->zb_level == 0); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); module_param(zio_delay_max, int, 0644); MODULE_PARM_DESC(zio_delay_max, "Max zio millisec delay before posting event"); module_param(zio_requeue_io_start_cut_in_line, int, 0644); MODULE_PARM_DESC(zio_requeue_io_start_cut_in_line, "Prioritize requeued I/O"); module_param(zfs_sync_pass_deferred_free, int, 0644); MODULE_PARM_DESC(zfs_sync_pass_deferred_free, "Defer frees starting in this pass"); module_param(zfs_sync_pass_dont_compress, int, 0644); MODULE_PARM_DESC(zfs_sync_pass_dont_compress, "Don't compress starting in this pass"); module_param(zfs_sync_pass_rewrite, int, 0644); MODULE_PARM_DESC(zfs_sync_pass_rewrite, "Rewrite new bps starting in this pass"); module_param(zio_dva_throttle_enabled, int, 0644); MODULE_PARM_DESC(zio_dva_throttle_enabled, "Throttle block allocations in the ZIO pipeline"); #endif diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 37116f049748..53658daca8e9 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -1,472 +1,472 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include /* * Checksum vectors. * * In the SPA, everything is checksummed. We support checksum vectors * for three distinct reasons: * * 1. Different kinds of data need different levels of protection. * For SPA metadata, we always want a very strong checksum. * For user data, we let users make the trade-off between speed * and checksum strength. * * 2. Cryptographic hash and MAC algorithms are an area of active research. * It is likely that in future hash functions will be at least as strong * as current best-of-breed, and may be substantially faster as well. * We want the ability to take advantage of these new hashes as soon as * they become available. * * 3. If someone develops hardware that can compute a strong hash quickly, * we want the ability to take advantage of that hardware. * * Of course, we don't want a checksum upgrade to invalidate existing * data, so we store the checksum *function* in eight bits of the bp. * This gives us room for up to 256 different checksum functions. * * When writing a block, we always checksum it with the latest-and-greatest * checksum function of the appropriate strength. When reading a block, * we compare the expected checksum against the actual checksum, which we * compute via the checksum function specified by BP_GET_CHECKSUM(bp). * * SALTED CHECKSUMS * * To enable the use of less secure hash algorithms with dedup, we * introduce the notion of salted checksums (MACs, really). A salted * checksum is fed both a random 256-bit value (the salt) and the data * to be checksummed. This salt is kept secret (stored on the pool, but * never shown to the user). Thus even if an attacker knew of collision * weaknesses in the hash algorithm, they won't be able to mount a known * plaintext attack on the DDT, since the actual hash value cannot be * known ahead of time. How the salt is used is algorithm-specific * (some might simply prefix it to the data block, others might need to * utilize a full-blown HMAC). On disk the salt is stored in a ZAP * object in the MOS (DMU_POOL_CHECKSUM_SALT). * * CONTEXT TEMPLATES * * Some hashing algorithms need to perform a substantial amount of * initialization work (e.g. salted checksums above may need to pre-hash * the salt) before being able to process data. Performing this * redundant work for each block would be wasteful, so we instead allow * a checksum algorithm to do the work once (the first time it's used) * and then keep this pre-initialized context as a template inside the * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to * construct and destruct the pre-initialized checksum context. The * pre-initialized context is then reused during each checksum * invocation and passed to the checksum function. */ /*ARGSUSED*/ static void abd_checksum_off(abd_t *abd, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) + const void *ctx_template, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } /*ARGSUSED*/ void abd_fletcher_2_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { fletcher_init(zcp); (void) abd_iterate_func(abd, 0, size, fletcher_2_incremental_native, zcp); } /*ARGSUSED*/ void abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { fletcher_init(zcp); (void) abd_iterate_func(abd, 0, size, fletcher_2_incremental_byteswap, zcp); } /*ARGSUSED*/ void abd_fletcher_4_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { fletcher_init(zcp); (void) abd_iterate_func(abd, 0, size, fletcher_4_incremental_native, zcp); } /*ARGSUSED*/ void abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { fletcher_init(zcp); (void) abd_iterate_func(abd, 0, size, fletcher_4_incremental_byteswap, zcp); } zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "on"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "off"}, {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, 0, "fletcher2"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "noparity"}, {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, }; /* * The flag corresponding to the "verify" in dedup=[checksum,]verify * must be cleared first, so callers should use ZIO_CHECKSUM_MASK. */ spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum) { VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0); switch (cksum) { case ZIO_CHECKSUM_SHA512: return (SPA_FEATURE_SHA512); case ZIO_CHECKSUM_SKEIN: return (SPA_FEATURE_SKEIN); case ZIO_CHECKSUM_EDONR: return (SPA_FEATURE_EDONR); default: return (SPA_FEATURE_NONE); } } enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent) { ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); if (child == ZIO_CHECKSUM_INHERIT) return (parent); if (child == ZIO_CHECKSUM_ON) return (ZIO_CHECKSUM_ON_VALUE); return (child); } enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent) { ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); if (child == ZIO_CHECKSUM_INHERIT) return (parent); if (child == ZIO_CHECKSUM_ON) return (spa_dedup_checksum(spa)); if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY)) return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY); ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags & ZCHECKSUM_FLAG_DEDUP) || (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF); return (child); } /* * Set the external verifier for a gang block based on , * a tuple which is guaranteed to be unique for the life of the pool. */ static void zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t txg = BP_PHYSICAL_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); } /* * Set the external verifier for a label block based on its offset. * The vdev is implicit, and the txg is unknowable at pool open time -- * hence the logic in vdev_uberblock_load() to find the most recent copy. */ static void zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) { ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); } /* * Calls the template init function of a checksum which supports context * templates and installs the template into the spa_t. */ static void zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; if (ci->ci_tmpl_init == NULL) return; if (spa->spa_cksum_tmpls[checksum] != NULL) return; VERIFY(ci->ci_tmpl_free != NULL); mutex_enter(&spa->spa_cksum_tmpls_lock); if (spa->spa_cksum_tmpls[checksum] == NULL) { spa->spa_cksum_tmpls[checksum] = ci->ci_tmpl_init(&spa->spa_cksum_salt); VERIFY(spa->spa_cksum_tmpls[checksum] != NULL); } mutex_exit(&spa->spa_cksum_tmpls_lock); } /* * Generate the checksum. */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, abd_t *abd, uint64_t size) { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t cksum; spa_t *spa = zio->io_spa; ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); ASSERT(ci->ci_func[0] != NULL); zio_checksum_template_init(checksum, spa); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; void *data = abd_to_buf(abd); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ, uint64_t); eck = &zilc->zc_eck; } else { eck = (zio_eck_t *)((char *)data + size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&eck->zec_cksum, bp); else if (checksum == ZIO_CHECKSUM_LABEL) zio_checksum_label_verifier(&eck->zec_cksum, offset); else bp->blk_cksum = eck->zec_cksum; eck->zec_magic = ZEC_MAGIC; ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); eck->zec_cksum = cksum; } else { ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &bp->blk_cksum); } } int zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; int byteswap; zio_cksum_t actual_cksum, expected_cksum; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (SET_ERROR(EINVAL)); zio_checksum_template_init(checksum, spa); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; zio_cksum_t verifier; size_t eck_offset; uint64_t data_size = size; void *data = abd_borrow_buf_copy(abd, data_size); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; uint64_t nused; eck = &zilc->zc_eck; if (eck->zec_magic == ZEC_MAGIC) { nused = zilc->zc_nused; } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) { nused = BSWAP_64(zilc->zc_nused); } else { abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); } if (nused > data_size) { abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); } size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { eck = (zio_eck_t *)((char *)data + data_size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) zio_checksum_label_verifier(&verifier, offset); else verifier = bp->blk_cksum; byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data; expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; abd_return_buf_copy(abd, data, data_size); ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); abd_copy_from_buf_off(abd, &expected_cksum, eck_offset, sizeof (zio_cksum_t)); if (byteswap) { byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } if (info != NULL) { info->zbc_expected = expected_cksum; info->zbc_actual = actual_cksum; info->zbc_checksum_name = ci->ci_name; info->zbc_byteswapped = byteswap; info->zbc_injected = 0; info->zbc_has_cksum = 1; } if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); return (0); } int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) { blkptr_t *bp = zio->io_bp; uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); int error; uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; error = zio_checksum_error_impl(spa, bp, checksum, data, size, offset, info); if (error != 0 && zio_injection_enabled && !zio->io_error && (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { info->zbc_injected = 1; return (error); } return (error); } /* * Called by a spa_t that's about to be deallocated. This steps through * all of the checksum context templates and deallocates any that were * initialized using the algorithm-specific template init function. */ void zio_checksum_templates_free(spa_t *spa) { enum zio_checksum checksum; for (checksum = 0; checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { if (spa->spa_cksum_tmpls[checksum] != NULL) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; VERIFY(ci->ci_tmpl_free != NULL); ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); spa->spa_cksum_tmpls[checksum] = NULL; } } } diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c index 332fb992e7a2..356fa78f83af 100644 --- a/module/zfs/zpl_file.c +++ b/module/zfs/zpl_file.c @@ -1,876 +1,876 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #ifdef CONFIG_COMPAT #include #endif #include #include #include #include #include static int zpl_open(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = generic_file_open(ip, filp); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_release(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); if (ITOZ(ip)->z_atime_dirty) zfs_mark_inode_dirty(ip); crhold(cr); error = -zfs_close(ip, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_iterate(struct file *filp, struct dir_context *ctx) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_readdir(file_inode(filp), ctx, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) { struct dir_context ctx = DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* HAVE_VFS_ITERATE */ #if defined(HAVE_FSYNC_WITH_DENTRY) /* * Linux 2.6.x - 2.6.34 API, * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *' * to the fops->fsync() hook. For this reason, we must be careful not to * use filp unconditionally. */ static int zpl_fsync(struct file *filp, struct dentry *dentry, int datasync) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(dentry->d_inode, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { struct file *filp = kiocb->ki_filp; return (zpl_fsync(filp, file_dentry(filp), datasync)); } #endif #elif defined(HAVE_FSYNC_WITHOUT_DENTRY) /* * Linux 2.6.35 - 3.0 API, * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed * redundant. The dentry is still accessible via filp->f_path.dentry, * and we are guaranteed that filp will never be NULL. */ static int zpl_fsync(struct file *filp, int datasync) { struct inode *inode = filp->f_mapping->host; cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(inode, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, datasync)); } #endif #elif defined(HAVE_FSYNC_RANGE) /* * Linux 3.1 - 3.x API, * As of 3.1 the responsibility to call filemap_write_and_wait_range() has * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex * lock is no longer held by the caller, for zfs we don't require the lock * to be held so we don't acquire it. */ static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(inode, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); } #endif #else #error "Unsupported fops->fsync() implementation" #endif static ssize_t zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, cred_t *cr, size_t skip) { ssize_t read; uio_t uio; int error; fstrans_cookie_t cookie; uio.uio_iov = iovp; uio.uio_skip = skip; uio.uio_resid = count; uio.uio_iovcnt = nr_segs; uio.uio_loffset = *ppos; uio.uio_limit = MAXOFFSET_T; uio.uio_segflg = segment; cookie = spl_fstrans_mark(); error = -zfs_read(ip, &uio, flags, cr); spl_fstrans_unmark(cookie); if (error < 0) return (error); read = count - uio.uio_resid; *ppos += read; task_io_account_read(read); return (read); } inline ssize_t zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, uio_seg_t segment, int flags, cred_t *cr) { struct iovec iov; iov.iov_base = (void *)buf; iov.iov_len = len; return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment, flags, cr, 0)); } static ssize_t zpl_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { cred_t *cr = CRED(); ssize_t read; crhold(cr); read = zpl_read_common(filp->f_mapping->host, buf, len, ppos, UIO_USERSPACE, filp->f_flags, cr); crfree(cr); file_accessed(filp); return (read); } static ssize_t zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) { cred_t *cr = CRED(); struct file *filp = kiocb->ki_filp; ssize_t read; crhold(cr); read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count, nr_segs, &kiocb->ki_pos, seg, filp->f_flags, cr, skip); crfree(cr); file_accessed(filp); return (read); } #if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { ssize_t ret; uio_seg_t seg = UIO_USERSPACE; if (to->type & ITER_KVEC) seg = UIO_SYSSPACE; if (to->type & ITER_BVEC) seg = UIO_BVEC; ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, iov_iter_count(to), seg, to->iov_offset); if (ret > 0) iov_iter_advance(to, ret); return (ret); } #else static ssize_t zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { return (zpl_iter_read_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes, UIO_USERSPACE, 0)); } #endif /* HAVE_VFS_RW_ITERATE */ static ssize_t zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, cred_t *cr, size_t skip) { ssize_t wrote; uio_t uio; int error; fstrans_cookie_t cookie; if (flags & O_APPEND) *ppos = i_size_read(ip); uio.uio_iov = iovp; uio.uio_skip = skip; uio.uio_resid = count; uio.uio_iovcnt = nr_segs; uio.uio_loffset = *ppos; uio.uio_limit = MAXOFFSET_T; uio.uio_segflg = segment; cookie = spl_fstrans_mark(); error = -zfs_write(ip, &uio, flags, cr); spl_fstrans_unmark(cookie); if (error < 0) return (error); wrote = count - uio.uio_resid; *ppos += wrote; task_io_account_write(wrote); return (wrote); } inline ssize_t zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, uio_seg_t segment, int flags, cred_t *cr) { struct iovec iov; iov.iov_base = (void *)buf; iov.iov_len = len; return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment, flags, cr, 0)); } static ssize_t zpl_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { cred_t *cr = CRED(); ssize_t wrote; crhold(cr); wrote = zpl_write_common(filp->f_mapping->host, buf, len, ppos, UIO_USERSPACE, filp->f_flags, cr); crfree(cr); return (wrote); } static ssize_t zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) { cred_t *cr = CRED(); struct file *filp = kiocb->ki_filp; ssize_t wrote; crhold(cr); wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count, nr_segs, &kiocb->ki_pos, seg, filp->f_flags, cr, skip); crfree(cr); return (wrote); } #if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { ssize_t ret; uio_seg_t seg = UIO_USERSPACE; if (from->type & ITER_KVEC) seg = UIO_SYSSPACE; if (from->type & ITER_BVEC) seg = UIO_BVEC; ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, iov_iter_count(from), seg, from->iov_offset); if (ret > 0) iov_iter_advance(from, ret); return (ret); } #else static ssize_t zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { return (zpl_iter_write_common(kiocb, iovp, nr_segs, kiocb->ki_nbytes, UIO_USERSPACE, 0)); } #endif /* HAVE_VFS_RW_ITERATE */ static loff_t zpl_llseek(struct file *filp, loff_t offset, int whence) { #if defined(SEEK_HOLE) && defined(SEEK_DATA) fstrans_cookie_t cookie; if (whence == SEEK_DATA || whence == SEEK_HOLE) { struct inode *ip = filp->f_mapping->host; loff_t maxbytes = ip->i_sb->s_maxbytes; loff_t error; spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); error = -zfs_holey(ip, whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); spl_inode_unlock_shared(ip); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ return (generic_file_llseek(filp, offset, whence)); } /* * It's worth taking a moment to describe how mmap is implemented * for zfs because it differs considerably from other Linux filesystems. * However, this issue is handled the same way under OpenSolaris. * * The issue is that by design zfs bypasses the Linux page cache and * leaves all caching up to the ARC. This has been shown to work * well for the common read(2)/write(2) case. However, mmap(2) * is problem because it relies on being tightly integrated with the * page cache. To handle this we cache mmap'ed files twice, once in * the ARC and a second time in the page cache. The code is careful * to keep both copies synchronized. * * When a file with an mmap'ed region is written to using write(2) * both the data in the ARC and existing pages in the page cache * are updated. For a read(2) data will be read first from the page * cache then the ARC if needed. Neither a write(2) or read(2) will * will ever result in new pages being added to the page cache. * * New pages are added to the page cache only via .readpage() which * is called when the vfs needs to read a page off disk to back the * virtual memory region. These pages may be modified without * notifying the ARC and will be written out periodically via * .writepage(). This will occur due to either a sync or the usual * page aging behavior. Note because a read(2) of a mmap'ed file * will always check the page cache first even when the ARC is out * of date correct data will still be returned. * * While this implementation ensures correct behavior it does have * have some drawbacks. The most obvious of which is that it * increases the required memory footprint when access mmap'ed * files. It also adds additional complexity to the code keeping * both caches synchronized. * * Longer term it may be possible to cleanly resolve this wart by * mapping page cache pages directly on to the ARC buffers. The * Linux address space operations are flexible enough to allow * selection of which pages back a particular index. The trick * would be working out the details of which subsystem is in * charge, the ARC, the page cache, or both. It may also prove * helpful to move the ARC buffers to a scatter-gather lists * rather than a vmalloc'ed region. */ static int zpl_mmap(struct file *filp, struct vm_area_struct *vma) { struct inode *ip = filp->f_mapping->host; znode_t *zp = ITOZ(ip); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); if (error) return (error); error = generic_file_mmap(filp, vma); if (error) return (error); mutex_enter(&zp->z_lock); zp->z_is_mapped = 1; mutex_exit(&zp->z_lock); return (error); } /* * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). * * Current this function relies on zpl_read_common() and the O_DIRECT * flag to read in a page. This works but the more correct way is to * update zfs_fillpage() to be Linux friendly and use that interface. */ static int zpl_readpage(struct file *filp, struct page *pp) { struct inode *ip; struct page *pl[1]; int error = 0; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ip = pp->mapping->host; pl[0] = pp; cookie = spl_fstrans_mark(); error = -zfs_getpage(ip, pl, 1); spl_fstrans_unmark(cookie); if (error) { SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); flush_dcache_page(pp); } unlock_page(pp); return (error); } /* * Populate a set of pages with data for the Linux page cache. This * function will only be called for read ahead and never for demand * paging. For simplicity, the code relies on read_cache_pages() to * correctly lock each page for IO and call zpl_readpage(). */ static int zpl_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct list_head *pages, unsigned nr_pages) { return (read_cache_pages(mapping, pages, (filler_t *)zpl_readpage, filp)); } int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); (void) zfs_putpage(mapping->host, pp, wbc); spl_fstrans_unmark(cookie); return (0); } static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { znode_t *zp = ITOZ(mapping->host); zfs_sb_t *zsb = ITOZSB(mapping->host); enum writeback_sync_modes sync_mode; int result; ZFS_ENTER(zsb); if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; ZFS_EXIT(zsb); sync_mode = wbc->sync_mode; /* * We don't want to run write_cache_pages() in SYNC mode here, because * that would make putpage() wait for a single page to be committed to * disk every single time, resulting in atrocious performance. Instead * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ wbc->sync_mode = WB_SYNC_NONE; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); if (sync_mode != wbc->sync_mode) { ZFS_ENTER(zsb); ZFS_VERIFY_ZP(zp); if (zsb->z_log != NULL) zil_commit(zsb->z_log, zp->z_id); ZFS_EXIT(zsb); /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty * pages (see the implementation of write_cache_pages() for * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); } return (result); } /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations * which never call .write(). These dirty pages are kept in sync with * the ARC buffers via this hook. */ static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; return (zpl_putpage(pp, wbc, pp->mapping)); } /* * The only flag combination which matches the behavior of zfs_space() * is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE * flag was introduced in the 2.6.38 kernel. */ #if defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) long zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) { int error = -EOPNOTSUPP; #if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) cred_t *cr = CRED(); flock64_t bf; loff_t olen; fstrans_cookie_t cookie; if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return (error); if (offset < 0 || len <= 0) return (-EINVAL); spl_inode_lock(ip); olen = i_size_read(ip); if (offset > olen) { spl_inode_unlock(ip); return (0); } if (offset + len > olen) len = olen - offset; bf.l_type = F_WRLCK; bf.l_whence = 0; bf.l_start = offset; bf.l_len = len; bf.l_pid = 0; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr); spl_fstrans_unmark(cookie); spl_inode_unlock(ip); crfree(cr); #endif /* defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) */ ASSERT3S(error, <=, 0); return (error); } #endif /* defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) */ #ifdef HAVE_FILE_FALLOCATE static long zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) { return zpl_fallocate_common(file_inode(filp), mode, offset, len); } #endif /* HAVE_FILE_FALLOCATE */ /* * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file * attributes common to both Linux and Solaris are mapped. */ static int zpl_ioctl_getflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); unsigned int ioctl_flags = 0; uint64_t zfs_flags = ITOZ(ip)->z_pflags; int error; if (zfs_flags & ZFS_IMMUTABLE) ioctl_flags |= FS_IMMUTABLE_FL; if (zfs_flags & ZFS_APPENDONLY) ioctl_flags |= FS_APPEND_FL; if (zfs_flags & ZFS_NODUMP) ioctl_flags |= FS_NODUMP_FL; ioctl_flags &= FS_FL_USER_VISIBLE; error = copy_to_user(arg, &ioctl_flags, sizeof (ioctl_flags)); return (error); } /* * fchange() is a helper macro to detect if we have been asked to change a * flag. This is ugly, but the requirement that we do this is a consequence of * how the Linux file attribute interface was designed. Another consequence is * that concurrent modification of files suffers from a TOCTOU race. Neither * are things we can fix without modifying the kernel-userland interface, which * is outside of our jurisdiction. */ #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) static int zpl_ioctl_setflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint64_t zfs_flags = ITOZ(ip)->z_pflags; unsigned int ioctl_flags; cred_t *cr = CRED(); xvattr_t xva; xoptattr_t *xoap; int error; fstrans_cookie_t cookie; if (copy_from_user(&ioctl_flags, arg, sizeof (ioctl_flags))) return (-EFAULT); if ((ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL))) return (-EOPNOTSUPP); if ((ioctl_flags & ~(FS_FL_USER_MODIFIABLE))) return (-EACCES); if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EACCES); if (!zpl_inode_owner_or_capable(ip)) return (-EACCES); xva_init(&xva); xoap = xva_getxoptattr(&xva); XVA_SET_REQ(&xva, XAT_IMMUTABLE); if (ioctl_flags & FS_IMMUTABLE_FL) xoap->xoa_immutable = B_TRUE; XVA_SET_REQ(&xva, XAT_APPENDONLY); if (ioctl_flags & FS_APPEND_FL) xoap->xoa_appendonly = B_TRUE; XVA_SET_REQ(&xva, XAT_NODUMP); if (ioctl_flags & FS_NODUMP_FL) xoap->xoa_nodump = B_TRUE; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr); spl_fstrans_unmark(cookie); crfree(cr); return (error); } static long zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC_GETFLAGS: return (zpl_ioctl_getflags(filp, (void *)arg)); case FS_IOC_SETFLAGS: return (zpl_ioctl_setflags(filp, (void *)arg)); default: return (-ENOTTY); } } #ifdef CONFIG_COMPAT static long zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; default: return (-ENOTTY); } return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); } #endif /* CONFIG_COMPAT */ const struct address_space_operations zpl_address_space_operations = { .readpages = zpl_readpages, .readpage = zpl_readpage, .writepage = zpl_writepage, .writepages = zpl_writepages, }; const struct file_operations zpl_file_operations = { .open = zpl_open, .release = zpl_release, .llseek = zpl_llseek, .read = zpl_read, .write = zpl_write, #ifdef HAVE_VFS_RW_ITERATE .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #else .aio_read = zpl_aio_read, .aio_write = zpl_aio_write, #endif .mmap = zpl_mmap, .fsync = zpl_fsync, #ifdef HAVE_FILE_AIO_FSYNC .aio_fsync = zpl_aio_fsync, #endif #ifdef HAVE_FILE_FALLOCATE .fallocate = zpl_fallocate, #endif /* HAVE_FILE_FALLOCATE */ .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; const struct file_operations zpl_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_iterate, #else .readdir = zpl_readdir, #endif .fsync = zpl_fsync, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index aad110b1bc06..e5cd47afd8cb 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1,2220 +1,2221 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev// * * Volumes are persistent through reboot and module load. No user command * needs to be run before opening and using a device. * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int zvol_inhibit_dev = 0; unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; static kmutex_t zvol_state_lock; static list_t zvol_state_list; #define ZVOL_HT_SIZE 1024 static struct hlist_head *zvol_htable; #define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)]) static DEFINE_IDA(zvol_ida); /* * The in-core state of each volume. */ struct zvol_state { char zv_name[MAXNAMELEN]; /* name */ uint64_t zv_volsize; /* advertised space */ uint64_t zv_volblocksize; /* volume block size */ objset_t *zv_objset; /* objset handle */ uint32_t zv_flags; /* ZVOL_* flags */ uint32_t zv_open_count; /* open counts */ uint32_t zv_changed; /* disk changed */ zilog_t *zv_zilog; /* ZIL handle */ zfs_rlock_t zv_range_lock; /* range lock */ dmu_buf_t *zv_dbuf; /* bonus handle */ dev_t zv_dev; /* device id */ struct gendisk *zv_disk; /* generic disk */ struct request_queue *zv_queue; /* request queue */ list_node_t zv_next; /* next zvol_state_t linkage */ uint64_t zv_hash; /* name hash */ struct hlist_node zv_hlink; /* hash link */ atomic_t zv_suspend_ref; /* refcount for suspend */ krwlock_t zv_suspend_lock; /* suspend lock */ }; typedef enum { ZVOL_ASYNC_CREATE_MINORS, ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, ZVOL_ASYNC_MAX } zvol_async_op_t; typedef struct { zvol_async_op_t op; char pool[MAXNAMELEN]; char name1[MAXNAMELEN]; char name2[MAXNAMELEN]; zprop_source_t source; uint64_t snapdev; } zvol_task_t; #define ZVOL_RDONLY 0x1 static uint64_t zvol_name_hash(const char *name) { int i; uint64_t crc = -1ULL; uint8_t *p = (uint8_t *)name; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; } return (crc); } /* * Find a zvol_state_t given the full major+minor dev_t. */ static zvol_state_t * zvol_find_by_dev(dev_t dev) { zvol_state_t *zv; ASSERT(MUTEX_HELD(&zvol_state_lock)); for (zv = list_head(&zvol_state_list); zv != NULL; zv = list_next(&zvol_state_list, zv)) { if (zv->zv_dev == dev) return (zv); } return (NULL); } /* * Find a zvol_state_t given the name and hash generated by zvol_name_hash. */ static zvol_state_t * zvol_find_by_name_hash(const char *name, uint64_t hash) { zvol_state_t *zv; struct hlist_node *p; ASSERT(MUTEX_HELD(&zvol_state_lock)); hlist_for_each(p, ZVOL_HT_HEAD(hash)) { zv = hlist_entry(p, zvol_state_t, zv_hlink); if (zv->zv_hash == hash && strncmp(zv->zv_name, name, MAXNAMELEN) == 0) return (zv); } return (NULL); } /* * Find a zvol_state_t given the name provided at zvol_alloc() time. */ static zvol_state_t * zvol_find_by_name(const char *name) { return (zvol_find_by_name_hash(name, zvol_name_hash(name))); } /* * Given a path, return TRUE if path is a ZVOL. */ boolean_t zvol_is_zvol(const char *device) { struct block_device *bdev; unsigned int major; bdev = vdev_lookup_bdev(device); if (IS_ERR(bdev)) return (B_FALSE); major = MAJOR(bdev->bd_dev); bdput(bdev); if (major == zvol_major) return (B_TRUE); return (B_FALSE); } /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; nvlist_t *nvprops = zct->zct_props; int error; uint64_t volblocksize, volsize; VERIFY(nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); /* * These properties must be removed from the list so the generic * property setting step won't apply to them. */ VERIFY(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, DMU_OT_NONE, 0, tx); ASSERT(error == 0); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); ASSERT(error == 0); } /* * ZFS_IOC_OBJSET_STATS entry point. */ int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; dmu_object_info_t *doi; uint64_t val; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (SET_ERROR(error)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_object_info(os, ZVOL_OBJ, doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, doi->doi_data_block_size); } kmem_free(doi, sizeof (dmu_object_info_t)); return (SET_ERROR(error)); } static void zvol_size_changed(zvol_state_t *zv, uint64_t volsize) { struct block_device *bdev; bdev = bdget_disk(zv->zv_disk, 0); if (bdev == NULL) return; set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; check_disk_size_change(zv->zv_disk, bdev); bdput(bdev); } /* * Sanity check volume size. */ int zvol_check_volsize(uint64_t volsize, uint64_t blocksize) { if (volsize == 0) return (SET_ERROR(EINVAL)); if (volsize % blocksize != 0) return (SET_ERROR(EINVAL)); #ifdef _ILP32 if (volsize - 1 > SPEC_MAXOFFSET_T) return (SET_ERROR(EOVERFLOW)); #endif return (0); } /* * Ensure the zap is flushed then inform the VFS of the capacity change. */ static int zvol_update_volsize(uint64_t volsize, objset_t *os) { dmu_tx_t *tx; int error; uint64_t txg; ASSERT(MUTEX_HELD(&zvol_state_lock)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (SET_ERROR(error)); } txg = dmu_tx_get_txg(tx); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); if (error == 0) error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); return (error); } static int zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize) { zvol_size_changed(zv, volsize); /* * We should post a event here describing the expansion. However, * the zfs_ereport_post() interface doesn't nicely support posting * events for zvols, it assumes events relate to vdevs or zios. */ return (0); } /* * Set ZFS_PROP_VOLSIZE set entry point. */ int zvol_set_volsize(const char *name, uint64_t volsize) { zvol_state_t *zv = NULL; objset_t *os = NULL; int error; dmu_object_info_t *doi; uint64_t readonly; boolean_t owned = B_FALSE; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); if (error != 0) return (SET_ERROR(error)); if (readonly) return (SET_ERROR(EROFS)); mutex_enter(&zvol_state_lock); zv = zvol_find_by_name(name); if (zv == NULL || zv->zv_objset == NULL) { if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, FTAG, &os)) != 0) { mutex_exit(&zvol_state_lock); return (SET_ERROR(error)); } owned = B_TRUE; if (zv != NULL) zv->zv_objset = os; } else { rw_enter(&zv->zv_suspend_lock, RW_READER); os = zv->zv_objset; } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) goto out; error = zvol_update_volsize(volsize, os); kmem_free(doi, sizeof (dmu_object_info_t)); if (error == 0 && zv != NULL) error = zvol_update_live_volsize(zv, volsize); out: if (owned) { dmu_objset_disown(os, FTAG); if (zv != NULL) zv->zv_objset = NULL; } else { rw_exit(&zv->zv_suspend_lock); } mutex_exit(&zvol_state_lock); return (error); } /* * Sanity check volume block size. */ int zvol_check_volblocksize(const char *name, uint64_t volblocksize) { /* Record sizes above 128k need the feature to be enabled */ if (volblocksize > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (volblocksize > zfs_max_recordsize) return (SET_ERROR(EDOM)); spa_close(spa, FTAG); } if (volblocksize < SPA_MINBLOCKSIZE || volblocksize > SPA_MAXBLOCKSIZE || !ISP2(volblocksize)) return (SET_ERROR(EDOM)); return (0); } /* * Set ZFS_PROP_VOLBLOCKSIZE set entry point. */ int zvol_set_volblocksize(const char *name, uint64_t volblocksize) { zvol_state_t *zv; dmu_tx_t *tx; int error; mutex_enter(&zvol_state_lock); zv = zvol_find_by_name(name); if (zv == NULL) { error = SET_ERROR(ENXIO); goto out; } if (zv->zv_flags & ZVOL_RDONLY) { error = SET_ERROR(EROFS); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, volblocksize, 0, tx); if (error == ENOTSUP) error = SET_ERROR(EBUSY); dmu_tx_commit(tx); if (error == 0) zv->zv_volblocksize = volblocksize; } rw_exit(&zv->zv_suspend_lock); out: mutex_exit(&zvol_state_lock); return (SET_ERROR(error)); } /* * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we * implement DKIOCFREE/free-long-range. */ static int zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap) { uint64_t offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); } /* * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ static int zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) { objset_t *os = zv->zv_objset; char *data = (char *)(lr + 1); /* data follows lr_write_t */ uint64_t off = lr->lr_offset; uint64_t len = lr->lr_length; dmu_tx_t *tx; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, off, len, data, tx); dmu_tx_commit(tx); } return (SET_ERROR(error)); } static int zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) { return (SET_ERROR(ENOTSUP)); } /* * Callback vectors for replaying records. * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ zil_replay_func_t zvol_replay_vector[TX_MAX_TYPE] = { (zil_replay_func_t)zvol_replay_err, /* no such transaction type */ (zil_replay_func_t)zvol_replay_err, /* TX_CREATE */ (zil_replay_func_t)zvol_replay_err, /* TX_MKDIR */ (zil_replay_func_t)zvol_replay_err, /* TX_MKXATTR */ (zil_replay_func_t)zvol_replay_err, /* TX_SYMLINK */ (zil_replay_func_t)zvol_replay_err, /* TX_REMOVE */ (zil_replay_func_t)zvol_replay_err, /* TX_RMDIR */ (zil_replay_func_t)zvol_replay_err, /* TX_LINK */ (zil_replay_func_t)zvol_replay_err, /* TX_RENAME */ (zil_replay_func_t)zvol_replay_write, /* TX_WRITE */ (zil_replay_func_t)zvol_replay_truncate, /* TX_TRUNCATE */ (zil_replay_func_t)zvol_replay_err, /* TX_SETATTR */ (zil_replay_func_t)zvol_replay_err, /* TX_ACL */ }; /* * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. * * We store data in the log buffers if it's small enough. * Otherwise we will later flush the data out via dmu_sync(). */ ssize_t zvol_immediate_write_sz = 32768; static void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, int sync) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; boolean_t slogging; ssize_t immediate_write_sz; if (zil_replaying(zilog, tx)) return; immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) ? 0 : zvol_immediate_write_sz; slogging = spa_has_slogs(zilog->zl_spa) && (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); while (size) { itx_t *itx; lr_write_t *lr; ssize_t len; itx_wr_state_t write_state; /* * Unlike zfs_log_write() we can be called with * up to DMU_MAX_ACCESS/2 (5MB) writes. */ if (blocksize > immediate_write_sz && !slogging && size >= blocksize && offset % blocksize == 0) { write_state = WR_INDIRECT; /* uses dmu_sync */ len = blocksize; } else if (sync) { write_state = WR_COPIED; len = MIN(ZIL_MAX_LOG_DATA, size); } else { write_state = WR_NEED_COPY; len = MIN(ZIL_MAX_LOG_DATA, size); } itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zv->zv_objset, ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; } itx->itx_wr_state = write_state; if (write_state == WR_NEED_COPY) itx->itx_sod += len; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = offset; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; itx->itx_sync = sync; (void) zil_itx_assign(zilog, itx, tx); offset += len; size -= len; } } static int zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync) { uint64_t volsize = zv->zv_volsize; rl_t *rl; int error = 0; ASSERT(zv && zv->zv_open_count > 0); rl = zfs_range_lock(&zv->zv_range_lock, uio->uio_loffset, uio->uio_resid, RL_WRITER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio->uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); if (error) break; } zfs_range_unlock(rl); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); return (error); } /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, boolean_t sync) { itx_t *itx; lr_truncate_t *lr; zilog_t *zilog = zv->zv_zilog; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = len; itx->itx_sync = sync; zil_itx_assign(zilog, itx, tx); } static int zvol_discard(struct bio *bio) { zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data; uint64_t start = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); uint64_t end = start + size; int error; rl_t *rl; dmu_tx_t *tx; ASSERT(zv && zv->zv_open_count > 0); if (end > zv->zv_volsize) return (SET_ERROR(EIO)); /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!bio_is_secure_erase(bio)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } if (start >= end) return (0); rl = zfs_range_lock(&zv->zv_range_lock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_range_unlock(rl); return (error); } static int zvol_read(zvol_state_t *zv, uio_t *uio) { uint64_t volsize = zv->zv_volsize; rl_t *rl; int error = 0; ASSERT(zv && zv->zv_open_count > 0); rl = zfs_range_lock(&zv->zv_range_lock, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; error = dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_range_unlock(rl); return (error); } static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) { uio_t uio; zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); int rw = bio_data_dir(bio); #ifdef HAVE_GENERIC_IO_ACCT unsigned long start = jiffies; #endif int error = 0; rw_enter(&zv->zv_suspend_lock, RW_READER); uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; uio.uio_skip = BIO_BI_SKIP(bio); uio.uio_resid = BIO_BI_SIZE(bio); uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); uio.uio_loffset = BIO_BI_SECTOR(bio) << 9; uio.uio_limit = MAXOFFSET_T; uio.uio_segflg = UIO_BVEC; if (bio_has_data(bio) && uio.uio_loffset + uio.uio_resid > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_disk->disk_name, (long long unsigned)uio.uio_loffset, (long unsigned)uio.uio_resid); error = SET_ERROR(EIO); goto out1; } generic_start_io_acct(rw, bio_sectors(bio), &zv->zv_disk->part0); if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { error = SET_ERROR(EROFS); goto out2; } if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { error = zvol_discard(bio); goto out2; } /* * Some requests are just for flush and nothing else. */ if (uio.uio_resid == 0) { if (bio_is_flush(bio)) zil_commit(zv->zv_zilog, ZVOL_OBJ); goto out2; } error = zvol_write(zv, &uio, bio_is_flush(bio) || bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); } else error = zvol_read(zv, &uio); out2: generic_end_io_acct(rw, &zv->zv_disk->part0, start); out1: BIO_END_IO(bio, -error); rw_exit(&zv->zv_suspend_lock); spl_fstrans_unmark(cookie); #ifdef HAVE_MAKE_REQUEST_FN_RET_INT return (0); #elif defined(HAVE_MAKE_REQUEST_FN_RET_QC) return (BLK_QC_T_NONE); #endif } static void zvol_get_done(zgd_t *zgd, int error) { if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_range_unlock(zgd->zgd_rl); if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } /* * Get data to generate a TX_WRITE intent log record. */ static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zvol_state_t *zv = arg; objset_t *os = zv->zv_objset; uint64_t object = ZVOL_OBJ; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT(zio != NULL); ASSERT(size != 0); zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zv->zv_zilog; zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size, RL_READER); /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } else { size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *obp = dmu_buf_get_blkptr(db); if (obp) { ASSERT(BP_IS_HOLE(bp)); *bp = *obp; } zgd->zgd_db = db; zgd->zgd_bp = &lr->lr_blkptr; ASSERT(db != NULL); ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) return (0); } } zvol_get_done(zgd, error); return (SET_ERROR(error)); } /* * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ static void zvol_insert(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zvol_state_lock)); ASSERT3U(MINOR(zv->zv_dev) & ZVOL_MINOR_MASK, ==, 0); list_insert_head(&zvol_state_list, zv); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } /* * Simply remove the zvol from to list of zvols. */ static void zvol_remove(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zvol_state_lock)); list_remove(&zvol_state_list, zv); hlist_del(&zv->zv_hlink); } /* * Setup zv after we just own the zv->objset */ static int zvol_setup_zv(zvol_state_t *zv) { uint64_t volsize; int error; uint64_t ro; objset_t *os = zv->zv_objset; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) return (SET_ERROR(error)); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) return (SET_ERROR(error)); error = dmu_bonus_hold(os, ZVOL_OBJ, zv, &zv->zv_dbuf); if (error) return (SET_ERROR(error)); set_capacity(zv->zv_disk, volsize >> 9); zv->zv_volsize = volsize; zv->zv_zilog = zil_open(os, zvol_get_data); if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { set_disk_ro(zv->zv_disk, 1); zv->zv_flags |= ZVOL_RDONLY; } else { set_disk_ro(zv->zv_disk, 0); zv->zv_flags &= ~ZVOL_RDONLY; } return (0); } /* * Shutdown every zv_objset related stuff except zv_objset itself. * The is the reverse of zvol_setup_zv. */ static void zvol_shutdown_zv(zvol_state_t *zv) { zil_close(zv->zv_zilog); zv->zv_zilog = NULL; dmu_buf_rele(zv->zv_dbuf, zv); zv->zv_dbuf = NULL; /* * Evict cached data */ if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) && !(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); (void) dmu_objset_evict_dbufs(zv->zv_objset); } /* * return the proper tag for rollback and recv */ void * zvol_tag(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); return (zv->zv_open_count > 0 ? zv : NULL); } /* * Suspend the zvol for recv and rollback. */ zvol_state_t * zvol_suspend(const char *name) { zvol_state_t *zv; mutex_enter(&zvol_state_lock); zv = zvol_find_by_name(name); if (zv == NULL) goto out; /* block all I/O, release in zvol_resume. */ rw_enter(&zv->zv_suspend_lock, RW_WRITER); atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) zvol_shutdown_zv(zv); out: mutex_exit(&zvol_state_lock); return (zv); } int zvol_resume(zvol_state_t *zv) { int error = 0; ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); if (zv->zv_open_count > 0) { VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); dmu_objset_rele(zv->zv_objset, zv); error = zvol_setup_zv(zv); } rw_exit(&zv->zv_suspend_lock); /* * We need this because we don't hold zvol_state_lock while releasing * zv_suspend_lock. zvol_remove_minors_impl thus cannot check * zv_suspend_lock to determine it is safe to free because rwlock is * not inherent atomic. */ atomic_dec(&zv->zv_suspend_ref); return (SET_ERROR(error)); } static int zvol_first_open(zvol_state_t *zv) { objset_t *os; int error; /* lie and say we're read-only */ error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, 1, zv, &os); if (error) return (SET_ERROR(-error)); zv->zv_objset = os; error = zvol_setup_zv(zv); if (error) { dmu_objset_disown(os, zv); zv->zv_objset = NULL; } return (SET_ERROR(-error)); } static void zvol_last_close(zvol_state_t *zv) { zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, zv); zv->zv_objset = NULL; } static int zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; int error = 0, drop_mutex = 0, drop_suspend = 0; /* * If the caller is already holding the mutex do not take it * again, this will happen as part of zvol_create_minor_impl(). * Once add_disk() is called the device is live and the kernel * will attempt to open it to read the partition information. */ if (!mutex_owned(&zvol_state_lock)) { mutex_enter(&zvol_state_lock); drop_mutex = 1; } /* * Obtain a copy of private_data under the lock to make sure * that either the result of zvol_free() setting * bdev->bd_disk->private_data to NULL is observed, or zvol_free() * is not called on this zv because of the positive zv_open_count. */ zv = bdev->bd_disk->private_data; if (zv == NULL) { error = -ENXIO; goto out_mutex; } if (zv->zv_open_count == 0) { /* make sure zvol is not suspended when first open */ rw_enter(&zv->zv_suspend_lock, RW_READER); drop_suspend = 1; error = zvol_first_open(zv); if (error) goto out_mutex; } if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { error = -EROFS; goto out_open_count; } zv->zv_open_count++; check_disk_change(bdev); out_open_count: if (zv->zv_open_count == 0) zvol_last_close(zv); out_mutex: if (drop_suspend) rw_exit(&zv->zv_suspend_lock); if (drop_mutex) mutex_exit(&zvol_state_lock); return (SET_ERROR(error)); } #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID static void #else static int #endif zvol_release(struct gendisk *disk, fmode_t mode) { zvol_state_t *zv = disk->private_data; int drop_mutex = 0; ASSERT(zv && zv->zv_open_count > 0); if (!mutex_owned(&zvol_state_lock)) { mutex_enter(&zvol_state_lock); drop_mutex = 1; } /* make sure zvol is not suspended when last close */ if (zv->zv_open_count == 1) rw_enter(&zv->zv_suspend_lock, RW_READER); zv->zv_open_count--; if (zv->zv_open_count == 0) { zvol_last_close(zv); rw_exit(&zv->zv_suspend_lock); } if (drop_mutex) mutex_exit(&zvol_state_lock); #ifndef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID return (0); #endif } static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT(zv && zv->zv_open_count > 0); rw_enter(&zv->zv_suspend_lock, RW_READER); switch (cmd) { case BLKFLSBUF: zil_commit(zv->zv_zilog, ZVOL_OBJ); break; case BLKZNAME: error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); break; default: error = -ENOTTY; break; } rw_exit(&zv->zv_suspend_lock); return (SET_ERROR(error)); } #ifdef CONFIG_COMPAT static int zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { return (zvol_ioctl(bdev, mode, cmd, arg)); } #else #define zvol_compat_ioctl NULL #endif static int zvol_media_changed(struct gendisk *disk) { zvol_state_t *zv = disk->private_data; ASSERT(zv && zv->zv_open_count > 0); return (zv->zv_changed); } static int zvol_revalidate_disk(struct gendisk *disk) { zvol_state_t *zv = disk->private_data; ASSERT(zv && zv->zv_open_count > 0); zv->zv_changed = 0; set_capacity(zv->zv_disk, zv->zv_volsize >> 9); return (0); } /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; ASSERT(zv && zv->zv_open_count > 0); sectors = get_capacity(zv->zv_disk); if (sectors > 2048) { geo->heads = 16; geo->sectors = 63; } else { geo->heads = 2; geo->sectors = 4; } geo->start = 0; geo->cylinders = sectors / (geo->heads * geo->sectors); return (0); } static struct kobject * zvol_probe(dev_t dev, int *part, void *arg) { zvol_state_t *zv; struct kobject *kobj; mutex_enter(&zvol_state_lock); zv = zvol_find_by_dev(dev); kobj = zv ? get_disk(zv->zv_disk) : NULL; mutex_exit(&zvol_state_lock); return (kobj); } #ifdef HAVE_BDEV_BLOCK_DEVICE_OPERATIONS static struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .media_changed = zvol_media_changed, .revalidate_disk = zvol_revalidate_disk, .getgeo = zvol_getgeo, .owner = THIS_MODULE, }; #else /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */ static int zvol_open_by_inode(struct inode *inode, struct file *file) { return (zvol_open(inode->i_bdev, file->f_mode)); } static int zvol_release_by_inode(struct inode *inode, struct file *file) { return (zvol_release(inode->i_bdev->bd_disk, file->f_mode)); } static int zvol_ioctl_by_inode(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { if (file == NULL || inode == NULL) return (SET_ERROR(-EINVAL)); return (zvol_ioctl(inode->i_bdev, file->f_mode, cmd, arg)); } #ifdef CONFIG_COMPAT static long zvol_compat_ioctl_by_inode(struct file *file, unsigned int cmd, unsigned long arg) { if (file == NULL) return (SET_ERROR(-EINVAL)); return (zvol_compat_ioctl(file->f_dentry->d_inode->i_bdev, file->f_mode, cmd, arg)); } #else #define zvol_compat_ioctl_by_inode NULL #endif static struct block_device_operations zvol_ops = { .open = zvol_open_by_inode, .release = zvol_release_by_inode, .ioctl = zvol_ioctl_by_inode, .compat_ioctl = zvol_compat_ioctl_by_inode, .media_changed = zvol_media_changed, .revalidate_disk = zvol_revalidate_disk, .getgeo = zvol_getgeo, .owner = THIS_MODULE, }; #endif /* HAVE_BDEV_BLOCK_DEVICE_OPERATIONS */ /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static zvol_state_t * zvol_alloc(dev_t dev, const char *name) { zvol_state_t *zv; zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); list_link_init(&zv->zv_next); zv->zv_queue = blk_alloc_queue(GFP_ATOMIC); if (zv->zv_queue == NULL) goto out_kmem; blk_queue_make_request(zv->zv_queue, zvol_request); blk_queue_set_write_cache(zv->zv_queue, B_TRUE, B_TRUE); zv->zv_disk = alloc_disk(ZVOL_MINORS); if (zv->zv_disk == NULL) goto out_queue; zv->zv_queue->queuedata = zv; zv->zv_dev = dev; zv->zv_open_count = 0; strlcpy(zv->zv_name, name, MAXNAMELEN); zfs_rlock_init(&zv->zv_range_lock); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zv->zv_disk->major = zvol_major; zv->zv_disk->first_minor = (dev & MINORMASK); zv->zv_disk->fops = &zvol_ops; zv->zv_disk->private_data = zv; zv->zv_disk->queue = zv->zv_queue; snprintf(zv->zv_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); return (zv); out_queue: blk_cleanup_queue(zv->zv_queue); out_kmem: kmem_free(zv, sizeof (zvol_state_t)); return (NULL); } /* * Used for taskq, if used out side zvol_state_lock, you need to clear * zv_disk->private_data inside lock first. */ static void zvol_free_impl(void *arg) { zvol_state_t *zv = arg; ASSERT(zv->zv_open_count == 0); rw_destroy(&zv->zv_suspend_lock); zfs_rlock_destroy(&zv->zv_range_lock); zv->zv_disk->private_data = NULL; del_gendisk(zv->zv_disk); blk_cleanup_queue(zv->zv_queue); put_disk(zv->zv_disk); ida_simple_remove(&zvol_ida, MINOR(zv->zv_dev) >> ZVOL_MINOR_BITS); kmem_free(zv, sizeof (zvol_state_t)); } /* * Cleanup then free a zvol_state_t which was created by zvol_alloc(). */ static void zvol_free(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zvol_state_lock)); zvol_free_impl(zv); } /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ static int zvol_create_minor_impl(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; int idx; uint64_t hash = zvol_name_hash(name); idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; mutex_enter(&zvol_state_lock); zv = zvol_find_by_name_hash(name, hash); if (zv) { error = SET_ERROR(EEXIST); goto out; } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; set_capacity(zv->zv_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_queue, (DMU_MAX_ACCESS / 4) >> 9); blk_queue_max_segments(zv->zv_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_queue, UINT_MAX); blk_queue_physical_block_size(zv->zv_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_queue, zv->zv_volblocksize); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zv->zv_queue); #ifdef QUEUE_FLAG_NONROT queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zv->zv_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zv->zv_queue); #endif if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) zil_destroy(dmu_objset_zil(os), B_FALSE); else zil_replay(os, zv, zvol_replay_vector); } /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); out: if (error == 0) { zvol_insert(zv); /* * Drop the lock to prevent deadlock with sys_open() -> * zvol_open(), which first takes bd_disk->bd_mutex and then * takes zvol_state_lock, whereas this code path first takes * zvol_state_lock, and then takes bd_disk->bd_mutex. */ mutex_exit(&zvol_state_lock); add_disk(zv->zv_disk); } else { mutex_exit(&zvol_state_lock); ida_simple_remove(&zvol_ida, idx); } return (SET_ERROR(error)); } /* * Rename a block device minor mode for the specified volume. */ static void zvol_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_disk); ASSERT(MUTEX_HELD(&zvol_state_lock)); rw_enter(&zv->zv_suspend_lock, RW_READER); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); rw_exit(&zv->zv_suspend_lock); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing * a KOBJ_CHANGE uevent to be issued. This ensures udev detects * the name change and fixes the symlinks. This does not change * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never * changes. This would normally be done using kobject_uevent() but * that is a GPL-only symbol which is why we need this workaround. */ set_disk_ro(zv->zv_disk, !readonly); set_disk_ro(zv->zv_disk, readonly); } typedef struct minors_job { list_t *list; list_node_t link; /* input */ char *name; /* output */ int error; } minors_job_t; /* * Prefetch zvol dnodes for the minors_job */ static void zvol_prefetch_minors_impl(void *arg) { minors_job_t *job = arg; char *dsname = job->name; objset_t *os = NULL; job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, FTAG, &os); if (job->error == 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, FTAG); } } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_snap_minor_cb(const char *dsname, void *arg) { minors_job_t *j = arg; list_t *minors_list = j->list; const char *name = j->name; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); /* skip the designated dataset */ if (name && strcmp(dsname, name) == 0) return (0); /* at this point, the dsname should name a snapshot */ if (strchr(dsname, '@') == 0) { dprintf("zvol_create_snap_minor_cb(): " "%s is not a shapshot name\n", dsname); } else { minors_job_t *job; char *n = strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); } return (0); } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_minors_cb(const char *dsname, void *arg) { uint64_t snapdev; int error; list_t *minors_list = arg; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); if (error) return (0); /* * Given the name and the 'snapdev' property, create device minor nodes * with the linkages to zvols/snapshots as needed. * If the name represents a zvol, create a minor node for the zvol, then * check if its snapshots are 'visible', and if so, iterate over the * snapshots and create device minor nodes for those. */ if (strchr(dsname, '@') == 0) { minors_job_t *job; char *n = strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ error = dmu_objset_find((char *)dsname, zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); } } else { dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", dsname); } return (0); } /* * Create minors for the specified dataset, including children and snapshots. * Pay attention to the 'snapdev' property and iterate over the snapshots * only if they are 'visible'. This approach allows one to assure that the * snapshot metadata is read from disk only if it is needed. * * The name can represent a dataset to be recursively scanned for zvols and * their snapshots, or a single zvol snapshot. If the name represents a * dataset, the scan is performed in two nested stages: * - scan the dataset for zvols, and * - for each zvol, create a minor node, then check if the zvol's snapshots * are 'visible', and only then iterate over the snapshots if needed * * If the name represents a snapshot, a check is performed if the snapshot is * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ static int zvol_create_minors_impl(const char *name) { int error = 0; fstrans_cookie_t cookie; char *atp, *parent; list_t minors_list; minors_job_t *job; if (zvol_inhibit_dev) return (0); /* * This is the list for prefetch jobs. Whenever we found a match * during dmu_objset_find, we insert a minors_job to the list and do * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need * any lock because all list operation is done on the current thread. * * We will use this list to do zvol_create_minor_impl after prefetch * so we don't have to traverse using dmu_objset_find again. */ list_create(&minors_list, sizeof (minors_job_t), offsetof(minors_job_t, link)); parent = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) strlcpy(parent, name, MAXPATHLEN); if ((atp = strrchr(parent, '@')) != NULL) { uint64_t snapdev; *atp = '\0'; error = dsl_prop_get_integer(parent, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) error = zvol_create_minor_impl(name); } else { cookie = spl_fstrans_mark(); error = dmu_objset_find(parent, zvol_create_minors_cb, &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } kmem_free(parent, MAXPATHLEN); taskq_wait_outstanding(system_taskq, 0); /* * Prefetch is completed, we can do zvol_create_minor_impl * sequentially. */ while ((job = list_head(&minors_list)) != NULL) { list_remove(&minors_list, job); if (!job->error) zvol_create_minor_impl(job->name); strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); return (SET_ERROR(error)); } /* * Remove minors for specified dataset including children and snapshots. */ static void zvol_remove_minors_impl(const char *name) { zvol_state_t *zv, *zv_next; int namelen = ((name) ? strlen(name) : 0); taskqid_t t, tid = TASKQID_INVALID; if (zvol_inhibit_dev) return; mutex_enter(&zvol_state_lock); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); if (name == NULL || strcmp(zv->zv_name, name) == 0 || (strncmp(zv->zv_name, name, namelen) == 0 && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) continue; zvol_remove(zv); /* clear this so zvol_open won't open it */ zv->zv_disk->private_data = NULL; /* try parallel zv_free, if failed do it in place */ t = taskq_dispatch(system_taskq, zvol_free_impl, zv, TQ_SLEEP); if (t == TASKQID_INVALID) zvol_free(zv); else tid = t; } } mutex_exit(&zvol_state_lock); if (tid != TASKQID_INVALID) taskq_wait_outstanding(system_taskq, tid); } /* Remove minor for this specific snapshot only */ static void zvol_remove_minor_impl(const char *name) { zvol_state_t *zv, *zv_next; if (zvol_inhibit_dev) return; if (strchr(name, '@') == NULL) return; mutex_enter(&zvol_state_lock); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); if (strcmp(zv->zv_name, name) == 0) { /* If in use, leave alone */ if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) continue; zvol_remove(zv); zvol_free(zv); break; } } mutex_exit(&zvol_state_lock); } /* * Rename minors for specified dataset including children and snapshots. */ static void zvol_rename_minors_impl(const char *oldname, const char *newname) { zvol_state_t *zv, *zv_next; int oldnamelen, newnamelen; char *name; if (zvol_inhibit_dev) return; oldnamelen = strlen(oldname); newnamelen = strlen(newname); name = kmem_alloc(MAXNAMELEN, KM_SLEEP); mutex_enter(&zvol_state_lock); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); /* If in use, leave alone */ if (zv->zv_open_count > 0) continue; if (strcmp(zv->zv_name, oldname) == 0) { zvol_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { snprintf(name, MAXNAMELEN, "%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); zvol_rename_minor(zv, name); } } mutex_exit(&zvol_state_lock); kmem_free(name, MAXNAMELEN); } typedef struct zvol_snapdev_cb_arg { uint64_t snapdev; } zvol_snapdev_cb_arg_t; static int -zvol_set_snapdev_cb(const char *dsname, void *param) { +zvol_set_snapdev_cb(const char *dsname, void *param) +{ zvol_snapdev_cb_arg_t *arg = param; if (strchr(dsname, '@') == NULL) return (0); switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: (void) zvol_create_minor_impl(dsname); break; case ZFS_SNAPDEV_HIDDEN: (void) zvol_remove_minor_impl(dsname); break; } return (0); } static void zvol_set_snapdev_impl(char *name, uint64_t snapdev) { zvol_snapdev_cb_arg_t arg = {snapdev}; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * The zvol_set_snapdev_sync() sets snapdev appropriately * in the dataset hierarchy. Here, we only scan snapshots. */ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); spl_fstrans_unmark(cookie); } static zvol_task_t * zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, uint64_t snapdev) { zvol_task_t *task; char *delim; /* Never allow tasks on hidden names. */ if (name1[0] == '$') return (NULL); task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->op = op; task->snapdev = snapdev; delim = strchr(name1, '/'); strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); strlcpy(task->name1, name1, MAXNAMELEN); if (name2 != NULL) strlcpy(task->name2, name2, MAXNAMELEN); return (task); } static void zvol_task_free(zvol_task_t *task) { kmem_free(task, sizeof (zvol_task_t)); } /* * The worker thread function performed asynchronously. */ static void zvol_task_cb(void *param) { zvol_task_t *task = (zvol_task_t *)param; switch (task->op) { case ZVOL_ASYNC_CREATE_MINORS: (void) zvol_create_minors_impl(task->name1); break; case ZVOL_ASYNC_REMOVE_MINORS: zvol_remove_minors_impl(task->name1); break; case ZVOL_ASYNC_RENAME_MINORS: zvol_rename_minors_impl(task->name1, task->name2); break; case ZVOL_ASYNC_SET_SNAPDEV: zvol_set_snapdev_impl(task->name1, task->snapdev); break; default: VERIFY(0); break; } zvol_task_free(task); } typedef struct zvol_set_snapdev_arg { const char *zsda_name; uint64_t zsda_value; zprop_source_t zsda_source; dmu_tx_t *zsda_tx; } zvol_set_snapdev_arg_t; /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) { zvol_set_snapdev_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } static int zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { zvol_set_snapdev_arg_t *zsda = arg; char dsname[MAXNAMELEN]; zvol_task_t *task; dsl_dataset_name(ds, dsname); dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, zsda->zsda_tx); task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, zsda->zsda_value); if (task == NULL) return (0); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child snapshot datasets and apply snapdev appropriately. */ static void zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx) { zvol_set_snapdev_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); zsda->zsda_tx = tx; dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev) { zvol_set_snapdev_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = snapdev; return (dsl_sync_task(ddname, zvol_set_snapdev_check, zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } void zvol_create_minors(spa_t *spa, const char *name, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_CREATE_MINORS, name, NULL, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } void zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, boolean_t async) { zvol_task_t *task; taskqid_t id; task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); if (task == NULL) return; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } int zvol_init(void) { int i, error; list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), KM_SLEEP); if (!zvol_htable) { error = ENOMEM; goto out; } for (i = 0; i < ZVOL_HT_SIZE; i++) INIT_HLIST_HEAD(&zvol_htable[i]); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); goto out_free; } blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, THIS_MODULE, zvol_probe, NULL, NULL); return (0); out_free: kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); out: mutex_destroy(&zvol_state_lock); list_destroy(&zvol_state_list); return (SET_ERROR(error)); } void zvol_fini(void) { zvol_remove_minors_impl(NULL); blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); mutex_destroy(&zvol_state_lock); ida_destroy(&zvol_ida); } /* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); /* END CSTYLED */ diff --git a/module/zpios/pios.c b/module/zpios/pios.c index 297d35bba949..c70c0d6f1c9e 100644 --- a/module/zpios/pios.c +++ b/module/zpios/pios.c @@ -1,1298 +1,1298 @@ /* * ZPIOS is a heavily modified version of the original PIOS test code. * It is designed to have the test code running in the Linux kernel * against ZFS while still being flexibly controlled from user space. * * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * LLNL-CODE-403049 * * Original PIOS Test Code * Copyright (C) 2004 Cluster File Systems, Inc. * Written by Peter Braam * Atul Vidwansa * Milind Dumbare * * This file is part of ZFS on Linux. * For details, see . * * ZPIOS is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * ZPIOS is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with ZPIOS. If not, see . * * Copyright (c) 2015, Intel Corporation. */ #include #include #include #include #include #include #include "zpios-internal.h" static char *zpios_tag = "zpios_tag"; static int zpios_upcall(char *path, char *phase, run_args_t *run_args, int rc) { /* * This is stack heavy but it should be OK since we are only * making the upcall between tests when the stack is shallow. */ char id[16], chunk_size[16], region_size[16], thread_count[16]; char region_count[16], offset[16], region_noise[16], chunk_noise[16]; char thread_delay[16], flags[16], result[8]; char *argv[16], *envp[4]; if ((path == NULL) || (strlen(path) == 0)) return (-ENOENT); snprintf(id, 15, "%d", run_args->id); snprintf(chunk_size, 15, "%lu", (long unsigned)run_args->chunk_size); snprintf(region_size, 15, "%lu", (long unsigned) run_args->region_size); snprintf(thread_count, 15, "%u", run_args->thread_count); snprintf(region_count, 15, "%u", run_args->region_count); snprintf(offset, 15, "%lu", (long unsigned)run_args->offset); snprintf(region_noise, 15, "%u", run_args->region_noise); snprintf(chunk_noise, 15, "%u", run_args->chunk_noise); snprintf(thread_delay, 15, "%u", run_args->thread_delay); snprintf(flags, 15, "0x%x", run_args->flags); snprintf(result, 7, "%d", rc); /* Passing 15 args to registered pre/post upcall */ argv[0] = path; argv[1] = phase; argv[2] = strlen(run_args->log) ? run_args->log : ""; argv[3] = id; argv[4] = run_args->pool; argv[5] = chunk_size; argv[6] = region_size; argv[7] = thread_count; argv[8] = region_count; argv[9] = offset; argv[10] = region_noise; argv[11] = chunk_noise; argv[12] = thread_delay; argv[13] = flags; argv[14] = result; argv[15] = NULL; /* Passing environment for user space upcall */ envp[0] = "HOME=/"; envp[1] = "TERM=linux"; envp[2] = "PATH=/sbin:/usr/sbin:/bin:/usr/bin"; envp[3] = NULL; return (call_usermodehelper(path, argv, envp, UMH_WAIT_PROC)); } static int zpios_print(struct file *file, const char *format, ...) { zpios_info_t *info = (zpios_info_t *)file->private_data; va_list adx; int rc; ASSERT(info); ASSERT(info->info_buffer); va_start(adx, format); spin_lock(&info->info_lock); /* Don't allow the kernel to start a write in the red zone */ if ((int)(info->info_head - info->info_buffer) > (info->info_size - ZPIOS_INFO_BUFFER_REDZONE)) { rc = -EOVERFLOW; } else { rc = vsprintf(info->info_head, format, adx); if (rc >= 0) info->info_head += rc; } spin_unlock(&info->info_lock); va_end(adx); return (rc); } static uint64_t zpios_dmu_object_create(run_args_t *run_args, objset_t *os) { struct dmu_tx *tx; uint64_t obj = 0ULL; uint64_t blksize = run_args->block_size; int rc; if (blksize < SPA_MINBLOCKSIZE || blksize > spa_maxblocksize(dmu_objset_spa(os)) || !ISP2(blksize)) { zpios_print(run_args->file, "invalid block size for pool: %d\n", (int)blksize); return (obj); } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, OBJ_SIZE); rc = dmu_tx_assign(tx, TXG_WAIT); if (rc) { zpios_print(run_args->file, "dmu_tx_assign() failed: %d\n", rc); dmu_tx_abort(tx); return (obj); } obj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, DMU_OT_NONE, 0, tx); rc = dmu_object_set_blocksize(os, obj, blksize, 0, tx); if (rc) { zpios_print(run_args->file, "dmu_object_set_blocksize to %d failed: %d\n", (int)blksize, rc); dmu_tx_abort(tx); return (obj); } dmu_tx_commit(tx); return (obj); } static int zpios_dmu_object_free(run_args_t *run_args, objset_t *os, uint64_t obj) { struct dmu_tx *tx; int rc; tx = dmu_tx_create(os); dmu_tx_hold_free(tx, obj, 0, DMU_OBJECT_END); rc = dmu_tx_assign(tx, TXG_WAIT); if (rc) { zpios_print(run_args->file, "dmu_tx_assign() failed: %d\n", rc); dmu_tx_abort(tx); return (rc); } rc = dmu_object_free(os, obj, tx); if (rc) { zpios_print(run_args->file, "dmu_object_free() failed: %d\n", rc); dmu_tx_abort(tx); return (rc); } dmu_tx_commit(tx); return (0); } static int zpios_dmu_setup(run_args_t *run_args) { zpios_time_t *t = &(run_args->stats.cr_time); objset_t *os; char name[32]; uint64_t obj = 0ULL; int i, rc = 0, rc2; (void) zpios_upcall(run_args->pre, PHASE_PRE_CREATE, run_args, 0); t->start = zpios_timespec_now(); (void) snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); rc = dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL); if (rc) { zpios_print(run_args->file, "Error dmu_objset_create(%s, ...) " "failed: %d\n", name, rc); goto out; } rc = dmu_objset_own(name, DMU_OST_OTHER, 0, zpios_tag, &os); if (rc) { zpios_print(run_args->file, "Error dmu_objset_own(%s, ...) " "failed: %d\n", name, rc); goto out_destroy; } if (!(run_args->flags & DMU_FPP)) { obj = zpios_dmu_object_create(run_args, os); if (obj == 0) { rc = -EBADF; zpios_print(run_args->file, "Error zpios_dmu_" "object_create() failed, %d\n", rc); goto out_destroy; } } for (i = 0; i < run_args->region_count; i++) { zpios_region_t *region; region = &run_args->regions[i]; mutex_init(®ion->lock, NULL, MUTEX_DEFAULT, NULL); if (run_args->flags & DMU_FPP) { /* File per process */ region->obj.os = os; region->obj.obj = zpios_dmu_object_create(run_args, os); ASSERT(region->obj.obj > 0); /* XXX - Handle this */ region->wr_offset = run_args->offset; region->rd_offset = run_args->offset; region->init_offset = run_args->offset; region->max_offset = run_args->offset + run_args->region_size; } else { /* Single shared file */ region->obj.os = os; region->obj.obj = obj; region->wr_offset = run_args->offset * i; region->rd_offset = run_args->offset * i; region->init_offset = run_args->offset * i; region->max_offset = run_args->offset * i + run_args->region_size; } } run_args->os = os; out_destroy: if (rc) { rc2 = dsl_destroy_head(name); if (rc2) zpios_print(run_args->file, "Error dsl_destroy_head" "(%s, ...) failed: %d\n", name, rc2); } out: t->stop = zpios_timespec_now(); t->delta = zpios_timespec_sub(t->stop, t->start); (void) zpios_upcall(run_args->post, PHASE_POST_CREATE, run_args, rc); return (rc); } static int zpios_setup_run(run_args_t **run_args, zpios_cmd_t *kcmd, struct file *file) { run_args_t *ra; int rc, size; size = sizeof (*ra) + kcmd->cmd_region_count * sizeof (zpios_region_t); ra = vmem_zalloc(size, KM_SLEEP); *run_args = ra; strncpy(ra->pool, kcmd->cmd_pool, ZPIOS_NAME_SIZE - 1); strncpy(ra->pre, kcmd->cmd_pre, ZPIOS_PATH_SIZE - 1); strncpy(ra->post, kcmd->cmd_post, ZPIOS_PATH_SIZE - 1); strncpy(ra->log, kcmd->cmd_log, ZPIOS_PATH_SIZE - 1); ra->id = kcmd->cmd_id; ra->chunk_size = kcmd->cmd_chunk_size; ra->thread_count = kcmd->cmd_thread_count; ra->region_count = kcmd->cmd_region_count; ra->region_size = kcmd->cmd_region_size; ra->offset = kcmd->cmd_offset; ra->region_noise = kcmd->cmd_region_noise; ra->chunk_noise = kcmd->cmd_chunk_noise; ra->thread_delay = kcmd->cmd_thread_delay; ra->flags = kcmd->cmd_flags; ra->block_size = kcmd->cmd_block_size; ra->stats.wr_data = 0; ra->stats.wr_chunks = 0; ra->stats.rd_data = 0; ra->stats.rd_chunks = 0; ra->region_next = 0; ra->file = file; mutex_init(&ra->lock_work, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ra->lock_ctl, NULL, MUTEX_DEFAULT, NULL); (void) zpios_upcall(ra->pre, PHASE_PRE_RUN, ra, 0); rc = zpios_dmu_setup(ra); if (rc) { mutex_destroy(&ra->lock_ctl); mutex_destroy(&ra->lock_work); vmem_free(ra, size); *run_args = NULL; } return (rc); } static int zpios_get_work_item(run_args_t *run_args, dmu_obj_t *obj, __u64 *offset, - __u32 *chunk_size, zpios_region_t **region, __u32 flags) + __u32 *chunk_size, zpios_region_t **region, __u32 flags) { int i, j, count = 0; unsigned int random_int; get_random_bytes(&random_int, sizeof (unsigned int)); mutex_enter(&run_args->lock_work); i = run_args->region_next; /* * XXX: I don't much care for this chunk selection mechansim * there's the potential to burn a lot of time here doing nothing * useful while holding the global lock. This could give some * misleading performance results. I'll fix it latter. */ while (count < run_args->region_count) { __u64 *rw_offset; zpios_time_t *rw_time; j = i % run_args->region_count; *region = &(run_args->regions[j]); if (flags & DMU_WRITE) { rw_offset = &((*region)->wr_offset); rw_time = &((*region)->stats.wr_time); } else { rw_offset = &((*region)->rd_offset); rw_time = &((*region)->stats.rd_time); } /* test if region is fully written */ if (*rw_offset + *chunk_size > (*region)->max_offset) { i++; count++; if (unlikely(rw_time->stop.ts_sec == 0) && unlikely(rw_time->stop.ts_nsec == 0)) rw_time->stop = zpios_timespec_now(); continue; } *offset = *rw_offset; *obj = (*region)->obj; *rw_offset += *chunk_size; /* update ctl structure */ if (run_args->region_noise) { get_random_bytes(&random_int, sizeof (unsigned int)); run_args->region_next += random_int % run_args->region_noise; } else { run_args->region_next++; } mutex_exit(&run_args->lock_work); return (1); } /* nothing left to do */ mutex_exit(&run_args->lock_work); return (0); } static void zpios_remove_objset(run_args_t *run_args) { zpios_time_t *t = &(run_args->stats.rm_time); zpios_region_t *region; char name[32]; int rc = 0, i; (void) zpios_upcall(run_args->pre, PHASE_PRE_REMOVE, run_args, 0); t->start = zpios_timespec_now(); (void) snprintf(name, 32, "%s/id_%d", run_args->pool, run_args->id); if (run_args->flags & DMU_REMOVE) { if (run_args->flags & DMU_FPP) { for (i = 0; i < run_args->region_count; i++) { region = &run_args->regions[i]; rc = zpios_dmu_object_free(run_args, region->obj.os, region->obj.obj); if (rc) zpios_print(run_args->file, "Error removing object %d, %d\n", (int)region->obj.obj, rc); } } else { region = &run_args->regions[0]; rc = zpios_dmu_object_free(run_args, region->obj.os, region->obj.obj); if (rc) zpios_print(run_args->file, "Error removing object %d, %d\n", (int)region->obj.obj, rc); } } dmu_objset_disown(run_args->os, zpios_tag); if (run_args->flags & DMU_REMOVE) { rc = dsl_destroy_head(name); if (rc) zpios_print(run_args->file, "Error dsl_destroy_head" "(%s, ...) failed: %d\n", name, rc); } t->stop = zpios_timespec_now(); t->delta = zpios_timespec_sub(t->stop, t->start); (void) zpios_upcall(run_args->post, PHASE_POST_REMOVE, run_args, rc); } static void zpios_cleanup_run(run_args_t *run_args) { int i, size = 0; if (run_args == NULL) return; if (run_args->threads != NULL) { for (i = 0; i < run_args->thread_count; i++) { if (run_args->threads[i]) { mutex_destroy(&run_args->threads[i]->lock); kmem_free(run_args->threads[i], sizeof (thread_data_t)); } } kmem_free(run_args->threads, sizeof (thread_data_t *) * run_args->thread_count); } for (i = 0; i < run_args->region_count; i++) mutex_destroy(&run_args->regions[i].lock); mutex_destroy(&run_args->lock_work); mutex_destroy(&run_args->lock_ctl); size = run_args->region_count * sizeof (zpios_region_t); vmem_free(run_args, sizeof (*run_args) + size); } static int zpios_dmu_write(run_args_t *run_args, objset_t *os, uint64_t object, - uint64_t offset, uint64_t size, const void *buf) + uint64_t offset, uint64_t size, const void *buf) { struct dmu_tx *tx; int rc, how = TXG_WAIT; // int flags = 0; if (run_args->flags & DMU_WRITE_NOWAIT) how = TXG_NOWAIT; while (1) { tx = dmu_tx_create(os); dmu_tx_hold_write(tx, object, offset, size); rc = dmu_tx_assign(tx, how); if (rc) { if (rc == ERESTART && how == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); continue; } zpios_print(run_args->file, "Error in dmu_tx_assign(), %d", rc); dmu_tx_abort(tx); return (rc); } break; } // if (run_args->flags & DMU_WRITE_ZC) // flags |= DMU_WRITE_ZEROCOPY; dmu_write(os, object, offset, size, buf, tx); dmu_tx_commit(tx); return (0); } static int zpios_dmu_read(run_args_t *run_args, objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf) { int flags = 0; // if (run_args->flags & DMU_READ_ZC) // flags |= DMU_READ_ZEROCOPY; if (run_args->flags & DMU_READ_NOPF) flags |= DMU_READ_NO_PREFETCH; return (dmu_read(os, object, offset, size, buf, flags)); } static int zpios_thread_main(void *data) { thread_data_t *thr = (thread_data_t *)data; run_args_t *run_args = thr->run_args; zpios_time_t t; dmu_obj_t obj; __u64 offset; __u32 chunk_size; zpios_region_t *region; char *buf; unsigned int random_int; int chunk_noise = run_args->chunk_noise; int chunk_noise_tmp = 0; int thread_delay = run_args->thread_delay; int thread_delay_tmp = 0; int i, rc = 0; if (chunk_noise) { get_random_bytes(&random_int, sizeof (unsigned int)); chunk_noise_tmp = (random_int % (chunk_noise * 2))-chunk_noise; } /* * It's OK to vmem_alloc() this memory because it will be copied * in to the slab and pointers to the slab copy will be setup in * the bio when the IO is submitted. This of course is not ideal * since we want a zero-copy IO path if possible. It would be nice * to have direct access to those slab entries. */ chunk_size = run_args->chunk_size + chunk_noise_tmp; buf = (char *)vmem_alloc(chunk_size, KM_SLEEP); ASSERT(buf); /* Trivial data verification pattern for now. */ if (run_args->flags & DMU_VERIFY) memset(buf, 'z', chunk_size); /* Write phase */ mutex_enter(&thr->lock); thr->stats.wr_time.start = zpios_timespec_now(); mutex_exit(&thr->lock); while (zpios_get_work_item(run_args, &obj, &offset, &chunk_size, ®ion, DMU_WRITE)) { if (thread_delay) { get_random_bytes(&random_int, sizeof (unsigned int)); thread_delay_tmp = random_int % thread_delay; set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(thread_delay_tmp); /* In jiffies */ } t.start = zpios_timespec_now(); rc = zpios_dmu_write(run_args, obj.os, obj.obj, offset, chunk_size, buf); t.stop = zpios_timespec_now(); t.delta = zpios_timespec_sub(t.stop, t.start); if (rc) { zpios_print(run_args->file, "IO error while doing " "dmu_write(): %d\n", rc); break; } mutex_enter(&thr->lock); thr->stats.wr_data += chunk_size; thr->stats.wr_chunks++; thr->stats.wr_time.delta = zpios_timespec_add( thr->stats.wr_time.delta, t.delta); mutex_exit(&thr->lock); mutex_enter(®ion->lock); region->stats.wr_data += chunk_size; region->stats.wr_chunks++; region->stats.wr_time.delta = zpios_timespec_add( region->stats.wr_time.delta, t.delta); /* First time region was accessed */ if (region->init_offset == offset) region->stats.wr_time.start = t.start; mutex_exit(®ion->lock); } mutex_enter(&run_args->lock_ctl); run_args->threads_done++; mutex_exit(&run_args->lock_ctl); mutex_enter(&thr->lock); thr->rc = rc; thr->stats.wr_time.stop = zpios_timespec_now(); mutex_exit(&thr->lock); wake_up(&run_args->waitq); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); /* Check if we should exit */ mutex_enter(&thr->lock); rc = thr->rc; mutex_exit(&thr->lock); if (rc) goto out; /* Read phase */ mutex_enter(&thr->lock); thr->stats.rd_time.start = zpios_timespec_now(); mutex_exit(&thr->lock); while (zpios_get_work_item(run_args, &obj, &offset, &chunk_size, ®ion, DMU_READ)) { if (thread_delay) { get_random_bytes(&random_int, sizeof (unsigned int)); thread_delay_tmp = random_int % thread_delay; set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(thread_delay_tmp); /* In jiffies */ } if (run_args->flags & DMU_VERIFY) memset(buf, 0, chunk_size); t.start = zpios_timespec_now(); rc = zpios_dmu_read(run_args, obj.os, obj.obj, offset, chunk_size, buf); t.stop = zpios_timespec_now(); t.delta = zpios_timespec_sub(t.stop, t.start); if (rc) { zpios_print(run_args->file, "IO error while doing " "dmu_read(): %d\n", rc); break; } /* Trivial data verification, expensive! */ if (run_args->flags & DMU_VERIFY) { for (i = 0; i < chunk_size; i++) { if (buf[i] != 'z') { zpios_print(run_args->file, "IO verify error: %d/%d/%d\n", (int)obj.obj, (int)offset, (int)chunk_size); break; } } } mutex_enter(&thr->lock); thr->stats.rd_data += chunk_size; thr->stats.rd_chunks++; thr->stats.rd_time.delta = zpios_timespec_add( thr->stats.rd_time.delta, t.delta); mutex_exit(&thr->lock); mutex_enter(®ion->lock); region->stats.rd_data += chunk_size; region->stats.rd_chunks++; region->stats.rd_time.delta = zpios_timespec_add( region->stats.rd_time.delta, t.delta); /* First time region was accessed */ if (region->init_offset == offset) region->stats.rd_time.start = t.start; mutex_exit(®ion->lock); } mutex_enter(&run_args->lock_ctl); run_args->threads_done++; mutex_exit(&run_args->lock_ctl); mutex_enter(&thr->lock); thr->rc = rc; thr->stats.rd_time.stop = zpios_timespec_now(); mutex_exit(&thr->lock); wake_up(&run_args->waitq); out: vmem_free(buf, chunk_size); do_exit(0); return (rc); /* Unreachable, due to do_exit() */ } static int zpios_thread_done(run_args_t *run_args) { ASSERT(run_args->threads_done <= run_args->thread_count); return (run_args->threads_done == run_args->thread_count); } static int zpios_threads_run(run_args_t *run_args) { struct task_struct *tsk, **tsks; thread_data_t *thr = NULL; zpios_time_t *tt = &(run_args->stats.total_time); zpios_time_t *tw = &(run_args->stats.wr_time); zpios_time_t *tr = &(run_args->stats.rd_time); int i, rc = 0, tc = run_args->thread_count; tsks = kmem_zalloc(sizeof (struct task_struct *) * tc, KM_SLEEP); run_args->threads = kmem_zalloc(sizeof (thread_data_t *)*tc, KM_SLEEP); init_waitqueue_head(&run_args->waitq); run_args->threads_done = 0; /* Create all the needed threads which will sleep until awoken */ for (i = 0; i < tc; i++) { thr = kmem_zalloc(sizeof (thread_data_t), KM_SLEEP); thr->thread_no = i; thr->run_args = run_args; thr->rc = 0; mutex_init(&thr->lock, NULL, MUTEX_DEFAULT, NULL); run_args->threads[i] = thr; tsk = kthread_create(zpios_thread_main, (void *)thr, "%s/%d", "zpios_io", i); if (IS_ERR(tsk)) { rc = -EINVAL; goto taskerr; } tsks[i] = tsk; } tt->start = zpios_timespec_now(); /* Wake up all threads for write phase */ (void) zpios_upcall(run_args->pre, PHASE_PRE_WRITE, run_args, 0); for (i = 0; i < tc; i++) wake_up_process(tsks[i]); /* Wait for write phase to complete */ tw->start = zpios_timespec_now(); wait_event(run_args->waitq, zpios_thread_done(run_args)); tw->stop = zpios_timespec_now(); (void) zpios_upcall(run_args->post, PHASE_POST_WRITE, run_args, rc); for (i = 0; i < tc; i++) { thr = run_args->threads[i]; mutex_enter(&thr->lock); if (!rc && thr->rc) rc = thr->rc; run_args->stats.wr_data += thr->stats.wr_data; run_args->stats.wr_chunks += thr->stats.wr_chunks; mutex_exit(&thr->lock); } if (rc) { /* Wake up all threads and tell them to exit */ for (i = 0; i < tc; i++) { mutex_enter(&thr->lock); thr->rc = rc; mutex_exit(&thr->lock); wake_up_process(tsks[i]); } goto out; } mutex_enter(&run_args->lock_ctl); ASSERT(run_args->threads_done == run_args->thread_count); run_args->threads_done = 0; mutex_exit(&run_args->lock_ctl); /* Wake up all threads for read phase */ (void) zpios_upcall(run_args->pre, PHASE_PRE_READ, run_args, 0); for (i = 0; i < tc; i++) wake_up_process(tsks[i]); /* Wait for read phase to complete */ tr->start = zpios_timespec_now(); wait_event(run_args->waitq, zpios_thread_done(run_args)); tr->stop = zpios_timespec_now(); (void) zpios_upcall(run_args->post, PHASE_POST_READ, run_args, rc); for (i = 0; i < tc; i++) { thr = run_args->threads[i]; mutex_enter(&thr->lock); if (!rc && thr->rc) rc = thr->rc; run_args->stats.rd_data += thr->stats.rd_data; run_args->stats.rd_chunks += thr->stats.rd_chunks; mutex_exit(&thr->lock); } out: tt->stop = zpios_timespec_now(); tt->delta = zpios_timespec_sub(tt->stop, tt->start); tw->delta = zpios_timespec_sub(tw->stop, tw->start); tr->delta = zpios_timespec_sub(tr->stop, tr->start); cleanup: kmem_free(tsks, sizeof (struct task_struct *) * tc); return (rc); taskerr: /* Destroy all threads that were created successfully */ for (i = 0; i < tc; i++) if (tsks[i] != NULL) (void) kthread_stop(tsks[i]); goto cleanup; } static int zpios_do_one_run(struct file *file, zpios_cmd_t *kcmd, int data_size, void *data) { run_args_t *run_args = { 0 }; zpios_stats_t *stats = (zpios_stats_t *)data; int i, n, m, size, rc; if ((!kcmd->cmd_chunk_size) || (!kcmd->cmd_region_size) || (!kcmd->cmd_thread_count) || (!kcmd->cmd_region_count)) { zpios_print(file, "Invalid chunk_size, region_size, " "thread_count, or region_count, %d\n", -EINVAL); return (-EINVAL); } if (!(kcmd->cmd_flags & DMU_WRITE) || !(kcmd->cmd_flags & DMU_READ)) { zpios_print(file, "Invalid flags, minimally DMU_WRITE " "and DMU_READ must be set, %d\n", -EINVAL); return (-EINVAL); } if ((kcmd->cmd_flags & (DMU_WRITE_ZC | DMU_READ_ZC)) && (kcmd->cmd_flags & DMU_VERIFY)) { zpios_print(file, "Invalid flags, DMU_*_ZC incompatible " "with DMU_VERIFY, used for performance analysis " "only, %d\n", -EINVAL); return (-EINVAL); } /* * Opaque data on return contains structs of the following form: * * zpios_stat_t stats[]; * stats[0] = run_args->stats; * stats[1-N] = threads[N]->stats; * stats[N+1-M] = regions[M]->stats; * * Where N is the number of threads, and M is the number of regions. */ size = (sizeof (zpios_stats_t) + (kcmd->cmd_thread_count * sizeof (zpios_stats_t)) + (kcmd->cmd_region_count * sizeof (zpios_stats_t))); if (data_size < size) { zpios_print(file, "Invalid size, command data buffer " "size too small, (%d < %d)\n", data_size, size); return (-ENOSPC); } rc = zpios_setup_run(&run_args, kcmd, file); if (rc) return (rc); rc = zpios_threads_run(run_args); zpios_remove_objset(run_args); if (rc) goto cleanup; if (stats) { n = 1; m = 1 + kcmd->cmd_thread_count; stats[0] = run_args->stats; for (i = 0; i < kcmd->cmd_thread_count; i++) stats[n+i] = run_args->threads[i]->stats; for (i = 0; i < kcmd->cmd_region_count; i++) stats[m+i] = run_args->regions[i].stats; } cleanup: zpios_cleanup_run(run_args); (void) zpios_upcall(kcmd->cmd_post, PHASE_POST_RUN, run_args, 0); return (rc); } static int zpios_open(struct inode *inode, struct file *file) { zpios_info_t *info; info = (zpios_info_t *)kmem_alloc(sizeof (*info), KM_SLEEP); spin_lock_init(&info->info_lock); info->info_size = ZPIOS_INFO_BUFFER_SIZE; info->info_buffer = (char *)vmem_alloc(ZPIOS_INFO_BUFFER_SIZE, KM_SLEEP); info->info_head = info->info_buffer; file->private_data = (void *)info; return (0); } static int zpios_release(struct inode *inode, struct file *file) { zpios_info_t *info = (zpios_info_t *)file->private_data; ASSERT(info); ASSERT(info->info_buffer); vmem_free(info->info_buffer, ZPIOS_INFO_BUFFER_SIZE); kmem_free(info, sizeof (*info)); return (0); } static int zpios_buffer_clear(struct file *file, zpios_cfg_t *kcfg, unsigned long arg) { zpios_info_t *info = (zpios_info_t *)file->private_data; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); memset(info->info_buffer, 0, info->info_size); info->info_head = info->info_buffer; spin_unlock(&info->info_lock); return (0); } static int zpios_buffer_size(struct file *file, zpios_cfg_t *kcfg, unsigned long arg) { zpios_info_t *info = (zpios_info_t *)file->private_data; char *buf; int min, size, rc = 0; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); if (kcfg->cfg_arg1 > 0) { size = kcfg->cfg_arg1; buf = (char *)vmem_alloc(size, KM_SLEEP); /* Zero fill and truncate contents when coping buffer */ min = ((size < info->info_size) ? size : info->info_size); memset(buf, 0, size); memcpy(buf, info->info_buffer, min); vmem_free(info->info_buffer, info->info_size); info->info_size = size; info->info_buffer = buf; info->info_head = info->info_buffer; } kcfg->cfg_rc1 = info->info_size; if (copy_to_user((struct zpios_cfg_t __user *)arg, kcfg, sizeof (*kcfg))) rc = -EFAULT; spin_unlock(&info->info_lock); return (rc); } static int zpios_ioctl_cfg(struct file *file, unsigned long arg) { zpios_cfg_t kcfg; int rc = 0; if (copy_from_user(&kcfg, (zpios_cfg_t *)arg, sizeof (kcfg))) return (-EFAULT); if (kcfg.cfg_magic != ZPIOS_CFG_MAGIC) { zpios_print(file, "Bad config magic 0x%x != 0x%x\n", kcfg.cfg_magic, ZPIOS_CFG_MAGIC); return (-EINVAL); } switch (kcfg.cfg_cmd) { case ZPIOS_CFG_BUFFER_CLEAR: /* * cfg_arg1 - Unused * cfg_rc1 - Unused */ rc = zpios_buffer_clear(file, &kcfg, arg); break; case ZPIOS_CFG_BUFFER_SIZE: /* * cfg_arg1 - 0 - query size; >0 resize * cfg_rc1 - Set to current buffer size */ rc = zpios_buffer_size(file, &kcfg, arg); break; default: zpios_print(file, "Bad config command %d\n", kcfg.cfg_cmd); rc = -EINVAL; break; } return (rc); } static int zpios_ioctl_cmd(struct file *file, unsigned long arg) { zpios_cmd_t *kcmd; void *data = NULL; int rc = -EINVAL; kcmd = kmem_alloc(sizeof (zpios_cmd_t), KM_SLEEP); rc = copy_from_user(kcmd, (zpios_cfg_t *)arg, sizeof (zpios_cmd_t)); if (rc) { zpios_print(file, "Unable to copy command structure " "from user to kernel memory, %d\n", rc); goto out_cmd; } if (kcmd->cmd_magic != ZPIOS_CMD_MAGIC) { zpios_print(file, "Bad command magic 0x%x != 0x%x\n", kcmd->cmd_magic, ZPIOS_CFG_MAGIC); rc = (-EINVAL); goto out_cmd; } /* Allocate memory for any opaque data the caller needed to pass on */ if (kcmd->cmd_data_size > 0) { data = (void *)vmem_alloc(kcmd->cmd_data_size, KM_SLEEP); rc = copy_from_user(data, (void *)(arg + offsetof(zpios_cmd_t, cmd_data_str)), kcmd->cmd_data_size); if (rc) { zpios_print(file, "Unable to copy data buffer " "from user to kernel memory, %d\n", rc); goto out_data; } } rc = zpios_do_one_run(file, kcmd, kcmd->cmd_data_size, data); if (data != NULL) { /* If the test failed do not print out the stats */ if (rc) goto out_data; rc = copy_to_user((void *)(arg + offsetof(zpios_cmd_t, cmd_data_str)), data, kcmd->cmd_data_size); if (rc) { zpios_print(file, "Unable to copy data buffer " "from kernel to user memory, %d\n", rc); rc = -EFAULT; } out_data: vmem_free(data, kcmd->cmd_data_size); } out_cmd: kmem_free(kcmd, sizeof (zpios_cmd_t)); return (rc); } static long zpios_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int rc = 0; /* Ignore tty ioctls */ if ((cmd & 0xffffff00) == ((int)'T') << 8) return (-ENOTTY); switch (cmd) { case ZPIOS_CFG: rc = zpios_ioctl_cfg(file, arg); break; case ZPIOS_CMD: rc = zpios_ioctl_cmd(file, arg); break; default: zpios_print(file, "Bad ioctl command %d\n", cmd); rc = -EINVAL; break; } return (rc); } #ifdef CONFIG_COMPAT /* Compatibility handler for ioctls from 32-bit ELF binaries */ static long zpios_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return (zpios_unlocked_ioctl(file, cmd, arg)); } #endif /* CONFIG_COMPAT */ /* * I'm not sure why you would want to write in to this buffer from * user space since its principle use is to pass test status info * back to the user space, but I don't see any reason to prevent it. */ static ssize_t zpios_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { zpios_info_t *info = (zpios_info_t *)file->private_data; int rc = 0; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); /* Write beyond EOF */ if (*ppos >= info->info_size) { rc = -EFBIG; goto out; } /* Resize count if beyond EOF */ if (*ppos + count > info->info_size) count = info->info_size - *ppos; if (copy_from_user(info->info_buffer, buf, count)) { rc = -EFAULT; goto out; } *ppos += count; rc = count; out: spin_unlock(&info->info_lock); return (rc); } static ssize_t zpios_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { zpios_info_t *info = (zpios_info_t *)file->private_data; int rc = 0; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); /* Read beyond EOF */ if (*ppos >= info->info_size) goto out; /* Resize count if beyond EOF */ if (*ppos + count > info->info_size) count = info->info_size - *ppos; if (copy_to_user(buf, info->info_buffer + *ppos, count)) { rc = -EFAULT; goto out; } *ppos += count; rc = count; out: spin_unlock(&info->info_lock); return (rc); } static loff_t zpios_seek(struct file *file, loff_t offset, int origin) { zpios_info_t *info = (zpios_info_t *)file->private_data; int rc = -EINVAL; ASSERT(info); ASSERT(info->info_buffer); spin_lock(&info->info_lock); switch (origin) { case 0: /* SEEK_SET - No-op just do it */ break; case 1: /* SEEK_CUR - Seek from current */ offset = file->f_pos + offset; break; case 2: /* SEEK_END - Seek from end */ offset = info->info_size + offset; break; } if (offset >= 0) { file->f_pos = offset; file->f_version = 0; rc = offset; } spin_unlock(&info->info_lock); return (rc); } static struct file_operations zpios_fops = { .owner = THIS_MODULE, .open = zpios_open, .release = zpios_release, .unlocked_ioctl = zpios_unlocked_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpios_compat_ioctl, #endif .read = zpios_read, .write = zpios_write, .llseek = zpios_seek, }; static struct miscdevice zpios_misc = { .minor = MISC_DYNAMIC_MINOR, .name = ZPIOS_NAME, .fops = &zpios_fops, }; #ifdef DEBUG #define ZFS_DEBUG_STR " (DEBUG mode)" #else #define ZFS_DEBUG_STR "" #endif static int __init zpios_init(void) { int error; error = misc_register(&zpios_misc); if (error) { printk(KERN_INFO "ZPIOS: misc_register() failed %d\n", error); } else { printk(KERN_INFO "ZPIOS: Loaded module v%s-%s%s\n", ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); } return (error); } static void __exit zpios_fini(void) { misc_deregister(&zpios_misc); printk(KERN_INFO "ZPIOS: Unloaded module v%s-%s%s\n", ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); } module_init(zpios_init); module_exit(zpios_fini); MODULE_AUTHOR("LLNL / Sun"); MODULE_DESCRIPTION("Kernel PIOS implementation"); MODULE_LICENSE("GPL"); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/tests/zfs-tests/cmd/xattrtest/xattrtest.c b/tests/zfs-tests/cmd/xattrtest/xattrtest.c index c8921fb69d77..a93cce7f18cf 100644 --- a/tests/zfs-tests/cmd/xattrtest/xattrtest.c +++ b/tests/zfs-tests/cmd/xattrtest/xattrtest.c @@ -1,702 +1,703 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2016 Lawrence Livermore National Security, LLC. */ /* * An extended attribute (xattr) correctness test. This program creates * N files and sets M attrs on them of size S. Optionally is will verify * a pattern stored in the xattr. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern char *program_invocation_short_name; #define ERROR(fmt, ...) \ fprintf(stderr, "%s: %s:%d: %s: " fmt "\n", \ program_invocation_short_name, __FILE__, __LINE__, \ __func__, ## __VA_ARGS__); static const char shortopts[] = "hvycdn:f:x:s:p:t:e:rRko:"; static const struct option longopts[] = { { "help", no_argument, 0, 'h' }, { "verbose", no_argument, 0, 'v' }, { "verify", no_argument, 0, 'y' }, { "nth", required_argument, 0, 'n' }, { "files", required_argument, 0, 'f' }, { "xattrs", required_argument, 0, 'x' }, { "size", required_argument, 0, 's' }, { "path", required_argument, 0, 'p' }, { "synccaches", no_argument, 0, 'c' }, { "dropcaches", no_argument, 0, 'd' }, { "script", required_argument, 0, 't' }, { "seed", required_argument, 0, 'e' }, { "random", no_argument, 0, 'r' }, { "randomvalue", no_argument, 0, 'R' }, { "keep", no_argument, 0, 'k' }, { "only", required_argument, 0, 'o' }, { 0, 0, 0, 0 } }; enum phases { PHASE_ALL = 0, PHASE_CREATE, PHASE_SETXATTR, PHASE_GETXATTR, PHASE_UNLINK, PHASE_INVAL }; static int verbose = 0; static int verify = 0; static int synccaches = 0; static int dropcaches = 0; static int nth = 0; static int files = 1000; static int xattrs = 1; static int size = 6; static int size_is_random = 0; static int value_is_random = 0; static int keep_files = 0; static int phase = PHASE_ALL; static char path[PATH_MAX] = "/tmp/xattrtest"; static char script[PATH_MAX] = "/bin/true"; static char xattrbytes[XATTR_SIZE_MAX]; static int -usage(int argc, char **argv) { +usage(int argc, char **argv) +{ fprintf(stderr, - "usage: %s [-hvycdrRk] [-n ] [-f ] [-x ]\n" - " [-s ] [-p ] [-t