Index: head/cddl/contrib/opensolaris/cmd/zinject/zinject.c =================================================================== --- head/cddl/contrib/opensolaris/cmd/zinject/zinject.c (revision 296509) +++ head/cddl/contrib/opensolaris/cmd/zinject/zinject.c (revision 296510) @@ -1,988 +1,1093 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* * ZFS Fault Injector * * This userland component takes a set of options and uses libzpool to translate * from a user-visible object type and name to an internal representation. * There are two basic types of faults: device faults and data faults. * * * DEVICE FAULTS * * Errors can be injected into a particular vdev using the '-d' option. This * option takes a path or vdev GUID to uniquely identify the device within a * pool. There are two types of errors that can be injected, EIO and ENXIO, * that can be controlled through the '-e' option. The default is ENXIO. For * EIO failures, any attempt to read data from the device will return EIO, but * subsequent attempt to reopen the device will succeed. For ENXIO failures, * any attempt to read from the device will return EIO, but any attempt to * reopen the device will also return ENXIO. * For label faults, the -L option must be specified. This allows faults * to be injected into either the nvlist, uberblock, pad1, or pad2 region * of all the labels for the specified device. * * This form of the command looks like: * * zinject -d device [-e errno] [-L ] pool * * * DATA FAULTS * * We begin with a tuple of the form: * * * * type A string describing the type of data to target. Each type * implicitly describes how to interpret 'object'. Currently, * the following values are supported: * * data User data for a file * dnode Dnode for a file or directory * * The following MOS objects are special. Instead of injecting * errors on a particular object or blkid, we inject errors across * all objects of the given type. * * mos Any data in the MOS * mosdir object directory * config pool configuration * bpobj blkptr list * spacemap spacemap * metaslab metaslab * errlog persistent error log * * level Object level. Defaults to '0', not applicable to all types. If * a range is given, this corresponds to the indirect block * corresponding to the specific range. * * range A numerical range [start,end) within the object. Defaults to * the full size of the file. * * object A string describing the logical location of the object. For * files and directories (currently the only supported types), * this is the path of the object on disk. * * This is translated, via libzpool, into the following internal representation: * * * * These types should be self-explanatory. This tuple is then passed to the * kernel via a special ioctl() to initiate fault injection for the given * object. Note that 'type' is not strictly necessary for fault injection, but * is used when translating existing faults into a human-readable string. * * * The command itself takes one of the forms: * * zinject * zinject <-a | -u pool> * zinject -c * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] * [-r range] * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool * * With no arguments, the command prints all currently registered injection * handlers, with their numeric identifiers. * * The '-c' option will clear the given handler, or all handlers if 'all' is * specified. * * The '-e' option takes a string describing the errno to simulate. This must * be either 'io' or 'checksum'. In most cases this will result in the same * behavior, but RAID-Z will produce a different set of ereports for this * situation. * * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is * specified, then the ARC cache is flushed appropriately. If '-u' is * specified, then the underlying SPA is unloaded. Either of these flags can be * specified independently of any other handlers. The '-m' flag automatically * does an unmount and remount of the underlying dataset to aid in flushing the * cache. * * The '-f' flag controls the frequency of errors injected, expressed as a * integer percentage between 1 and 100. The default is 100. * * The this form is responsible for actually injecting the handler into the * framework. It takes the arguments described above, translates them to the * internal tuple using libzpool, and then issues an ioctl() to register the * handler. * * The final form can target a specific bookmark, regardless of whether a * human-readable interface has been designed. It allows developers to specify * a particular block by number. */ #include #include #include #include #include #include #include #include #include #include #include #undef verify /* both libzfs.h and zfs_context.h want to define this */ #include "zinject.h" libzfs_handle_t *g_zfs; int zfs_fd; #ifndef ECKSUM #define ECKSUM EBADE #endif static const char *errtable[TYPE_INVAL] = { "data", "dnode", "mos", "mosdir", "metaslab", "config", "bpobj", "spacemap", "errlog", "uber", "nvlist", "pad1", "pad2" }; static err_type_t name_to_type(const char *arg) { int i; for (i = 0; i < TYPE_INVAL; i++) if (strcmp(errtable[i], arg) == 0) return (i); return (TYPE_INVAL); } static const char * type_to_name(uint64_t type) { switch (type) { case DMU_OT_OBJECT_DIRECTORY: return ("mosdir"); case DMU_OT_OBJECT_ARRAY: return ("metaslab"); case DMU_OT_PACKED_NVLIST: return ("config"); case DMU_OT_BPOBJ: return ("bpobj"); case DMU_OT_SPACE_MAP: return ("spacemap"); case DMU_OT_ERROR_LOG: return ("errlog"); default: return ("-"); } } /* * Print usage message. */ void usage(void) { (void) printf( "usage:\n" "\n" "\tzinject\n" "\n" "\t\tList all active injection records.\n" "\n" "\tzinject -c \n" "\n" "\t\tClear the particular record (if given a numeric ID), or\n" "\t\tall records if 'all' is specificed.\n" "\n" "\tzinject -p pool\n" + "\n" "\t\tInject a panic fault at the specified function. Only \n" "\t\tfunctions which call spa_vdev_config_exit(), or \n" "\t\tspa_vdev_exit() will trigger a panic.\n" "\n" "\tzinject -d device [-e errno] [-L ] [-F]\n" "\t [-T pool\n" + "\n" "\t\tInject a fault into a particular device or the device's\n" "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " "\t\t'pad1', or 'pad2'.\n" "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n" "\n" "\tzinject -d device -A pool\n" + "\n" "\t\tPerform a specific action on a particular device\n" "\n" + "\tzinject -d device -D latency:lanes pool\n" + "\n" + "\t\tAdd an artificial delay to IO requests on a particular\n" + "\t\tdevice, such that the requests take a minimum of 'latency'\n" + "\t\tmilliseconds to complete. Each delay has an associated\n" + "\t\tnumber of 'lanes' which defines the number of concurrent\n" + "\t\tIO requests that can be processed.\n" + "\n" + "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n" + "\t\tthe device will only be able to service a single IO request\n" + "\t\tat a time with each request taking 10 ms to complete. So,\n" + "\t\tif only a single request is submitted every 10 ms, the\n" + "\t\taverage latency will be 10 ms; but if more than one request\n" + "\t\tis submitted every 10 ms, the average latency will be more\n" + "\t\tthan 10 ms.\n" + "\n" + "\t\tSimilarly, if a delay of 10 ms is specified to have two\n" + "\t\tlanes (-D 10:2), then the device will be able to service\n" + "\t\ttwo requests at a time, each with a minimum latency of\n" + "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n" + "\t\tthe average latency will be 10 ms; but if more than two\n" + "\t\trequests are submitted every 10 ms, the average latency\n" + "\t\twill be more than 10 ms.\n" + "\n" + "\t\tAlso note, these delays are additive. So two invocations\n" + "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n" + "\t\tof '-D 10:2'. This also means, one can specify multiple\n" + "\t\tlanes with differing target latencies. For example, an\n" + "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n" + "\t\tcreate 3 lanes on the device; one lane with a latency\n" + "\t\tof 10 ms and two lanes with a 25 ms latency.\n" + "\n" "\tzinject -I [-s | -g ] pool\n" + "\n" "\t\tCause the pool to stop writing blocks yet not\n" "\t\treport errors for a duration. Simulates buggy hardware\n" "\t\tthat fails to honor cache flush requests.\n" "\t\tDefault duration is 30 seconds. The machine is panicked\n" "\t\tat the end of the duration.\n" "\n" "\tzinject -b objset:object:level:blkid pool\n" "\n" "\t\tInject an error into pool 'pool' with the numeric bookmark\n" "\t\tspecified by the remaining tuple. Each number is in\n" "\t\thexidecimal, and only one block can be specified.\n" "\n" "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n" "\t [-a] [-m] [-u] [-f freq] \n" "\n" "\t\tInject an error into the object specified by the '-t' option\n" "\t\tand the object descriptor. The 'object' parameter is\n" "\t\tinterperted depending on the '-t' option.\n" "\n" "\t\t-q\tQuiet mode. Only print out the handler number added.\n" "\t\t-e\tInject a specific error. Must be either 'io' or\n" "\t\t\t'checksum'. Default is 'io'.\n" "\t\t-l\tInject error at a particular block level. Default is " "0.\n" "\t\t-m\tAutomatically remount underlying filesystem.\n" "\t\t-r\tInject error over a particular logical range of an\n" "\t\t\tobject. Will be translated to the appropriate blkid\n" "\t\t\trange according to the object's properties.\n" "\t\t-a\tFlush the ARC cache. Can be specified without any\n" "\t\t\tassociated object.\n" "\t\t-u\tUnload the associated pool. Can be specified with only\n" "\t\t\ta pool object.\n" "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n" "\t\t\ta percentage between 1 and 100.\n" "\n" "\t-t data\t\tInject an error into the plain file contents of a\n" "\t\t\tfile. The object must be specified as a complete path\n" "\t\t\tto a file on a ZFS filesystem.\n" "\n" "\t-t dnode\tInject an error into the metadnode in the block\n" "\t\t\tcorresponding to the dnode for a file or directory. The\n" "\t\t\t'-r' option is incompatible with this mode. The object\n" "\t\t\tis specified as a complete path to a file or directory\n" "\t\t\ton a ZFS filesystem.\n" "\n" "\t-t \tInject errors into the MOS for objects of the given\n" "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n" "\t\t\tspacemap, metaslab, errlog. The only valid is\n" "\t\t\tthe poolname.\n"); } static int iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), void *data) { zfs_cmd_t zc = { 0 }; int ret; while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) if ((ret = func((int)zc.zc_guid, zc.zc_name, &zc.zc_inject_record, data)) != 0) return (ret); if (errno != ENOENT) { (void) fprintf(stderr, "Unable to list handlers: %s\n", strerror(errno)); return (-1); } return (0); } static int print_data_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_guid != 0 || record->zi_func[0] != '\0') return (0); if (*count == 0) { (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE"); (void) printf("--- --------------- ------ " "------ -------- --- ---------------\n"); } *count += 1; (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool, (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object, type_to_name(record->zi_type), record->zi_level); if (record->zi_start == 0 && record->zi_end == -1ULL) (void) printf("all\n"); else (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); return (0); } static int print_device_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_guid == 0 || record->zi_func[0] != '\0') return (0); + if (record->zi_cmd == ZINJECT_DELAY_IO) + return (0); + if (*count == 0) { (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); (void) printf("--- --------------- ----------------\n"); } *count += 1; (void) printf("%3d %-15s %llx\n", id, pool, (u_longlong_t)record->zi_guid); return (0); } static int +print_delay_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid == 0 || record->zi_func[0] != '\0') + return (0); + + if (record->zi_cmd != ZINJECT_DELAY_IO) + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %-15s %-15s %s\n", + "ID", "POOL", "DELAY (ms)", "LANES", "GUID"); + (void) printf("--- --------------- --------------- " + "--------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool, + (u_longlong_t)NSEC2MSEC(record->zi_timer), + (u_longlong_t)record->zi_nlanes, + (u_longlong_t)record->zi_guid); + + return (0); +} + +static int print_panic_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_func[0] == '\0') return (0); if (*count == 0) { (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION"); (void) printf("--- --------------- ----------------\n"); } *count += 1; (void) printf("%3d %-15s %s\n", id, pool, record->zi_func); return (0); } /* * Print all registered error handlers. Returns the number of handlers * registered. */ static int print_all_handlers(void) { int count = 0, total = 0; (void) iter_handlers(print_device_handler, &count); if (count > 0) { total += count; (void) printf("\n"); count = 0; } + (void) iter_handlers(print_delay_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + (void) iter_handlers(print_data_handler, &count); if (count > 0) { total += count; (void) printf("\n"); count = 0; } (void) iter_handlers(print_panic_handler, &count); return (count + total); } /* ARGSUSED */ static int cancel_one_handler(int id, const char *pool, zinject_record_t *record, void *data) { zfs_cmd_t zc = { 0 }; zc.zc_guid = (uint64_t)id; if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to remove handler %d: %s\n", id, strerror(errno)); return (1); } return (0); } /* * Remove all fault injection handlers. */ static int cancel_all_handlers(void) { int ret = iter_handlers(cancel_one_handler, NULL); if (ret == 0) (void) printf("removed all registered handlers\n"); return (ret); } /* * Remove a specific fault injection handler. */ static int cancel_handler(int id) { zfs_cmd_t zc = { 0 }; zc.zc_guid = (uint64_t)id; if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to remove handler %d: %s\n", id, strerror(errno)); return (1); } (void) printf("removed handler %d\n", id); return (0); } /* * Register a new fault injection handler. */ static int register_handler(const char *pool, int flags, zinject_record_t *record, int quiet) { zfs_cmd_t zc = { 0 }; (void) strcpy(zc.zc_name, pool); zc.zc_inject_record = *record; zc.zc_guid = flags; if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to add handler: %s\n", strerror(errno)); return (1); } if (flags & ZINJECT_NULL) return (0); if (quiet) { (void) printf("%llu\n", (u_longlong_t)zc.zc_guid); } else { (void) printf("Added handler %llu with the following " "properties:\n", (u_longlong_t)zc.zc_guid); (void) printf(" pool: %s\n", pool); if (record->zi_guid) { (void) printf(" vdev: %llx\n", (u_longlong_t)record->zi_guid); } else if (record->zi_func[0] != '\0') { (void) printf(" panic function: %s\n", record->zi_func); } else if (record->zi_duration > 0) { (void) printf(" time: %lld seconds\n", (u_longlong_t)record->zi_duration); } else if (record->zi_duration < 0) { (void) printf(" txgs: %lld \n", (u_longlong_t)-record->zi_duration); } else { (void) printf("objset: %llu\n", (u_longlong_t)record->zi_objset); (void) printf("object: %llu\n", (u_longlong_t)record->zi_object); (void) printf(" type: %llu\n", (u_longlong_t)record->zi_type); (void) printf(" level: %d\n", record->zi_level); if (record->zi_start == 0 && record->zi_end == -1ULL) (void) printf(" range: all\n"); else (void) printf(" range: [%llu, %llu)\n", (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); } } return (0); } int perform_action(const char *pool, zinject_record_t *record, int cmd) { zfs_cmd_t zc = { 0 }; ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED); (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); zc.zc_guid = record->zi_guid; zc.zc_cookie = cmd; if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (1); } +static int +parse_delay(char *str, uint64_t *delay, uint64_t *nlanes) +{ + unsigned long scan_delay; + unsigned long scan_nlanes; + + if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2) + return (1); + + /* + * We explicitly disallow a delay of zero here, because we key + * off this value being non-zero in translate_device(), to + * determine if the fault is a ZINJECT_DELAY_IO fault or not. + */ + if (scan_delay == 0) + return (1); + + /* + * The units for the CLI delay parameter is milliseconds, but + * the data passed to the kernel is interpreted as nanoseconds. + * Thus we scale the milliseconds to nanoseconds here, and this + * nanosecond value is used to pass the delay to the kernel. + */ + *delay = MSEC2NSEC(scan_delay); + *nlanes = scan_nlanes; + + return (0); +} + int main(int argc, char **argv) { int c; char *range = NULL; char *cancel = NULL; char *end; char *raw = NULL; char *device = NULL; int level = 0; int quiet = 0; int error = 0; int domount = 0; int io_type = ZIO_TYPES; int action = VDEV_STATE_UNKNOWN; err_type_t type = TYPE_INVAL; err_type_t label = TYPE_INVAL; zinject_record_t record = { 0 }; char pool[MAXNAMELEN]; char dataset[MAXNAMELEN]; zfs_handle_t *zhp; int nowrites = 0; int dur_txg = 0; int dur_secs = 0; int ret; int flags = 0; if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "internal error: failed to " "initialize ZFS library\n"); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { (void) fprintf(stderr, "failed to open ZFS device\n"); return (1); } if (argc == 1) { /* * No arguments. Print the available handlers. If there are no * available handlers, direct the user to '-h' for help * information. */ if (print_all_handlers() == 0) { (void) printf("No handlers registered.\n"); (void) printf("Run 'zinject -h' for usage " "information.\n"); } return (0); } while ((c = getopt(argc, argv, ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; break; case 'A': if (strcasecmp(optarg, "degrade") == 0) { action = VDEV_STATE_DEGRADED; } else if (strcasecmp(optarg, "fault") == 0) { action = VDEV_STATE_FAULTED; } else { (void) fprintf(stderr, "invalid action '%s': " "must be 'degrade' or 'fault'\n", optarg); usage(); return (1); } break; case 'b': raw = optarg; break; case 'c': cancel = optarg; break; case 'd': device = optarg; break; case 'D': - record.zi_timer = strtoull(optarg, &end, 10); - if (errno != 0 || *end != '\0') { + ret = parse_delay(optarg, &record.zi_timer, + &record.zi_nlanes); + if (ret != 0) { (void) fprintf(stderr, "invalid i/o delay " "value: '%s'\n", optarg); usage(); return (1); } break; case 'e': if (strcasecmp(optarg, "io") == 0) { error = EIO; } else if (strcasecmp(optarg, "checksum") == 0) { error = ECKSUM; } else if (strcasecmp(optarg, "nxio") == 0) { error = ENXIO; } else if (strcasecmp(optarg, "dtl") == 0) { error = ECHILD; } else { (void) fprintf(stderr, "invalid error type " "'%s': must be 'io', 'checksum' or " "'nxio'\n", optarg); usage(); return (1); } break; case 'f': record.zi_freq = atoi(optarg); if (record.zi_freq < 1 || record.zi_freq > 100) { (void) fprintf(stderr, "frequency range must " "be in the range (0, 100]\n"); return (1); } break; case 'F': record.zi_failfast = B_TRUE; break; case 'g': dur_txg = 1; record.zi_duration = (int)strtol(optarg, &end, 10); if (record.zi_duration <= 0 || *end != '\0') { (void) fprintf(stderr, "invalid duration '%s': " "must be a positive integer\n", optarg); usage(); return (1); } /* store duration of txgs as its negative */ record.zi_duration *= -1; break; case 'h': usage(); return (0); case 'I': /* default duration, if one hasn't yet been defined */ nowrites = 1; if (dur_secs == 0 && dur_txg == 0) record.zi_duration = 30; break; case 'l': level = (int)strtol(optarg, &end, 10); if (*end != '\0') { (void) fprintf(stderr, "invalid level '%s': " "must be an integer\n", optarg); usage(); return (1); } break; case 'm': domount = 1; break; case 'p': (void) strlcpy(record.zi_func, optarg, sizeof (record.zi_func)); record.zi_cmd = ZINJECT_PANIC; break; case 'q': quiet = 1; break; case 'r': range = optarg; break; case 's': dur_secs = 1; record.zi_duration = (int)strtol(optarg, &end, 10); if (record.zi_duration <= 0 || *end != '\0') { (void) fprintf(stderr, "invalid duration '%s': " "must be a positive integer\n", optarg); usage(); return (1); } break; case 'T': if (strcasecmp(optarg, "read") == 0) { io_type = ZIO_TYPE_READ; } else if (strcasecmp(optarg, "write") == 0) { io_type = ZIO_TYPE_WRITE; } else if (strcasecmp(optarg, "free") == 0) { io_type = ZIO_TYPE_FREE; } else if (strcasecmp(optarg, "claim") == 0) { io_type = ZIO_TYPE_CLAIM; } else if (strcasecmp(optarg, "all") == 0) { io_type = ZIO_TYPES; } else { (void) fprintf(stderr, "invalid I/O type " "'%s': must be 'read', 'write', 'free', " "'claim' or 'all'\n", optarg); usage(); return (1); } break; case 't': if ((type = name_to_type(optarg)) == TYPE_INVAL && !MOS_TYPE(type)) { (void) fprintf(stderr, "invalid type '%s'\n", optarg); usage(); return (1); } break; case 'u': flags |= ZINJECT_UNLOAD_SPA; break; case 'L': if ((label = name_to_type(optarg)) == TYPE_INVAL && !LABEL_TYPE(type)) { (void) fprintf(stderr, "invalid label type " "'%s'\n", optarg); usage(); return (1); } break; case ':': (void) fprintf(stderr, "option -%c requires an " "operand\n", optopt); usage(); return (1); case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); usage(); return (2); } } argc -= optind; argv += optind; if (record.zi_duration != 0) record.zi_cmd = ZINJECT_IGNORED_WRITES; if (cancel != NULL) { /* * '-c' is invalid with any other options. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { (void) fprintf(stderr, "cancel (-c) incompatible with " "any other options\n"); usage(); return (2); } if (argc != 0) { (void) fprintf(stderr, "extraneous argument to '-c'\n"); usage(); return (2); } if (strcmp(cancel, "all") == 0) { return (cancel_all_handlers()); } else { int id = (int)strtol(cancel, &end, 10); if (*end != '\0') { (void) fprintf(stderr, "invalid handle id '%s':" " must be an integer or 'all'\n", cancel); usage(); return (1); } return (cancel_handler(id)); } } if (device != NULL) { /* * Device (-d) injection uses a completely different mechanism * for doing injection, so handle it separately here. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { (void) fprintf(stderr, "device (-d) incompatible with " "data error injection\n"); usage(); return (2); } if (argc != 1) { (void) fprintf(stderr, "device (-d) injection requires " "a single pool name\n"); usage(); return (2); } (void) strcpy(pool, argv[0]); dataset[0] = '\0'; if (error == ECKSUM) { (void) fprintf(stderr, "device error type must be " "'io' or 'nxio'\n"); return (1); } record.zi_iotype = io_type; if (translate_device(pool, device, label, &record) != 0) return (1); if (!error) error = ENXIO; if (action != VDEV_STATE_UNKNOWN) return (perform_action(pool, &record, action)); } else if (raw != NULL) { if (range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { (void) fprintf(stderr, "raw (-b) format with " "any other options\n"); usage(); return (2); } if (argc != 1) { (void) fprintf(stderr, "raw (-b) format expects a " "single pool name\n"); usage(); return (2); } (void) strcpy(pool, argv[0]); dataset[0] = '\0'; if (error == ENXIO) { (void) fprintf(stderr, "data error type must be " "'checksum' or 'io'\n"); return (1); } record.zi_cmd = ZINJECT_DATA_FAULT; if (translate_raw(raw, &record) != 0) return (1); if (!error) error = EIO; } else if (record.zi_cmd == ZINJECT_PANIC) { if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || device != NULL) { (void) fprintf(stderr, "panic (-p) incompatible with " "other options\n"); usage(); return (2); } if (argc < 1 || argc > 2) { (void) fprintf(stderr, "panic (-p) injection requires " "a single pool name and an optional id\n"); usage(); return (2); } (void) strcpy(pool, argv[0]); if (argv[1] != NULL) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { if (nowrites == 0) { (void) fprintf(stderr, "-s or -g meaningless " "without -I (ignore writes)\n"); usage(); return (2); } else if (dur_secs && dur_txg) { (void) fprintf(stderr, "choose a duration either " "in seconds (-s) or a number of txgs (-g) " "but not both\n"); usage(); return (2); } else if (argc != 1) { (void) fprintf(stderr, "ignore writes (-I) " "injection requires a single pool name\n"); usage(); return (2); } (void) strcpy(pool, argv[0]); dataset[0] = '\0'; } else if (type == TYPE_INVAL) { if (flags == 0) { (void) fprintf(stderr, "at least one of '-b', '-d', " "'-t', '-a', '-p', '-I' or '-u' " "must be specified\n"); usage(); return (2); } if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) { (void) strcpy(pool, argv[0]); dataset[0] = '\0'; } else if (argc != 0) { (void) fprintf(stderr, "extraneous argument for " "'-f'\n"); usage(); return (2); } flags |= ZINJECT_NULL; } else { if (argc != 1) { (void) fprintf(stderr, "missing object\n"); usage(); return (2); } if (error == ENXIO) { (void) fprintf(stderr, "data error type must be " "'checksum' or 'io'\n"); return (1); } record.zi_cmd = ZINJECT_DATA_FAULT; if (translate_record(type, argv[0], range, level, &record, pool, dataset) != 0) return (1); if (!error) error = EIO; } /* * If this is pool-wide metadata, unmount everything. The ioctl() will * unload the pool, so that we trigger spa-wide reopen of metadata next * time we access the pool. */ if (dataset[0] != '\0' && domount) { if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) return (1); if (zfs_unmount(zhp, NULL, 0) != 0) return (1); } record.zi_error = error; ret = register_handler(pool, flags, &record, quiet); if (dataset[0] != '\0' && domount) ret = (zfs_mount(zhp, NULL, 0) != 0); libzfs_fini(g_zfs); return (ret); } Index: head/cddl/contrib/opensolaris =================================================================== --- head/cddl/contrib/opensolaris (revision 296509) +++ head/cddl/contrib/opensolaris (revision 296510) Property changes on: head/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor/illumos/dist:r296505 Index: head/sys/cddl/compat/opensolaris/sys/callo.h =================================================================== --- head/sys/cddl/compat/opensolaris/sys/callo.h (nonexistent) +++ head/sys/cddl/compat/opensolaris/sys/callo.h (revision 296510) @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2016 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_CALLO_H_ +#define _OPENSOLARIS_SYS_CALLO_H_ + +#include_next + +#define CALLOUT_REALTIME 0 /* realtime callout type */ +#define CALLOUT_NORMAL 1 /* normal callout type */ + +#endif /* !_OPENSOLARIS_SYS_CALLO_H_ */ Property changes on: head/sys/cddl/compat/opensolaris/sys/callo.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/sys/cddl/compat/opensolaris/sys/systm.h =================================================================== --- head/sys/cddl/compat/opensolaris/sys/systm.h (revision 296509) +++ head/sys/cddl/compat/opensolaris/sys/systm.h (revision 296510) @@ -1,47 +1,50 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_SYSTM_H_ #define _OPENSOLARIS_SYS_SYSTM_H_ #ifdef _KERNEL #include #include_next #include #define PAGESIZE PAGE_SIZE #define PAGEOFFSET (PAGESIZE - 1) #define PAGEMASK (~PAGEOFFSET) #define delay(x) pause("soldelay", (x)) +#define timeout_generic(type, fn, arg, t, r, f) \ + timeout(fn, arg, t / (NANOSEC/hz) + 1) + #endif /* _KERNEL */ #endif /* _OPENSOLARIS_SYS_SYSTM_H_ */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h (revision 296509) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h (revision 296510) @@ -1,149 +1,147 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H #ifdef __cplusplus extern "C" { #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef illumos #include -#include -#else /* FreeBSD */ -#include #endif +#include #include #include #include #include #include #include #include /* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */ #ifdef min_offset #undef min_offset #endif #ifdef max_offset #undef max_offset #endif #include #include #define CPU_SEQID (curcpu) #define tsd_create(keyp, destructor) do { \ *(keyp) = osd_thread_register((destructor)); \ KASSERT(*(keyp) > 0, ("cannot register OSD")); \ } while (0) #define tsd_destroy(keyp) osd_thread_deregister(*(keyp)) #define tsd_get(key) osd_thread_get(curthread, (key)) #define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) #ifdef __cplusplus } #endif extern int zfs_debug_level; extern struct mtx zfs_debug_mtx; #define ZFS_LOG(lvl, ...) do { \ if (((lvl) & 0xff) <= zfs_debug_level) { \ mtx_lock(&zfs_debug_mtx); \ printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \ printf(__VA_ARGS__); \ printf("\n"); \ if ((lvl) & 0x100) \ kdb_backtrace(); \ mtx_unlock(&zfs_debug_mtx); \ } \ } while (0) #define sys_shutdown rebooting #endif /* _SYS_ZFS_CONTEXT_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h (revision 296509) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h (revision 296510) @@ -1,445 +1,446 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* _KERNEL */ #ifdef __cplusplus extern "C" { #endif /* * The structures in this file are passed between userland and the * kernel. Userland may be running a 32-bit process, while the kernel * is 64-bit. Therefore, these structures need to compile the same in * 32-bit and 64-bit. This means not using type "long", and adding * explicit padding so that the 32-bit structure will not be packed more * tightly than the 64-bit structure (which requires 64-bit alignment). */ /* * Property values for snapdir */ #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 /* * Field manipulation macros for the drr_versioninfo field of the * send stream header. */ /* * Header types for zfs send streams. */ typedef enum drr_headertype { DMU_SUBSTREAM = 0x1, DMU_COMPOUNDSTREAM = 0x2 } drr_headertype_t; #define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) #define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) #define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) #define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) /* * Feature flags for zfs send streams (flags in drr_versioninfo) */ #define DMU_BACKUP_FEATURE_DEDUP (1 << 0) #define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) #define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ #define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) #define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | \ DMU_BACKUP_FEATURE_LARGE_BLOCKS) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) typedef enum dmu_send_resume_token_version { ZFS_SEND_RESUME_TOKEN_VERSION = 1 } dmu_send_resume_token_version_t; /* * The drr_versioninfo field of the dmu_replay_record has the * following layout: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * | reserved | feature-flags |C|S| * +-------+-------+-------+-------+-------+-------+-------+-------+ * * The low order two bits indicate the header type: SUBSTREAM (0x1) * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: * this field used to be a version number, where the two version types * were 1 and 2. Using two bits for this allows earlier versions of * the code to be able to recognize send streams that don't use any * of the features indicated by feature flags. */ #define DMU_BACKUP_MAGIC 0x2F5bacbacULL #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) /* * This send stream, if it is a full send, includes the FREE and FREEOBJECT * records that are created by the sending process. This means that the send * stream can be received as a clone, even though it is not an incremental. * This is not implemented as a feature flag, because the receiving side does * not need to have implemented it to receive this stream; it is fully backwards * compatible. We need a flag, though, because full send streams without it * cannot necessarily be received as a clone correctly. */ #define DRR_FLAG_FREERECORDS (1<<2) /* * flags in the drr_checksumflags field in the DRR_WRITE and * DRR_WRITE_BYREF blocks */ #define DRR_CHECKSUM_DEDUP (1<<0) #define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) /* * zfs ioctl command structure */ struct drr_begin { uint64_t drr_magic; uint64_t drr_versioninfo; /* was drr_version */ uint64_t drr_creation_time; dmu_objset_type_t drr_type; uint32_t drr_flags; uint64_t drr_toguid; uint64_t drr_fromguid; char drr_toname[MAXNAMELEN]; }; struct drr_end { zio_cksum_t drr_checksum; uint64_t drr_toguid; }; struct drr_object { uint64_t drr_object; dmu_object_type_t drr_type; dmu_object_type_t drr_bonustype; uint32_t drr_blksz; uint32_t drr_bonuslen; uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_pad[6]; uint64_t drr_toguid; /* bonus content follows */ }; struct drr_freeobjects { uint64_t drr_firstobj; uint64_t drr_numobjs; uint64_t drr_toguid; }; struct drr_write { uint64_t drr_object; dmu_object_type_t drr_type; uint32_t drr_pad; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_checksumtype; uint8_t drr_checksumflags; uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ /* content follows */ }; struct drr_free { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; }; struct drr_write_byref { /* where to put the data */ uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; /* where to find the prior copy of the data */ uint64_t drr_refguid; uint64_t drr_refobject; uint64_t drr_refoffset; /* properties of the data */ uint8_t drr_checksumtype; uint8_t drr_checksumflags; uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ }; struct drr_spill { uint64_t drr_object; uint64_t drr_length; uint64_t drr_toguid; uint64_t drr_pad[4]; /* needed for crypto */ /* spill data follows */ }; typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { struct drr_begin drr_begin; struct drr_end drr_end; struct drr_object drr_object; struct drr_freeobjects drr_freeobjects; struct drr_write drr_write; struct drr_free drr_free; struct drr_write_byref drr_write_byref; struct drr_spill drr_spill; struct drr_write_embedded { uint64_t drr_object; uint64_t drr_offset; /* logical length, should equal blocksize */ uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_compression; uint8_t drr_etype; uint8_t drr_pad[6]; uint32_t drr_lsize; /* uncompressed size of payload */ uint32_t drr_psize; /* compr. (real) size of payload */ /* (possibly compressed) content follows */ } drr_write_embedded; /* * Nore: drr_checksum is overlaid with all record types * except DRR_BEGIN. Therefore its (non-pad) members * must not overlap with members from the other structs. * We accomplish this by putting its members at the very * end of the struct. */ struct drr_checksum { uint64_t drr_pad[34]; /* * fletcher-4 checksum of everything preceding the * checksum. */ zio_cksum_t drr_checksum; } drr_checksum; } drr_u; } dmu_replay_record_t; /* diff record range types */ typedef enum diff_type { DDR_NONE = 0x1, DDR_INUSE = 0x2, DDR_FREE = 0x4 } diff_type_t; /* * The diff reports back ranges of free or in-use objects. */ typedef struct dmu_diff_record { uint64_t ddr_type; uint64_t ddr_first; uint64_t ddr_last; } dmu_diff_record_t; typedef struct zinject_record { uint64_t zi_objset; uint64_t zi_object; uint64_t zi_start; uint64_t zi_end; uint64_t zi_guid; uint32_t zi_level; uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; char zi_func[MAXNAMELEN]; uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; + uint64_t zi_nlanes; uint32_t zi_cmd; uint32_t zi_pad; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, ZINJECT_DEVICE_FAULT, ZINJECT_LABEL_FAULT, ZINJECT_IGNORED_WRITES, ZINJECT_PANIC, ZINJECT_DELAY_IO, } zinject_type_t; typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; uint64_t z_sharetype; /* 0 = share, 1 = unshare */ uint64_t z_sharemax; /* max length of share string */ } zfs_share_t; /* * ZFS file systems may behave the usual, POSIX-compliant way, where * name lookups are case-sensitive. They may also be set up so that * all the name lookups are case-insensitive, or so that only some * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. */ typedef enum zfs_case { ZFS_CASE_SENSITIVE, ZFS_CASE_INSENSITIVE, ZFS_CASE_MIXED } zfs_case_t; typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; /* name of pool or dataset */ uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ int zc_pad2; /* * The following members are for legacy ioctls which haven't been * converted to the new method. */ uint64_t zc_history; /* really (char *) */ char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; dmu_replay_record_t zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; boolean_t zc_resumable; uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; } zfs_cmd_t; typedef struct zfs_useracct { char zu_domain[256]; uid_t zu_rid; uint32_t zu_pad; uint64_t zu_space; } zfs_useracct_t; #define ZFSDEV_MAX_MINOR (1 << 16) #define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) #define ZPOOL_EXPORT_AFTER_SPLIT 0x1 #ifdef _KERNEL typedef struct zfs_creat { nvlist_t *zct_zplprops; nvlist_t *zct_props; } zfs_creat_t; extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int zfs_busy(void); extern int zfs_unmount_snap(const char *); extern void zfs_destroy_unmount_origin(const char *); /* * ZFS minor numbers can refer to either a control device instance or * a zvol. Depending on the value of zss_type, zss_data points to either * a zvol_state_t or a zfs_onexit_t. */ enum zfs_soft_state_type { ZSST_ZVOL, ZSST_CTLDEV }; typedef struct zfs_soft_state { enum zfs_soft_state_type zss_type; void *zss_data; } zfs_soft_state_t; extern void *zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which); extern minor_t zfsdev_minor_alloc(void); extern void *zfsdev_state; #endif /* _KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_IOCTL_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (revision 296509) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (revision 296510) @@ -1,643 +1,646 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ #ifndef _ZIO_H #define _ZIO_H #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Embedded checksum */ #define ZEC_MAGIC 0x210da7ab10c7a11ULL typedef struct zio_eck { uint64_t zec_magic; /* for validation, endianness */ zio_cksum_t zec_cksum; /* 256-bit checksum */ } zio_eck_t; /* * Gang block headers are self-checksumming and contain an array * of block pointers. */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, ZIO_CHECKSUM_ON, ZIO_CHECKSUM_OFF, ZIO_CHECKSUM_LABEL, ZIO_CHECKSUM_GANG_HEADER, ZIO_CHECKSUM_ZILOG, ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_NOPARITY, #ifdef illumos ZIO_CHECKSUM_SHA512, ZIO_CHECKSUM_SKEIN, ZIO_CHECKSUM_EDONR, #endif ZIO_CHECKSUM_FUNCTIONS }; /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_MASK 0xffULL #define ZIO_CHECKSUM_VERIFY (1 << 8) #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 #define ZIO_DEDUPDITTO_MIN 100 enum zio_compress { ZIO_COMPRESS_INHERIT = 0, ZIO_COMPRESS_ON, ZIO_COMPRESS_OFF, ZIO_COMPRESS_LZJB, ZIO_COMPRESS_EMPTY, ZIO_COMPRESS_GZIP_1, ZIO_COMPRESS_GZIP_2, ZIO_COMPRESS_GZIP_3, ZIO_COMPRESS_GZIP_4, ZIO_COMPRESS_GZIP_5, ZIO_COMPRESS_GZIP_6, ZIO_COMPRESS_GZIP_7, ZIO_COMPRESS_GZIP_8, ZIO_COMPRESS_GZIP_9, ZIO_COMPRESS_ZLE, ZIO_COMPRESS_LZ4, ZIO_COMPRESS_FUNCTIONS }; /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 /* * The meaning of "compress = on" selected by the compression features enabled * on a given pool. */ #define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4 #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF #define BOOTFS_COMPRESS_VALID(compress) \ ((compress) == ZIO_COMPRESS_LZJB || \ (compress) == ZIO_COMPRESS_LZ4 || \ (compress) == ZIO_COMPRESS_ON || \ (compress) == ZIO_COMPRESS_OFF) #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 enum zio_flag { /* * Flags inherited by gang, ddt, and vdev children, * and that must be equal for two zios to aggregate */ ZIO_FLAG_DONT_AGGREGATE = 1 << 0, ZIO_FLAG_IO_REPAIR = 1 << 1, ZIO_FLAG_SELF_HEAL = 1 << 2, ZIO_FLAG_RESILVER = 1 << 3, ZIO_FLAG_SCRUB = 1 << 4, ZIO_FLAG_SCAN_THREAD = 1 << 5, ZIO_FLAG_PHYSICAL = 1 << 6, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ ZIO_FLAG_SPECULATIVE = 1 << 8, ZIO_FLAG_CONFIG_WRITER = 1 << 9, ZIO_FLAG_DONT_RETRY = 1 << 10, ZIO_FLAG_DONT_CACHE = 1 << 11, ZIO_FLAG_NODATA = 1 << 12, ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) /* * Flags inherited by vdev children. */ ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */ ZIO_FLAG_PROBE = 1 << 15, ZIO_FLAG_TRYHARD = 1 << 16, ZIO_FLAG_OPTIONAL = 1 << 17, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */ ZIO_FLAG_DONT_PROPAGATE = 1 << 19, ZIO_FLAG_IO_BYPASS = 1 << 20, ZIO_FLAG_IO_REWRITE = 1 << 21, ZIO_FLAG_RAW = 1 << 22, ZIO_FLAG_GANG_CHILD = 1 << 23, ZIO_FLAG_DDT_CHILD = 1 << 24, ZIO_FLAG_GODFATHER = 1 << 25, ZIO_FLAG_NOPWRITE = 1 << 26, ZIO_FLAG_REEXECUTED = 1 << 27, ZIO_FLAG_DELEGATED = 1 << 28, }; #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_DDT_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_VDEV_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ ZIO_FLAG_CANFAIL) enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; enum zio_wait_type { ZIO_WAIT_READY = 0, ZIO_WAIT_DONE, ZIO_WAIT_TYPES }; /* * We'll take the number 122 and 123 to indicate checksum errors and * fragmentation. Those doesn't collide with any errno values as they * are greater than ELAST. */ #define ECKSUM 122 #define EFRAGS 123 typedef void zio_done_func_t(zio_t *zio); extern const char *zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) * is objset 0, and the meta-dnode is object 0. This covers all blocks * except root blocks and ZIL blocks, which are defined as follows: * * Root blocks (objset_phys_t) are object 0, level -1: . * ZIL blocks are bookmarked . * dmu_sync()ed ZIL data blocks are bookmarked . * dnode visit bookmarks are . * * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel, and is * stored on disk (by virtue of being incorporated into other on-disk * structures, e.g. dsl_scan_phys_t). */ typedef struct zbookmark_phys { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; } zbookmark_phys_t; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ } #define ZB_DESTROYED_OBJSET (-1ULL) #define ZB_ROOT_OBJECT (0ULL) #define ZB_ROOT_LEVEL (-1LL) #define ZB_ROOT_BLKID (0ULL) #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) #define ZB_DNODE_LEVEL (-3LL) #define ZB_DNODE_BLKID (0ULL) #define ZB_IS_ZERO(zb) \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ (zb)->zb_level == 0 && (zb)->zb_blkid == 0) #define ZB_IS_ROOT(zb) \ ((zb)->zb_object == ZB_ROOT_OBJECT && \ (zb)->zb_level == ZB_ROOT_LEVEL && \ (zb)->zb_blkid == ZB_ROOT_BLKID) typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, const void *good_data); typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; struct zio_cksum_report { struct zio_cksum_report *zcr_next; nvlist_t *zcr_ereport; nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; zio_cksum_free_f *zcr_free; /* internal use only */ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ }; typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, void *arg); zio_vsd_cksum_report_f zio_vsd_default_cksum_report; typedef struct zio_vsd_ops { zio_done_func_t *vsd_free; zio_vsd_cksum_report_f *vsd_cksum_report; } zio_vsd_ops_t; typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, zio_gang_node_t *gn, void *data); typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); typedef struct zio_transform { void *zt_orig_data; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; struct zio_transform *zt_next; } zio_transform_t; typedef int zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must * be able to propagate them to the parent. The normal io_flags are local * to the zio, not protected by any lock, and not modifiable by children; * the reexecute flags are protected by io_lock, modifiable by children, * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; list_node_t zl_parent_node; list_node_t zl_child_node; } zio_link_t; /* * Used for TRIM kstat. */ typedef struct zio_trim_stats { /* * Number of bytes successfully TRIMmed. */ kstat_named_t bytes; /* * Number of successful TRIM requests. */ kstat_named_t success; /* * Number of TRIM requests that failed because TRIM is not * supported. */ kstat_named_t unsupported; /* * Number of TRIM requests that failed for other reasons. */ kstat_named_t failed; } zio_trim_stats_t; extern zio_trim_stats_t zio_trim_stats; #define ZIO_TRIM_STAT_INCR(stat, val) \ atomic_add_64(&zio_trim_stats.stat.value.ui64, (val)); #define ZIO_TRIM_STAT_BUMP(stat) \ ZIO_TRIM_STAT_INCR(stat, 1); struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; zio_prop_t io_prop; zio_type_t io_type; enum zio_child io_child_type; int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; blkptr_t *io_bp_override; blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; zio_link_t *io_walk_link; zio_t *io_logical; zio_transform_t *io_transform_stack; /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_physdone; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* Data represented by this I/O */ void *io_data; void *io_orig_data; uint64_t io_size; uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; uint64_t io_offset; hrtime_t io_timestamp; + hrtime_t io_target_timestamp; avl_node_t io_queue_node; avl_node_t io_offset_node; /* Internal pipeline state */ enum zio_flag io_flags; enum zio_stage io_stage; enum zio_stage io_pipeline; enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t io_child_count; uint64_t io_phys_children; uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; kmutex_t io_lock; kcondvar_t io_cv; /* FMA state */ zio_cksum_report_t *io_cksum_report; uint64_t io_ena; /* Taskq dispatching state */ taskq_ent_t io_tqent; avl_node_t io_trim_node; list_node_t io_trim_link; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, uint64_t size, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); extern void zio_execute(zio_t *zio); extern void zio_interrupt(zio_t *zio); +extern void zio_delay_init(zio_t *zio); +extern void zio_delay_interrupt(zio_t *zio); extern zio_t *zio_walk_parents(zio_t *cio); extern zio_t *zio_walk_children(zio_t *pio); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *priv); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *priv); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); extern enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent); extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent); extern enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent); extern void zio_suspend(spa_t *spa, zio_t *zio); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); /* * Initial setup and teardown. */ extern void zio_init(void); extern void zio_fini(void); /* * Fault injection */ struct zinject_record; extern uint32_t zio_injection_enabled; extern int zio_inject_fault(char *name, int flags, int *id, struct zinject_record *record); extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); -extern uint64_t zio_handle_io_delay(zio_t *zio); +extern hrtime_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions */ extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const void *good_data, const void *bad_data, boolean_t drop_if_identical); extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t offset, uint64_t length, const void *good_data, const void *bad_data, struct zio_bad_cksum *info); /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark_phys functions */ boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); #ifdef __cplusplus } #endif #endif /* _ZIO_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c (revision 296509) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c (revision 296510) @@ -1,943 +1,944 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * Virtual device vector for disks. */ extern ldi_ident_t zfs_li; static void vdev_disk_close(vdev_t *); typedef struct vdev_disk_ldi_cb { list_node_t lcb_next; ldi_callback_id_t lcb_id; } vdev_disk_ldi_cb_t; static void vdev_disk_alloc(vdev_t *vd) { vdev_disk_t *dvd; dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); /* * Create the LDI event callback list. */ list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), offsetof(vdev_disk_ldi_cb_t, lcb_next)); } static void vdev_disk_free(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; vdev_disk_ldi_cb_t *lcb; if (dvd == NULL) return; /* * We have already closed the LDI handle. Clean up the LDI event * callbacks and free vd->vdev_tsd. */ while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { list_remove(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_remove_callbacks(lcb->lcb_id); kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); } list_destroy(&dvd->vd_ldi_cbs); kmem_free(dvd, sizeof (vdev_disk_t)); vd->vdev_tsd = NULL; } /* ARGSUSED */ static int vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; vdev_disk_t *dvd = vd->vdev_tsd; /* * Ignore events other than offline. */ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) return (LDI_EV_SUCCESS); /* * All LDI handles must be closed for the state change to succeed, so * call on vdev_disk_close() to do this. * * We inform vdev_disk_close that it is being called from offline * notify context so it will defer cleanup of LDI event callbacks and * freeing of vd->vdev_tsd to the offline finalize or a reopen. */ dvd->vd_ldi_offline = B_TRUE; vdev_disk_close(vd); /* * Now that the device is closed, request that the spa_async_thread * mark the device as REMOVED and notify FMA of the removal. */ zfs_post_remove(vd->vdev_spa, vd); vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); return (LDI_EV_SUCCESS); } /* ARGSUSED */ static void vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, int ldi_result, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; /* * Ignore events other than offline. */ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) return; /* * We have already closed the LDI handle in notify. * Clean up the LDI event callbacks and free vd->vdev_tsd. */ vdev_disk_free(vd); /* * Request that the vdev be reopened if the offline state change was * unsuccessful. */ if (ldi_result != LDI_EV_SUCCESS) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); } } static ldi_ev_callback_t vdev_disk_off_callb = { .cb_vers = LDI_EV_CB_VERS, .cb_notify = vdev_disk_off_notify, .cb_finalize = vdev_disk_off_finalize }; /* ARGSUSED */ static void vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, int ldi_result, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; /* * Ignore events other than degrade. */ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0) return; /* * Degrade events always succeed. Mark the vdev as degraded. * This status is purely informative for the user. */ (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0); } static ldi_ev_callback_t vdev_disk_dgrd_callb = { .cb_vers = LDI_EV_CB_VERS, .cb_notify = NULL, .cb_finalize = vdev_disk_dgrd_finalize }; static void vdev_disk_hold(vdev_t *vd) { ddi_devid_t devid; char *minor; ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') return; /* * Only prefetch path and devid info if the device has * never been opened. */ if (vd->vdev_tsd != NULL) return; if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); (void) snprintf(buf, len, "%ss0", vd->vdev_path); (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); kmem_free(buf, len); } if (vd->vdev_name_vp == NULL) (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); if (vd->vdev_devid != NULL && ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); ddi_devid_str_free(minor); ddi_devid_free(devid); } } static void vdev_disk_rele(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); if (vd->vdev_name_vp) { VN_RELE_ASYNC(vd->vdev_name_vp, dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); vd->vdev_name_vp = NULL; } if (vd->vdev_devid_vp) { VN_RELE_ASYNC(vd->vdev_devid_vp, dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); vd->vdev_devid_vp = NULL; } } static uint64_t vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz) { ASSERT(vd->vdev_wholedisk); vdev_disk_t *dvd = vd->vdev_tsd; dk_efi_t dk_ioc; efi_gpt_t *efi; uint64_t avail_space = 0; int efisize = EFI_LABEL_SIZE * 2; dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP); dk_ioc.dki_lba = 1; dk_ioc.dki_length = efisize; dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data; efi = dk_ioc.dki_data; if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc, FKIOCTL, kcred, NULL) == 0) { uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA); if (capacity > efi_altern_lba) avail_space = (capacity - efi_altern_lba) * blksz; } kmem_free(dk_ioc.dki_data, efisize); return (avail_space); } /* * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when * even a fallback to DKIOCGMEDIAINFO fails. */ #ifdef DEBUG #define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) #else #define VDEV_DEBUG(...) /* Nothing... */ #endif static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd = vd->vdev_tsd; ldi_ev_cookie_t ecookie; vdev_disk_ldi_cb_t *lcb; union { struct dk_minfo_ext ude; struct dk_minfo ud; } dks; struct dk_minfo_ext *dkmext = &dks.ude; struct dk_minfo *dkm = &dks.ud; int error; dev_t dev; int otyp; boolean_t validate_devid = B_FALSE; ddi_devid_t devid; uint64_t capacity = 0, blksz = 0, pbsize; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (dvd != NULL) { if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { /* * If we are opening a device in its offline notify * context, the LDI handle was just closed. Clean * up the LDI event callbacks and free vd->vdev_tsd. */ vdev_disk_free(vd); } else { ASSERT(vd->vdev_reopening); goto skip_open; } } /* * Create vd->vdev_tsd. */ vdev_disk_alloc(vd); dvd = vd->vdev_tsd; /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave * us, even if it is one of multiple paths to the save device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * * 1. Try opening the device by path. For legacy pools without the * 'whole_disk' property, attempt to fix the path by appending 's0'. * * 2. If the devid of the device matches the stored value, return * success. * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. */ if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, &dvd->vd_minor) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } } error = EINVAL; /* presume failure */ if (vd->vdev_path != NULL) { if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); (void) snprintf(buf, len, "%ss0", vd->vdev_path); error = ldi_open_by_name(buf, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); if (error == 0) { spa_strfree(vd->vdev_path); vd->vdev_path = buf; vd->vdev_wholedisk = 1ULL; } else { kmem_free(buf, len); } } /* * If we have not yet opened the device, try to open it by the * specified path. */ if (error != 0) { error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * Compare the devid to the stored value. */ if (error == 0 && vd->vdev_devid != NULL && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { error = SET_ERROR(EINVAL); (void) ldi_close(dvd->vd_lh, spa_mode(spa), kcred); dvd->vd_lh = NULL; } ddi_devid_free(devid); } /* * If we succeeded in opening the device, but 'vdev_wholedisk' * is not yet set, then this must be a slice. */ if (error == 0 && vd->vdev_wholedisk == -1ULL) vd->vdev_wholedisk = 0; } /* * If we were unable to open by path, or the devid check fails, open by * devid instead. */ if (error != 0 && vd->vdev_devid != NULL) { error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * If all else fails, then try opening by physical path (if available) * or the logical path (if we failed due to the devid check). While not * as reliable as the devid, this will give us something, and the higher * level vdev validation will prevent us from opening the wrong device. */ if (error) { if (vd->vdev_devid != NULL) validate_devid = B_TRUE; if (vd->vdev_physpath != NULL && (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * Note that we don't support the legacy auto-wholedisk support * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ if (error && vd->vdev_path != NULL) error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } /* * Now that the device has been successfully opened, update the devid * if necessary. */ if (validate_devid && spa_writeable(spa) && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { char *vd_devid; vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); zfs_dbgmsg("vdev %s: update devid from %s, " "to %s", vd->vdev_path, vd->vdev_devid, vd_devid); spa_strfree(vd->vdev_devid); vd->vdev_devid = spa_strdup(vd_devid); ddi_devid_str_free(vd_devid); } ddi_devid_free(devid); } /* * Once a device is opened, verify that the physical device path (if * available) is up to date. */ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { char *physpath, *minorname; physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); minorname = NULL; if (ddi_dev_pathname(dev, otyp, physpath) == 0 && ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && (vd->vdev_physpath == NULL || strcmp(vd->vdev_physpath, physpath) != 0)) { if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); (void) strlcat(physpath, ":", MAXPATHLEN); (void) strlcat(physpath, minorname, MAXPATHLEN); vd->vdev_physpath = spa_strdup(physpath); } if (minorname) kmem_free(minorname, strlen(minorname) + 1); kmem_free(physpath, MAXPATHLEN); } /* * Register callbacks for the LDI offline event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); } /* * Register callbacks for the LDI degrade event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); } skip_open: /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (SET_ERROR(EINVAL)); } *max_psize = *psize; /* * Determine the device's minimum transfer size. * If the ioctl isn't supported, assume DEV_BSIZE. */ if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { capacity = dkmext->dki_capacity - 1; blksz = dkmext->dki_lbsize; pbsize = dkmext->dki_pbsize; } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { VDEV_DEBUG( "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", vd->vdev_path); capacity = dkm->dki_capacity - 1; blksz = dkm->dki_lbsize; pbsize = blksz; } else { VDEV_DEBUG("vdev_disk_open(\"%s\"): " "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", vd->vdev_path, error); pbsize = DEV_BSIZE; } *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; if (vd->vdev_wholedisk == 1) { int wce = 1; if (error == 0) { /* * If we have the capability to expand, we'd have * found out via success from DKIOCGMEDIAINFO{,EXT}. * Adjust max_psize upward accordingly since we know * we own the whole disk now. */ *max_psize += vdev_disk_get_space(vd, capacity, blksz); zfs_dbgmsg("capacity change: vdev %s, psize %llu, " "max_psize %llu", vd->vdev_path, *psize, *max_psize); } /* * Since we own the whole disk, try to enable disk write * caching. We ignore errors because it's OK if we can't do it. */ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, FKIOCTL, kcred, NULL); } /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. */ vd->vdev_nowritecache = B_FALSE; return (0); } static void vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; if (vd->vdev_reopening || dvd == NULL) return; if (dvd->vd_minor != NULL) { ddi_devid_str_free(dvd->vd_minor); dvd->vd_minor = NULL; } if (dvd->vd_devid != NULL) { ddi_devid_free(dvd->vd_devid); dvd->vd_devid = NULL; } if (dvd->vd_lh != NULL) { (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); dvd->vd_lh = NULL; } vd->vdev_delayed_close = B_FALSE; /* * If we closed the LDI handle due to an offline notify from LDI, * don't free vd->vdev_tsd or unregister the callbacks here; * the offline finalize callback or a reopen will take care of it. */ if (dvd->vd_ldi_offline) return; vdev_disk_free(vd); } int vdev_disk_physio(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, int flags, boolean_t isdump) { vdev_disk_t *dvd = vd->vdev_tsd; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) return (EIO); ASSERT(vd->vdev_ops == &vdev_disk_ops); /* * If in the context of an active crash dump, use the ldi_dump(9F) * call instead of ldi_strategy(9F) as usual. */ if (isdump) { ASSERT3P(dvd, !=, NULL); return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), lbtodb(size))); } return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); } int vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, size_t size, uint64_t offset, int flags) { buf_t *bp; int error = 0; if (vd_lh == NULL) return (SET_ERROR(EINVAL)); ASSERT(flags & B_READ || flags & B_WRITE); bp = getrbuf(KM_SLEEP); bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; bp->b_bcount = size; bp->b_un.b_addr = (void *)data; bp->b_lblkno = lbtodb(offset); bp->b_bufsize = size; error = ldi_strategy(vd_lh, bp); ASSERT(error == 0); if ((error = biowait(bp)) == 0 && bp->b_resid != 0) error = SET_ERROR(EIO); freerbuf(bp); return (error); } static void vdev_disk_io_intr(buf_t *bp) { vdev_buf_t *vb = (vdev_buf_t *)bp; zio_t *zio = vb->vb_io; /* * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. * Rather than teach the rest of the stack about other error * possibilities (EFAULT, etc), we normalize the error value here. */ zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0); if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = SET_ERROR(EIO); kmem_free(vb, sizeof (vdev_buf_t)); - zio_interrupt(zio); + zio_delay_interrupt(zio); } static void vdev_disk_ioctl_free(zio_t *zio) { kmem_free(zio->io_vsd, sizeof (struct dk_callback)); } static const zio_vsd_ops_t vdev_disk_vsd_ops = { vdev_disk_ioctl_free, zio_vsd_default_cksum_report }; static void vdev_disk_ioctl_done(void *zio_arg, int error) { zio_t *zio = zio_arg; zio->io_error = error; zio_interrupt(zio); } static void vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; vdev_buf_t *vb; struct dk_callback *dkc; buf_t *bp; int error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush) break; if (vd->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); zio->io_vsd_ops = &vdev_disk_vsd_ops; dkc->dkc_callback = vdev_disk_ioctl_done; dkc->dkc_flag = FLUSH_VOLATILE; dkc->dkc_cookie = zio; error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, (uintptr_t)dkc, FKIOCTL, kcred, NULL); if (error == 0) { /* * The ioctl will be done asychronously, * and will call vdev_disk_ioctl_done() * upon completion. */ return; } if (error == ENOTSUP || error == ENOTTY) { /* * If we get ENOTSUP or ENOTTY, we know that * no future attempts will ever succeed. * In this case we set a persistent bit so * that we don't bother with the ioctl in the * future. */ vd->vdev_nowritecache = B_TRUE; } zio->io_error = error; break; default: zio->io_error = SET_ERROR(ENOTSUP); } zio_execute(zio); return; } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); vb->vb_io = zio; bp = &vb->vb_buf; bioinit(bp); bp->b_flags = B_BUSY | B_NOCACHE | (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) bp->b_flags |= B_FAILFAST; bp->b_bcount = zio->io_size; bp->b_un.b_addr = zio->io_data; bp->b_lblkno = lbtodb(zio->io_offset); bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; /* ldi_strategy() will return non-zero only on programming errors */ VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); } static void vdev_disk_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an * asynchronous removal of the device. Otherwise, probe the device and * make sure it's still accessible. */ if (zio->io_error == EIO && !vd->vdev_remove_wanted) { vdev_disk_t *dvd = vd->vdev_tsd; int state = DKIO_NONE; if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { /* * We post the resource as soon as possible, instead of * when the async removal actually happens, because the * DE is using this information to discard previous I/O * errors. */ zfs_post_remove(zio->io_spa, vd); vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } else if (!vd->vdev_delayed_close) { vd->vdev_delayed_close = B_TRUE; } } } vdev_ops_t vdev_disk_ops = { vdev_disk_open, vdev_disk_close, vdev_default_asize, vdev_disk_io_start, vdev_disk_io_done, NULL, vdev_disk_hold, vdev_disk_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; /* * Given the root disk device devid or pathname, read the label from * the device, and construct a configuration nvlist. */ int vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) { ldi_handle_t vd_lh; vdev_label_t *label; uint64_t s, size; int l; ddi_devid_t tmpdevid; int error = -1; char *minor_name; /* * Read the device label and build the nvlist. */ if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, &minor_name) == 0) { error = ldi_open_by_devid(tmpdevid, minor_name, FREAD, kcred, &vd_lh, zfs_li); ddi_devid_free(tmpdevid); ddi_devid_str_free(minor_name); } if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, zfs_li))) return (error); if (ldi_get_size(vd_lh, &s)) { (void) ldi_close(vd_lh, FREAD, kcred); return (SET_ERROR(EIO)); } size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); *config = NULL; for (l = 0; l < VDEV_LABELS; l++) { uint64_t offset, state, txg = 0; /* read vdev label */ offset = vdev_label_offset(size, l, 0); if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) continue; if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { *config = NULL; continue; } if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state >= POOL_STATE_DESTROYED) { nvlist_free(*config); *config = NULL; continue; } if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0) { nvlist_free(*config); *config = NULL; continue; } break; } kmem_free(label, sizeof (vdev_label_t)); (void) ldi_close(vd_lh, FREAD, kcred); if (*config == NULL) error = SET_ERROR(EIDRM); return (error); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 296509) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 296510) @@ -1,244 +1,245 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include /* * Virtual device vector for files. */ static void vdev_file_hold(vdev_t *vd) { ASSERT(vd->vdev_path != NULL); } static void vdev_file_rele(vdev_t *vd) { ASSERT(vd->vdev_path != NULL); } static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_file_t *vf; vnode_t *vp; vattr_t vattr; int error; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); vf = vd->vdev_tsd; vp = vf->vf_vnode; goto skip_open; } vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); /* * We always open the files from the root of the global zone, even if * we're in a local zone. If the user has gotten to this point, the * administrator has already decided that the pool should be available * to local zone users, so the underlying devices should be as well. */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; return (error); } vf->vf_vnode = vp; #ifdef _KERNEL /* * Make sure it's a regular file. */ if (vp->v_type != VREG) { #ifdef __FreeBSD__ (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); #endif vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; #ifdef __FreeBSD__ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; #endif return (SET_ERROR(ENODEV)); } #endif /* _KERNEL */ skip_open: /* * Determine the physical size of the file. */ vattr.va_mask = AT_SIZE; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, kcred); VOP_UNLOCK(vp, 0); if (error) { (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; return (error); } vd->vdev_notrim = B_TRUE; *max_psize = *psize = vattr.va_size; *logical_ashift = SPA_MINBLOCKSHIFT; *physical_ashift = SPA_MINBLOCKSHIFT; return (0); } static void vdev_file_close(vdev_t *vd) { vdev_file_t *vf = vd->vdev_tsd; if (vd->vdev_reopening || vf == NULL) return; if (vf->vf_vnode != NULL) { (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); } vd->vdev_delayed_close = B_FALSE; kmem_free(vf, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; } static void vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_file_t *vf; vnode_t *vp; ssize_t resid; if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } vf = vd->vdev_tsd; vp = vf->vf_vnode; if (zio->io_type == ZIO_TYPE_IOCTL) { switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: zio->io_error = VOP_FSYNC(vp, FSYNC | FDSYNC, kcred, NULL); break; default: zio->io_error = SET_ERROR(ENOTSUP); } zio_execute(zio); return; } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size, zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; - zio_interrupt(zio); + zio_delay_interrupt(zio); #ifdef illumos VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp, TQ_SLEEP), !=, 0); #endif } /* ARGSUSED */ static void vdev_file_io_done(zio_t *zio) { } vdev_ops_t vdev_file_ops = { vdev_file_open, vdev_file_close, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, NULL, vdev_file_hold, vdev_file_rele, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; /* * From userland we access disks just like files. */ #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { vdev_file_open, vdev_file_close, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, NULL, vdev_file_hold, vdev_file_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; #endif Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 296509) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 296510) @@ -1,1001 +1,1002 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright (c) 2012 Martin Matuska */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual device vector for GEOM. */ static g_attrchanged_t vdev_geom_attrchanged; struct g_class zfs_vdev_class = { .name = "ZFS::VDEV", .version = G_VERSION, .attrchanged = vdev_geom_attrchanged, }; DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); SYSCTL_DECL(_vfs_zfs_vdev); /* Don't send BIO_FLUSH. */ static int vdev_geom_bio_flush_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); /* Don't send BIO_DELETE. */ static int vdev_geom_bio_delete_disable; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); /* * Thread local storage used to indicate when a thread is probing geoms * for their guids. If NULL, this thread is not tasting geoms. If non NULL, * it is looking for a replacement for the vdev_t* that is its value. */ uint_t zfs_geom_probe_vdev_key; static void vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp) { int error; uint16_t rate; error = g_getattr("GEOM::rotation_rate", cp, &rate); if (error == 0) vd->vdev_rotation_rate = rate; else vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN; } static void vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) { vdev_t *vd; spa_t *spa; char *physpath; int error, physpath_len; vd = cp->private; if (vd == NULL) return; if (strcmp(attr, "GEOM::rotation_rate") == 0) { vdev_geom_set_rotation_rate(vd, cp); return; } if (strcmp(attr, "GEOM::physpath") != 0) return; if (g_access(cp, 1, 0, 0) != 0) return; /* * Record/Update physical path information for this device. */ spa = vd->vdev_spa; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); g_access(cp, -1, 0, 0); if (error == 0) { char *old_physpath; old_physpath = vd->vdev_physpath; vd->vdev_physpath = spa_strdup(physpath); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); if (old_physpath != NULL) { int held_lock; held_lock = spa_config_held(spa, SCL_STATE, RW_WRITER); if (held_lock == 0) { g_topology_unlock(); spa_config_enter(spa, SCL_STATE, FTAG, RW_WRITER); } spa_strfree(old_physpath); if (held_lock == 0) { spa_config_exit(spa, SCL_STATE, FTAG); g_topology_lock(); } } } g_free(physpath); } static void vdev_geom_orphan(struct g_consumer *cp) { vdev_t *vd; g_topology_assert(); vd = cp->private; if (vd == NULL) { /* Vdev close in progress. Ignore the event. */ return; } /* * Orphan callbacks occur from the GEOM event thread. * Concurrent with this call, new I/O requests may be * working their way through GEOM about to find out * (only once executed by the g_down thread) that we've * been orphaned from our disk provider. These I/Os * must be retired before we can detach our consumer. * This is most easily achieved by acquiring the * SPA ZIO configuration lock as a writer, but doing * so with the GEOM topology lock held would cause * a lock order reversal. Instead, rely on the SPA's * async removal support to invoke a close on this * vdev once it is safe to do so. */ vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } static struct g_consumer * vdev_geom_attach(struct g_provider *pp, vdev_t *vd) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); ZFS_LOG(1, "Attaching to %s.", pp->name); /* Do we have geom already? No? Create one. */ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; if (strcmp(gp->name, "zfs::vdev") != 0) continue; break; } if (gp == NULL) { gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); gp->orphan = vdev_geom_orphan; gp->attrchanged = vdev_geom_attrchanged; cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_wither_geom(gp, ENXIO); return (NULL); } if (g_access(cp, 1, 0, 1) != 0) { g_wither_geom(gp, ENXIO); return (NULL); } ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); } else { /* Check if we are already connected to this provider. */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) { ZFS_LOG(1, "Found consumer for %s.", pp->name); break; } } if (cp == NULL) { cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 0, 1) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } ZFS_LOG(1, "Created consumer for %s.", pp->name); } else { if (g_access(cp, 1, 0, 1) != 0) return (NULL); ZFS_LOG(1, "Used existing consumer for %s.", pp->name); } } /* * BUG: cp may already belong to a vdev. This could happen if: * 1) That vdev is a shared spare, or * 2) We are trying to reopen a missing vdev and we are scanning by * guid. In that case, we'll ultimately fail to open this consumer, * but not until after setting the private field. * The solution is to: * 1) Don't set the private field until after the open succeeds, and * 2) Set it to a linked list of vdevs, not just a single vdev */ cp->private = vd; vd->vdev_tsd = cp; /* Fetch initial physical path information for this device. */ vdev_geom_attrchanged(cp, "GEOM::physpath"); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; return (cp); } static void vdev_geom_close_locked(vdev_t *vd) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); cp = vd->vdev_tsd; if (cp == NULL) return; ZFS_LOG(1, "Closing access to %s.", cp->provider->name); KASSERT(vd->vdev_tsd == cp, ("%s: vdev_tsd is not cp", __func__)); vd->vdev_tsd = NULL; vd->vdev_delayed_close = B_FALSE; cp->private = NULL; gp = cp->geom; g_access(cp, -1, 0, -1); /* Destroy consumer on last close. */ if (cp->acr == 0 && cp->ace == 0) { if (cp->acw > 0) g_access(cp, 0, -cp->acw, 0); if (cp->provider != NULL) { ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name); g_detach(cp); } g_destroy_consumer(cp); } /* Destroy geom if there are no consumers left. */ if (LIST_EMPTY(&gp->consumer)) { ZFS_LOG(1, "Destroyed geom %s.", gp->name); g_wither_geom(gp, ENXIO); } } static void nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid) { (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid); (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid); } static int vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size) { struct bio *bp; u_char *p; off_t off, maxio; int error; ASSERT((offset % cp->provider->sectorsize) == 0); ASSERT((size % cp->provider->sectorsize) == 0); bp = g_alloc_bio(); off = offset; offset += size; p = data; maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); error = 0; for (; off < offset; off += maxio, p += maxio, size -= maxio) { g_reset_bio(bp); bp->bio_cmd = cmd; bp->bio_done = NULL; bp->bio_offset = off; bp->bio_length = MIN(size, maxio); bp->bio_data = p; g_io_request(bp, cp); error = biowait(bp, "vdev_geom_io"); if (error != 0) break; } g_destroy_bio(bp); return (error); } static void vdev_geom_taste_orphan(struct g_consumer *cp) { ZFS_LOG(0, "WARNING: Orphan %s while tasting its VDev GUID.", cp->provider->name); } static int vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) { struct g_provider *pp; vdev_label_t *label; char *p, *buf; size_t buflen; uint64_t psize; off_t offset, size; uint64_t state, txg; int error, l, len; g_topology_assert_not(); pp = cp->provider; ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); size = sizeof(*label) + pp->sectorsize - ((sizeof(*label) - 1) % pp->sectorsize) - 1; label = kmem_alloc(size, KM_SLEEP); buflen = sizeof(label->vl_vdev_phys.vp_nvlist); *config = NULL; for (l = 0; l < VDEV_LABELS; l++) { offset = vdev_label_offset(psize, l, 0); if ((offset % pp->sectorsize) != 0) continue; if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0) continue; buf = label->vl_vdev_phys.vp_nvlist; if (nvlist_unpack(buf, buflen, config, 0) != 0) continue; if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(*config); *config = NULL; continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(*config); *config = NULL; continue; } break; } kmem_free(label, size); return (*config == NULL ? ENOENT : 0); } static void resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) { nvlist_t **new_configs; uint64_t i; if (id < *count) return; new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *), KM_SLEEP); for (i = 0; i < *count; i++) new_configs[i] = (*configs)[i]; if (*configs != NULL) kmem_free(*configs, *count * sizeof(void *)); *configs = new_configs; *count = id + 1; } static void process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, const char *name, uint64_t* known_pool_guid) { nvlist_t *vdev_tree; uint64_t pool_guid; uint64_t vdev_guid, known_guid; uint64_t id, txg, known_txg; char *pname; int i; if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || strcmp(pname, name) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) goto ignore; if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) goto ignore; if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) goto ignore; VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if (*known_pool_guid != 0) { if (pool_guid != *known_pool_guid) goto ignore; } else *known_pool_guid = pool_guid; resize_configs(configs, count, id); if ((*configs)[id] != NULL) { VERIFY(nvlist_lookup_uint64((*configs)[id], ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); if (txg <= known_txg) goto ignore; nvlist_free((*configs)[id]); } (*configs)[id] = cfg; return; ignore: nvlist_free(cfg); } static int vdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp) { int error; if (pp->flags & G_PF_WITHER) return (EINVAL); g_attach(cp, pp); error = g_access(cp, 1, 0, 0); if (error == 0) { if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) error = EINVAL; else if (pp->mediasize < SPA_MINDEVSIZE) error = EINVAL; if (error != 0) g_access(cp, -1, 0, 0); } if (error != 0) g_detach(cp); return (error); } static void vdev_geom_detach_taster(struct g_consumer *cp) { g_access(cp, -1, 0, 0); g_detach(cp); } int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count) { struct g_class *mp; struct g_geom *gp, *zgp; struct g_provider *pp; struct g_consumer *zcp; nvlist_t *vdev_cfg; uint64_t pool_guid; int error; DROP_GIANT(); g_topology_lock(); zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste"); /* This orphan function should be never called. */ zgp->orphan = vdev_geom_taste_orphan; zcp = g_new_consumer(zgp); *configs = NULL; *count = 0; pool_guid = 0; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->flags & G_PF_WITHER) continue; if (vdev_geom_attach_taster(zcp, pp) != 0) continue; g_topology_unlock(); error = vdev_geom_read_config(zcp, &vdev_cfg); g_topology_lock(); vdev_geom_detach_taster(zcp); if (error) continue; ZFS_LOG(1, "successfully read vdev config"); process_vdev_config(configs, count, vdev_cfg, name, &pool_guid); } } } g_destroy_consumer(zcp); g_destroy_geom(zgp); g_topology_unlock(); PICKUP_GIANT(); return (*count > 0 ? 0 : ENOENT); } static void vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid) { nvlist_t *config; g_topology_assert_not(); *pguid = 0; *vguid = 0; if (vdev_geom_read_config(cp, &config) == 0) { nvlist_get_guids(config, pguid, vguid); nvlist_free(config); } } static struct g_consumer * vdev_geom_attach_by_guids(vdev_t *vd) { struct g_class *mp; struct g_geom *gp, *zgp; struct g_provider *pp; struct g_consumer *cp, *zcp; uint64_t pguid, vguid; g_topology_assert(); zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste"); zgp->orphan = vdev_geom_taste_orphan; zcp = g_new_consumer(zgp); cp = NULL; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (vdev_geom_attach_taster(zcp, pp) != 0) continue; g_topology_unlock(); vdev_geom_read_guids(zcp, &pguid, &vguid); g_topology_lock(); vdev_geom_detach_taster(zcp); /* * Check that the label's vdev guid matches the * desired guid. If the label has a pool guid, * check that it matches too. (Inactive spares * and L2ARCs do not have any pool guid in the * label.) */ if ((pguid != 0 && pguid != spa_guid(vd->vdev_spa)) || vguid != vd->vdev_guid) continue; cp = vdev_geom_attach(pp, vd); if (cp == NULL) { printf("ZFS WARNING: Unable to " "attach to %s.\n", pp->name); continue; } break; } if (cp != NULL) break; } if (cp != NULL) break; } end: g_destroy_consumer(zcp); g_destroy_geom(zgp); return (cp); } static struct g_consumer * vdev_geom_open_by_guids(vdev_t *vd) { struct g_consumer *cp; char *buf; size_t len; g_topology_assert(); ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid); cp = vdev_geom_attach_by_guids(vd); if (cp != NULL) { len = strlen(cp->provider->name) + strlen("/dev/") + 1; buf = kmem_alloc(len, KM_SLEEP); snprintf(buf, len, "/dev/%s", cp->provider->name); spa_strfree(vd->vdev_path); vd->vdev_path = buf; ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid, vd->vdev_path); } else { ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); } return (cp); } static struct g_consumer * vdev_geom_open_by_path(vdev_t *vd, int check_guid) { struct g_provider *pp; struct g_consumer *cp; uint64_t pguid, vguid; g_topology_assert(); cp = NULL; pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); if (pp != NULL) { ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); cp = vdev_geom_attach(pp, vd); if (cp != NULL && check_guid && ISP2(pp->sectorsize) && pp->sectorsize <= VDEV_PAD_SIZE) { g_topology_unlock(); vdev_geom_read_guids(cp, &pguid, &vguid); g_topology_lock(); if (pguid != spa_guid(vd->vdev_spa) || vguid != vd->vdev_guid) { vdev_geom_close_locked(vd); cp = NULL; ZFS_LOG(1, "guid mismatch for provider %s: " "%ju:%ju != %ju:%ju.", vd->vdev_path, (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid, (uintmax_t)pguid, (uintmax_t)vguid); } else { ZFS_LOG(1, "guid match for provider %s.", vd->vdev_path); } } } return (cp); } static int vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct g_provider *pp; struct g_consumer *cp; size_t bufsize; int error; /* Set the TLS to indicate downstack that we should not access zvols*/ VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } vd->vdev_tsd = NULL; DROP_GIANT(); g_topology_lock(); error = 0; if (vd->vdev_spa->spa_splitting_newspa || (vd->vdev_prevstate == VDEV_STATE_UNKNOWN && vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)) { /* * We are dealing with a vdev that hasn't been previously * opened (since boot), and we are not loading an * existing pool configuration. This looks like a * vdev add operation to a new or existing pool. * Assume the user knows what he/she is doing and find * GEOM provider by its name, ignoring GUID mismatches. * * XXPOLICY: It would be safer to only allow a device * that is unlabeled or labeled but missing * GUID information to be opened in this fashion, * unless we are doing a split, in which case we * should allow any guid. */ cp = vdev_geom_open_by_path(vd, 0); } else { /* * Try using the recorded path for this device, but only * accept it if its label data contains the expected GUIDs. */ cp = vdev_geom_open_by_path(vd, 1); if (cp == NULL) { /* * The device at vd->vdev_path doesn't have the * expected GUIDs. The disks might have merely * moved around so try all other GEOM providers * to find one with the right GUIDs. */ cp = vdev_geom_open_by_guids(vd); } } /* Clear the TLS now that tasting is done */ VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); if (cp == NULL) { ZFS_LOG(1, "Provider %s not found.", vd->vdev_path); error = ENOENT; } else if (cp->provider->sectorsize > VDEV_PAD_SIZE || !ISP2(cp->provider->sectorsize)) { ZFS_LOG(1, "Provider %s has unsupported sectorsize.", vd->vdev_path); vdev_geom_close_locked(vd); error = EINVAL; cp = NULL; } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) { int i; for (i = 0; i < 5; i++) { error = g_access(cp, 0, 1, 0); if (error == 0) break; g_topology_unlock(); tsleep(vd, 0, "vdev", hz / 2); g_topology_lock(); } if (error != 0) { printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n", vd->vdev_path, error); vdev_geom_close_locked(vd); cp = NULL; } } g_topology_unlock(); PICKUP_GIANT(); if (cp == NULL) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } pp = cp->provider; /* * Determine the actual size of the device. */ *max_psize = *psize = pp->mediasize; /* * Determine the device's minimum transfer size and preferred * transfer size. */ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *physical_ashift = 0; if (pp->stripesize) *physical_ashift = highbit(pp->stripesize) - 1; /* * Clear the nowritecache settings, so that on a vdev_reopen() * we will try again. */ vd->vdev_nowritecache = B_FALSE; /* * Determine the device's rotation rate. */ vdev_geom_set_rotation_rate(vd, cp); return (0); } static void vdev_geom_close(vdev_t *vd) { DROP_GIANT(); g_topology_lock(); vdev_geom_close_locked(vd); g_topology_unlock(); PICKUP_GIANT(); } static void vdev_geom_io_intr(struct bio *bp) { vdev_t *vd; zio_t *zio; zio = bp->bio_caller1; vd = zio->io_vd; zio->io_error = bp->bio_error; if (zio->io_error == 0 && bp->bio_resid != 0) zio->io_error = SET_ERROR(EIO); switch(zio->io_error) { case ENOTSUP: /* * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know * that future attempts will never succeed. In this case * we set a persistent flag so that we don't bother with * requests in the future. */ switch(bp->bio_cmd) { case BIO_FLUSH: vd->vdev_nowritecache = B_TRUE; break; case BIO_DELETE: vd->vdev_notrim = B_TRUE; break; } break; case ENXIO: if (!vd->vdev_remove_wanted) { /* * If provider's error is set we assume it is being * removed. */ if (bp->bio_to->error != 0) { vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } else if (!vd->vdev_delayed_close) { vd->vdev_delayed_close = B_TRUE; } } break; } g_destroy_bio(bp); - zio_interrupt(zio); + zio_delay_interrupt(zio); } static void vdev_geom_io_start(zio_t *zio) { vdev_t *vd; struct g_consumer *cp; struct bio *bp; int error; vd = zio->io_vd; switch (zio->io_type) { case ZIO_TYPE_IOCTL: /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } else { switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush || vdev_geom_bio_flush_disable) break; if (vd->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } goto sendreq; default: zio->io_error = SET_ERROR(ENOTSUP); } } zio_execute(zio); return; case ZIO_TYPE_FREE: if (vd->vdev_notrim) { zio->io_error = SET_ERROR(ENOTSUP); } else if (!vdev_geom_bio_delete_disable) { goto sendreq; } zio_execute(zio); return; } sendreq: ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE || zio->io_type == ZIO_TYPE_IOCTL); cp = vd->vdev_tsd; if (cp == NULL) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } bp = g_alloc_bio(); bp->bio_caller1 = zio; switch (zio->io_type) { case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE; bp->bio_data = zio->io_data; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; case ZIO_TYPE_FREE: bp->bio_cmd = BIO_DELETE; bp->bio_data = NULL; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; case ZIO_TYPE_IOCTL: bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; break; } bp->bio_done = vdev_geom_io_intr; g_io_request(bp, cp); } static void vdev_geom_io_done(zio_t *zio) { } static void vdev_geom_hold(vdev_t *vd) { } static void vdev_geom_rele(vdev_t *vd) { } vdev_ops_t vdev_geom_ops = { vdev_geom_open, vdev_geom_close, vdev_default_asize, vdev_geom_io_start, vdev_geom_io_done, NULL, vdev_geom_hold, vdev_geom_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 296509) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 296510) @@ -1,918 +1,915 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include /* * ZFS I/O Scheduler * --------------- * * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The * I/O scheduler determines when and in what order those operations are * issued. The I/O scheduler divides operations into six I/O classes * prioritized in the following order: sync read, sync write, async read, * async write, scrub/resilver and trim. Each queue defines the minimum and * maximum number of concurrent operations that may be issued to the device. * In addition, the device has an aggregate maximum. Note that the sum of the * per-queue minimums must not exceed the aggregate maximum, and if the * aggregate maximum is equal to or greater than the sum of the per-queue * maximums, the per-queue minimum has no effect. * * For many physical devices, throughput increases with the number of * concurrent operations, but latency typically suffers. Further, physical * devices typically have a limit at which more concurrent operations have no * effect on throughput or can actually cause it to decrease. * * The scheduler selects the next operation to issue by first looking for an * I/O class whose minimum has not been satisfied. Once all are satisfied and * the aggregate maximum has not been hit, the scheduler looks for classes * whose maximum has not been satisfied. Iteration through the I/O classes is * done in the order specified above. No further operations are issued if the * aggregate maximum number of concurrent operations has been hit or if there * are no operations queued for an I/O class that has not hit its maximum. * Every time an I/O is queued or an operation completes, the I/O scheduler * looks for new operations to issue. * * All I/O classes have a fixed maximum number of outstanding operations * except for the async write class. Asynchronous writes represent the data * that is committed to stable storage during the syncing stage for * transaction groups (see txg.c). Transaction groups enter the syncing state * periodically so the number of queued async writes will quickly burst up and * then bleed down to zero. Rather than servicing them as quickly as possible, * the I/O scheduler changes the maximum number of active async write I/Os * according to the amount of dirty data in the pool (see dsl_pool.c). Since * both throughput and latency typically increase with the number of * concurrent operations issued to physical devices, reducing the burstiness * in the number of concurrent operations also stabilizes the response time of * operations from other -- and in particular synchronous -- queues. In broad * strokes, the I/O scheduler will issue more concurrent operations from the * async write queue as there's more dirty data in the pool. * * Async Writes * * The number of concurrent operations issued for the async write I/O class * follows a piece-wise linear function defined by a few adjustable points. * * | o---------| <-- zfs_vdev_async_write_max_active * ^ | /^ | * | | / | | * active | / | | * I/O | / | | * count | / | | * | / | | * |------------o | | <-- zfs_vdev_async_write_min_active * 0|____________^______|_________| * 0% | | 100% of zfs_dirty_data_max * | | * | `-- zfs_vdev_async_write_active_max_dirty_percent * `--------- zfs_vdev_async_write_active_min_dirty_percent * * Until the amount of dirty data exceeds a minimum percentage of the dirty * data allowed in the pool, the I/O scheduler will limit the number of * concurrent operations to the minimum. As that threshold is crossed, the * number of concurrent operations issued increases linearly to the maximum at * the specified maximum percentage of the dirty data allowed in the pool. * * Ideally, the amount of dirty data on a busy pool will stay in the sloped * part of the function between zfs_vdev_async_write_active_min_dirty_percent * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the * maximum percentage, this indicates that the rate of incoming data is * greater than the rate that the backend storage can handle. In this case, we * must further throttle incoming writes (see dmu_tx_delay() for details). */ /* * The maximum number of I/Os active to each device. Ideally, this will be >= * the sum of each queue's max_active. It must be at least the sum of each * queue's min_active. */ uint32_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of I/Os active to each device. If the * sum of the queue's max_active is < zfs_vdev_max_active, then the * min_active comes into play. We will send min_active from each queue, * and then select from queues in the order defined by zio_priority_t. * * In general, smaller max_active's will lead to lower latency of synchronous * operations. Larger max_active's may lead to higher overall throughput, * depending on underlying storage. * * The ratio of the queues' max_actives determines the balance of performance * between reads, writes, and scrubs. E.g., increasing * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete * more quickly, but reads and writes to have higher latency and lower * throughput. */ uint32_t zfs_vdev_sync_read_min_active = 10; uint32_t zfs_vdev_sync_read_max_active = 10; uint32_t zfs_vdev_sync_write_min_active = 10; uint32_t zfs_vdev_sync_write_max_active = 10; uint32_t zfs_vdev_async_read_min_active = 1; uint32_t zfs_vdev_async_read_max_active = 3; uint32_t zfs_vdev_async_write_min_active = 1; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; uint32_t zfs_vdev_trim_min_active = 1; /* * TRIM max active is large in comparison to the other values due to the fact * that TRIM IOs are coalesced at the device layer. This value is set such * that a typical SSD can process the queued IOs in a single request. */ uint32_t zfs_vdev_trim_max_active = 64; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent * dirty data, use zfs_vdev_async_write_min_active. When it has more than * zfs_vdev_async_write_active_max_dirty_percent, use * zfs_vdev_async_write_max_active. The value is linearly interpolated * between min and max. */ int zfs_vdev_async_write_active_min_dirty_percent = 30; int zfs_vdev_async_write_active_max_dirty_percent = 60; /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * For read I/Os, we also aggregate across small adjacency gaps; for writes * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; #ifdef __FreeBSD__ SYSCTL_DECL(_vfs_zfs_vdev); static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_zfs_async_write_active_min_dirty_percent, "I", "Percentage of async write dirty data below which " "async_write_min_active is used."); static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_zfs_async_write_active_max_dirty_percent, "I", "Percentage of async write dirty data above which " "async_write_max_active is used."); SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, "The maximum number of I/Os of all types active for each device."); #define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ &zfs_vdev_ ## name ## _min_active, 0, \ "Initial number of I/O requests of type " #name \ " active for each device"); #define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\ &zfs_vdev_ ## name ## _max_active, 0, \ "Maximum number of I/O requests of type " #name \ " active for each device"); ZFS_VDEV_QUEUE_KNOB_MIN(sync_read); ZFS_VDEV_QUEUE_KNOB_MAX(sync_read); ZFS_VDEV_QUEUE_KNOB_MIN(sync_write); ZFS_VDEV_QUEUE_KNOB_MAX(sync_write); ZFS_VDEV_QUEUE_KNOB_MIN(async_read); ZFS_VDEV_QUEUE_KNOB_MAX(async_read); ZFS_VDEV_QUEUE_KNOB_MIN(async_write); ZFS_VDEV_QUEUE_KNOB_MAX(async_write); ZFS_VDEV_QUEUE_KNOB_MIN(scrub); ZFS_VDEV_QUEUE_KNOB_MAX(scrub); ZFS_VDEV_QUEUE_KNOB_MIN(trim); ZFS_VDEV_QUEUE_KNOB_MAX(trim); #undef ZFS_VDEV_QUEUE_KNOB SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN, &zfs_vdev_aggregation_limit, 0, "I/O requests are aggregated up to this size"); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN, &zfs_vdev_read_gap_limit, 0, "Acceptable gap between two reads being aggregated"); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN, &zfs_vdev_write_gap_limit, 0, "Acceptable gap between two writes being aggregated"); static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS) { int val, err; val = zfs_vdev_async_write_active_min_dirty_percent; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val > 100 || val >= zfs_vdev_async_write_active_max_dirty_percent) return (EINVAL); zfs_vdev_async_write_active_min_dirty_percent = val; return (0); } static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS) { int val, err; val = zfs_vdev_async_write_active_max_dirty_percent; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val > 100 || val <= zfs_vdev_async_write_active_min_dirty_percent) return (EINVAL); zfs_vdev_async_write_active_max_dirty_percent = val; return (0); } #endif int vdev_queue_offset_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_offset < z2->io_offset) return (-1); if (z1->io_offset > z2->io_offset) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } static inline avl_tree_t * vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) { return (&vq->vq_class[p].vqc_queued_tree); } static inline avl_tree_t * vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) { if (t == ZIO_TYPE_READ) return (&vq->vq_read_offset_tree); else if (t == ZIO_TYPE_WRITE) return (&vq->vq_write_offset_tree); else return (NULL); } int vdev_queue_timestamp_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_timestamp < z2->io_timestamp) return (-1); if (z1->io_timestamp > z2->io_timestamp) return (1); if (z1->io_offset < z2->io_offset) return (-1); if (z1->io_offset > z2->io_offset) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } void vdev_queue_init(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_queue_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); /* * The synchronous i/o queues are dispatched in FIFO rather * than LBA order. This provides more consistent latency for * these i/os. */ if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) compfn = vdev_queue_timestamp_compare; else compfn = vdev_queue_offset_compare; avl_create(vdev_queue_class_tree(vq, p), compfn, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } vq->vq_lastoffset = 0; } void vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) avl_destroy(vdev_queue_class_tree(vq, p)); avl_destroy(&vq->vq_active_tree); avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; avl_tree_t *qtt; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); qtt = vdev_queue_type_tree(vq, zio->io_type); if (qtt) avl_add(qtt, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); spa->spa_queue_stats[zio->io_priority].spa_queued++; if (spa->spa_iokstat != NULL) kstat_waitq_enter(spa->spa_iokstat->ks_data); mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; avl_tree_t *qtt; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); qtt = vdev_queue_type_tree(vq, zio->io_type); if (qtt) avl_remove(qtt, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0); spa->spa_queue_stats[zio->io_priority].spa_queued--; if (spa->spa_iokstat != NULL) kstat_waitq_exit(spa->spa_iokstat->ks_data); mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active++; avl_add(&vq->vq_active_tree, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); spa->spa_queue_stats[zio->io_priority].spa_active++; if (spa->spa_iokstat != NULL) kstat_runq_enter(spa->spa_iokstat->ks_data); mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active--; avl_remove(&vq->vq_active_tree, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0); spa->spa_queue_stats[zio->io_priority].spa_active--; if (spa->spa_iokstat != NULL) { kstat_io_t *ksio = spa->spa_iokstat->ks_data; kstat_runq_exit(spa->spa_iokstat->ks_data); if (zio->io_type == ZIO_TYPE_READ) { ksio->reads++; ksio->nread += zio->io_size; } else if (zio->io_type == ZIO_TYPE_WRITE) { ksio->writes++; ksio->nwritten += zio->io_size; } } mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_agg_io_done(zio_t *aio) { if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; while ((pio = zio_walk_parents(aio)) != NULL) { bcopy((char *)aio->io_data + (pio->io_offset - aio->io_offset), pio->io_data, pio->io_size); } } zio_buf_free(aio->io_data, aio->io_size); } static int vdev_queue_class_min_active(zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_min_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_min_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_min_active); case ZIO_PRIORITY_ASYNC_WRITE: return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_min_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); default: panic("invalid priority %u", p); return (0); } } static __noinline int vdev_queue_max_async_writes(spa_t *spa) { int writes; uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total; uint64_t min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint64_t max_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_max_dirty_percent / 100; /* * Sync tasks correspond to interactive user actions. To reduce the * execution time of those actions we push data out as fast as possible. */ if (spa_has_pending_synctask(spa)) { return (zfs_vdev_async_write_max_active); } if (dirty < min_bytes) return (zfs_vdev_async_write_min_active); if (dirty > max_bytes) return (zfs_vdev_async_write_max_active); /* * linear interpolation: * slope = (max_writes - min_writes) / (max_bytes - min_bytes) * move right by min_bytes * move up by min_writes */ writes = (dirty - min_bytes) * (zfs_vdev_async_write_max_active - zfs_vdev_async_write_min_active) / (max_bytes - min_bytes) + zfs_vdev_async_write_min_active; ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); return (writes); } static int vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_max_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_max_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); default: panic("invalid priority %u", p); return (0); } } /* * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if * there is no eligible class. */ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { spa_t *spa = vq->vq_vdev->vdev_spa; zio_priority_t p; ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* find a queue that has not reached its minimum # outstanding i/os */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_min_active(p)) return (p); } /* * If we haven't found a queue, look for one that hasn't reached its * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_max_active(spa, p)) return (p); } /* No eligible queued i/os */ return (ZIO_PRIORITY_NUM_QUEUEABLE); } /* * Compute the range spanned by two i/os, which is the endpoint of the last * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. */ #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) static zio_t * vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { zio_t *first, *last, *aio, *dio, *mandatory, *nio; uint64_t maxgap = 0; uint64_t size; boolean_t stretch; avl_tree_t *t; enum zio_flag flags; ASSERT(MUTEX_HELD(&vq->vq_lock)); if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) return (NULL); first = last = zio; if (zio->io_type == ZIO_TYPE_READ) maxgap = zfs_vdev_read_gap_limit; /* * We can aggregate I/Os that are sufficiently adjacent and of * the same flavor, as expressed by the AGG_INHERIT flags. * The latter requirement is necessary so that certain * attributes of the I/O, such as whether it's a normal I/O * or a scrub/resilver, can be preserved in the aggregate. * We can include optional I/Os, but don't allow them * to begin a range as they add no benefit in that situation. */ /* * We keep track of the last non-optional I/O. */ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; /* * Walk backwards through sufficiently contiguous I/Os * recording the last non-option I/O. */ flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; t = vdev_queue_type_tree(vq, zio->io_type); while (t != NULL && (dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && IO_GAP(dio, first) <= maxgap) { first = dio; if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = first; } /* * Skip any initial optional I/Os. */ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { first = AVL_NEXT(t, first); ASSERT(first != NULL); } /* * Walk forward through sufficiently contiguous I/Os. */ while ((dio = AVL_NEXT(t, last)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit && IO_GAP(last, dio) <= maxgap) { last = dio; if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = last; } /* * Now that we've established the range of the I/O aggregation * we must decide what to do with trailing optional I/Os. * For reads, there's nothing to do. While we are unable to * aggregate further, it's possible that a trailing optional * I/O would allow the underlying device to aggregate with * subsequent I/Os. We must therefore determine if the next * non-optional I/O is close enough to make aggregation * worthwhile. */ stretch = B_FALSE; if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { zio_t *nio = last; while ((dio = AVL_NEXT(t, nio)) != NULL && IO_GAP(nio, dio) == 0 && IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { nio = dio; if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { stretch = B_TRUE; break; } } } if (stretch) { /* This may be a no-op. */ dio = AVL_NEXT(t, last); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { while (last != mandatory && last != first) { ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); last = AVL_PREV(t, last); ASSERT(last != NULL); } } if (first == last) return (NULL); size = IO_SPAN(first, last); ASSERT3U(size, <=, zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, zio_buf_alloc(size), size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; do { dio = nio; nio = AVL_NEXT(t, dio); ASSERT3U(dio->io_type, ==, aio->io_type); if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); bzero((char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); } else if (dio->io_type == ZIO_TYPE_WRITE) { bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); } zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); zio_execute(dio); } while (dio != last); return (aio); } static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq) { zio_t *zio, *aio; zio_priority_t p; avl_index_t idx; avl_tree_t *tree; zio_t search; again: ASSERT(MUTEX_HELD(&vq->vq_lock)); p = vdev_queue_class_to_issue(vq); if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { /* No eligible queued i/os */ return (NULL); } /* * For LBA-ordered queues (async / scrub), issue the i/o which follows * the most recently issued i/o in LBA (offset) order. * * For FIFO queues (sync), issue the i/o with the lowest timestamp. */ tree = vdev_queue_class_tree(vq, p); search.io_timestamp = 0; search.io_offset = vq->vq_last_offset + 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); zio = avl_nearest(tree, idx, AVL_AFTER); if (zio == NULL) zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); if (aio != NULL) zio = aio; else vdev_queue_io_remove(vq, zio); /* * If the I/O is or was optional and therefore has no data, we need to * simply discard it. We need to drop the vdev queue's lock to avoid a * deadlock that we could encounter since this I/O will complete * immediately. */ if (zio->io_flags & ZIO_FLAG_NODATA) { mutex_exit(&vq->vq_lock); zio_vdev_io_bypass(zio); zio_execute(zio); mutex_enter(&vq->vq_lock); goto again; } vdev_queue_pending_add(vq, zio); vq->vq_last_offset = zio->io_offset; return (zio); } zio_t * vdev_queue_io(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) return (zio); /* * Children i/os inherent their parent's priority, which might * not match the child's i/o type. Fix it up here. */ if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } else { ASSERT(zio->io_type == ZIO_TYPE_FREE); zio->io_priority = ZIO_PRIORITY_TRIM; } zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; mutex_enter(&vq->vq_lock); zio->io_timestamp = gethrtime(); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); mutex_exit(&vq->vq_lock); if (nio == NULL) return (NULL); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); return (NULL); } return (nio); } void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; - if (zio_injection_enabled) - delay(SEC_TO_TICK(zio_handle_io_delay(zio))); - mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); vq->vq_io_complete_ts = gethrtime(); while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); } else { zio_vdev_io_reissue(nio); zio_execute(nio); } mutex_enter(&vq->vq_lock); } mutex_exit(&vq->vq_lock); } /* * As these three methods are only used for load calculations we're not concerned * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex * use here, instead we prefer to keep it lock free for performance. */ int vdev_queue_length(vdev_t *vd) { return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); } uint64_t vdev_queue_lastoffset(vdev_t *vd) { return (vd->vdev_queue.vq_lastoffset); } void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio) { vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size; } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 296509) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 296510) @@ -1,3605 +1,3657 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); #if defined(__amd64__) static int zio_use_uma = 1; #else static int zio_use_uma = 0; #endif SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, "Use uma(9) for ZIO allocations"); static int zio_exclude_metadata = 0; SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, "Exclude metadata buffers from dumps as well"); zio_trim_stats_t zio_trim_stats = { { "bytes", KSTAT_DATA_UINT64, "Number of bytes successfully TRIMmed" }, { "success", KSTAT_DATA_UINT64, "Number of successful TRIM requests" }, { "unsupported", KSTAT_DATA_UINT64, "Number of TRIM requests that failed because TRIM is not supported" }, { "failed", KSTAT_DATA_UINT64, "Number of TRIM requests that failed for reasons other than not supported" }, }; static kstat_t *zio_trim_ksp; /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", "zio_ioctl" }; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #ifdef _KERNEL extern vmem_t *zio_alloc_arena; #endif #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; #else int zio_buf_debug_limit = 0; #endif void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); if (!zio_use_uma) goto out; /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; while (!ISP2(p2)) p2 &= p2 - 1; #ifdef illumos #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; #endif #endif /* illumos */ if (size <= 4 * SPA_MINBLOCKSIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = MIN(p2 >> 2, PAGESIZE); } if (align != 0) { char name[36]; (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); /* * Since zio_data bufs do not appear in crash dumps, we * pass KMC_NOTOUCH so that no allocator metadata is * stored with the buffers. */ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags | KMC_NOTOUCH | KMC_NODEBUG); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } out: zio_inject_init(); zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", KSTAT_TYPE_NAMED, sizeof(zio_trim_stats) / sizeof(kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zio_trim_ksp != NULL) { zio_trim_ksp->ks_data = &zio_trim_stats; kstat_install(zio_trim_ksp); } } void zio_fini(void) { size_t c; kmem_cache_t *last_cache = NULL; kmem_cache_t *last_data_cache = NULL; for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); } zio_buf_cache[c] = NULL; if (zio_data_buf_cache[c] != last_data_cache) { last_data_cache = zio_data_buf_cache[c]; kmem_cache_destroy(zio_data_buf_cache[c]); } zio_data_buf_cache[c] = NULL; } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); if (zio_trim_ksp != NULL) { kstat_delete(zio_trim_ksp); zio_trim_ksp = NULL; } } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; int flags = zio_exclude_metadata ? KM_NODEBUG : 0; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); else return (kmem_alloc(size, KM_SLEEP|flags)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); else return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) kmem_cache_free(zio_buf_cache[c], buf); else kmem_free(buf, size); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) kmem_cache_free(zio_data_buf_cache[c], buf); else kmem_free(buf, size); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ static void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_data = zio->io_data; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_data = data; zio->io_size = size; } static void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_data, zt->zt_orig_size); if (zt->zt_bufsize != 0) zio_buf_free(zio->io_data, zt->zt_bufsize); zio->io_data = zt->zt_orig_data; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks and decompression * ========================================================================== */ static void zio_subblock(zio_t *zio, void *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) bcopy(zio->io_data, data, size); } static void zio_decompress(zio_t *zio, void *data, uint64_t size) { if (zio->io_error == 0 && zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_data, data, zio->io_size, size) != 0) zio->io_error = SET_ERROR(EIO); } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ /* * NOTE - Callers to zio_walk_parents() and zio_walk_children must * continue calling these functions until they return NULL. * Otherwise, the next caller will pick up the list walk in * some indeterminate state. (Otherwise every caller would * have to pass in a cookie to keep the state represented by * io_walk_link, which gets annoying.) */ zio_t * zio_walk_parents(zio_t *cio) { zio_link_t *zl = cio->io_walk_link; list_t *pl = &cio->io_parent_list; zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); cio->io_walk_link = zl; if (zl == NULL) return (NULL); ASSERT(zl->zl_child == cio); return (zl->zl_parent); } zio_t * zio_walk_children(zio_t *pio) { zio_link_t *zl = pio->io_walk_link; list_t *cl = &pio->io_child_list; zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); pio->io_walk_link = zl; if (zl == NULL) return (NULL); ASSERT(zl->zl_parent == pio); return (zl->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_t *pio = zio_walk_parents(cio); VERIFY(zio_walk_parents(cio) == NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT(cio->io_child_type <= pio->io_child_type); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); pio->io_child_count++; cio->io_parent_count++; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); pio->io_child_count--; cio->io_parent_count--; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) { uint64_t *countp = &zio->io_children[child][wait]; boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); if (*countp != 0) { zio->io_stage >>= 1; zio->io_stall = countp; waiting = B_TRUE; } mutex_exit(&zio->io_lock); return (waiting); } static void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { pio->io_stall = NULL; mutex_exit(&pio->io_lock); zio_execute(pio); } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_data = zio->io_data = data; zio->io_orig_size = zio->io_size = size; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } return (zio); } static void zio_destroy(zio_t *zio) { list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) { if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { zfs_panic_recover("blkptr at %p has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); if (vdevid >= spa->spa_root_vdev->vdev_children) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { zfs_panic_recover("blkptr at %p DVA %u has hole " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "OFFSET %llu", bp, i, (longlong_t)offset); } } } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; zfs_blkptr_verify(spa, bp); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_physdone = physdone; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). */ if (data == NULL && zio->io_prop.zp_dedup_verify) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. */ if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, BP_GET_PSIZE(bp), 0))); } } zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, uint64_t size, enum zio_flag flags) { zio_t *zio; enum zio_stage stage = ZIO_FREE_PIPELINE; ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); metaslab_check_free(spa, bp); arc_freed(spa, bp); if (zfs_trim_enabled) stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | ZIO_STAGE_VDEV_IO_ASSESS; /* * GANG and DEDUP blocks can induce a read (for the gang block header, * or the DDT), so issue them asynchronously so that this thread is * not tied up. */ else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) stage |= ZIO_STAGE_ISSUE_ASYNC; flags |= ZIO_FLAG_DONT_QUEUE; zio = zio_create(pio, spa, txg, bp, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); return (zio); } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; dprintf_bp(bp, "claiming in txg %llu", txg); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); ASSERT(txg == spa_first_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, offset, size, done, private, priority, flags)); } return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ void *wbuf = zio_buf_alloc(size); bcopy(data, wbuf, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; ASSERT(vd->vdev_parent == (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* Not all IO types require vdev io done stage e.g. free */ if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; if (vd->vdev_children == 0) offset += VDEV_LABEL_START_SIZE; flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } zio_t * zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) { ASSERT(vd->vdev_ops->vdev_op_leaf); return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT(zio->io_executor == NULL); ASSERT(zio->io_orig_size == zio->io_size); ASSERT(size <= zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) zio->io_orig_size = zio->io_size = size; } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static int zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(psize); zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, zio->io_data); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static int zio_write_bp_init(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_size; uint64_t psize = lsize; int pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) return (ZIO_PIPELINE_STOP); if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) return (ZIO_PIPELINE_CONTINUE); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (ZIO_PIPELINE_CONTINUE); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (ZIO_PIPELINE_CONTINUE); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); } zio->io_bp_override = NULL; BP_ZERO(bp); } if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } if (compress != ZIO_COMPRESS_OFF) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (ZIO_PIPELINE_CONTINUE); } else { /* * Round up compressed size up to the ashift * of the smallest-ashift device, and zero the tail. * This ensures that the compressed size of the BP * (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); size_t rounded = (size_t)P2ROUNDUP(psize, 1ULL << spa->spa_min_ashift); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { bzero((char *)cbuf + psize, rounded - psize); psize = rounded; zio_push_transform(zio, cbuf, psize, lsize, NULL); } } } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { ASSERT(psize != 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (ZIO_PIPELINE_CONTINUE); } static int zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if (zio->io_priority == ZIO_PRIORITY_NOW && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ #if defined(illumos) || !defined(_KERNEL) ASSERT(zio->io_tqent.tqent_next == NULL); #else ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); #endif spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { kthread_t *executor = zio->io_executor; spa_t *spa = zio->io_spa; for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (taskq_member(tqs->stqs_taskq[i], executor)) return (B_TRUE); } } return (B_FALSE); } static int zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } void zio_interrupt(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } +void +zio_delay_interrupt(zio_t *zio) +{ + /* + * The timeout_generic() function isn't defined in userspace, so + * rather than trying to implement the function, the zio delay + * functionality has been disabled for userspace builds. + */ + +#ifdef _KERNEL + /* + * If io_target_timestamp is zero, then no delay has been registered + * for this IO, thus jump to the end of this function and "skip" the + * delay; issuing it directly to the zio layer. + */ + if (zio->io_target_timestamp != 0) { + hrtime_t now = gethrtime(); + + if (now >= zio->io_target_timestamp) { + /* + * This IO has already taken longer than the target + * delay to complete, so we don't want to delay it + * any longer; we "miss" the delay and issue it + * directly to the zio layer. This is likely due to + * the target latency being set to a value less than + * the underlying hardware can satisfy (e.g. delay + * set to 1ms, but the disks take 10ms to complete an + * IO request). + */ + + DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, + hrtime_t, now); + + zio_interrupt(zio); + } else { + hrtime_t diff = zio->io_target_timestamp - now; + + DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, + hrtime_t, now, hrtime_t, diff); + + (void) timeout_generic(CALLOUT_NORMAL, + (void (*)(void *))zio_interrupt, zio, diff, 1, 0); + } + + return; + } +#endif + + DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); + zio_interrupt(zio); +} + /* * Execute the I/O pipeline until one of the following occurs: * * (1) the I/O completes * (2) the pipeline stalls waiting for dependent child I/Os * (3) the I/O issues, so we're waiting for an I/O completion interrupt * (4) the I/O is delegated by vdev-level caching or aggregation * (5) the I/O is deferred due to vdev-level queueing * (6) the I/O is handed off to another thread. * * In all cases, the pipeline stops whenever there's no CPU work; it never * burns a thread in cv_wait(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; void zio_execute(zio_t *zio) { zio->io_executor = curthread; while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; int rv; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) return; ASSERT(rv == ZIO_PIPELINE_CONTINUE); } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { int error; ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) cv_wait(&zio->io_cv, &zio->io_lock); mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { ASSERT(zio->io_executor == NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && zio_unique_parent(zio) == NULL) { /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); } zio_execute(zio); } /* * ========================================================================== * Reexecute or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio); mutex_enter(&pio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); } /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on him. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) zio_execute(pio); } void zio_suspend(spa_t *spa, zio_t *zio) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = B_TRUE; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = B_FALSE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { zio_t *zio; if (gn != NULL) { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), data, BP_GET_PSIZE(bp)); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } /* ARGSUSED */ zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(zio->io_child_count == 0); if (zio->io_error) return; if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(zio->io_data, zio->io_size); ASSERT(zio->io_data == gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); data = (char *)data + BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree && gio->io_data != NULL) ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) zio_nowait(zio); } static int zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (ZIO_PIPELINE_CONTINUE); } static int zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); zio_t *gio = zio->io_gang_leader; dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static int zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); zio_prop_t zp; int error; error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); if (error) { pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio_nowait(zio); return (ZIO_PIPELINE_CONTINUE); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static int zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (ZIO_PIPELINE_CONTINUE); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_data == NULL) dde->dde_repair_data = zio->io_data; else zio_buf_free(zio->io_data, zio->io_size); mutex_exit(&pio->io_lock); } static int zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (ZIO_PIPELINE_CONTINUE); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, zio_buf_alloc(zio->io_size), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (ZIO_PIPELINE_CONTINUE); } static int zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (ZIO_PIPELINE_CONTINUE); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } if (dde->dde_repair_data != NULL) { bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (ZIO_PIPELINE_CONTINUE); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || bcmp(zio->io_orig_data, lio->io_orig_data, zio->io_orig_size) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); VERIFY(arc_buf_remove_ref(abuf, &abuf)); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); while ((pio = zio_walk_parents(zio)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { while (zio_walk_parents(zio) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static void zio_ddt_ditto_write_done(zio_t *zio) { int p = DDT_PHYS_DITTO; zio_prop_t *zp = &zio->io_prop; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_key_t *ddk = &dde->dde_key; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); if (ddp->ddp_phys_birth != 0) ddt_phys_free(ddt, ddk, ddp, zio->io_txg); ddt_phys_fill(ddp, bp); } ddt_exit(ddt); } static int zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; int ditto_copies; zio_t *cio = NULL; zio_t *dio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; } zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); ASSERT(ditto_copies < SPA_DVAS_PER_BP); if (ditto_copies > ddt_ditto_copies_present(dde) && dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { zio_prop_t czp = *zp; czp.zp_copies = ditto_copies; /* * If we arrived here with an override bp, we won't have run * the transform stack, so we won't have the data we need to * generate a child i/o. So, toss the override bp and restart. * This is safe, because using the override bp is just an * optimization; and it's rare, so the cost doesn't matter. */ if (zio->io_bp_override) { zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; zio->io_pipeline = ZIO_WRITE_PIPELINE; zio->io_bp_override = NULL; BP_ZERO(bp); ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, zio->io_orig_size, &czp, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); if (cio) zio_nowait(cio); if (dio) zio_nowait(dio); return (ZIO_PIPELINE_CONTINUE); } ddt_entry_t *freedde; /* for debugging */ static int zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static int zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); /* * The dump device does not support gang blocks so allocation on * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid * the "fast" gang feature. */ flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? METASLAB_GANG_CHILD : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags); if (error) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; } return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog) { int error = 1; ASSERT(txg > spa_syncing_txg(spa)); /* * ZIL blocks are always contiguous (i.e. not gang blocks) so we * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" * when allocating them. */ if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); } if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); } return (error); } /* * Free an intent log block. */ void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) { ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); ASSERT(!BP_IS_GANG(bp)); zio_free(spa, txg, bp); } /* * ========================================================================== * Read, write and delete to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static int zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; int ret; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && zio->io_priority == ZIO_PRIORITY_NOW) { trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); return (ZIO_PIPELINE_CONTINUE); } /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care * about non-scrubbing, top-level reads and writes with the following * characteristics: * - synchronous writes of user data to non-slog devices * - any reads of user data * When these conditions are met, adjust the timestamp of spa_last_io * which allows the scan thread to adjust its workload accordingly. */ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && vd == vd->vdev_top && !vd->vdev_islog && zio->io_bookmark.zb_objset != DMU_META_OBJSET && zio->io_txg != spa_syncing_txg(spa)) { uint64_t old = spa->spa_last_io; uint64_t new = ddi_get_lbolt64(); if (old != new) (void) atomic_cas_64(&spa->spa_last_io, old, new); } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); char *abuf = NULL; if (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE) abuf = zio_buf_alloc(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { bcopy(zio->io_data, abuf, zio->io_size); bzero(abuf + zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, abuf ? asize : 0, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering with nested replication. * For example, given a mirror of mirrors, (A+B)+(C+D), if only * A is out of date, we'll read from C+D, then use the data to * resilver A+B -- but we don't actually want to resilver B, just A. * The top-level mirror has no way to know this, so instead we just * discard unnecessary repairs as we work our way down the vdev tree. * The same logic applies to any form of nested replication: * ditto + mirror, RAID-Z + replacing, etc. This covers them all. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (ZIO_PIPELINE_CONTINUE); } if (vd->vdev_ops->vdev_op_leaf) { switch (zio->io_type) { case ZIO_TYPE_READ: if (vdev_cache_read(zio)) return (ZIO_PIPELINE_CONTINUE); /* FALLTHROUGH */ case ZIO_TYPE_WRITE: case ZIO_TYPE_FREE: if ((zio = vdev_queue_io(zio)) == NULL) return (ZIO_PIPELINE_STOP); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (ZIO_PIPELINE_STOP); } break; } /* * Note that we ignore repair writes for TRIM because they can * conflict with normal writes. This isn't an issue because, by * definition, we only repair blocks that aren't freed. */ if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && !trim_map_write_start(zio)) return (ZIO_PIPELINE_STOP); } vd->vdev_ops->vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } static int zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); if (vd != NULL && vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE)) { if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) trim_map_write_done(zio); vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injection(vd, zio, EIO); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error) { if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FREE) { /* Not all devices support TRIM. */ } else if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); return (ZIO_PIPELINE_CONTINUE); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const void *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } /*ARGSUSED*/ void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); bcopy(zio->io_data, buf, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_buf_free; } static int zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); if (zio->io_type == ZIO_TYPE_FREE && zio->io_priority != ZIO_PRIORITY_NOW) { switch (zio->io_error) { case 0: ZIO_TRIM_STAT_INCR(bytes, zio->io_size); ZIO_TRIM_STAT_BUMP(success); break; case EOPNOTSUPP: ZIO_TRIM_STAT_BUMP(unsupported); break; default: ZIO_TRIM_STAT_BUMP(failed); break; } } /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (ZIO_PIPELINE_STOP); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && zio->io_physdone != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); zio->io_physdone(zio->io_logical); } return (ZIO_PIPELINE_CONTINUE); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static int zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); return (ZIO_PIPELINE_CONTINUE); } static int zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, zio, zio->io_offset, zio->io_size, NULL, &info); } } return (ZIO_PIPELINE_CONTINUE); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static int zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) return (ZIO_PIPELINE_STOP); if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio); zio_notify_parent(pio, zio, ZIO_WAIT_READY); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (ZIO_PIPELINE_CONTINUE); } static int zio_done(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; zio_t *pio, *pio_next; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (bp != NULL && !BP_IS_EMBEDDED(bp)) { ASSERT(bp->blk_pad[0] == 0); ASSERT(bp->blk_pad[1] == 0); ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); ASSERT(BP_COUNT_GANG(bp) == 0 || (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); char *abuf = zio->io_data; if (asize != psize) { abuf = zio_buf_alloc(asize); bcopy(zio->io_data, abuf, psize); bzero(abuf + psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); if (asize != psize) zio_buf_free(abuf, asize); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == lio) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(spa, zio); zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 0, 0); } } if (zio->io_error && zio == lio) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(vd == NULL && bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(spa) == SPA_LOAD_NONE && spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute = 0; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { zio_link_t *zl = zio->io_walk_link; pio_next = zio_walk_parents(zio); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(spa, zio); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ #if defined(illumos) || !defined(_KERNEL) ASSERT(zio->io_tqent.tqent_next == NULL); #else ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); #endif spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } return (ZIO_PIPELINE_STOP); } ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { zio_link_t *zl = zio->io_walk_link; pio_next = zio_walk_parents(zio); zio_remove_child(pio, zio, zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (ZIO_PIPELINE_STOP); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_free_bp_init, zio_issue_async, zio_write_bp_init, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT(last_block->zb_level == 0); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c (revision 296509) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c (revision 296510) @@ -1,528 +1,755 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* * ZFS fault injection * * To handle fault injection, we keep track of a series of zinject_record_t * structures which describe which logical block(s) should be injected with a * fault. These are kept in a global list. Each record corresponds to a given * spa_t and maintains a special hold on the spa_t so that it cannot be deleted * or exported while the injection record exists. * * Device level injection is done using the 'zi_guid' field. If this is set, it * means that the error is destined for a particular device, not a piece of * data. * * This is a rather poor data structure and algorithm, but we don't expect more * than a few faults at any one time, so it should be sufficient for our needs. */ #include #include #include #include #include #include uint32_t zio_injection_enabled; +/* + * Data describing each zinject handler registered on the system, and + * contains the list node linking the handler in the global zinject + * handler list. + */ typedef struct inject_handler { int zi_id; spa_t *zi_spa; zinject_record_t zi_record; + uint64_t *zi_lanes; + int zi_next_lane; list_node_t zi_link; } inject_handler_t; +/* + * List of all zinject handlers registered on the system, protected by + * the inject_lock defined below. + */ static list_t inject_handlers; + +/* + * This protects insertion into, and traversal of, the inject handler + * list defined above; as well as the inject_delay_count. Any time a + * handler is inserted or removed from the list, this lock should be + * taken as a RW_WRITER; and any time traversal is done over the list + * (without modification to it) this lock should be taken as a RW_READER. + */ static krwlock_t inject_lock; + +/* + * This holds the number of zinject delay handlers that have been + * registered on the system. It is protected by the inject_lock defined + * above. Thus modifications to this count must be a RW_WRITER of the + * inject_lock, and reads of this count must be (at least) a RW_READER + * of the lock. + */ +static int inject_delay_count = 0; + +/* + * This lock is used only in zio_handle_io_delay(), refer to the comment + * in that function for more details. + */ +static kmutex_t inject_delay_mtx; + +/* + * Used to assign unique identifying numbers to each new zinject handler. + */ static int inject_next_id = 1; /* * Returns true if the given record matches the I/O in progress. */ static boolean_t zio_match_handler(zbookmark_phys_t *zb, uint64_t type, zinject_record_t *record, int error) { /* * Check for a match against the MOS, which is based on type */ if (zb->zb_objset == DMU_META_OBJSET && record->zi_objset == DMU_META_OBJSET && record->zi_object == DMU_META_DNODE_OBJECT) { if (record->zi_type == DMU_OT_NONE || type == record->zi_type) return (record->zi_freq == 0 || spa_get_random(100) < record->zi_freq); else return (B_FALSE); } /* * Check for an exact match. */ if (zb->zb_objset == record->zi_objset && zb->zb_object == record->zi_object && zb->zb_level == record->zi_level && zb->zb_blkid >= record->zi_start && zb->zb_blkid <= record->zi_end && error == record->zi_error) return (record->zi_freq == 0 || spa_get_random(100) < record->zi_freq); return (B_FALSE); } /* * Panic the system when a config change happens in the function * specified by tag. */ void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) { inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa) continue; if (handler->zi_record.zi_type == type && strcmp(tag, handler->zi_record.zi_func) == 0) panic("Panic requested in function %s\n", tag); } rw_exit(&inject_lock); } /* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. */ int zio_handle_fault_injection(zio_t *zio, int error) { int ret = 0; inject_handler_t *handler; /* * Ignore I/O not associated with any logical data. */ if (zio->io_logical == NULL) return (0); /* * Currently, we only support fault injection on reads. */ if (zio->io_type != ZIO_TYPE_READ) return (0); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; /* If this handler matches, return EIO */ if (zio_match_handler(&zio->io_logical->io_bookmark, zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, &handler->zi_record, error)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } /* * Determine if the zio is part of a label update and has an injection * handler associated with that portion of the label. Currently, we * allow error injection in either the nvlist or the uberblock region of * of the vdev label. */ int zio_handle_label_injection(zio_t *zio, int error) { inject_handler_t *handler; vdev_t *vd = zio->io_vd; uint64_t offset = zio->io_offset; int label; int ret = 0; if (offset >= VDEV_LABEL_START_SIZE && offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) continue; /* * The injection region is the relative offsets within a * vdev label. We must determine the label which is being * updated and adjust our region accordingly. */ label = vdev_label_number(vd->vdev_psize, offset); start = vdev_label_offset(vd->vdev_psize, label, start); end = vdev_label_offset(vd->vdev_psize, label, end); if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && (offset >= start && offset <= end)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) { inject_handler_t *handler; int ret = 0; /* * We skip over faults in the labels unless it's during * device open (i.e. zio == NULL). */ if (zio != NULL) { uint64_t offset = zio->io_offset; if (offset < VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); } rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { if (handler->zi_record.zi_failfast && (zio == NULL || (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { continue; } /* Handle type specific I/O failures */ if (zio != NULL && handler->zi_record.zi_iotype != ZIO_TYPES && handler->zi_record.zi_iotype != zio->io_type) continue; if (handler->zi_record.zi_error == error) { /* * For a failed open, pretend like the device * has gone away. */ if (error == ENXIO) vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; /* * Treat these errors as if they had been * retried so that all the appropriate stats * and FMA events are generated. */ if (!handler->zi_record.zi_failfast && zio != NULL) zio->io_flags |= ZIO_FLAG_IO_RETRY; ret = error; break; } if (handler->zi_record.zi_error == ENXIO) { ret = SET_ERROR(EIO); break; } } } rw_exit(&inject_lock); return (ret); } /* * Simulate hardware that ignores cache flushes. For requested number * of seconds nix the actual writing to disk. */ void zio_handle_ignored_writes(zio_t *zio) { inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { /* Ignore errors not destined for this pool */ if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; /* * Positive duration implies # of seconds, negative * a number of txgs */ if (handler->zi_record.zi_timer == 0) { if (handler->zi_record.zi_duration > 0) handler->zi_record.zi_timer = ddi_get_lbolt64(); else handler->zi_record.zi_timer = zio->io_txg; } /* Have a "problem" writing 60% of the time */ if (spa_get_random(100) < 60) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; break; } rw_exit(&inject_lock); } void spa_handle_ignored_writes(spa_t *spa) { inject_handler_t *handler; if (zio_injection_enabled == 0) return; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; if (handler->zi_record.zi_duration > 0) { VERIFY(handler->zi_record.zi_timer == 0 || handler->zi_record.zi_timer + handler->zi_record.zi_duration * hz > ddi_get_lbolt64()); } else { /* duration is negative so the subtraction here adds */ VERIFY(handler->zi_record.zi_timer == 0 || handler->zi_record.zi_timer - handler->zi_record.zi_duration >= spa_syncing_txg(spa)); } } rw_exit(&inject_lock); } -uint64_t +hrtime_t zio_handle_io_delay(zio_t *zio) { vdev_t *vd = zio->io_vd; - inject_handler_t *handler; - uint64_t seconds = 0; + inject_handler_t *min_handler = NULL; + hrtime_t min_target = 0; - if (zio_injection_enabled == 0) - return (0); - rw_enter(&inject_lock, RW_READER); - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { + /* + * inject_delay_count is a subset of zio_injection_enabled that + * is only incremented for delay handlers. These checks are + * mainly added to remind the reader why we're not explicitly + * checking zio_injection_enabled like the other functions. + */ + IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); + IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); + /* + * If there aren't any inject delay handlers registered, then we + * can short circuit and simply return 0 here. A value of zero + * informs zio_delay_interrupt() that this request should not be + * delayed. This short circuit keeps us from acquiring the + * inject_delay_mutex unnecessarily. + */ + if (inject_delay_count == 0) { + rw_exit(&inject_lock); + return (0); + } + + /* + * Each inject handler has a number of "lanes" associated with + * it. Each lane is able to handle requests independently of one + * another, and at a latency defined by the inject handler + * record's zi_timer field. Thus if a handler in configured with + * a single lane with a 10ms latency, it will delay requests + * such that only a single request is completed every 10ms. So, + * if more than one request is attempted per each 10ms interval, + * the average latency of the requests will be greater than + * 10ms; but if only a single request is submitted each 10ms + * interval the average latency will be 10ms. + * + * We need to acquire this mutex to prevent multiple concurrent + * threads being assigned to the same lane of a given inject + * handler. The mutex allows us to perform the following two + * operations atomically: + * + * 1. determine the minimum handler and minimum target + * value of all the possible handlers + * 2. update that minimum handler's lane array + * + * Without atomicity, two (or more) threads could pick the same + * lane in step (1), and then conflict with each other in step + * (2). This could allow a single lane handler to process + * multiple requests simultaneously, which shouldn't be possible. + */ + mutex_enter(&inject_delay_mtx); + + for (inject_handler_t *handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) continue; - if (vd->vdev_guid == handler->zi_record.zi_guid) { - seconds = handler->zi_record.zi_timer; - break; + if (vd->vdev_guid != handler->zi_record.zi_guid) + continue; + + /* + * Defensive; should never happen as the array allocation + * occurs prior to inserting this handler on the list. + */ + ASSERT3P(handler->zi_lanes, !=, NULL); + + /* + * This should never happen, the zinject command should + * prevent a user from setting an IO delay with zero lanes. + */ + ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); + + ASSERT3U(handler->zi_record.zi_nlanes, >, + handler->zi_next_lane); + + /* + * We want to issue this IO to the lane that will become + * idle the soonest, so we compare the soonest this + * specific handler can complete the IO with all other + * handlers, to find the lowest value of all possible + * lanes. We then use this lane to submit the request. + * + * Since each handler has a constant value for its + * delay, we can just use the "next" lane for that + * handler; as it will always be the lane with the + * lowest value for that particular handler (i.e. the + * lane that will become idle the soonest). This saves a + * scan of each handler's lanes array. + * + * There's two cases to consider when determining when + * this specific IO request should complete. If this + * lane is idle, we want to "submit" the request now so + * it will complete after zi_timer milliseconds. Thus, + * we set the target to now + zi_timer. + * + * If the lane is busy, we want this request to complete + * zi_timer milliseconds after the lane becomes idle. + * Since the 'zi_lanes' array holds the time at which + * each lane will become idle, we use that value to + * determine when this request should complete. + */ + hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); + hrtime_t busy = handler->zi_record.zi_timer + + handler->zi_lanes[handler->zi_next_lane]; + hrtime_t target = MAX(idle, busy); + + if (min_handler == NULL) { + min_handler = handler; + min_target = target; + continue; } + ASSERT3P(min_handler, !=, NULL); + ASSERT3U(min_target, !=, 0); + + /* + * We don't yet increment the "next lane" variable since + * we still might find a lower value lane in another + * handler during any remaining iterations. Once we're + * sure we've selected the absolute minimum, we'll claim + * the lane and increment the handler's "next lane" + * field below. + */ + + if (target < min_target) { + min_handler = handler; + min_target = target; + } } + + /* + * 'min_handler' will be NULL if no IO delays are registered for + * this vdev, otherwise it will point to the handler containing + * the lane that will become idle the soonest. + */ + if (min_handler != NULL) { + ASSERT3U(min_target, !=, 0); + min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; + + /* + * If we've used all possible lanes for this handler, + * loop back and start using the first lane again; + * otherwise, just increment the lane index. + */ + min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % + min_handler->zi_record.zi_nlanes; + } + + mutex_exit(&inject_delay_mtx); rw_exit(&inject_lock); - return (seconds); + + return (min_target); } /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, * which is the switch to trigger all fault injection. */ int zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) { inject_handler_t *handler; int error; spa_t *spa; /* * If this is pool-wide metadata, make sure we unload the corresponding * spa_t, so that the next attempt to load it will trigger the fault. * We call spa_reset() to unload the pool appropriately. */ if (flags & ZINJECT_UNLOAD_SPA) if ((error = spa_reset(name)) != 0) return (error); + if (record->zi_cmd == ZINJECT_DELAY_IO) { + /* + * A value of zero for the number of lanes or for the + * delay time doesn't make sense. + */ + if (record->zi_timer == 0 || record->zi_nlanes == 0) + return (SET_ERROR(EINVAL)); + + /* + * The number of lanes is directly mapped to the size of + * an array used by the handler. Thus, to ensure the + * user doesn't trigger an allocation that's "too large" + * we cap the number of lanes here. + */ + if (record->zi_nlanes >= UINT16_MAX) + return (SET_ERROR(EINVAL)); + } + if (!(flags & ZINJECT_NULL)) { /* * spa_inject_ref() will add an injection reference, which will * prevent the pool from being removed from the namespace while * still allowing it to be unloaded. */ if ((spa = spa_inject_addref(name)) == NULL) return (SET_ERROR(ENOENT)); handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); + handler->zi_spa = spa; + handler->zi_record = *record; + + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + handler->zi_lanes = kmem_zalloc( + sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes, KM_SLEEP); + handler->zi_next_lane = 0; + } else { + handler->zi_lanes = NULL; + handler->zi_next_lane = 0; + } + rw_enter(&inject_lock, RW_WRITER); + /* + * We can't move this increment into the conditional + * above because we need to hold the RW_WRITER lock of + * inject_lock, and we don't want to hold that while + * allocating the handler's zi_lanes array. + */ + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >=, 0); + inject_delay_count++; + ASSERT3S(inject_delay_count, >, 0); + } + *id = handler->zi_id = inject_next_id++; - handler->zi_spa = spa; - handler->zi_record = *record; list_insert_tail(&inject_handlers, handler); atomic_inc_32(&zio_injection_enabled); rw_exit(&inject_lock); } /* * Flush the ARC, so that any attempts to read this data will end up * going to the ZIO layer. Note that this is a little overkill, but * we don't have the necessary ARC interfaces to do anything else, and * fault injection isn't a performance critical path. */ if (flags & ZINJECT_FLUSH_ARC) /* * We must use FALSE to ensure arc_flush returns, since * we're not preventing concurrent ARC insertions. */ arc_flush(NULL, FALSE); return (0); } /* * Returns the next record with an ID greater than that supplied to the * function. Used to iterate over all handlers in the system. */ int zio_inject_list_next(int *id, char *name, size_t buflen, zinject_record_t *record) { inject_handler_t *handler; int ret; mutex_enter(&spa_namespace_lock); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) if (handler->zi_id > *id) break; if (handler) { *record = handler->zi_record; *id = handler->zi_id; (void) strncpy(name, spa_name(handler->zi_spa), buflen); ret = 0; } else { ret = SET_ERROR(ENOENT); } rw_exit(&inject_lock); mutex_exit(&spa_namespace_lock); return (ret); } /* * Clear the fault handler with the given identifier, or return ENOENT if none * exists. */ int zio_clear_fault(int id) { inject_handler_t *handler; rw_enter(&inject_lock, RW_WRITER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) if (handler->zi_id == id) break; if (handler == NULL) { rw_exit(&inject_lock); return (SET_ERROR(ENOENT)); } + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >, 0); + inject_delay_count--; + ASSERT3S(inject_delay_count, >=, 0); + } + list_remove(&inject_handlers, handler); rw_exit(&inject_lock); + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3P(handler->zi_lanes, !=, NULL); + kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes); + } else { + ASSERT3P(handler->zi_lanes, ==, NULL); + } + spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); return (0); } void zio_inject_init(void) { rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&inject_handlers, sizeof (inject_handler_t), offsetof(inject_handler_t, zi_link)); } void zio_inject_fini(void) { list_destroy(&inject_handlers); + mutex_destroy(&inject_delay_mtx); rw_destroy(&inject_lock); } Index: head/sys/cddl/contrib/opensolaris =================================================================== --- head/sys/cddl/contrib/opensolaris (revision 296509) +++ head/sys/cddl/contrib/opensolaris (revision 296510) Property changes on: head/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor-sys/illumos/dist:r296505