Index: stable/10/cddl/contrib/opensolaris/cmd/zinject/zinject.c =================================================================== --- stable/10/cddl/contrib/opensolaris/cmd/zinject/zinject.c (revision 297107) +++ stable/10/cddl/contrib/opensolaris/cmd/zinject/zinject.c (revision 297108) @@ -1,988 +1,1093 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* * ZFS Fault Injector * * This userland component takes a set of options and uses libzpool to translate * from a user-visible object type and name to an internal representation. * There are two basic types of faults: device faults and data faults. * * * DEVICE FAULTS * * Errors can be injected into a particular vdev using the '-d' option. This * option takes a path or vdev GUID to uniquely identify the device within a * pool. There are two types of errors that can be injected, EIO and ENXIO, * that can be controlled through the '-e' option. The default is ENXIO. For * EIO failures, any attempt to read data from the device will return EIO, but * subsequent attempt to reopen the device will succeed. For ENXIO failures, * any attempt to read from the device will return EIO, but any attempt to * reopen the device will also return ENXIO. * For label faults, the -L option must be specified. This allows faults * to be injected into either the nvlist, uberblock, pad1, or pad2 region * of all the labels for the specified device. * * This form of the command looks like: * * zinject -d device [-e errno] [-L ] pool * * * DATA FAULTS * * We begin with a tuple of the form: * * * * type A string describing the type of data to target. Each type * implicitly describes how to interpret 'object'. Currently, * the following values are supported: * * data User data for a file * dnode Dnode for a file or directory * * The following MOS objects are special. Instead of injecting * errors on a particular object or blkid, we inject errors across * all objects of the given type. * * mos Any data in the MOS * mosdir object directory * config pool configuration * bpobj blkptr list * spacemap spacemap * metaslab metaslab * errlog persistent error log * * level Object level. Defaults to '0', not applicable to all types. If * a range is given, this corresponds to the indirect block * corresponding to the specific range. * * range A numerical range [start,end) within the object. Defaults to * the full size of the file. * * object A string describing the logical location of the object. For * files and directories (currently the only supported types), * this is the path of the object on disk. * * This is translated, via libzpool, into the following internal representation: * * * * These types should be self-explanatory. This tuple is then passed to the * kernel via a special ioctl() to initiate fault injection for the given * object. Note that 'type' is not strictly necessary for fault injection, but * is used when translating existing faults into a human-readable string. * * * The command itself takes one of the forms: * * zinject * zinject <-a | -u pool> * zinject -c * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] * [-r range] * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool * * With no arguments, the command prints all currently registered injection * handlers, with their numeric identifiers. * * The '-c' option will clear the given handler, or all handlers if 'all' is * specified. * * The '-e' option takes a string describing the errno to simulate. This must * be either 'io' or 'checksum'. In most cases this will result in the same * behavior, but RAID-Z will produce a different set of ereports for this * situation. * * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is * specified, then the ARC cache is flushed appropriately. If '-u' is * specified, then the underlying SPA is unloaded. Either of these flags can be * specified independently of any other handlers. The '-m' flag automatically * does an unmount and remount of the underlying dataset to aid in flushing the * cache. * * The '-f' flag controls the frequency of errors injected, expressed as a * integer percentage between 1 and 100. The default is 100. * * The this form is responsible for actually injecting the handler into the * framework. It takes the arguments described above, translates them to the * internal tuple using libzpool, and then issues an ioctl() to register the * handler. * * The final form can target a specific bookmark, regardless of whether a * human-readable interface has been designed. It allows developers to specify * a particular block by number. */ #include #include #include #include #include #include #include #include #include #include #include #undef verify /* both libzfs.h and zfs_context.h want to define this */ #include "zinject.h" libzfs_handle_t *g_zfs; int zfs_fd; #ifndef ECKSUM #define ECKSUM EBADE #endif static const char *errtable[TYPE_INVAL] = { "data", "dnode", "mos", "mosdir", "metaslab", "config", "bpobj", "spacemap", "errlog", "uber", "nvlist", "pad1", "pad2" }; static err_type_t name_to_type(const char *arg) { int i; for (i = 0; i < TYPE_INVAL; i++) if (strcmp(errtable[i], arg) == 0) return (i); return (TYPE_INVAL); } static const char * type_to_name(uint64_t type) { switch (type) { case DMU_OT_OBJECT_DIRECTORY: return ("mosdir"); case DMU_OT_OBJECT_ARRAY: return ("metaslab"); case DMU_OT_PACKED_NVLIST: return ("config"); case DMU_OT_BPOBJ: return ("bpobj"); case DMU_OT_SPACE_MAP: return ("spacemap"); case DMU_OT_ERROR_LOG: return ("errlog"); default: return ("-"); } } /* * Print usage message. */ void usage(void) { (void) printf( "usage:\n" "\n" "\tzinject\n" "\n" "\t\tList all active injection records.\n" "\n" "\tzinject -c \n" "\n" "\t\tClear the particular record (if given a numeric ID), or\n" "\t\tall records if 'all' is specificed.\n" "\n" "\tzinject -p pool\n" + "\n" "\t\tInject a panic fault at the specified function. Only \n" "\t\tfunctions which call spa_vdev_config_exit(), or \n" "\t\tspa_vdev_exit() will trigger a panic.\n" "\n" "\tzinject -d device [-e errno] [-L ] [-F]\n" "\t [-T pool\n" + "\n" "\t\tInject a fault into a particular device or the device's\n" "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " "\t\t'pad1', or 'pad2'.\n" "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n" "\n" "\tzinject -d device -A pool\n" + "\n" "\t\tPerform a specific action on a particular device\n" "\n" + "\tzinject -d device -D latency:lanes pool\n" + "\n" + "\t\tAdd an artificial delay to IO requests on a particular\n" + "\t\tdevice, such that the requests take a minimum of 'latency'\n" + "\t\tmilliseconds to complete. Each delay has an associated\n" + "\t\tnumber of 'lanes' which defines the number of concurrent\n" + "\t\tIO requests that can be processed.\n" + "\n" + "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n" + "\t\tthe device will only be able to service a single IO request\n" + "\t\tat a time with each request taking 10 ms to complete. So,\n" + "\t\tif only a single request is submitted every 10 ms, the\n" + "\t\taverage latency will be 10 ms; but if more than one request\n" + "\t\tis submitted every 10 ms, the average latency will be more\n" + "\t\tthan 10 ms.\n" + "\n" + "\t\tSimilarly, if a delay of 10 ms is specified to have two\n" + "\t\tlanes (-D 10:2), then the device will be able to service\n" + "\t\ttwo requests at a time, each with a minimum latency of\n" + "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n" + "\t\tthe average latency will be 10 ms; but if more than two\n" + "\t\trequests are submitted every 10 ms, the average latency\n" + "\t\twill be more than 10 ms.\n" + "\n" + "\t\tAlso note, these delays are additive. So two invocations\n" + "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n" + "\t\tof '-D 10:2'. This also means, one can specify multiple\n" + "\t\tlanes with differing target latencies. For example, an\n" + "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n" + "\t\tcreate 3 lanes on the device; one lane with a latency\n" + "\t\tof 10 ms and two lanes with a 25 ms latency.\n" + "\n" "\tzinject -I [-s | -g ] pool\n" + "\n" "\t\tCause the pool to stop writing blocks yet not\n" "\t\treport errors for a duration. Simulates buggy hardware\n" "\t\tthat fails to honor cache flush requests.\n" "\t\tDefault duration is 30 seconds. The machine is panicked\n" "\t\tat the end of the duration.\n" "\n" "\tzinject -b objset:object:level:blkid pool\n" "\n" "\t\tInject an error into pool 'pool' with the numeric bookmark\n" "\t\tspecified by the remaining tuple. Each number is in\n" "\t\thexidecimal, and only one block can be specified.\n" "\n" "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n" "\t [-a] [-m] [-u] [-f freq] \n" "\n" "\t\tInject an error into the object specified by the '-t' option\n" "\t\tand the object descriptor. The 'object' parameter is\n" "\t\tinterperted depending on the '-t' option.\n" "\n" "\t\t-q\tQuiet mode. Only print out the handler number added.\n" "\t\t-e\tInject a specific error. Must be either 'io' or\n" "\t\t\t'checksum'. Default is 'io'.\n" "\t\t-l\tInject error at a particular block level. Default is " "0.\n" "\t\t-m\tAutomatically remount underlying filesystem.\n" "\t\t-r\tInject error over a particular logical range of an\n" "\t\t\tobject. Will be translated to the appropriate blkid\n" "\t\t\trange according to the object's properties.\n" "\t\t-a\tFlush the ARC cache. Can be specified without any\n" "\t\t\tassociated object.\n" "\t\t-u\tUnload the associated pool. Can be specified with only\n" "\t\t\ta pool object.\n" "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n" "\t\t\ta percentage between 1 and 100.\n" "\n" "\t-t data\t\tInject an error into the plain file contents of a\n" "\t\t\tfile. The object must be specified as a complete path\n" "\t\t\tto a file on a ZFS filesystem.\n" "\n" "\t-t dnode\tInject an error into the metadnode in the block\n" "\t\t\tcorresponding to the dnode for a file or directory. The\n" "\t\t\t'-r' option is incompatible with this mode. The object\n" "\t\t\tis specified as a complete path to a file or directory\n" "\t\t\ton a ZFS filesystem.\n" "\n" "\t-t \tInject errors into the MOS for objects of the given\n" "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n" "\t\t\tspacemap, metaslab, errlog. The only valid is\n" "\t\t\tthe poolname.\n"); } static int iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), void *data) { zfs_cmd_t zc = { 0 }; int ret; while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) if ((ret = func((int)zc.zc_guid, zc.zc_name, &zc.zc_inject_record, data)) != 0) return (ret); if (errno != ENOENT) { (void) fprintf(stderr, "Unable to list handlers: %s\n", strerror(errno)); return (-1); } return (0); } static int print_data_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_guid != 0 || record->zi_func[0] != '\0') return (0); if (*count == 0) { (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE"); (void) printf("--- --------------- ------ " "------ -------- --- ---------------\n"); } *count += 1; (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool, (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object, type_to_name(record->zi_type), record->zi_level); if (record->zi_start == 0 && record->zi_end == -1ULL) (void) printf("all\n"); else (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); return (0); } static int print_device_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_guid == 0 || record->zi_func[0] != '\0') return (0); + if (record->zi_cmd == ZINJECT_DELAY_IO) + return (0); + if (*count == 0) { (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); (void) printf("--- --------------- ----------------\n"); } *count += 1; (void) printf("%3d %-15s %llx\n", id, pool, (u_longlong_t)record->zi_guid); return (0); } static int +print_delay_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid == 0 || record->zi_func[0] != '\0') + return (0); + + if (record->zi_cmd != ZINJECT_DELAY_IO) + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %-15s %-15s %s\n", + "ID", "POOL", "DELAY (ms)", "LANES", "GUID"); + (void) printf("--- --------------- --------------- " + "--------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool, + (u_longlong_t)NSEC2MSEC(record->zi_timer), + (u_longlong_t)record->zi_nlanes, + (u_longlong_t)record->zi_guid); + + return (0); +} + +static int print_panic_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_func[0] == '\0') return (0); if (*count == 0) { (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION"); (void) printf("--- --------------- ----------------\n"); } *count += 1; (void) printf("%3d %-15s %s\n", id, pool, record->zi_func); return (0); } /* * Print all registered error handlers. Returns the number of handlers * registered. */ static int print_all_handlers(void) { int count = 0, total = 0; (void) iter_handlers(print_device_handler, &count); if (count > 0) { total += count; (void) printf("\n"); count = 0; } + (void) iter_handlers(print_delay_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + (void) iter_handlers(print_data_handler, &count); if (count > 0) { total += count; (void) printf("\n"); count = 0; } (void) iter_handlers(print_panic_handler, &count); return (count + total); } /* ARGSUSED */ static int cancel_one_handler(int id, const char *pool, zinject_record_t *record, void *data) { zfs_cmd_t zc = { 0 }; zc.zc_guid = (uint64_t)id; if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to remove handler %d: %s\n", id, strerror(errno)); return (1); } return (0); } /* * Remove all fault injection handlers. */ static int cancel_all_handlers(void) { int ret = iter_handlers(cancel_one_handler, NULL); if (ret == 0) (void) printf("removed all registered handlers\n"); return (ret); } /* * Remove a specific fault injection handler. */ static int cancel_handler(int id) { zfs_cmd_t zc = { 0 }; zc.zc_guid = (uint64_t)id; if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to remove handler %d: %s\n", id, strerror(errno)); return (1); } (void) printf("removed handler %d\n", id); return (0); } /* * Register a new fault injection handler. */ static int register_handler(const char *pool, int flags, zinject_record_t *record, int quiet) { zfs_cmd_t zc = { 0 }; (void) strcpy(zc.zc_name, pool); zc.zc_inject_record = *record; zc.zc_guid = flags; if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to add handler: %s\n", strerror(errno)); return (1); } if (flags & ZINJECT_NULL) return (0); if (quiet) { (void) printf("%llu\n", (u_longlong_t)zc.zc_guid); } else { (void) printf("Added handler %llu with the following " "properties:\n", (u_longlong_t)zc.zc_guid); (void) printf(" pool: %s\n", pool); if (record->zi_guid) { (void) printf(" vdev: %llx\n", (u_longlong_t)record->zi_guid); } else if (record->zi_func[0] != '\0') { (void) printf(" panic function: %s\n", record->zi_func); } else if (record->zi_duration > 0) { (void) printf(" time: %lld seconds\n", (u_longlong_t)record->zi_duration); } else if (record->zi_duration < 0) { (void) printf(" txgs: %lld \n", (u_longlong_t)-record->zi_duration); } else { (void) printf("objset: %llu\n", (u_longlong_t)record->zi_objset); (void) printf("object: %llu\n", (u_longlong_t)record->zi_object); (void) printf(" type: %llu\n", (u_longlong_t)record->zi_type); (void) printf(" level: %d\n", record->zi_level); if (record->zi_start == 0 && record->zi_end == -1ULL) (void) printf(" range: all\n"); else (void) printf(" range: [%llu, %llu)\n", (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); } } return (0); } int perform_action(const char *pool, zinject_record_t *record, int cmd) { zfs_cmd_t zc = { 0 }; ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED); (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); zc.zc_guid = record->zi_guid; zc.zc_cookie = cmd; if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (1); } +static int +parse_delay(char *str, uint64_t *delay, uint64_t *nlanes) +{ + unsigned long scan_delay; + unsigned long scan_nlanes; + + if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2) + return (1); + + /* + * We explicitly disallow a delay of zero here, because we key + * off this value being non-zero in translate_device(), to + * determine if the fault is a ZINJECT_DELAY_IO fault or not. + */ + if (scan_delay == 0) + return (1); + + /* + * The units for the CLI delay parameter is milliseconds, but + * the data passed to the kernel is interpreted as nanoseconds. + * Thus we scale the milliseconds to nanoseconds here, and this + * nanosecond value is used to pass the delay to the kernel. + */ + *delay = MSEC2NSEC(scan_delay); + *nlanes = scan_nlanes; + + return (0); +} + int main(int argc, char **argv) { int c; char *range = NULL; char *cancel = NULL; char *end; char *raw = NULL; char *device = NULL; int level = 0; int quiet = 0; int error = 0; int domount = 0; int io_type = ZIO_TYPES; int action = VDEV_STATE_UNKNOWN; err_type_t type = TYPE_INVAL; err_type_t label = TYPE_INVAL; zinject_record_t record = { 0 }; char pool[MAXNAMELEN]; char dataset[MAXNAMELEN]; zfs_handle_t *zhp; int nowrites = 0; int dur_txg = 0; int dur_secs = 0; int ret; int flags = 0; if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "internal error: failed to " "initialize ZFS library\n"); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { (void) fprintf(stderr, "failed to open ZFS device\n"); return (1); } if (argc == 1) { /* * No arguments. Print the available handlers. If there are no * available handlers, direct the user to '-h' for help * information. */ if (print_all_handlers() == 0) { (void) printf("No handlers registered.\n"); (void) printf("Run 'zinject -h' for usage " "information.\n"); } return (0); } while ((c = getopt(argc, argv, ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; break; case 'A': if (strcasecmp(optarg, "degrade") == 0) { action = VDEV_STATE_DEGRADED; } else if (strcasecmp(optarg, "fault") == 0) { action = VDEV_STATE_FAULTED; } else { (void) fprintf(stderr, "invalid action '%s': " "must be 'degrade' or 'fault'\n", optarg); usage(); return (1); } break; case 'b': raw = optarg; break; case 'c': cancel = optarg; break; case 'd': device = optarg; break; case 'D': - record.zi_timer = strtoull(optarg, &end, 10); - if (errno != 0 || *end != '\0') { + ret = parse_delay(optarg, &record.zi_timer, + &record.zi_nlanes); + if (ret != 0) { (void) fprintf(stderr, "invalid i/o delay " "value: '%s'\n", optarg); usage(); return (1); } break; case 'e': if (strcasecmp(optarg, "io") == 0) { error = EIO; } else if (strcasecmp(optarg, "checksum") == 0) { error = ECKSUM; } else if (strcasecmp(optarg, "nxio") == 0) { error = ENXIO; } else if (strcasecmp(optarg, "dtl") == 0) { error = ECHILD; } else { (void) fprintf(stderr, "invalid error type " "'%s': must be 'io', 'checksum' or " "'nxio'\n", optarg); usage(); return (1); } break; case 'f': record.zi_freq = atoi(optarg); if (record.zi_freq < 1 || record.zi_freq > 100) { (void) fprintf(stderr, "frequency range must " "be in the range (0, 100]\n"); return (1); } break; case 'F': record.zi_failfast = B_TRUE; break; case 'g': dur_txg = 1; record.zi_duration = (int)strtol(optarg, &end, 10); if (record.zi_duration <= 0 || *end != '\0') { (void) fprintf(stderr, "invalid duration '%s': " "must be a positive integer\n", optarg); usage(); return (1); } /* store duration of txgs as its negative */ record.zi_duration *= -1; break; case 'h': usage(); return (0); case 'I': /* default duration, if one hasn't yet been defined */ nowrites = 1; if (dur_secs == 0 && dur_txg == 0) record.zi_duration = 30; break; case 'l': level = (int)strtol(optarg, &end, 10); if (*end != '\0') { (void) fprintf(stderr, "invalid level '%s': " "must be an integer\n", optarg); usage(); return (1); } break; case 'm': domount = 1; break; case 'p': (void) strlcpy(record.zi_func, optarg, sizeof (record.zi_func)); record.zi_cmd = ZINJECT_PANIC; break; case 'q': quiet = 1; break; case 'r': range = optarg; break; case 's': dur_secs = 1; record.zi_duration = (int)strtol(optarg, &end, 10); if (record.zi_duration <= 0 || *end != '\0') { (void) fprintf(stderr, "invalid duration '%s': " "must be a positive integer\n", optarg); usage(); return (1); } break; case 'T': if (strcasecmp(optarg, "read") == 0) { io_type = ZIO_TYPE_READ; } else if (strcasecmp(optarg, "write") == 0) { io_type = ZIO_TYPE_WRITE; } else if (strcasecmp(optarg, "free") == 0) { io_type = ZIO_TYPE_FREE; } else if (strcasecmp(optarg, "claim") == 0) { io_type = ZIO_TYPE_CLAIM; } else if (strcasecmp(optarg, "all") == 0) { io_type = ZIO_TYPES; } else { (void) fprintf(stderr, "invalid I/O type " "'%s': must be 'read', 'write', 'free', " "'claim' or 'all'\n", optarg); usage(); return (1); } break; case 't': if ((type = name_to_type(optarg)) == TYPE_INVAL && !MOS_TYPE(type)) { (void) fprintf(stderr, "invalid type '%s'\n", optarg); usage(); return (1); } break; case 'u': flags |= ZINJECT_UNLOAD_SPA; break; case 'L': if ((label = name_to_type(optarg)) == TYPE_INVAL && !LABEL_TYPE(type)) { (void) fprintf(stderr, "invalid label type " "'%s'\n", optarg); usage(); return (1); } break; case ':': (void) fprintf(stderr, "option -%c requires an " "operand\n", optopt); usage(); return (1); case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); usage(); return (2); } } argc -= optind; argv += optind; if (record.zi_duration != 0) record.zi_cmd = ZINJECT_IGNORED_WRITES; if (cancel != NULL) { /* * '-c' is invalid with any other options. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { (void) fprintf(stderr, "cancel (-c) incompatible with " "any other options\n"); usage(); return (2); } if (argc != 0) { (void) fprintf(stderr, "extraneous argument to '-c'\n"); usage(); return (2); } if (strcmp(cancel, "all") == 0) { return (cancel_all_handlers()); } else { int id = (int)strtol(cancel, &end, 10); if (*end != '\0') { (void) fprintf(stderr, "invalid handle id '%s':" " must be an integer or 'all'\n", cancel); usage(); return (1); } return (cancel_handler(id)); } } if (device != NULL) { /* * Device (-d) injection uses a completely different mechanism * for doing injection, so handle it separately here. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { (void) fprintf(stderr, "device (-d) incompatible with " "data error injection\n"); usage(); return (2); } if (argc != 1) { (void) fprintf(stderr, "device (-d) injection requires " "a single pool name\n"); usage(); return (2); } (void) strcpy(pool, argv[0]); dataset[0] = '\0'; if (error == ECKSUM) { (void) fprintf(stderr, "device error type must be " "'io' or 'nxio'\n"); return (1); } record.zi_iotype = io_type; if (translate_device(pool, device, label, &record) != 0) return (1); if (!error) error = ENXIO; if (action != VDEV_STATE_UNKNOWN) return (perform_action(pool, &record, action)); } else if (raw != NULL) { if (range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) { (void) fprintf(stderr, "raw (-b) format with " "any other options\n"); usage(); return (2); } if (argc != 1) { (void) fprintf(stderr, "raw (-b) format expects a " "single pool name\n"); usage(); return (2); } (void) strcpy(pool, argv[0]); dataset[0] = '\0'; if (error == ENXIO) { (void) fprintf(stderr, "data error type must be " "'checksum' or 'io'\n"); return (1); } record.zi_cmd = ZINJECT_DATA_FAULT; if (translate_raw(raw, &record) != 0) return (1); if (!error) error = EIO; } else if (record.zi_cmd == ZINJECT_PANIC) { if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || device != NULL) { (void) fprintf(stderr, "panic (-p) incompatible with " "other options\n"); usage(); return (2); } if (argc < 1 || argc > 2) { (void) fprintf(stderr, "panic (-p) injection requires " "a single pool name and an optional id\n"); usage(); return (2); } (void) strcpy(pool, argv[0]); if (argv[1] != NULL) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { if (nowrites == 0) { (void) fprintf(stderr, "-s or -g meaningless " "without -I (ignore writes)\n"); usage(); return (2); } else if (dur_secs && dur_txg) { (void) fprintf(stderr, "choose a duration either " "in seconds (-s) or a number of txgs (-g) " "but not both\n"); usage(); return (2); } else if (argc != 1) { (void) fprintf(stderr, "ignore writes (-I) " "injection requires a single pool name\n"); usage(); return (2); } (void) strcpy(pool, argv[0]); dataset[0] = '\0'; } else if (type == TYPE_INVAL) { if (flags == 0) { (void) fprintf(stderr, "at least one of '-b', '-d', " "'-t', '-a', '-p', '-I' or '-u' " "must be specified\n"); usage(); return (2); } if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) { (void) strcpy(pool, argv[0]); dataset[0] = '\0'; } else if (argc != 0) { (void) fprintf(stderr, "extraneous argument for " "'-f'\n"); usage(); return (2); } flags |= ZINJECT_NULL; } else { if (argc != 1) { (void) fprintf(stderr, "missing object\n"); usage(); return (2); } if (error == ENXIO) { (void) fprintf(stderr, "data error type must be " "'checksum' or 'io'\n"); return (1); } record.zi_cmd = ZINJECT_DATA_FAULT; if (translate_record(type, argv[0], range, level, &record, pool, dataset) != 0) return (1); if (!error) error = EIO; } /* * If this is pool-wide metadata, unmount everything. The ioctl() will * unload the pool, so that we trigger spa-wide reopen of metadata next * time we access the pool. */ if (dataset[0] != '\0' && domount) { if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) return (1); if (zfs_unmount(zhp, NULL, 0) != 0) return (1); } record.zi_error = error; ret = register_handler(pool, flags, &record, quiet); if (dataset[0] != '\0' && domount) ret = (zfs_mount(zhp, NULL, 0) != 0); libzfs_fini(g_zfs); return (ret); } Index: stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c =================================================================== --- stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c (revision 297107) +++ stable/10/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_compat.c (revision 297108) @@ -1,115 +1,118 @@ /* * CDDL HEADER SART * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2013 Martin Matuska . All rights reserved. */ #include "libzfs_compat.h" int zfs_ioctl_version = ZFS_IOCVER_UNDEF; static int zfs_spa_version = -1; /* * Get zfs_ioctl_version */ int get_zfs_ioctl_version(void) { size_t ver_size; int ver = ZFS_IOCVER_NONE; ver_size = sizeof(ver); sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); return (ver); } /* * Get the SPA version */ static int get_zfs_spa_version(void) { size_t ver_size; int ver = 0; ver_size = sizeof(ver); sysctlbyname("vfs.zfs.version.spa", &ver, &ver_size, NULL, 0); return (ver); } /* * This is FreeBSD version of ioctl, because Solaris' ioctl() updates * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an * error is returned zc_nvlist_dst_size won't be updated. */ int zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) { size_t oldsize; int ret, cflag = ZFS_CMD_COMPAT_NONE; if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) zfs_ioctl_version = get_zfs_ioctl_version(); if (zfs_ioctl_version >= ZFS_IOCVER_DEADMAN) { switch (zfs_ioctl_version) { + case ZFS_IOCVER_RESUME: + cflag = ZFS_CMD_COMPAT_RESUME; + break; case ZFS_IOCVER_EDBP: cflag = ZFS_CMD_COMPAT_EDBP; break; case ZFS_IOCVER_ZCMD: cflag = ZFS_CMD_COMPAT_ZCMD; break; case ZFS_IOCVER_LZC: cflag = ZFS_CMD_COMPAT_LZC; break; case ZFS_IOCVER_DEADMAN: cflag = ZFS_CMD_COMPAT_DEADMAN; break; } } else { /* * If vfs.zfs.version.ioctl is not defined, assume we have v28 * compatible binaries and use vfs.zfs.version.spa to test for v15 */ cflag = ZFS_CMD_COMPAT_V28; if (zfs_spa_version < 0) zfs_spa_version = get_zfs_spa_version(); if (zfs_spa_version == SPA_VERSION_15 || zfs_spa_version == SPA_VERSION_14 || zfs_spa_version == SPA_VERSION_13) cflag = ZFS_CMD_COMPAT_V15; } oldsize = zc->zc_nvlist_dst_size; ret = zcmd_ioctl_compat(fd, request, zc, cflag); if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { ret = -1; errno = ENOMEM; } return (ret); } Index: stable/10/sys/cddl/compat/opensolaris/sys/callo.h =================================================================== --- stable/10/sys/cddl/compat/opensolaris/sys/callo.h (nonexistent) +++ stable/10/sys/cddl/compat/opensolaris/sys/callo.h (revision 297108) @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2016 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_CALLO_H_ +#define _OPENSOLARIS_SYS_CALLO_H_ + +#include_next + +#define CALLOUT_REALTIME 0 /* realtime callout type */ +#define CALLOUT_NORMAL 1 /* normal callout type */ + +#endif /* !_OPENSOLARIS_SYS_CALLO_H_ */ Property changes on: stable/10/sys/cddl/compat/opensolaris/sys/callo.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: stable/10/sys/cddl/compat/opensolaris/sys/systm.h =================================================================== --- stable/10/sys/cddl/compat/opensolaris/sys/systm.h (revision 297107) +++ stable/10/sys/cddl/compat/opensolaris/sys/systm.h (revision 297108) @@ -1,47 +1,50 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_SYSTM_H_ #define _OPENSOLARIS_SYS_SYSTM_H_ #ifdef _KERNEL #include #include_next #include #define PAGESIZE PAGE_SIZE #define PAGEOFFSET (PAGESIZE - 1) #define PAGEMASK (~PAGEOFFSET) #define delay(x) pause("soldelay", (x)) +#define timeout_generic(type, fn, arg, t, r, f) \ + timeout(fn, arg, t / (NANOSEC/hz) + 1) + #endif /* _KERNEL */ #endif /* _OPENSOLARIS_SYS_SYSTM_H_ */ Index: stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.c (revision 297108) @@ -1,1059 +1,1288 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2013 Xin Li . All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Portions Copyright 2005, 2010, Oracle and/or its affiliates. * All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_ioctl_compat.h" static int zfs_version_ioctl = ZFS_IOCVER_CURRENT; SYSCTL_DECL(_vfs_zfs_version); SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl, 0, "ZFS_IOCTL_VERSION"); /* * FreeBSD zfs_cmd compatibility with older binaries * appropriately remap/extend the zfs_cmd_t structure */ void zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) { zfs_cmd_v15_t *zc_c; zfs_cmd_v28_t *zc28_c; zfs_cmd_deadman_t *zcdm_c; zfs_cmd_zcmd_t *zcmd_c; zfs_cmd_edbp_t *edbp_c; + zfs_cmd_resume_t *resume_c; switch (cflag) { + case ZFS_CMD_COMPAT_RESUME: + resume_c = (void *)addr; + /* zc */ + strlcpy(zc->zc_name, resume_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, resume_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, resume_c->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zc->field = resume_c->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + FIELD_COPY(zc_begin_record); + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(zc->zc_inject_record.zi_func, + resume_c->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + zc->zc_inject_record.zi_nlanes = 1; + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_resumable); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + break; + case ZFS_CMD_COMPAT_EDBP: edbp_c = (void *)addr; /* zc */ strlcpy(zc->zc_name, edbp_c->zc_name, MAXPATHLEN); strlcpy(zc->zc_value, edbp_c->zc_value, MAXPATHLEN * 2); strlcpy(zc->zc_string, edbp_c->zc_string, MAXPATHLEN); -#define ZCMD_COPY(field) zc->field = edbp_c->field - ZCMD_COPY(zc_nvlist_src); - ZCMD_COPY(zc_nvlist_src_size); - ZCMD_COPY(zc_nvlist_dst); - ZCMD_COPY(zc_nvlist_dst_size); - ZCMD_COPY(zc_nvlist_dst_filled); - ZCMD_COPY(zc_pad2); - ZCMD_COPY(zc_history); - ZCMD_COPY(zc_guid); - ZCMD_COPY(zc_nvlist_conf); - ZCMD_COPY(zc_nvlist_conf_size); - ZCMD_COPY(zc_cookie); - ZCMD_COPY(zc_objset_type); - ZCMD_COPY(zc_perm_action); - ZCMD_COPY(zc_history_len); - ZCMD_COPY(zc_history_offset); - ZCMD_COPY(zc_obj); - ZCMD_COPY(zc_iflags); - ZCMD_COPY(zc_share); - ZCMD_COPY(zc_jailid); - ZCMD_COPY(zc_objset_stats); +#define FIELD_COPY(field) zc->field = edbp_c->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); zc->zc_begin_record.drr_u.drr_begin = edbp_c->zc_begin_record; - ZCMD_COPY(zc_inject_record); - ZCMD_COPY(zc_defer_destroy); - ZCMD_COPY(zc_flags); - ZCMD_COPY(zc_action_handle); - ZCMD_COPY(zc_cleanup_fd); - ZCMD_COPY(zc_simple); + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(zc->zc_inject_record.zi_func, + edbp_c->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + zc->zc_inject_record.zi_nlanes = 1; + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); zc->zc_resumable = B_FALSE; - ZCMD_COPY(zc_sendobj); - ZCMD_COPY(zc_fromobj); - ZCMD_COPY(zc_createtxg); - ZCMD_COPY(zc_stat); -#undef ZCMD_COPY + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY break; case ZFS_CMD_COMPAT_ZCMD: zcmd_c = (void *)addr; /* zc */ strlcpy(zc->zc_name, zcmd_c->zc_name, MAXPATHLEN); strlcpy(zc->zc_value, zcmd_c->zc_value, MAXPATHLEN * 2); strlcpy(zc->zc_string, zcmd_c->zc_string, MAXPATHLEN); -#define ZCMD_COPY(field) zc->field = zcmd_c->field - ZCMD_COPY(zc_nvlist_src); - ZCMD_COPY(zc_nvlist_src_size); - ZCMD_COPY(zc_nvlist_dst); - ZCMD_COPY(zc_nvlist_dst_size); - ZCMD_COPY(zc_nvlist_dst_filled); - ZCMD_COPY(zc_pad2); - ZCMD_COPY(zc_history); - ZCMD_COPY(zc_guid); - ZCMD_COPY(zc_nvlist_conf); - ZCMD_COPY(zc_nvlist_conf_size); - ZCMD_COPY(zc_cookie); - ZCMD_COPY(zc_objset_type); - ZCMD_COPY(zc_perm_action); - ZCMD_COPY(zc_history_len); - ZCMD_COPY(zc_history_offset); - ZCMD_COPY(zc_obj); - ZCMD_COPY(zc_iflags); - ZCMD_COPY(zc_share); - ZCMD_COPY(zc_jailid); - ZCMD_COPY(zc_objset_stats); +#define FIELD_COPY(field) zc->field = zcmd_c->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); zc->zc_begin_record.drr_u.drr_begin = zcmd_c->zc_begin_record; - ZCMD_COPY(zc_inject_record); + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(zc->zc_inject_record.zi_func, + zcmd_c->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + zc->zc_inject_record.zi_nlanes = 1; + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); /* boolean_t -> uint32_t */ zc->zc_defer_destroy = (uint32_t)(zcmd_c->zc_defer_destroy); zc->zc_flags = 0; - ZCMD_COPY(zc_action_handle); - ZCMD_COPY(zc_cleanup_fd); - ZCMD_COPY(zc_simple); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); zc->zc_resumable = B_FALSE; - ZCMD_COPY(zc_sendobj); - ZCMD_COPY(zc_fromobj); - ZCMD_COPY(zc_createtxg); - ZCMD_COPY(zc_stat); -#undef ZCMD_COPY + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY break; case ZFS_CMD_COMPAT_DEADMAN: zcdm_c = (void *)addr; /* zc */ strlcpy(zc->zc_name, zcdm_c->zc_name, MAXPATHLEN); strlcpy(zc->zc_value, zcdm_c->zc_value, MAXPATHLEN * 2); strlcpy(zc->zc_string, zcdm_c->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zc->field = zcdm_c->field zc->zc_guid = zcdm_c->zc_guid; zc->zc_nvlist_conf = zcdm_c->zc_nvlist_conf; zc->zc_nvlist_conf_size = zcdm_c->zc_nvlist_conf_size; zc->zc_nvlist_src = zcdm_c->zc_nvlist_src; zc->zc_nvlist_src_size = zcdm_c->zc_nvlist_src_size; zc->zc_nvlist_dst = zcdm_c->zc_nvlist_dst; zc->zc_nvlist_dst_size = zcdm_c->zc_nvlist_dst_size; zc->zc_cookie = zcdm_c->zc_cookie; zc->zc_objset_type = zcdm_c->zc_objset_type; zc->zc_perm_action = zcdm_c->zc_perm_action; zc->zc_history = zcdm_c->zc_history; zc->zc_history_len = zcdm_c->zc_history_len; zc->zc_history_offset = zcdm_c->zc_history_offset; zc->zc_obj = zcdm_c->zc_obj; zc->zc_iflags = zcdm_c->zc_iflags; zc->zc_share = zcdm_c->zc_share; zc->zc_jailid = zcdm_c->zc_jailid; zc->zc_objset_stats = zcdm_c->zc_objset_stats; zc->zc_begin_record.drr_u.drr_begin = zcdm_c->zc_begin_record; zc->zc_defer_destroy = zcdm_c->zc_defer_destroy; (void)zcdm_c->zc_temphold; zc->zc_action_handle = zcdm_c->zc_action_handle; zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd; zc->zc_simple = zcdm_c->zc_simple; zc->zc_resumable = B_FALSE; zc->zc_sendobj = zcdm_c->zc_sendobj; zc->zc_fromobj = zcdm_c->zc_fromobj; zc->zc_createtxg = zcdm_c->zc_createtxg; zc->zc_stat = zcdm_c->zc_stat; + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(zc->zc_inject_record.zi_func, + resume_c->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + zc->zc_inject_record.zi_nlanes = 1; + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); - /* zc_inject_record doesn't change in libzfs_core */ - zcdm_c->zc_inject_record = zc->zc_inject_record; - /* we always assume zc_nvlist_dst_filled is true */ zc->zc_nvlist_dst_filled = B_TRUE; +#undef FIELD_COPY break; case ZFS_CMD_COMPAT_V28: zc28_c = (void *)addr; /* zc */ strlcpy(zc->zc_name, zc28_c->zc_name, MAXPATHLEN); strlcpy(zc->zc_value, zc28_c->zc_value, MAXPATHLEN * 2); strlcpy(zc->zc_string, zc28_c->zc_string, MAXPATHLEN); zc->zc_guid = zc28_c->zc_guid; zc->zc_nvlist_conf = zc28_c->zc_nvlist_conf; zc->zc_nvlist_conf_size = zc28_c->zc_nvlist_conf_size; zc->zc_nvlist_src = zc28_c->zc_nvlist_src; zc->zc_nvlist_src_size = zc28_c->zc_nvlist_src_size; zc->zc_nvlist_dst = zc28_c->zc_nvlist_dst; zc->zc_nvlist_dst_size = zc28_c->zc_nvlist_dst_size; zc->zc_cookie = zc28_c->zc_cookie; zc->zc_objset_type = zc28_c->zc_objset_type; zc->zc_perm_action = zc28_c->zc_perm_action; zc->zc_history = zc28_c->zc_history; zc->zc_history_len = zc28_c->zc_history_len; zc->zc_history_offset = zc28_c->zc_history_offset; zc->zc_obj = zc28_c->zc_obj; zc->zc_iflags = zc28_c->zc_iflags; zc->zc_share = zc28_c->zc_share; zc->zc_jailid = zc28_c->zc_jailid; zc->zc_objset_stats = zc28_c->zc_objset_stats; zc->zc_begin_record.drr_u.drr_begin = zc28_c->zc_begin_record; zc->zc_defer_destroy = zc28_c->zc_defer_destroy; (void)zc28_c->zc_temphold; zc->zc_action_handle = zc28_c->zc_action_handle; zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd; zc->zc_simple = zc28_c->zc_simple; zc->zc_resumable = B_FALSE; zc->zc_sendobj = zc28_c->zc_sendobj; zc->zc_fromobj = zc28_c->zc_fromobj; zc->zc_createtxg = zc28_c->zc_createtxg; zc->zc_stat = zc28_c->zc_stat; /* zc->zc_inject_record */ zc->zc_inject_record.zi_objset = zc28_c->zc_inject_record.zi_objset; zc->zc_inject_record.zi_object = zc28_c->zc_inject_record.zi_object; zc->zc_inject_record.zi_start = zc28_c->zc_inject_record.zi_start; zc->zc_inject_record.zi_end = zc28_c->zc_inject_record.zi_end; zc->zc_inject_record.zi_guid = zc28_c->zc_inject_record.zi_guid; zc->zc_inject_record.zi_level = zc28_c->zc_inject_record.zi_level; zc->zc_inject_record.zi_error = zc28_c->zc_inject_record.zi_error; zc->zc_inject_record.zi_type = zc28_c->zc_inject_record.zi_type; zc->zc_inject_record.zi_freq = zc28_c->zc_inject_record.zi_freq; zc->zc_inject_record.zi_failfast = zc28_c->zc_inject_record.zi_failfast; strlcpy(zc->zc_inject_record.zi_func, zc28_c->zc_inject_record.zi_func, MAXNAMELEN); zc->zc_inject_record.zi_iotype = zc28_c->zc_inject_record.zi_iotype; zc->zc_inject_record.zi_duration = zc28_c->zc_inject_record.zi_duration; zc->zc_inject_record.zi_timer = zc28_c->zc_inject_record.zi_timer; + zc->zc_inject_record.zi_nlanes = 1; zc->zc_inject_record.zi_cmd = ZINJECT_UNINITIALIZED; zc->zc_inject_record.zi_pad = 0; break; case ZFS_CMD_COMPAT_V15: zc_c = (void *)addr; /* zc */ strlcpy(zc->zc_name, zc_c->zc_name, MAXPATHLEN); strlcpy(zc->zc_value, zc_c->zc_value, MAXPATHLEN); strlcpy(zc->zc_string, zc_c->zc_string, MAXPATHLEN); zc->zc_guid = zc_c->zc_guid; zc->zc_nvlist_conf = zc_c->zc_nvlist_conf; zc->zc_nvlist_conf_size = zc_c->zc_nvlist_conf_size; zc->zc_nvlist_src = zc_c->zc_nvlist_src; zc->zc_nvlist_src_size = zc_c->zc_nvlist_src_size; zc->zc_nvlist_dst = zc_c->zc_nvlist_dst; zc->zc_nvlist_dst_size = zc_c->zc_nvlist_dst_size; zc->zc_cookie = zc_c->zc_cookie; zc->zc_objset_type = zc_c->zc_objset_type; zc->zc_perm_action = zc_c->zc_perm_action; zc->zc_history = zc_c->zc_history; zc->zc_history_len = zc_c->zc_history_len; zc->zc_history_offset = zc_c->zc_history_offset; zc->zc_obj = zc_c->zc_obj; zc->zc_share = zc_c->zc_share; zc->zc_jailid = zc_c->zc_jailid; zc->zc_objset_stats = zc_c->zc_objset_stats; zc->zc_begin_record.drr_u.drr_begin = zc_c->zc_begin_record; /* zc->zc_inject_record */ zc->zc_inject_record.zi_objset = zc_c->zc_inject_record.zi_objset; zc->zc_inject_record.zi_object = zc_c->zc_inject_record.zi_object; zc->zc_inject_record.zi_start = zc_c->zc_inject_record.zi_start; zc->zc_inject_record.zi_end = zc_c->zc_inject_record.zi_end; zc->zc_inject_record.zi_guid = zc_c->zc_inject_record.zi_guid; zc->zc_inject_record.zi_level = zc_c->zc_inject_record.zi_level; zc->zc_inject_record.zi_error = zc_c->zc_inject_record.zi_error; zc->zc_inject_record.zi_type = zc_c->zc_inject_record.zi_type; zc->zc_inject_record.zi_freq = zc_c->zc_inject_record.zi_freq; zc->zc_inject_record.zi_failfast = zc_c->zc_inject_record.zi_failfast; break; } } void zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, const int cflag) { zfs_cmd_v15_t *zc_c; zfs_cmd_v28_t *zc28_c; zfs_cmd_deadman_t *zcdm_c; zfs_cmd_zcmd_t *zcmd_c; zfs_cmd_edbp_t *edbp_c; + zfs_cmd_resume_t *resume_c; switch (cflag) { + case ZFS_CMD_COMPAT_RESUME: + resume_c = (void *)addr; + strlcpy(resume_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(resume_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(resume_c->zc_string, zc->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) resume_c->field = zc->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + FIELD_COPY(zc_begin_record); + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(resume_c->zc_inject_record.zi_func, + zc->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + break; + case ZFS_CMD_COMPAT_EDBP: edbp_c = (void *)addr; strlcpy(edbp_c->zc_name, zc->zc_name, MAXPATHLEN); strlcpy(edbp_c->zc_value, zc->zc_value, MAXPATHLEN * 2); strlcpy(edbp_c->zc_string, zc->zc_string, MAXPATHLEN); -#define ZCMD_COPY(field) edbp_c->field = zc->field - ZCMD_COPY(zc_nvlist_src); - ZCMD_COPY(zc_nvlist_src_size); - ZCMD_COPY(zc_nvlist_dst); - ZCMD_COPY(zc_nvlist_dst_size); - ZCMD_COPY(zc_nvlist_dst_filled); - ZCMD_COPY(zc_pad2); - ZCMD_COPY(zc_history); - ZCMD_COPY(zc_guid); - ZCMD_COPY(zc_nvlist_conf); - ZCMD_COPY(zc_nvlist_conf_size); - ZCMD_COPY(zc_cookie); - ZCMD_COPY(zc_objset_type); - ZCMD_COPY(zc_perm_action); - ZCMD_COPY(zc_history_len); - ZCMD_COPY(zc_history_offset); - ZCMD_COPY(zc_obj); - ZCMD_COPY(zc_iflags); - ZCMD_COPY(zc_share); - ZCMD_COPY(zc_jailid); - ZCMD_COPY(zc_objset_stats); +#define FIELD_COPY(field) edbp_c->field = zc->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); edbp_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; - ZCMD_COPY(zc_inject_record); - ZCMD_COPY(zc_defer_destroy); - ZCMD_COPY(zc_flags); - ZCMD_COPY(zc_action_handle); - ZCMD_COPY(zc_cleanup_fd); - ZCMD_COPY(zc_simple); - ZCMD_COPY(zc_sendobj); - ZCMD_COPY(zc_fromobj); - ZCMD_COPY(zc_createtxg); - ZCMD_COPY(zc_stat); -#undef ZCMD_COPY + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(resume_c->zc_inject_record.zi_func, + zc->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY break; case ZFS_CMD_COMPAT_ZCMD: zcmd_c = (void *)addr; /* zc */ strlcpy(zcmd_c->zc_name, zc->zc_name, MAXPATHLEN); strlcpy(zcmd_c->zc_value, zc->zc_value, MAXPATHLEN * 2); strlcpy(zcmd_c->zc_string, zc->zc_string, MAXPATHLEN); -#define ZCMD_COPY(field) zcmd_c->field = zc->field - ZCMD_COPY(zc_nvlist_src); - ZCMD_COPY(zc_nvlist_src_size); - ZCMD_COPY(zc_nvlist_dst); - ZCMD_COPY(zc_nvlist_dst_size); - ZCMD_COPY(zc_nvlist_dst_filled); - ZCMD_COPY(zc_pad2); - ZCMD_COPY(zc_history); - ZCMD_COPY(zc_guid); - ZCMD_COPY(zc_nvlist_conf); - ZCMD_COPY(zc_nvlist_conf_size); - ZCMD_COPY(zc_cookie); - ZCMD_COPY(zc_objset_type); - ZCMD_COPY(zc_perm_action); - ZCMD_COPY(zc_history_len); - ZCMD_COPY(zc_history_offset); - ZCMD_COPY(zc_obj); - ZCMD_COPY(zc_iflags); - ZCMD_COPY(zc_share); - ZCMD_COPY(zc_jailid); - ZCMD_COPY(zc_objset_stats); +#define FIELD_COPY(field) zcmd_c->field = zc->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); zcmd_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; - ZCMD_COPY(zc_inject_record); + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(resume_c->zc_inject_record.zi_func, + zc->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); /* boolean_t -> uint32_t */ zcmd_c->zc_defer_destroy = (uint32_t)(zc->zc_defer_destroy); zcmd_c->zc_temphold = 0; - ZCMD_COPY(zc_action_handle); - ZCMD_COPY(zc_cleanup_fd); - ZCMD_COPY(zc_simple); - ZCMD_COPY(zc_sendobj); - ZCMD_COPY(zc_fromobj); - ZCMD_COPY(zc_createtxg); - ZCMD_COPY(zc_stat); -#undef ZCMD_COPY + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY break; case ZFS_CMD_COMPAT_DEADMAN: zcdm_c = (void *)addr; strlcpy(zcdm_c->zc_name, zc->zc_name, MAXPATHLEN); strlcpy(zcdm_c->zc_value, zc->zc_value, MAXPATHLEN * 2); strlcpy(zcdm_c->zc_string, zc->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zcdm_c->field = zc->field zcdm_c->zc_guid = zc->zc_guid; zcdm_c->zc_nvlist_conf = zc->zc_nvlist_conf; zcdm_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; zcdm_c->zc_nvlist_src = zc->zc_nvlist_src; zcdm_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; zcdm_c->zc_nvlist_dst = zc->zc_nvlist_dst; zcdm_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; zcdm_c->zc_cookie = zc->zc_cookie; zcdm_c->zc_objset_type = zc->zc_objset_type; zcdm_c->zc_perm_action = zc->zc_perm_action; zcdm_c->zc_history = zc->zc_history; zcdm_c->zc_history_len = zc->zc_history_len; zcdm_c->zc_history_offset = zc->zc_history_offset; zcdm_c->zc_obj = zc->zc_obj; zcdm_c->zc_iflags = zc->zc_iflags; zcdm_c->zc_share = zc->zc_share; zcdm_c->zc_jailid = zc->zc_jailid; zcdm_c->zc_objset_stats = zc->zc_objset_stats; zcdm_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; zcdm_c->zc_defer_destroy = zc->zc_defer_destroy; zcdm_c->zc_temphold = 0; zcdm_c->zc_action_handle = zc->zc_action_handle; zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd; zcdm_c->zc_simple = zc->zc_simple; zcdm_c->zc_sendobj = zc->zc_sendobj; zcdm_c->zc_fromobj = zc->zc_fromobj; zcdm_c->zc_createtxg = zc->zc_createtxg; zcdm_c->zc_stat = zc->zc_stat; - - /* zc_inject_record doesn't change in libzfs_core */ - zc->zc_inject_record = zcdm_c->zc_inject_record; + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(resume_c->zc_inject_record.zi_func, + zc->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); +#undef FIELD_COPY #ifndef _KERNEL if (request == ZFS_IOC_RECV) strlcpy(zcdm_c->zc_top_ds, zc->zc_value + strlen(zc->zc_value) + 1, (MAXPATHLEN * 2) - strlen(zc->zc_value) - 1); #endif break; case ZFS_CMD_COMPAT_V28: zc28_c = (void *)addr; strlcpy(zc28_c->zc_name, zc->zc_name, MAXPATHLEN); strlcpy(zc28_c->zc_value, zc->zc_value, MAXPATHLEN * 2); strlcpy(zc28_c->zc_string, zc->zc_string, MAXPATHLEN); zc28_c->zc_guid = zc->zc_guid; zc28_c->zc_nvlist_conf = zc->zc_nvlist_conf; zc28_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; zc28_c->zc_nvlist_src = zc->zc_nvlist_src; zc28_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; zc28_c->zc_nvlist_dst = zc->zc_nvlist_dst; zc28_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; zc28_c->zc_cookie = zc->zc_cookie; zc28_c->zc_objset_type = zc->zc_objset_type; zc28_c->zc_perm_action = zc->zc_perm_action; zc28_c->zc_history = zc->zc_history; zc28_c->zc_history_len = zc->zc_history_len; zc28_c->zc_history_offset = zc->zc_history_offset; zc28_c->zc_obj = zc->zc_obj; zc28_c->zc_iflags = zc->zc_iflags; zc28_c->zc_share = zc->zc_share; zc28_c->zc_jailid = zc->zc_jailid; zc28_c->zc_objset_stats = zc->zc_objset_stats; zc28_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; zc28_c->zc_defer_destroy = zc->zc_defer_destroy; zc28_c->zc_temphold = 0; zc28_c->zc_action_handle = zc->zc_action_handle; zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd; zc28_c->zc_simple = zc->zc_simple; zc28_c->zc_sendobj = zc->zc_sendobj; zc28_c->zc_fromobj = zc->zc_fromobj; zc28_c->zc_createtxg = zc->zc_createtxg; zc28_c->zc_stat = zc->zc_stat; #ifndef _KERNEL if (request == ZFS_IOC_RECV) strlcpy(zc28_c->zc_top_ds, zc->zc_value + strlen(zc->zc_value) + 1, MAXPATHLEN * 2 - strlen(zc->zc_value) - 1); #endif /* zc_inject_record */ zc28_c->zc_inject_record.zi_objset = zc->zc_inject_record.zi_objset; zc28_c->zc_inject_record.zi_object = zc->zc_inject_record.zi_object; zc28_c->zc_inject_record.zi_start = zc->zc_inject_record.zi_start; zc28_c->zc_inject_record.zi_end = zc->zc_inject_record.zi_end; zc28_c->zc_inject_record.zi_guid = zc->zc_inject_record.zi_guid; zc28_c->zc_inject_record.zi_level = zc->zc_inject_record.zi_level; zc28_c->zc_inject_record.zi_error = zc->zc_inject_record.zi_error; zc28_c->zc_inject_record.zi_type = zc->zc_inject_record.zi_type; zc28_c->zc_inject_record.zi_freq = zc->zc_inject_record.zi_freq; zc28_c->zc_inject_record.zi_failfast = zc->zc_inject_record.zi_failfast; strlcpy(zc28_c->zc_inject_record.zi_func, zc->zc_inject_record.zi_func, MAXNAMELEN); zc28_c->zc_inject_record.zi_iotype = zc->zc_inject_record.zi_iotype; zc28_c->zc_inject_record.zi_duration = zc->zc_inject_record.zi_duration; zc28_c->zc_inject_record.zi_timer = zc->zc_inject_record.zi_timer; break; case ZFS_CMD_COMPAT_V15: zc_c = (void *)addr; /* zc */ strlcpy(zc_c->zc_name, zc->zc_name, MAXPATHLEN); strlcpy(zc_c->zc_value, zc->zc_value, MAXPATHLEN); strlcpy(zc_c->zc_string, zc->zc_string, MAXPATHLEN); zc_c->zc_guid = zc->zc_guid; zc_c->zc_nvlist_conf = zc->zc_nvlist_conf; zc_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; zc_c->zc_nvlist_src = zc->zc_nvlist_src; zc_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; zc_c->zc_nvlist_dst = zc->zc_nvlist_dst; zc_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; zc_c->zc_cookie = zc->zc_cookie; zc_c->zc_objset_type = zc->zc_objset_type; zc_c->zc_perm_action = zc->zc_perm_action; zc_c->zc_history = zc->zc_history; zc_c->zc_history_len = zc->zc_history_len; zc_c->zc_history_offset = zc->zc_history_offset; zc_c->zc_obj = zc->zc_obj; zc_c->zc_share = zc->zc_share; zc_c->zc_jailid = zc->zc_jailid; zc_c->zc_objset_stats = zc->zc_objset_stats; zc_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; /* zc_inject_record */ zc_c->zc_inject_record.zi_objset = zc->zc_inject_record.zi_objset; zc_c->zc_inject_record.zi_object = zc->zc_inject_record.zi_object; zc_c->zc_inject_record.zi_start = zc->zc_inject_record.zi_start; zc_c->zc_inject_record.zi_end = zc->zc_inject_record.zi_end; zc_c->zc_inject_record.zi_guid = zc->zc_inject_record.zi_guid; zc_c->zc_inject_record.zi_level = zc->zc_inject_record.zi_level; zc_c->zc_inject_record.zi_error = zc->zc_inject_record.zi_error; zc_c->zc_inject_record.zi_type = zc->zc_inject_record.zi_type; zc_c->zc_inject_record.zi_freq = zc->zc_inject_record.zi_freq; zc_c->zc_inject_record.zi_failfast = zc->zc_inject_record.zi_failfast; break; } } static int zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (EINVAL); #ifdef _KERNEL packed = kmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { kmem_free(packed, size); return (error); } #else packed = (void *)(uintptr_t)nvl; #endif error = nvlist_unpack(packed, size, &list, 0); #ifdef _KERNEL kmem_free(packed, size); #endif if (error != 0) return (error); *nvp = list; return (0); } static int zfs_ioctl_compat_put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); #ifdef _KERNEL packed = kmem_alloc(size, KM_SLEEP); VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, KM_SLEEP) == 0); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = EFAULT; kmem_free(packed, size); #else packed = (void *)(uintptr_t)zc->zc_nvlist_dst; VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, 0) == 0); #endif zc->zc_nvlist_dst_size = size; return (error); } static void zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl) { nvlist_t **child; nvlist_t *nvroot = NULL; vdev_stat_t *vs; uint_t c, children, nelem; if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) { zfs_ioctl_compat_fix_stats_nvlist(child[c]); } } if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0) zfs_ioctl_compat_fix_stats_nvlist(nvroot); #ifdef _KERNEL if ((nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS, #else if ((nvlist_lookup_uint64_array(nvl, "stats", #endif (uint64_t **)&vs, &nelem) == 0)) { nvlist_add_uint64_array(nvl, #ifdef _KERNEL "stats", #else ZPOOL_CONFIG_VDEV_STATS, #endif (uint64_t *)vs, nelem); #ifdef _KERNEL nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, #else nvlist_remove(nvl, "stats", #endif DATA_TYPE_UINT64_ARRAY); } } static int zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc) { nvlist_t *nv, *nvp = NULL; nvpair_t *elem; int error; if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) return (error); if (nc == 5) { /* ZFS_IOC_POOL_STATS */ elem = NULL; while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) { if (nvpair_value_nvlist(elem, &nvp) == 0) zfs_ioctl_compat_fix_stats_nvlist(nvp); } elem = NULL; } else zfs_ioctl_compat_fix_stats_nvlist(nv); error = zfs_ioctl_compat_put_nvlist(zc, nv); nvlist_free(nv); return (error); } static int zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc) { nvlist_t *nv, *nva = NULL; int error; if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) return (error); #ifdef _KERNEL if (nvlist_lookup_nvlist(nv, "allocated", &nva) == 0) { nvlist_add_nvlist(nv, "used", nva); nvlist_remove(nv, "allocated", DATA_TYPE_NVLIST); } if (nvlist_lookup_nvlist(nv, "free", &nva) == 0) { nvlist_add_nvlist(nv, "available", nva); nvlist_remove(nv, "free", DATA_TYPE_NVLIST); } #else if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) { nvlist_add_nvlist(nv, "allocated", nva); nvlist_remove(nv, "used", DATA_TYPE_NVLIST); } if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) { nvlist_add_nvlist(nv, "free", nva); nvlist_remove(nv, "available", DATA_TYPE_NVLIST); } #endif error = zfs_ioctl_compat_put_nvlist(zc, nv); nvlist_free(nv); return (error); } #ifndef _KERNEL int zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) { int nc, ret; void *zc_c; unsigned long ncmd; zfs_iocparm_t zp; switch (cflag) { case ZFS_CMD_COMPAT_NONE: ncmd = _IOWR('Z', request, struct zfs_iocparm); zp.zfs_cmd = (uint64_t)zc; zp.zfs_cmd_size = sizeof(zfs_cmd_t); zp.zfs_ioctl_version = ZFS_IOCVER_CURRENT; return (ioctl(fd, ncmd, &zp)); + case ZFS_CMD_COMPAT_RESUME: + ncmd = _IOWR('Z', request, struct zfs_iocparm); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof(zfs_cmd_resume_t); + zp.zfs_ioctl_version = ZFS_IOCVER_RESUME; + return (ioctl(fd, ncmd, &zp)); case ZFS_CMD_COMPAT_EDBP: ncmd = _IOWR('Z', request, struct zfs_iocparm); zp.zfs_cmd = (uint64_t)zc; zp.zfs_cmd_size = sizeof(zfs_cmd_edbp_t); zp.zfs_ioctl_version = ZFS_IOCVER_EDBP; return (ioctl(fd, ncmd, &zp)); case ZFS_CMD_COMPAT_ZCMD: ncmd = _IOWR('Z', request, struct zfs_iocparm); zp.zfs_cmd = (uint64_t)zc; zp.zfs_cmd_size = sizeof(zfs_cmd_zcmd_t); zp.zfs_ioctl_version = ZFS_IOCVER_ZCMD; return (ioctl(fd, ncmd, &zp)); case ZFS_CMD_COMPAT_LZC: ncmd = _IOWR('Z', request, struct zfs_cmd); return (ioctl(fd, ncmd, zc)); case ZFS_CMD_COMPAT_DEADMAN: zc_c = malloc(sizeof(zfs_cmd_deadman_t)); ncmd = _IOWR('Z', request, struct zfs_cmd_deadman); break; case ZFS_CMD_COMPAT_V28: zc_c = malloc(sizeof(zfs_cmd_v28_t)); ncmd = _IOWR('Z', request, struct zfs_cmd_v28); break; case ZFS_CMD_COMPAT_V15: nc = zfs_ioctl_v28_to_v15[request]; zc_c = malloc(sizeof(zfs_cmd_v15_t)); ncmd = _IOWR('Z', nc, struct zfs_cmd_v15); break; default: return (EINVAL); } if (ZFS_IOCREQ(ncmd) == ZFS_IOC_COMPAT_FAIL) return (ENOTSUP); zfs_cmd_compat_put(zc, (caddr_t)zc_c, request, cflag); ret = ioctl(fd, ncmd, zc_c); if (cflag == ZFS_CMD_COMPAT_V15 && nc == ZFS_IOC_POOL_IMPORT) ret = ioctl(fd, _IOWR('Z', ZFS_IOC_POOL_CONFIGS, struct zfs_cmd_v15), zc_c); zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag); free(zc_c); if (cflag == ZFS_CMD_COMPAT_V15) { switch (nc) { case ZFS_IOC_POOL_IMPORT: case ZFS_IOC_POOL_CONFIGS: case ZFS_IOC_POOL_STATS: case ZFS_IOC_POOL_TRYIMPORT: zfs_ioctl_compat_fix_stats(zc, nc); break; case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ zfs_ioctl_compat_pool_get_props(zc); break; } } return (ret); } #else /* _KERNEL */ int zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag) { int error = 0; /* are we creating a clone? */ if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0') *vec = ZFS_IOC_CLONE; if (cflag == ZFS_CMD_COMPAT_V15) { switch (*vec) { case 7: /* ZFS_IOC_POOL_SCRUB (v15) */ zc->zc_cookie = POOL_SCAN_SCRUB; break; } } return (error); } void zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag) { if (cflag == ZFS_CMD_COMPAT_V15) { switch (vec) { case ZFS_IOC_POOL_CONFIGS: case ZFS_IOC_POOL_STATS: case ZFS_IOC_POOL_TRYIMPORT: zfs_ioctl_compat_fix_stats(zc, vec); break; case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ zfs_ioctl_compat_pool_get_props(zc); break; } } } nvlist_t * zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t * innvl, const int vec, const int cflag) { nvlist_t *nvl, *tmpnvl, *hnvl; nvpair_t *elem; char *poolname, *snapname; int err; if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || - cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP) + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || + cflag == ZFS_CMD_COMPAT_RESUME) goto out; switch (vec) { case ZFS_IOC_CREATE: nvl = fnvlist_alloc(); fnvlist_add_int32(nvl, "type", zc->zc_objset_type); if (innvl != NULL) { fnvlist_add_nvlist(nvl, "props", innvl); nvlist_free(innvl); } return (nvl); break; case ZFS_IOC_CLONE: nvl = fnvlist_alloc(); fnvlist_add_string(nvl, "origin", zc->zc_value); if (innvl != NULL) { fnvlist_add_nvlist(nvl, "props", innvl); nvlist_free(innvl); } return (nvl); break; case ZFS_IOC_SNAPSHOT: if (innvl == NULL) goto out; nvl = fnvlist_alloc(); fnvlist_add_nvlist(nvl, "props", innvl); tmpnvl = fnvlist_alloc(); snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); fnvlist_add_boolean(tmpnvl, snapname); kmem_free(snapname, strlen(snapname + 1)); /* check if we are doing a recursive snapshot */ if (zc->zc_cookie) dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, tmpnvl); fnvlist_add_nvlist(nvl, "snaps", tmpnvl); fnvlist_free(tmpnvl); nvlist_free(innvl); /* strip dataset part from zc->zc_name */ zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; return (nvl); break; case ZFS_IOC_SPACE_SNAPS: nvl = fnvlist_alloc(); fnvlist_add_string(nvl, "firstsnap", zc->zc_value); if (innvl != NULL) nvlist_free(innvl); return (nvl); break; case ZFS_IOC_DESTROY_SNAPS: if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN) goto out; nvl = fnvlist_alloc(); if (innvl != NULL) { fnvlist_add_nvlist(nvl, "snaps", innvl); } else { /* * We are probably called by even older binaries, * allocate and populate nvlist with recursive * snapshots */ if (zfs_component_namecheck(zc->zc_value, NULL, NULL) == 0) { tmpnvl = fnvlist_alloc(); if (dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, tmpnvl) == 0) fnvlist_add_nvlist(nvl, "snaps", tmpnvl); nvlist_free(tmpnvl); } } if (innvl != NULL) nvlist_free(innvl); /* strip dataset part from zc->zc_name */ zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; return (nvl); break; case ZFS_IOC_HOLD: nvl = fnvlist_alloc(); tmpnvl = fnvlist_alloc(); if (zc->zc_cleanup_fd != -1) fnvlist_add_int32(nvl, "cleanup_fd", (int32_t)zc->zc_cleanup_fd); if (zc->zc_cookie) { hnvl = fnvlist_alloc(); if (dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, hnvl) == 0) { elem = NULL; while ((elem = nvlist_next_nvpair(hnvl, elem)) != NULL) { nvlist_add_string(tmpnvl, nvpair_name(elem), zc->zc_string); } } nvlist_free(hnvl); } else { snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); nvlist_add_string(tmpnvl, snapname, zc->zc_string); kmem_free(snapname, strlen(snapname + 1)); } fnvlist_add_nvlist(nvl, "holds", tmpnvl); nvlist_free(tmpnvl); if (innvl != NULL) nvlist_free(innvl); /* strip dataset part from zc->zc_name */ zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; return (nvl); break; case ZFS_IOC_RELEASE: nvl = fnvlist_alloc(); tmpnvl = fnvlist_alloc(); if (zc->zc_cookie) { hnvl = fnvlist_alloc(); if (dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, hnvl) == 0) { elem = NULL; while ((elem = nvlist_next_nvpair(hnvl, elem)) != NULL) { fnvlist_add_boolean(tmpnvl, zc->zc_string); fnvlist_add_nvlist(nvl, nvpair_name(elem), tmpnvl); } } nvlist_free(hnvl); } else { snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); fnvlist_add_boolean(tmpnvl, zc->zc_string); fnvlist_add_nvlist(nvl, snapname, tmpnvl); kmem_free(snapname, strlen(snapname + 1)); } nvlist_free(tmpnvl); if (innvl != NULL) nvlist_free(innvl); /* strip dataset part from zc->zc_name */ zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; return (nvl); break; } out: return (innvl); } nvlist_t * zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t * outnvl, const int vec, const int cflag) { nvlist_t *tmpnvl; if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || - cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP) + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || + cflag == ZFS_CMD_COMPAT_RESUME) return (outnvl); switch (vec) { case ZFS_IOC_SPACE_SNAPS: (void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie); (void) nvlist_lookup_uint64(outnvl, "compressed", &zc->zc_objset_type); (void) nvlist_lookup_uint64(outnvl, "uncompressed", &zc->zc_perm_action); nvlist_free(outnvl); /* return empty outnvl */ tmpnvl = fnvlist_alloc(); return (tmpnvl); break; case ZFS_IOC_CREATE: case ZFS_IOC_CLONE: case ZFS_IOC_HOLD: case ZFS_IOC_RELEASE: nvlist_free(outnvl); /* return empty outnvl */ tmpnvl = fnvlist_alloc(); return (tmpnvl); break; } return (outnvl); } #endif /* KERNEL */ Index: stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/common/zfs/zfs_ioctl_compat.h (revision 297108) @@ -1,434 +1,498 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2014 Xin Li . All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZFS_IOCTL_COMPAT_H #define _SYS_ZFS_IOCTL_COMPAT_H #include #include #include #include #include #ifdef _KERNEL #include #endif /* _KERNEL */ #ifdef __cplusplus extern "C" { #endif /* * Backwards ioctl compatibility */ /* ioctl versions for vfs.zfs.version.ioctl */ #define ZFS_IOCVER_UNDEF -1 #define ZFS_IOCVER_NONE 0 #define ZFS_IOCVER_DEADMAN 1 #define ZFS_IOCVER_LZC 2 #define ZFS_IOCVER_ZCMD 3 #define ZFS_IOCVER_EDBP 4 #define ZFS_IOCVER_RESUME 5 -#define ZFS_IOCVER_CURRENT ZFS_IOCVER_RESUME +#define ZFS_IOCVER_INLANES 6 +#define ZFS_IOCVER_CURRENT ZFS_IOCVER_INLANES /* compatibility conversion flag */ #define ZFS_CMD_COMPAT_NONE 0 #define ZFS_CMD_COMPAT_V15 1 #define ZFS_CMD_COMPAT_V28 2 #define ZFS_CMD_COMPAT_DEADMAN 3 #define ZFS_CMD_COMPAT_LZC 4 #define ZFS_CMD_COMPAT_ZCMD 5 #define ZFS_CMD_COMPAT_EDBP 6 +#define ZFS_CMD_COMPAT_RESUME 7 #define ZFS_IOC_COMPAT_PASS 254 #define ZFS_IOC_COMPAT_FAIL 255 #define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff) typedef struct zfs_iocparm { uint32_t zfs_ioctl_version; uint64_t zfs_cmd; uint64_t zfs_cmd_size; } zfs_iocparm_t; typedef struct zinject_record_v15 { uint64_t zi_objset; uint64_t zi_object; uint64_t zi_start; uint64_t zi_end; uint64_t zi_guid; uint32_t zi_level; uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; } zinject_record_v15_t; typedef struct zfs_cmd_v15 { char zc_name[MAXPATHLEN]; char zc_value[MAXPATHLEN]; char zc_string[MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history; /* really (char *) */ uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_v15_t zc_inject_record; } zfs_cmd_v15_t; typedef struct zinject_record_v28 { uint64_t zi_objset; uint64_t zi_object; uint64_t zi_start; uint64_t zi_end; uint64_t zi_guid; uint32_t zi_level; uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; char zi_func[MAXNAMELEN]; uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; } zinject_record_v28_t; typedef struct zfs_cmd_v28 { char zc_name[MAXPATHLEN]; char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; char zc_top_ds[MAXPATHLEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history; /* really (char *) */ uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_v28_t zc_inject_record; boolean_t zc_defer_destroy; boolean_t zc_temphold; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad[3]; /* alignment */ uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; } zfs_cmd_v28_t; +typedef struct zinject_record_deadman { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; + uint32_t zi_cmd; + uint32_t zi_pad; +} zinject_record_deadman_t; + typedef struct zfs_cmd_deadman { char zc_name[MAXPATHLEN]; char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; char zc_top_ds[MAXPATHLEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history; /* really (char *) */ uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; /* zc_inject_record doesn't change in libzfs_core */ - zinject_record_t zc_inject_record; + zinject_record_deadman_t zc_inject_record; boolean_t zc_defer_destroy; boolean_t zc_temphold; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad[3]; /* alignment */ uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; } zfs_cmd_deadman_t; typedef struct zfs_cmd_zcmd { char zc_name[MAXPATHLEN]; /* name of pool or dataset */ uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ int zc_pad2; /* * The following members are for legacy ioctls which haven't been * converted to the new method. */ uint64_t zc_history; /* really (char *) */ char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; - zinject_record_t zc_inject_record; + zinject_record_deadman_t zc_inject_record; boolean_t zc_defer_destroy; boolean_t zc_temphold; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad[3]; /* alignment */ uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; } zfs_cmd_zcmd_t; typedef struct zfs_cmd_edbp { char zc_name[MAXPATHLEN]; /* name of pool or dataset */ uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ int zc_pad2; /* * The following members are for legacy ioctls which haven't been * converted to the new method. */ uint64_t zc_history; /* really (char *) */ char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; - zinject_record_t zc_inject_record; + zinject_record_deadman_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; uint8_t zc_pad[3]; /* alignment */ uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; } zfs_cmd_edbp_t; + +typedef struct zfs_cmd_resume { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + dmu_replay_record_t zc_begin_record; + zinject_record_deadman_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + boolean_t zc_resumable; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_resume_t; #ifdef _KERNEL unsigned static long zfs_ioctl_v15_to_v28[] = { 0, /* 0 ZFS_IOC_POOL_CREATE */ 1, /* 1 ZFS_IOC_POOL_DESTROY */ 2, /* 2 ZFS_IOC_POOL_IMPORT */ 3, /* 3 ZFS_IOC_POOL_EXPORT */ 4, /* 4 ZFS_IOC_POOL_CONFIGS */ 5, /* 5 ZFS_IOC_POOL_STATS */ 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ 7, /* 7 ZFS_IOC_POOL_SCRUB */ 8, /* 8 ZFS_IOC_POOL_FREEZE */ 9, /* 9 ZFS_IOC_POOL_UPGRADE */ 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ 11, /* 11 ZFS_IOC_VDEV_ADD */ 12, /* 12 ZFS_IOC_VDEV_REMOVE */ 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ 14, /* 14 ZFS_IOC_VDEV_ATTACH */ 15, /* 15 ZFS_IOC_VDEV_DETACH */ 16, /* 16 ZFS_IOC_VDEV_SETPATH */ 18, /* 17 ZFS_IOC_OBJSET_STATS */ 19, /* 18 ZFS_IOC_OBJSET_ZPLPROPS */ 20, /* 19 ZFS_IOC_DATASET_LIST_NEXT */ 21, /* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */ 22, /* 21 ZFS_IOC_SET_PROP */ ZFS_IOC_COMPAT_PASS, /* 22 ZFS_IOC_CREATE_MINOR */ ZFS_IOC_COMPAT_PASS, /* 23 ZFS_IOC_REMOVE_MINOR */ 23, /* 24 ZFS_IOC_CREATE */ 24, /* 25 ZFS_IOC_DESTROY */ 25, /* 26 ZFS_IOC_ROLLBACK */ 26, /* 27 ZFS_IOC_RENAME */ 27, /* 28 ZFS_IOC_RECV */ 28, /* 29 ZFS_IOC_SEND */ 29, /* 30 ZFS_IOC_INJECT_FAULT */ 30, /* 31 ZFS_IOC_CLEAR_FAULT */ 31, /* 32 ZFS_IOC_INJECT_LIST_NEXT */ 32, /* 33 ZFS_IOC_ERROR_LOG */ 33, /* 34 ZFS_IOC_CLEAR */ 34, /* 35 ZFS_IOC_PROMOTE */ 35, /* 36 ZFS_IOC_DESTROY_SNAPS */ 36, /* 37 ZFS_IOC_SNAPSHOT */ 37, /* 38 ZFS_IOC_DSOBJ_TO_DSNAME */ 38, /* 39 ZFS_IOC_OBJ_TO_PATH */ 39, /* 40 ZFS_IOC_POOL_SET_PROPS */ 40, /* 41 ZFS_IOC_POOL_GET_PROPS */ 41, /* 42 ZFS_IOC_SET_FSACL */ 42, /* 43 ZFS_IOC_GET_FSACL */ ZFS_IOC_COMPAT_PASS, /* 44 ZFS_IOC_ISCSI_PERM_CHECK */ 43, /* 45 ZFS_IOC_SHARE */ 44, /* 46 ZFS_IOC_IHNERIT_PROP */ 58, /* 47 ZFS_IOC_JAIL */ 59, /* 48 ZFS_IOC_UNJAIL */ 45, /* 49 ZFS_IOC_SMB_ACL */ 46, /* 50 ZFS_IOC_USERSPACE_ONE */ 47, /* 51 ZFS_IOC_USERSPACE_MANY */ 48, /* 52 ZFS_IOC_USERSPACE_UPGRADE */ 17, /* 53 ZFS_IOC_SETFRU */ }; #else /* KERNEL */ unsigned static long zfs_ioctl_v28_to_v15[] = { 0, /* 0 ZFS_IOC_POOL_CREATE */ 1, /* 1 ZFS_IOC_POOL_DESTROY */ 2, /* 2 ZFS_IOC_POOL_IMPORT */ 3, /* 3 ZFS_IOC_POOL_EXPORT */ 4, /* 4 ZFS_IOC_POOL_CONFIGS */ 5, /* 5 ZFS_IOC_POOL_STATS */ 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ 7, /* 7 ZFS_IOC_POOL_SCAN */ 8, /* 8 ZFS_IOC_POOL_FREEZE */ 9, /* 9 ZFS_IOC_POOL_UPGRADE */ 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ 11, /* 11 ZFS_IOC_VDEV_ADD */ 12, /* 12 ZFS_IOC_VDEV_REMOVE */ 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ 14, /* 14 ZFS_IOC_VDEV_ATTACH */ 15, /* 15 ZFS_IOC_VDEV_DETACH */ 16, /* 16 ZFS_IOC_VDEV_SETPATH */ 53, /* 17 ZFS_IOC_VDEV_SETFRU */ 17, /* 18 ZFS_IOC_OBJSET_STATS */ 18, /* 19 ZFS_IOC_OBJSET_ZPLPROPS */ 19, /* 20 ZFS_IOC_DATASET_LIST_NEXT */ 20, /* 21 ZFS_IOC_SNAPSHOT_LIST_NEXT */ 21, /* 22 ZFS_IOC_SET_PROP */ 24, /* 23 ZFS_IOC_CREATE */ 25, /* 24 ZFS_IOC_DESTROY */ 26, /* 25 ZFS_IOC_ROLLBACK */ 27, /* 26 ZFS_IOC_RENAME */ 28, /* 27 ZFS_IOC_RECV */ 29, /* 28 ZFS_IOC_SEND */ 30, /* 39 ZFS_IOC_INJECT_FAULT */ 31, /* 30 ZFS_IOC_CLEAR_FAULT */ 32, /* 31 ZFS_IOC_INJECT_LIST_NEXT */ 33, /* 32 ZFS_IOC_ERROR_LOG */ 34, /* 33 ZFS_IOC_CLEAR */ 35, /* 34 ZFS_IOC_PROMOTE */ 36, /* 35 ZFS_IOC_DESTROY_SNAPS */ 37, /* 36 ZFS_IOC_SNAPSHOT */ 38, /* 37 ZFS_IOC_DSOBJ_TO_DSNAME */ 39, /* 38 ZFS_IOC_OBJ_TO_PATH */ 40, /* 39 ZFS_IOC_POOL_SET_PROPS */ 41, /* 40 ZFS_IOC_POOL_GET_PROPS */ 42, /* 41 ZFS_IOC_SET_FSACL */ 43, /* 42 ZFS_IOC_GET_FSACL */ 45, /* 43 ZFS_IOC_SHARE */ 46, /* 44 ZFS_IOC_IHNERIT_PROP */ 49, /* 45 ZFS_IOC_SMB_ACL */ 50, /* 46 ZFS_IOC_USERSPACE_ONE */ 51, /* 47 ZFS_IOC_USERSPACE_MANY */ 52, /* 48 ZFS_IOC_USERSPACE_UPGRADE */ ZFS_IOC_COMPAT_FAIL, /* 49 ZFS_IOC_HOLD */ ZFS_IOC_COMPAT_FAIL, /* 50 ZFS_IOC_RELEASE */ ZFS_IOC_COMPAT_FAIL, /* 51 ZFS_IOC_GET_HOLDS */ ZFS_IOC_COMPAT_FAIL, /* 52 ZFS_IOC_OBJSET_RECVD_PROPS */ ZFS_IOC_COMPAT_FAIL, /* 53 ZFS_IOC_VDEV_SPLIT */ ZFS_IOC_COMPAT_FAIL, /* 54 ZFS_IOC_NEXT_OBJ */ ZFS_IOC_COMPAT_FAIL, /* 55 ZFS_IOC_DIFF */ ZFS_IOC_COMPAT_FAIL, /* 56 ZFS_IOC_TMP_SNAPSHOT */ ZFS_IOC_COMPAT_FAIL, /* 57 ZFS_IOC_OBJ_TO_STATS */ 47, /* 58 ZFS_IOC_JAIL */ 48, /* 59 ZFS_IOC_UNJAIL */ }; #endif /* ! _KERNEL */ #ifdef _KERNEL int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, const int); nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, const int); #else int zcmd_ioctl_compat(int, int, zfs_cmd_t *, const int); #endif /* _KERNEL */ void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int); void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int); #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_IOCTL_COMPAT_H */ Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h (revision 297108) @@ -1,149 +1,147 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H #ifdef __cplusplus extern "C" { #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef illumos #include -#include -#else /* FreeBSD */ -#include #endif +#include #include #include #include #include #include #include #include /* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */ #ifdef min_offset #undef min_offset #endif #ifdef max_offset #undef max_offset #endif #include #include #define CPU_SEQID (curcpu) #define tsd_create(keyp, destructor) do { \ *(keyp) = osd_thread_register((destructor)); \ KASSERT(*(keyp) > 0, ("cannot register OSD")); \ } while (0) #define tsd_destroy(keyp) osd_thread_deregister(*(keyp)) #define tsd_get(key) osd_thread_get(curthread, (key)) #define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) #ifdef __cplusplus } #endif extern int zfs_debug_level; extern struct mtx zfs_debug_mtx; #define ZFS_LOG(lvl, ...) do { \ if (((lvl) & 0xff) <= zfs_debug_level) { \ mtx_lock(&zfs_debug_mtx); \ printf("%s:%u[%d]: ", __func__, __LINE__, (lvl)); \ printf(__VA_ARGS__); \ printf("\n"); \ if ((lvl) & 0x100) \ kdb_backtrace(); \ mtx_unlock(&zfs_debug_mtx); \ } \ } while (0) #define sys_shutdown rebooting #endif /* _SYS_ZFS_CONTEXT_H */ Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h (revision 297108) @@ -1,445 +1,446 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* _KERNEL */ #ifdef __cplusplus extern "C" { #endif /* * The structures in this file are passed between userland and the * kernel. Userland may be running a 32-bit process, while the kernel * is 64-bit. Therefore, these structures need to compile the same in * 32-bit and 64-bit. This means not using type "long", and adding * explicit padding so that the 32-bit structure will not be packed more * tightly than the 64-bit structure (which requires 64-bit alignment). */ /* * Property values for snapdir */ #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 /* * Field manipulation macros for the drr_versioninfo field of the * send stream header. */ /* * Header types for zfs send streams. */ typedef enum drr_headertype { DMU_SUBSTREAM = 0x1, DMU_COMPOUNDSTREAM = 0x2 } drr_headertype_t; #define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) #define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) #define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) #define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) /* * Feature flags for zfs send streams (flags in drr_versioninfo) */ #define DMU_BACKUP_FEATURE_DEDUP (1 << 0) #define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) #define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) /* flags #3 - #15 are reserved for incompatible closed-source implementations */ #define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) #define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17) /* flag #18 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) #define DMU_BACKUP_FEATURE_RESUMING (1 << 20) /* * Mask of all supported backup features */ #define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | \ DMU_BACKUP_FEATURE_LARGE_BLOCKS) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) typedef enum dmu_send_resume_token_version { ZFS_SEND_RESUME_TOKEN_VERSION = 1 } dmu_send_resume_token_version_t; /* * The drr_versioninfo field of the dmu_replay_record has the * following layout: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * | reserved | feature-flags |C|S| * +-------+-------+-------+-------+-------+-------+-------+-------+ * * The low order two bits indicate the header type: SUBSTREAM (0x1) * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: * this field used to be a version number, where the two version types * were 1 and 2. Using two bits for this allows earlier versions of * the code to be able to recognize send streams that don't use any * of the features indicated by feature flags. */ #define DMU_BACKUP_MAGIC 0x2F5bacbacULL #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) /* * This send stream, if it is a full send, includes the FREE and FREEOBJECT * records that are created by the sending process. This means that the send * stream can be received as a clone, even though it is not an incremental. * This is not implemented as a feature flag, because the receiving side does * not need to have implemented it to receive this stream; it is fully backwards * compatible. We need a flag, though, because full send streams without it * cannot necessarily be received as a clone correctly. */ #define DRR_FLAG_FREERECORDS (1<<2) /* * flags in the drr_checksumflags field in the DRR_WRITE and * DRR_WRITE_BYREF blocks */ #define DRR_CHECKSUM_DEDUP (1<<0) #define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) /* * zfs ioctl command structure */ struct drr_begin { uint64_t drr_magic; uint64_t drr_versioninfo; /* was drr_version */ uint64_t drr_creation_time; dmu_objset_type_t drr_type; uint32_t drr_flags; uint64_t drr_toguid; uint64_t drr_fromguid; char drr_toname[MAXNAMELEN]; }; struct drr_end { zio_cksum_t drr_checksum; uint64_t drr_toguid; }; struct drr_object { uint64_t drr_object; dmu_object_type_t drr_type; dmu_object_type_t drr_bonustype; uint32_t drr_blksz; uint32_t drr_bonuslen; uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_pad[6]; uint64_t drr_toguid; /* bonus content follows */ }; struct drr_freeobjects { uint64_t drr_firstobj; uint64_t drr_numobjs; uint64_t drr_toguid; }; struct drr_write { uint64_t drr_object; dmu_object_type_t drr_type; uint32_t drr_pad; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_checksumtype; uint8_t drr_checksumflags; uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ /* content follows */ }; struct drr_free { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; }; struct drr_write_byref { /* where to put the data */ uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; uint64_t drr_toguid; /* where to find the prior copy of the data */ uint64_t drr_refguid; uint64_t drr_refobject; uint64_t drr_refoffset; /* properties of the data */ uint8_t drr_checksumtype; uint8_t drr_checksumflags; uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ }; struct drr_spill { uint64_t drr_object; uint64_t drr_length; uint64_t drr_toguid; uint64_t drr_pad[4]; /* needed for crypto */ /* spill data follows */ }; typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { struct drr_begin drr_begin; struct drr_end drr_end; struct drr_object drr_object; struct drr_freeobjects drr_freeobjects; struct drr_write drr_write; struct drr_free drr_free; struct drr_write_byref drr_write_byref; struct drr_spill drr_spill; struct drr_write_embedded { uint64_t drr_object; uint64_t drr_offset; /* logical length, should equal blocksize */ uint64_t drr_length; uint64_t drr_toguid; uint8_t drr_compression; uint8_t drr_etype; uint8_t drr_pad[6]; uint32_t drr_lsize; /* uncompressed size of payload */ uint32_t drr_psize; /* compr. (real) size of payload */ /* (possibly compressed) content follows */ } drr_write_embedded; /* * Nore: drr_checksum is overlaid with all record types * except DRR_BEGIN. Therefore its (non-pad) members * must not overlap with members from the other structs. * We accomplish this by putting its members at the very * end of the struct. */ struct drr_checksum { uint64_t drr_pad[34]; /* * fletcher-4 checksum of everything preceding the * checksum. */ zio_cksum_t drr_checksum; } drr_checksum; } drr_u; } dmu_replay_record_t; /* diff record range types */ typedef enum diff_type { DDR_NONE = 0x1, DDR_INUSE = 0x2, DDR_FREE = 0x4 } diff_type_t; /* * The diff reports back ranges of free or in-use objects. */ typedef struct dmu_diff_record { uint64_t ddr_type; uint64_t ddr_first; uint64_t ddr_last; } dmu_diff_record_t; typedef struct zinject_record { uint64_t zi_objset; uint64_t zi_object; uint64_t zi_start; uint64_t zi_end; uint64_t zi_guid; uint32_t zi_level; uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; uint32_t zi_failfast; char zi_func[MAXNAMELEN]; uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; + uint64_t zi_nlanes; uint32_t zi_cmd; uint32_t zi_pad; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, ZINJECT_DEVICE_FAULT, ZINJECT_LABEL_FAULT, ZINJECT_IGNORED_WRITES, ZINJECT_PANIC, ZINJECT_DELAY_IO, } zinject_type_t; typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; uint64_t z_sharetype; /* 0 = share, 1 = unshare */ uint64_t z_sharemax; /* max length of share string */ } zfs_share_t; /* * ZFS file systems may behave the usual, POSIX-compliant way, where * name lookups are case-sensitive. They may also be set up so that * all the name lookups are case-insensitive, or so that only some * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. */ typedef enum zfs_case { ZFS_CASE_SENSITIVE, ZFS_CASE_INSENSITIVE, ZFS_CASE_MIXED } zfs_case_t; typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; /* name of pool or dataset */ uint64_t zc_nvlist_src; /* really (char *) */ uint64_t zc_nvlist_src_size; uint64_t zc_nvlist_dst; /* really (char *) */ uint64_t zc_nvlist_dst_size; boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ int zc_pad2; /* * The following members are for legacy ioctls which haven't been * converted to the new method. */ uint64_t zc_history; /* really (char *) */ char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; uint64_t zc_cookie; uint64_t zc_objset_type; uint64_t zc_perm_action; uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; uint64_t zc_jailid; dmu_objset_stats_t zc_objset_stats; dmu_replay_record_t zc_begin_record; zinject_record_t zc_inject_record; uint32_t zc_defer_destroy; uint32_t zc_flags; uint64_t zc_action_handle; int zc_cleanup_fd; uint8_t zc_simple; boolean_t zc_resumable; uint64_t zc_sendobj; uint64_t zc_fromobj; uint64_t zc_createtxg; zfs_stat_t zc_stat; } zfs_cmd_t; typedef struct zfs_useracct { char zu_domain[256]; uid_t zu_rid; uint32_t zu_pad; uint64_t zu_space; } zfs_useracct_t; #define ZFSDEV_MAX_MINOR (1 << 16) #define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) #define ZPOOL_EXPORT_AFTER_SPLIT 0x1 #ifdef _KERNEL typedef struct zfs_creat { nvlist_t *zct_zplprops; nvlist_t *zct_props; } zfs_creat_t; extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int zfs_busy(void); extern int zfs_unmount_snap(const char *); extern void zfs_destroy_unmount_origin(const char *); /* * ZFS minor numbers can refer to either a control device instance or * a zvol. Depending on the value of zss_type, zss_data points to either * a zvol_state_t or a zfs_onexit_t. */ enum zfs_soft_state_type { ZSST_ZVOL, ZSST_CTLDEV }; typedef struct zfs_soft_state { enum zfs_soft_state_type zss_type; void *zss_data; } zfs_soft_state_t; extern void *zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which); extern minor_t zfsdev_minor_alloc(void); extern void *zfsdev_state; #endif /* _KERNEL */ #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_IOCTL_H */ Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h (revision 297108) @@ -1,643 +1,646 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ #ifndef _ZIO_H #define _ZIO_H #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Embedded checksum */ #define ZEC_MAGIC 0x210da7ab10c7a11ULL typedef struct zio_eck { uint64_t zec_magic; /* for validation, endianness */ zio_cksum_t zec_cksum; /* 256-bit checksum */ } zio_eck_t; /* * Gang block headers are self-checksumming and contain an array * of block pointers. */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { ZIO_CHECKSUM_INHERIT = 0, ZIO_CHECKSUM_ON, ZIO_CHECKSUM_OFF, ZIO_CHECKSUM_LABEL, ZIO_CHECKSUM_GANG_HEADER, ZIO_CHECKSUM_ZILOG, ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_NOPARITY, #ifdef illumos ZIO_CHECKSUM_SHA512, ZIO_CHECKSUM_SKEIN, ZIO_CHECKSUM_EDONR, #endif ZIO_CHECKSUM_FUNCTIONS }; /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 #define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON #define ZIO_CHECKSUM_MASK 0xffULL #define ZIO_CHECKSUM_VERIFY (1 << 8) #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 #define ZIO_DEDUPDITTO_MIN 100 enum zio_compress { ZIO_COMPRESS_INHERIT = 0, ZIO_COMPRESS_ON, ZIO_COMPRESS_OFF, ZIO_COMPRESS_LZJB, ZIO_COMPRESS_EMPTY, ZIO_COMPRESS_GZIP_1, ZIO_COMPRESS_GZIP_2, ZIO_COMPRESS_GZIP_3, ZIO_COMPRESS_GZIP_4, ZIO_COMPRESS_GZIP_5, ZIO_COMPRESS_GZIP_6, ZIO_COMPRESS_GZIP_7, ZIO_COMPRESS_GZIP_8, ZIO_COMPRESS_GZIP_9, ZIO_COMPRESS_ZLE, ZIO_COMPRESS_LZ4, ZIO_COMPRESS_FUNCTIONS }; /* * The number of "legacy" compression functions which can be set on individual * objects. */ #define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 /* * The meaning of "compress = on" selected by the compression features enabled * on a given pool. */ #define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4 #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF #define BOOTFS_COMPRESS_VALID(compress) \ ((compress) == ZIO_COMPRESS_LZJB || \ (compress) == ZIO_COMPRESS_LZ4 || \ (compress) == ZIO_COMPRESS_ON || \ (compress) == ZIO_COMPRESS_OFF) #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 enum zio_flag { /* * Flags inherited by gang, ddt, and vdev children, * and that must be equal for two zios to aggregate */ ZIO_FLAG_DONT_AGGREGATE = 1 << 0, ZIO_FLAG_IO_REPAIR = 1 << 1, ZIO_FLAG_SELF_HEAL = 1 << 2, ZIO_FLAG_RESILVER = 1 << 3, ZIO_FLAG_SCRUB = 1 << 4, ZIO_FLAG_SCAN_THREAD = 1 << 5, ZIO_FLAG_PHYSICAL = 1 << 6, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ ZIO_FLAG_SPECULATIVE = 1 << 8, ZIO_FLAG_CONFIG_WRITER = 1 << 9, ZIO_FLAG_DONT_RETRY = 1 << 10, ZIO_FLAG_DONT_CACHE = 1 << 11, ZIO_FLAG_NODATA = 1 << 12, ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) /* * Flags inherited by vdev children. */ ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */ ZIO_FLAG_PROBE = 1 << 15, ZIO_FLAG_TRYHARD = 1 << 16, ZIO_FLAG_OPTIONAL = 1 << 17, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */ ZIO_FLAG_DONT_PROPAGATE = 1 << 19, ZIO_FLAG_IO_BYPASS = 1 << 20, ZIO_FLAG_IO_REWRITE = 1 << 21, ZIO_FLAG_RAW = 1 << 22, ZIO_FLAG_GANG_CHILD = 1 << 23, ZIO_FLAG_DDT_CHILD = 1 << 24, ZIO_FLAG_GODFATHER = 1 << 25, ZIO_FLAG_NOPWRITE = 1 << 26, ZIO_FLAG_REEXECUTED = 1 << 27, ZIO_FLAG_DELEGATED = 1 << 28, }; #define ZIO_FLAG_MUSTSUCCEED 0 #define ZIO_DDT_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) #define ZIO_VDEV_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ ZIO_FLAG_CANFAIL) enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; enum zio_wait_type { ZIO_WAIT_READY = 0, ZIO_WAIT_DONE, ZIO_WAIT_TYPES }; /* * We'll take the number 122 and 123 to indicate checksum errors and * fragmentation. Those doesn't collide with any errno values as they * are greater than ELAST. */ #define ECKSUM 122 #define EFRAGS 123 typedef void zio_done_func_t(zio_t *zio); extern const char *zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) * is objset 0, and the meta-dnode is object 0. This covers all blocks * except root blocks and ZIL blocks, which are defined as follows: * * Root blocks (objset_phys_t) are object 0, level -1: . * ZIL blocks are bookmarked . * dmu_sync()ed ZIL data blocks are bookmarked . * dnode visit bookmarks are . * * Note: this structure is called a bookmark because its original purpose * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel, and is * stored on disk (by virtue of being incorporated into other on-disk * structures, e.g. dsl_scan_phys_t). */ typedef struct zbookmark_phys { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; } zbookmark_phys_t; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ (zb)->zb_objset = objset; \ (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ } #define ZB_DESTROYED_OBJSET (-1ULL) #define ZB_ROOT_OBJECT (0ULL) #define ZB_ROOT_LEVEL (-1LL) #define ZB_ROOT_BLKID (0ULL) #define ZB_ZIL_OBJECT (0ULL) #define ZB_ZIL_LEVEL (-2LL) #define ZB_DNODE_LEVEL (-3LL) #define ZB_DNODE_BLKID (0ULL) #define ZB_IS_ZERO(zb) \ ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ (zb)->zb_level == 0 && (zb)->zb_blkid == 0) #define ZB_IS_ROOT(zb) \ ((zb)->zb_object == ZB_ROOT_OBJECT && \ (zb)->zb_level == ZB_ROOT_LEVEL && \ (zb)->zb_blkid == ZB_ROOT_BLKID) typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, const void *good_data); typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; struct zio_cksum_report { struct zio_cksum_report *zcr_next; nvlist_t *zcr_ereport; nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; zio_cksum_free_f *zcr_free; /* internal use only */ struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ }; typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, void *arg); zio_vsd_cksum_report_f zio_vsd_default_cksum_report; typedef struct zio_vsd_ops { zio_done_func_t *vsd_free; zio_vsd_cksum_report_f *vsd_cksum_report; } zio_vsd_ops_t; typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, zio_gang_node_t *gn, void *data); typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); typedef struct zio_transform { void *zt_orig_data; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; struct zio_transform *zt_next; } zio_transform_t; typedef int zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must * be able to propagate them to the parent. The normal io_flags are local * to the zio, not protected by any lock, and not modifiable by children; * the reexecute flags are protected by io_lock, modifiable by children, * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 typedef struct zio_link { zio_t *zl_parent; zio_t *zl_child; list_node_t zl_parent_node; list_node_t zl_child_node; } zio_link_t; /* * Used for TRIM kstat. */ typedef struct zio_trim_stats { /* * Number of bytes successfully TRIMmed. */ kstat_named_t bytes; /* * Number of successful TRIM requests. */ kstat_named_t success; /* * Number of TRIM requests that failed because TRIM is not * supported. */ kstat_named_t unsupported; /* * Number of TRIM requests that failed for other reasons. */ kstat_named_t failed; } zio_trim_stats_t; extern zio_trim_stats_t zio_trim_stats; #define ZIO_TRIM_STAT_INCR(stat, val) \ atomic_add_64(&zio_trim_stats.stat.value.ui64, (val)); #define ZIO_TRIM_STAT_BUMP(stat) \ ZIO_TRIM_STAT_INCR(stat, 1); struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; zio_prop_t io_prop; zio_type_t io_type; enum zio_child io_child_type; int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; blkptr_t *io_bp_override; blkptr_t io_bp_copy; list_t io_parent_list; list_t io_child_list; zio_link_t *io_walk_link; zio_t *io_logical; zio_transform_t *io_transform_stack; /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_physdone; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* Data represented by this I/O */ void *io_data; void *io_orig_data; uint64_t io_size; uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; const zio_vsd_ops_t *io_vsd_ops; uint64_t io_offset; hrtime_t io_timestamp; + hrtime_t io_target_timestamp; avl_node_t io_queue_node; avl_node_t io_offset_node; /* Internal pipeline state */ enum zio_flag io_flags; enum zio_stage io_stage; enum zio_stage io_pipeline; enum zio_flag io_orig_flags; enum zio_stage io_orig_stage; enum zio_stage io_orig_pipeline; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; uint64_t io_child_count; uint64_t io_phys_children; uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; kmutex_t io_lock; kcondvar_t io_cv; /* FMA state */ zio_cksum_report_t *io_cksum_report; uint64_t io_ena; /* Taskq dispatching state */ taskq_ent_t io_tqent; avl_node_t io_trim_node; list_node_t io_trim_link; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *priv, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *priv, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, uint64_t size, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size); extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); extern void zio_execute(zio_t *zio); extern void zio_interrupt(zio_t *zio); +extern void zio_delay_init(zio_t *zio); +extern void zio_delay_interrupt(zio_t *zio); extern zio_t *zio_walk_parents(zio_t *cio); extern zio_t *zio_walk_children(zio_t *pio); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *priv); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *priv); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); extern enum zio_checksum zio_checksum_select(enum zio_checksum child, enum zio_checksum parent); extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, enum zio_checksum parent); extern enum zio_compress zio_compress_select(spa_t *spa, enum zio_compress child, enum zio_compress parent); extern void zio_suspend(spa_t *spa, zio_t *zio); extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); /* * Initial setup and teardown. */ extern void zio_init(void); extern void zio_fini(void); /* * Fault injection */ struct zinject_record; extern uint32_t zio_injection_enabled; extern int zio_inject_fault(char *name, int flags, int *id, struct zinject_record *record); extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); -extern uint64_t zio_handle_io_delay(zio_t *zio); +extern hrtime_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions */ extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const void *good_data, const void *bad_data, boolean_t drop_if_identical); extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, uint64_t offset, uint64_t length, const void *good_data, const void *bad_data, struct zio_bad_cksum *info); /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); /* zbookmark_phys functions */ boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); #ifdef __cplusplus } #endif #endif /* _ZIO_H */ Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c (revision 297108) @@ -1,943 +1,944 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * Virtual device vector for disks. */ extern ldi_ident_t zfs_li; static void vdev_disk_close(vdev_t *); typedef struct vdev_disk_ldi_cb { list_node_t lcb_next; ldi_callback_id_t lcb_id; } vdev_disk_ldi_cb_t; static void vdev_disk_alloc(vdev_t *vd) { vdev_disk_t *dvd; dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); /* * Create the LDI event callback list. */ list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), offsetof(vdev_disk_ldi_cb_t, lcb_next)); } static void vdev_disk_free(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; vdev_disk_ldi_cb_t *lcb; if (dvd == NULL) return; /* * We have already closed the LDI handle. Clean up the LDI event * callbacks and free vd->vdev_tsd. */ while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { list_remove(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_remove_callbacks(lcb->lcb_id); kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); } list_destroy(&dvd->vd_ldi_cbs); kmem_free(dvd, sizeof (vdev_disk_t)); vd->vdev_tsd = NULL; } /* ARGSUSED */ static int vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; vdev_disk_t *dvd = vd->vdev_tsd; /* * Ignore events other than offline. */ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) return (LDI_EV_SUCCESS); /* * All LDI handles must be closed for the state change to succeed, so * call on vdev_disk_close() to do this. * * We inform vdev_disk_close that it is being called from offline * notify context so it will defer cleanup of LDI event callbacks and * freeing of vd->vdev_tsd to the offline finalize or a reopen. */ dvd->vd_ldi_offline = B_TRUE; vdev_disk_close(vd); /* * Now that the device is closed, request that the spa_async_thread * mark the device as REMOVED and notify FMA of the removal. */ zfs_post_remove(vd->vdev_spa, vd); vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); return (LDI_EV_SUCCESS); } /* ARGSUSED */ static void vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, int ldi_result, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; /* * Ignore events other than offline. */ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) return; /* * We have already closed the LDI handle in notify. * Clean up the LDI event callbacks and free vd->vdev_tsd. */ vdev_disk_free(vd); /* * Request that the vdev be reopened if the offline state change was * unsuccessful. */ if (ldi_result != LDI_EV_SUCCESS) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); } } static ldi_ev_callback_t vdev_disk_off_callb = { .cb_vers = LDI_EV_CB_VERS, .cb_notify = vdev_disk_off_notify, .cb_finalize = vdev_disk_off_finalize }; /* ARGSUSED */ static void vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, int ldi_result, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; /* * Ignore events other than degrade. */ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0) return; /* * Degrade events always succeed. Mark the vdev as degraded. * This status is purely informative for the user. */ (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0); } static ldi_ev_callback_t vdev_disk_dgrd_callb = { .cb_vers = LDI_EV_CB_VERS, .cb_notify = NULL, .cb_finalize = vdev_disk_dgrd_finalize }; static void vdev_disk_hold(vdev_t *vd) { ddi_devid_t devid; char *minor; ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') return; /* * Only prefetch path and devid info if the device has * never been opened. */ if (vd->vdev_tsd != NULL) return; if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); (void) snprintf(buf, len, "%ss0", vd->vdev_path); (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); kmem_free(buf, len); } if (vd->vdev_name_vp == NULL) (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); if (vd->vdev_devid != NULL && ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); ddi_devid_str_free(minor); ddi_devid_free(devid); } } static void vdev_disk_rele(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); if (vd->vdev_name_vp) { VN_RELE_ASYNC(vd->vdev_name_vp, dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); vd->vdev_name_vp = NULL; } if (vd->vdev_devid_vp) { VN_RELE_ASYNC(vd->vdev_devid_vp, dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); vd->vdev_devid_vp = NULL; } } static uint64_t vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz) { ASSERT(vd->vdev_wholedisk); vdev_disk_t *dvd = vd->vdev_tsd; dk_efi_t dk_ioc; efi_gpt_t *efi; uint64_t avail_space = 0; int efisize = EFI_LABEL_SIZE * 2; dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP); dk_ioc.dki_lba = 1; dk_ioc.dki_length = efisize; dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data; efi = dk_ioc.dki_data; if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc, FKIOCTL, kcred, NULL) == 0) { uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA); if (capacity > efi_altern_lba) avail_space = (capacity - efi_altern_lba) * blksz; } kmem_free(dk_ioc.dki_data, efisize); return (avail_space); } /* * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when * even a fallback to DKIOCGMEDIAINFO fails. */ #ifdef DEBUG #define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) #else #define VDEV_DEBUG(...) /* Nothing... */ #endif static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd = vd->vdev_tsd; ldi_ev_cookie_t ecookie; vdev_disk_ldi_cb_t *lcb; union { struct dk_minfo_ext ude; struct dk_minfo ud; } dks; struct dk_minfo_ext *dkmext = &dks.ude; struct dk_minfo *dkm = &dks.ud; int error; dev_t dev; int otyp; boolean_t validate_devid = B_FALSE; ddi_devid_t devid; uint64_t capacity = 0, blksz = 0, pbsize; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (dvd != NULL) { if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) { /* * If we are opening a device in its offline notify * context, the LDI handle was just closed. Clean * up the LDI event callbacks and free vd->vdev_tsd. */ vdev_disk_free(vd); } else { ASSERT(vd->vdev_reopening); goto skip_open; } } /* * Create vd->vdev_tsd. */ vdev_disk_alloc(vd); dvd = vd->vdev_tsd; /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave * us, even if it is one of multiple paths to the save device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * * 1. Try opening the device by path. For legacy pools without the * 'whole_disk' property, attempt to fix the path by appending 's0'. * * 2. If the devid of the device matches the stored value, return * success. * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. */ if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, &dvd->vd_minor) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } } error = EINVAL; /* presume failure */ if (vd->vdev_path != NULL) { if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); (void) snprintf(buf, len, "%ss0", vd->vdev_path); error = ldi_open_by_name(buf, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); if (error == 0) { spa_strfree(vd->vdev_path); vd->vdev_path = buf; vd->vdev_wholedisk = 1ULL; } else { kmem_free(buf, len); } } /* * If we have not yet opened the device, try to open it by the * specified path. */ if (error != 0) { error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * Compare the devid to the stored value. */ if (error == 0 && vd->vdev_devid != NULL && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { error = SET_ERROR(EINVAL); (void) ldi_close(dvd->vd_lh, spa_mode(spa), kcred); dvd->vd_lh = NULL; } ddi_devid_free(devid); } /* * If we succeeded in opening the device, but 'vdev_wholedisk' * is not yet set, then this must be a slice. */ if (error == 0 && vd->vdev_wholedisk == -1ULL) vd->vdev_wholedisk = 0; } /* * If we were unable to open by path, or the devid check fails, open by * devid instead. */ if (error != 0 && vd->vdev_devid != NULL) { error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } /* * If all else fails, then try opening by physical path (if available) * or the logical path (if we failed due to the devid check). While not * as reliable as the devid, this will give us something, and the higher * level vdev validation will prevent us from opening the wrong device. */ if (error) { if (vd->vdev_devid != NULL) validate_devid = B_TRUE; if (vd->vdev_physpath != NULL && (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * Note that we don't support the legacy auto-wholedisk support * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ if (error && vd->vdev_path != NULL) error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); } if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } /* * Now that the device has been successfully opened, update the devid * if necessary. */ if (validate_devid && spa_writeable(spa) && ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { char *vd_devid; vd_devid = ddi_devid_str_encode(devid, dvd->vd_minor); zfs_dbgmsg("vdev %s: update devid from %s, " "to %s", vd->vdev_path, vd->vdev_devid, vd_devid); spa_strfree(vd->vdev_devid); vd->vdev_devid = spa_strdup(vd_devid); ddi_devid_str_free(vd_devid); } ddi_devid_free(devid); } /* * Once a device is opened, verify that the physical device path (if * available) is up to date. */ if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { char *physpath, *minorname; physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); minorname = NULL; if (ddi_dev_pathname(dev, otyp, physpath) == 0 && ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && (vd->vdev_physpath == NULL || strcmp(vd->vdev_physpath, physpath) != 0)) { if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); (void) strlcat(physpath, ":", MAXPATHLEN); (void) strlcat(physpath, minorname, MAXPATHLEN); vd->vdev_physpath = spa_strdup(physpath); } if (minorname) kmem_free(minorname, strlen(minorname) + 1); kmem_free(physpath, MAXPATHLEN); } /* * Register callbacks for the LDI offline event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); } /* * Register callbacks for the LDI degrade event. */ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == LDI_EV_SUCCESS) { lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); list_insert_tail(&dvd->vd_ldi_cbs, lcb); (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); } skip_open: /* * Determine the actual size of the device. */ if (ldi_get_size(dvd->vd_lh, psize) != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (SET_ERROR(EINVAL)); } *max_psize = *psize; /* * Determine the device's minimum transfer size. * If the ioctl isn't supported, assume DEV_BSIZE. */ if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { capacity = dkmext->dki_capacity - 1; blksz = dkmext->dki_lbsize; pbsize = dkmext->dki_pbsize; } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { VDEV_DEBUG( "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", vd->vdev_path); capacity = dkm->dki_capacity - 1; blksz = dkm->dki_lbsize; pbsize = blksz; } else { VDEV_DEBUG("vdev_disk_open(\"%s\"): " "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", vd->vdev_path, error); pbsize = DEV_BSIZE; } *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; if (vd->vdev_wholedisk == 1) { int wce = 1; if (error == 0) { /* * If we have the capability to expand, we'd have * found out via success from DKIOCGMEDIAINFO{,EXT}. * Adjust max_psize upward accordingly since we know * we own the whole disk now. */ *max_psize += vdev_disk_get_space(vd, capacity, blksz); zfs_dbgmsg("capacity change: vdev %s, psize %llu, " "max_psize %llu", vd->vdev_path, *psize, *max_psize); } /* * Since we own the whole disk, try to enable disk write * caching. We ignore errors because it's OK if we can't do it. */ (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, FKIOCTL, kcred, NULL); } /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. */ vd->vdev_nowritecache = B_FALSE; return (0); } static void vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; if (vd->vdev_reopening || dvd == NULL) return; if (dvd->vd_minor != NULL) { ddi_devid_str_free(dvd->vd_minor); dvd->vd_minor = NULL; } if (dvd->vd_devid != NULL) { ddi_devid_free(dvd->vd_devid); dvd->vd_devid = NULL; } if (dvd->vd_lh != NULL) { (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); dvd->vd_lh = NULL; } vd->vdev_delayed_close = B_FALSE; /* * If we closed the LDI handle due to an offline notify from LDI, * don't free vd->vdev_tsd or unregister the callbacks here; * the offline finalize callback or a reopen will take care of it. */ if (dvd->vd_ldi_offline) return; vdev_disk_free(vd); } int vdev_disk_physio(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, int flags, boolean_t isdump) { vdev_disk_t *dvd = vd->vdev_tsd; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) return (EIO); ASSERT(vd->vdev_ops == &vdev_disk_ops); /* * If in the context of an active crash dump, use the ldi_dump(9F) * call instead of ldi_strategy(9F) as usual. */ if (isdump) { ASSERT3P(dvd, !=, NULL); return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), lbtodb(size))); } return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); } int vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, size_t size, uint64_t offset, int flags) { buf_t *bp; int error = 0; if (vd_lh == NULL) return (SET_ERROR(EINVAL)); ASSERT(flags & B_READ || flags & B_WRITE); bp = getrbuf(KM_SLEEP); bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; bp->b_bcount = size; bp->b_un.b_addr = (void *)data; bp->b_lblkno = lbtodb(offset); bp->b_bufsize = size; error = ldi_strategy(vd_lh, bp); ASSERT(error == 0); if ((error = biowait(bp)) == 0 && bp->b_resid != 0) error = SET_ERROR(EIO); freerbuf(bp); return (error); } static void vdev_disk_io_intr(buf_t *bp) { vdev_buf_t *vb = (vdev_buf_t *)bp; zio_t *zio = vb->vb_io; /* * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. * Rather than teach the rest of the stack about other error * possibilities (EFAULT, etc), we normalize the error value here. */ zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0); if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = SET_ERROR(EIO); kmem_free(vb, sizeof (vdev_buf_t)); - zio_interrupt(zio); + zio_delay_interrupt(zio); } static void vdev_disk_ioctl_free(zio_t *zio) { kmem_free(zio->io_vsd, sizeof (struct dk_callback)); } static const zio_vsd_ops_t vdev_disk_vsd_ops = { vdev_disk_ioctl_free, zio_vsd_default_cksum_report }; static void vdev_disk_ioctl_done(void *zio_arg, int error) { zio_t *zio = zio_arg; zio->io_error = error; zio_interrupt(zio); } static void vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; vdev_buf_t *vb; struct dk_callback *dkc; buf_t *bp; int error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush) break; if (vd->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); zio->io_vsd_ops = &vdev_disk_vsd_ops; dkc->dkc_callback = vdev_disk_ioctl_done; dkc->dkc_flag = FLUSH_VOLATILE; dkc->dkc_cookie = zio; error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, (uintptr_t)dkc, FKIOCTL, kcred, NULL); if (error == 0) { /* * The ioctl will be done asychronously, * and will call vdev_disk_ioctl_done() * upon completion. */ return; } if (error == ENOTSUP || error == ENOTTY) { /* * If we get ENOTSUP or ENOTTY, we know that * no future attempts will ever succeed. * In this case we set a persistent bit so * that we don't bother with the ioctl in the * future. */ vd->vdev_nowritecache = B_TRUE; } zio->io_error = error; break; default: zio->io_error = SET_ERROR(ENOTSUP); } zio_execute(zio); return; } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); vb->vb_io = zio; bp = &vb->vb_buf; bioinit(bp); bp->b_flags = B_BUSY | B_NOCACHE | (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) bp->b_flags |= B_FAILFAST; bp->b_bcount = zio->io_size; bp->b_un.b_addr = zio->io_data; bp->b_lblkno = lbtodb(zio->io_offset); bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; /* ldi_strategy() will return non-zero only on programming errors */ VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); } static void vdev_disk_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an * asynchronous removal of the device. Otherwise, probe the device and * make sure it's still accessible. */ if (zio->io_error == EIO && !vd->vdev_remove_wanted) { vdev_disk_t *dvd = vd->vdev_tsd; int state = DKIO_NONE; if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { /* * We post the resource as soon as possible, instead of * when the async removal actually happens, because the * DE is using this information to discard previous I/O * errors. */ zfs_post_remove(zio->io_spa, vd); vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } else if (!vd->vdev_delayed_close) { vd->vdev_delayed_close = B_TRUE; } } } vdev_ops_t vdev_disk_ops = { vdev_disk_open, vdev_disk_close, vdev_default_asize, vdev_disk_io_start, vdev_disk_io_done, NULL, vdev_disk_hold, vdev_disk_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; /* * Given the root disk device devid or pathname, read the label from * the device, and construct a configuration nvlist. */ int vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) { ldi_handle_t vd_lh; vdev_label_t *label; uint64_t s, size; int l; ddi_devid_t tmpdevid; int error = -1; char *minor_name; /* * Read the device label and build the nvlist. */ if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, &minor_name) == 0) { error = ldi_open_by_devid(tmpdevid, minor_name, FREAD, kcred, &vd_lh, zfs_li); ddi_devid_free(tmpdevid); ddi_devid_str_free(minor_name); } if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, zfs_li))) return (error); if (ldi_get_size(vd_lh, &s)) { (void) ldi_close(vd_lh, FREAD, kcred); return (SET_ERROR(EIO)); } size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); *config = NULL; for (l = 0; l < VDEV_LABELS; l++) { uint64_t offset, state, txg = 0; /* read vdev label */ offset = vdev_label_offset(size, l, 0); if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) continue; if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { *config = NULL; continue; } if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state >= POOL_STATE_DESTROYED) { nvlist_free(*config); *config = NULL; continue; } if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0) { nvlist_free(*config); *config = NULL; continue; } break; } kmem_free(label, sizeof (vdev_label_t)); (void) ldi_close(vd_lh, FREAD, kcred); if (*config == NULL) error = SET_ERROR(EIDRM); return (error); } Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c (revision 297108) @@ -1,244 +1,245 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include /* * Virtual device vector for files. */ static void vdev_file_hold(vdev_t *vd) { ASSERT(vd->vdev_path != NULL); } static void vdev_file_rele(vdev_t *vd) { ASSERT(vd->vdev_path != NULL); } static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_file_t *vf; vnode_t *vp; vattr_t vattr; int error; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } /* * Reopen the device if it's not currently open. Otherwise, * just update the physical size of the device. */ if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); vf = vd->vdev_tsd; vp = vf->vf_vnode; goto skip_open; } vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); /* * We always open the files from the root of the global zone, even if * we're in a local zone. If the user has gotten to this point, the * administrator has already decided that the pool should be available * to local zone users, so the underlying devices should be as well. */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; return (error); } vf->vf_vnode = vp; #ifdef _KERNEL /* * Make sure it's a regular file. */ if (vp->v_type != VREG) { #ifdef __FreeBSD__ (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); #endif vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; #ifdef __FreeBSD__ kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; #endif return (SET_ERROR(ENODEV)); } #endif /* _KERNEL */ skip_open: /* * Determine the physical size of the file. */ vattr.va_mask = AT_SIZE; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, kcred); VOP_UNLOCK(vp, 0); if (error) { (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; return (error); } vd->vdev_notrim = B_TRUE; *max_psize = *psize = vattr.va_size; *logical_ashift = SPA_MINBLOCKSHIFT; *physical_ashift = SPA_MINBLOCKSHIFT; return (0); } static void vdev_file_close(vdev_t *vd) { vdev_file_t *vf = vd->vdev_tsd; if (vd->vdev_reopening || vf == NULL) return; if (vf->vf_vnode != NULL) { (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); } vd->vdev_delayed_close = B_FALSE; kmem_free(vf, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; } static void vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_file_t *vf; vnode_t *vp; ssize_t resid; if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } vf = vd->vdev_tsd; vp = vf->vf_vnode; if (zio->io_type == ZIO_TYPE_IOCTL) { switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: zio->io_error = VOP_FSYNC(vp, FSYNC | FDSYNC, kcred, NULL); break; default: zio->io_error = SET_ERROR(ENOTSUP); } zio_execute(zio); return; } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size, zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; - zio_interrupt(zio); + zio_delay_interrupt(zio); #ifdef illumos VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp, TQ_SLEEP), !=, 0); #endif } /* ARGSUSED */ static void vdev_file_io_done(zio_t *zio) { } vdev_ops_t vdev_file_ops = { vdev_file_open, vdev_file_close, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, NULL, vdev_file_hold, vdev_file_rele, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; /* * From userland we access disks just like files. */ #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { vdev_file_open, vdev_file_close, vdev_default_asize, vdev_file_io_start, vdev_file_io_done, NULL, vdev_file_hold, vdev_file_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; #endif Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c (revision 297108) @@ -1,992 +1,993 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright (c) 2012 Martin Matuska */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual device vector for GEOM. */ static g_attrchanged_t vdev_geom_attrchanged; struct g_class zfs_vdev_class = { .name = "ZFS::VDEV", .version = G_VERSION, .attrchanged = vdev_geom_attrchanged, }; DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); SYSCTL_DECL(_vfs_zfs_vdev); /* Don't send BIO_FLUSH. */ static int vdev_geom_bio_flush_disable = 0; TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW, &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); /* Don't send BIO_DELETE. */ static int vdev_geom_bio_delete_disable = 0; TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW, &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); static void vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp) { int error; uint16_t rate; error = g_getattr("GEOM::rotation_rate", cp, &rate); if (error == 0) vd->vdev_rotation_rate = rate; else vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN; } static void vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) { vdev_t *vd; spa_t *spa; char *physpath; int error, physpath_len; vd = cp->private; if (vd == NULL) return; if (strcmp(attr, "GEOM::rotation_rate") == 0) { vdev_geom_set_rotation_rate(vd, cp); return; } if (strcmp(attr, "GEOM::physpath") != 0) return; if (g_access(cp, 1, 0, 0) != 0) return; /* * Record/Update physical path information for this device. */ spa = vd->vdev_spa; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); g_access(cp, -1, 0, 0); if (error == 0) { char *old_physpath; old_physpath = vd->vdev_physpath; vd->vdev_physpath = spa_strdup(physpath); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); if (old_physpath != NULL) { int held_lock; held_lock = spa_config_held(spa, SCL_STATE, RW_WRITER); if (held_lock == 0) { g_topology_unlock(); spa_config_enter(spa, SCL_STATE, FTAG, RW_WRITER); } spa_strfree(old_physpath); if (held_lock == 0) { spa_config_exit(spa, SCL_STATE, FTAG); g_topology_lock(); } } } g_free(physpath); } static void vdev_geom_orphan(struct g_consumer *cp) { vdev_t *vd; g_topology_assert(); vd = cp->private; if (vd == NULL) { /* Vdev close in progress. Ignore the event. */ return; } /* * Orphan callbacks occur from the GEOM event thread. * Concurrent with this call, new I/O requests may be * working their way through GEOM about to find out * (only once executed by the g_down thread) that we've * been orphaned from our disk provider. These I/Os * must be retired before we can detach our consumer. * This is most easily achieved by acquiring the * SPA ZIO configuration lock as a writer, but doing * so with the GEOM topology lock held would cause * a lock order reversal. Instead, rely on the SPA's * async removal support to invoke a close on this * vdev once it is safe to do so. */ vd->vdev_remove_wanted = B_TRUE; spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); } static struct g_consumer * vdev_geom_attach(struct g_provider *pp, vdev_t *vd) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); ZFS_LOG(1, "Attaching to %s.", pp->name); /* Do we have geom already? No? Create one. */ LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; if (strcmp(gp->name, "zfs::vdev") != 0) continue; break; } if (gp == NULL) { gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); gp->orphan = vdev_geom_orphan; gp->attrchanged = vdev_geom_attrchanged; cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_wither_geom(gp, ENXIO); return (NULL); } if (g_access(cp, 1, 0, 1) != 0) { g_wither_geom(gp, ENXIO); return (NULL); } ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); } else { /* Check if we are already connected to this provider. */ LIST_FOREACH(cp, &gp->consumer, consumer) { if (cp->provider == pp) { ZFS_LOG(1, "Found consumer for %s.", pp->name); break; } } if (cp == NULL) { cp = g_new_consumer(gp); if (g_attach(cp, pp) != 0) { g_destroy_consumer(cp); return (NULL); } if (g_access(cp, 1, 0, 1) != 0) { g_detach(cp); g_destroy_consumer(cp); return (NULL); } ZFS_LOG(1, "Created consumer for %s.", pp->name); } else { if (g_access(cp, 1, 0, 1) != 0) return (NULL); ZFS_LOG(1, "Used existing consumer for %s.", pp->name); } } /* * BUG: cp may already belong to a vdev. This could happen if: * 1) That vdev is a shared spare, or * 2) We are trying to reopen a missing vdev and we are scanning by * guid. In that case, we'll ultimately fail to open this consumer, * but not until after setting the private field. * The solution is to: * 1) Don't set the private field until after the open succeeds, and * 2) Set it to a linked list of vdevs, not just a single vdev */ cp->private = vd; vd->vdev_tsd = cp; /* Fetch initial physical path information for this device. */ vdev_geom_attrchanged(cp, "GEOM::physpath"); cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; return (cp); } static void vdev_geom_close_locked(vdev_t *vd) { struct g_geom *gp; struct g_consumer *cp; g_topology_assert(); cp = vd->vdev_tsd; if (cp == NULL) return; ZFS_LOG(1, "Closing access to %s.", cp->provider->name); KASSERT(vd->vdev_tsd == cp, ("%s: vdev_tsd is not cp", __func__)); vd->vdev_tsd = NULL; vd->vdev_delayed_close = B_FALSE; cp->private = NULL; gp = cp->geom; g_access(cp, -1, 0, -1); /* Destroy consumer on last close. */ if (cp->acr == 0 && cp->ace == 0) { if (cp->acw > 0) g_access(cp, 0, -cp->acw, 0); if (cp->provider != NULL) { ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name); g_detach(cp); } g_destroy_consumer(cp); } /* Destroy geom if there are no consumers left. */ if (LIST_EMPTY(&gp->consumer)) { ZFS_LOG(1, "Destroyed geom %s.", gp->name); g_wither_geom(gp, ENXIO); } } static void nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid) { (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid); (void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid); } static int vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size) { struct bio *bp; u_char *p; off_t off, maxio; int error; ASSERT((offset % cp->provider->sectorsize) == 0); ASSERT((size % cp->provider->sectorsize) == 0); bp = g_alloc_bio(); off = offset; offset += size; p = data; maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); error = 0; for (; off < offset; off += maxio, p += maxio, size -= maxio) { bzero(bp, sizeof(*bp)); bp->bio_cmd = cmd; bp->bio_done = NULL; bp->bio_offset = off; bp->bio_length = MIN(size, maxio); bp->bio_data = p; g_io_request(bp, cp); error = biowait(bp, "vdev_geom_io"); if (error != 0) break; } g_destroy_bio(bp); return (error); } static void vdev_geom_taste_orphan(struct g_consumer *cp) { KASSERT(1 == 0, ("%s called while tasting %s.", __func__, cp->provider->name)); } static int vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) { struct g_provider *pp; vdev_label_t *label; char *p, *buf; size_t buflen; uint64_t psize; off_t offset, size; uint64_t state, txg; int error, l, len; g_topology_assert_not(); pp = cp->provider; ZFS_LOG(1, "Reading config from %s...", pp->name); psize = pp->mediasize; psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); size = sizeof(*label) + pp->sectorsize - ((sizeof(*label) - 1) % pp->sectorsize) - 1; label = kmem_alloc(size, KM_SLEEP); buflen = sizeof(label->vl_vdev_phys.vp_nvlist); *config = NULL; for (l = 0; l < VDEV_LABELS; l++) { offset = vdev_label_offset(psize, l, 0); if ((offset % pp->sectorsize) != 0) continue; if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0) continue; buf = label->vl_vdev_phys.vp_nvlist; if (nvlist_unpack(buf, buflen, config, 0) != 0) continue; if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(*config); *config = NULL; continue; } if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(*config); *config = NULL; continue; } break; } kmem_free(label, size); return (*config == NULL ? ENOENT : 0); } static void resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) { nvlist_t **new_configs; uint64_t i; if (id < *count) return; new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *), KM_SLEEP); for (i = 0; i < *count; i++) new_configs[i] = (*configs)[i]; if (*configs != NULL) kmem_free(*configs, *count * sizeof(void *)); *configs = new_configs; *count = id + 1; } static void process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, const char *name, uint64_t* known_pool_guid) { nvlist_t *vdev_tree; uint64_t pool_guid; uint64_t vdev_guid, known_guid; uint64_t id, txg, known_txg; char *pname; int i; if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || strcmp(pname, name) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) goto ignore; if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) goto ignore; if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) goto ignore; if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) goto ignore; VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if (*known_pool_guid != 0) { if (pool_guid != *known_pool_guid) goto ignore; } else *known_pool_guid = pool_guid; resize_configs(configs, count, id); if ((*configs)[id] != NULL) { VERIFY(nvlist_lookup_uint64((*configs)[id], ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); if (txg <= known_txg) goto ignore; nvlist_free((*configs)[id]); } (*configs)[id] = cfg; return; ignore: nvlist_free(cfg); } static int vdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp) { int error; if (pp->flags & G_PF_WITHER) return (EINVAL); g_attach(cp, pp); error = g_access(cp, 1, 0, 0); if (error == 0) { if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) error = EINVAL; else if (pp->mediasize < SPA_MINDEVSIZE) error = EINVAL; if (error != 0) g_access(cp, -1, 0, 0); } if (error != 0) g_detach(cp); return (error); } static void vdev_geom_detach_taster(struct g_consumer *cp) { g_access(cp, -1, 0, 0); g_detach(cp); } int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count) { struct g_class *mp; struct g_geom *gp, *zgp; struct g_provider *pp; struct g_consumer *zcp; nvlist_t *vdev_cfg; uint64_t pool_guid; int error; DROP_GIANT(); g_topology_lock(); zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste"); /* This orphan function should be never called. */ zgp->orphan = vdev_geom_taste_orphan; zcp = g_new_consumer(zgp); *configs = NULL; *count = 0; pool_guid = 0; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (pp->flags & G_PF_WITHER) continue; if (vdev_geom_attach_taster(zcp, pp) != 0) continue; g_topology_unlock(); error = vdev_geom_read_config(zcp, &vdev_cfg); g_topology_lock(); vdev_geom_detach_taster(zcp); if (error) continue; ZFS_LOG(1, "successfully read vdev config"); process_vdev_config(configs, count, vdev_cfg, name, &pool_guid); } } } g_destroy_consumer(zcp); g_destroy_geom(zgp); g_topology_unlock(); PICKUP_GIANT(); return (*count > 0 ? 0 : ENOENT); } static void vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid) { nvlist_t *config; g_topology_assert_not(); *pguid = 0; *vguid = 0; if (vdev_geom_read_config(cp, &config) == 0) { nvlist_get_guids(config, pguid, vguid); nvlist_free(config); } } static struct g_consumer * vdev_geom_attach_by_guids(vdev_t *vd) { struct g_class *mp; struct g_geom *gp, *zgp; struct g_provider *pp; struct g_consumer *cp, *zcp; uint64_t pguid, vguid; g_topology_assert(); zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste"); /* This orphan function should be never called. */ zgp->orphan = vdev_geom_taste_orphan; zcp = g_new_consumer(zgp); cp = NULL; LIST_FOREACH(mp, &g_classes, class) { if (mp == &zfs_vdev_class) continue; LIST_FOREACH(gp, &mp->geom, geom) { if (gp->flags & G_GEOM_WITHER) continue; LIST_FOREACH(pp, &gp->provider, provider) { if (vdev_geom_attach_taster(zcp, pp) != 0) continue; g_topology_unlock(); vdev_geom_read_guids(zcp, &pguid, &vguid); g_topology_lock(); vdev_geom_detach_taster(zcp); /* * Check that the label's vdev guid matches the * desired guid. If the label has a pool guid, * check that it matches too. (Inactive spares * and L2ARCs do not have any pool guid in the * label.) */ if ((pguid != 0 && pguid != spa_guid(vd->vdev_spa)) || vguid != vd->vdev_guid) continue; cp = vdev_geom_attach(pp, vd); if (cp == NULL) { printf("ZFS WARNING: Unable to " "attach to %s.\n", pp->name); continue; } break; } if (cp != NULL) break; } if (cp != NULL) break; } end: g_destroy_consumer(zcp); g_destroy_geom(zgp); return (cp); } static struct g_consumer * vdev_geom_open_by_guids(vdev_t *vd) { struct g_consumer *cp; char *buf; size_t len; g_topology_assert(); ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid); cp = vdev_geom_attach_by_guids(vd); if (cp != NULL) { len = strlen(cp->provider->name) + strlen("/dev/") + 1; buf = kmem_alloc(len, KM_SLEEP); snprintf(buf, len, "/dev/%s", cp->provider->name); spa_strfree(vd->vdev_path); vd->vdev_path = buf; ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid, vd->vdev_path); } else { ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); } return (cp); } static struct g_consumer * vdev_geom_open_by_path(vdev_t *vd, int check_guid) { struct g_provider *pp; struct g_consumer *cp; uint64_t pguid, vguid; g_topology_assert(); cp = NULL; pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); if (pp != NULL) { ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); cp = vdev_geom_attach(pp, vd); if (cp != NULL && check_guid && ISP2(pp->sectorsize) && pp->sectorsize <= VDEV_PAD_SIZE) { g_topology_unlock(); vdev_geom_read_guids(cp, &pguid, &vguid); g_topology_lock(); if (pguid != spa_guid(vd->vdev_spa) || vguid != vd->vdev_guid) { vdev_geom_close_locked(vd); cp = NULL; ZFS_LOG(1, "guid mismatch for provider %s: " "%ju:%ju != %ju:%ju.", vd->vdev_path, (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid, (uintmax_t)pguid, (uintmax_t)vguid); } else { ZFS_LOG(1, "guid match for provider %s.", vd->vdev_path); } } } return (cp); } static int vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct g_provider *pp; struct g_consumer *cp; size_t bufsize; int error; /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } vd->vdev_tsd = NULL; DROP_GIANT(); g_topology_lock(); error = 0; if (vd->vdev_spa->spa_splitting_newspa || (vd->vdev_prevstate == VDEV_STATE_UNKNOWN && vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)) { /* * We are dealing with a vdev that hasn't been previously * opened (since boot), and we are not loading an * existing pool configuration. This looks like a * vdev add operation to a new or existing pool. * Assume the user knows what he/she is doing and find * GEOM provider by its name, ignoring GUID mismatches. * * XXPOLICY: It would be safer to only allow a device * that is unlabeled or labeled but missing * GUID information to be opened in this fashion, * unless we are doing a split, in which case we * should allow any guid. */ cp = vdev_geom_open_by_path(vd, 0); } else { /* * Try using the recorded path for this device, but only * accept it if its label data contains the expected GUIDs. */ cp = vdev_geom_open_by_path(vd, 1); if (cp == NULL) { /* * The device at vd->vdev_path doesn't have the * expected GUIDs. The disks might have merely * moved around so try all other GEOM providers * to find one with the right GUIDs. */ cp = vdev_geom_open_by_guids(vd); } } if (cp == NULL) { ZFS_LOG(1, "Provider %s not found.", vd->vdev_path); error = ENOENT; } else if (cp->provider->sectorsize > VDEV_PAD_SIZE || !ISP2(cp->provider->sectorsize)) { ZFS_LOG(1, "Provider %s has unsupported sectorsize.", vd->vdev_path); vdev_geom_close_locked(vd); error = EINVAL; cp = NULL; } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) { int i; for (i = 0; i < 5; i++) { error = g_access(cp, 0, 1, 0); if (error == 0) break; g_topology_unlock(); tsleep(vd, 0, "vdev", hz / 2); g_topology_lock(); } if (error != 0) { printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n", vd->vdev_path, error); vdev_geom_close_locked(vd); cp = NULL; } } g_topology_unlock(); PICKUP_GIANT(); if (cp == NULL) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; return (error); } pp = cp->provider; /* * Determine the actual size of the device. */ *max_psize = *psize = pp->mediasize; /* * Determine the device's minimum transfer size and preferred * transfer size. */ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *physical_ashift = 0; if (pp->stripesize) *physical_ashift = highbit(pp->stripesize) - 1; /* * Clear the nowritecache settings, so that on a vdev_reopen() * we will try again. */ vd->vdev_nowritecache = B_FALSE; /* * Determine the device's rotation rate. */ vdev_geom_set_rotation_rate(vd, cp); return (0); } static void vdev_geom_close(vdev_t *vd) { DROP_GIANT(); g_topology_lock(); vdev_geom_close_locked(vd); g_topology_unlock(); PICKUP_GIANT(); } static void vdev_geom_io_intr(struct bio *bp) { vdev_t *vd; zio_t *zio; zio = bp->bio_caller1; vd = zio->io_vd; zio->io_error = bp->bio_error; if (zio->io_error == 0 && bp->bio_resid != 0) zio->io_error = SET_ERROR(EIO); switch(zio->io_error) { case ENOTSUP: /* * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know * that future attempts will never succeed. In this case * we set a persistent flag so that we don't bother with * requests in the future. */ switch(bp->bio_cmd) { case BIO_FLUSH: vd->vdev_nowritecache = B_TRUE; break; case BIO_DELETE: vd->vdev_notrim = B_TRUE; break; } break; case ENXIO: if (!vd->vdev_remove_wanted) { /* * If provider's error is set we assume it is being * removed. */ if (bp->bio_to->error != 0) { vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } else if (!vd->vdev_delayed_close) { vd->vdev_delayed_close = B_TRUE; } } break; } g_destroy_bio(bp); - zio_interrupt(zio); + zio_delay_interrupt(zio); } static void vdev_geom_io_start(zio_t *zio) { vdev_t *vd; struct g_consumer *cp; struct bio *bp; int error; vd = zio->io_vd; switch (zio->io_type) { case ZIO_TYPE_IOCTL: /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } else { switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush || vdev_geom_bio_flush_disable) break; if (vd->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } goto sendreq; default: zio->io_error = SET_ERROR(ENOTSUP); } } zio_execute(zio); return; case ZIO_TYPE_FREE: if (vd->vdev_notrim) { zio->io_error = SET_ERROR(ENOTSUP); } else if (!vdev_geom_bio_delete_disable) { goto sendreq; } zio_execute(zio); return; } sendreq: ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE || zio->io_type == ZIO_TYPE_IOCTL); cp = vd->vdev_tsd; if (cp == NULL) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } bp = g_alloc_bio(); bp->bio_caller1 = zio; switch (zio->io_type) { case ZIO_TYPE_READ: case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE; bp->bio_data = zio->io_data; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; case ZIO_TYPE_FREE: bp->bio_cmd = BIO_DELETE; bp->bio_data = NULL; bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; case ZIO_TYPE_IOCTL: bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; break; } bp->bio_done = vdev_geom_io_intr; g_io_request(bp, cp); } static void vdev_geom_io_done(zio_t *zio) { } static void vdev_geom_hold(vdev_t *vd) { } static void vdev_geom_rele(vdev_t *vd) { } vdev_ops_t vdev_geom_ops = { vdev_geom_open, vdev_geom_close, vdev_default_asize, vdev_geom_io_start, vdev_geom_io_done, NULL, vdev_geom_hold, vdev_geom_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 297108) @@ -1,930 +1,927 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include /* * ZFS I/O Scheduler * --------------- * * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The * I/O scheduler determines when and in what order those operations are * issued. The I/O scheduler divides operations into six I/O classes * prioritized in the following order: sync read, sync write, async read, * async write, scrub/resilver and trim. Each queue defines the minimum and * maximum number of concurrent operations that may be issued to the device. * In addition, the device has an aggregate maximum. Note that the sum of the * per-queue minimums must not exceed the aggregate maximum, and if the * aggregate maximum is equal to or greater than the sum of the per-queue * maximums, the per-queue minimum has no effect. * * For many physical devices, throughput increases with the number of * concurrent operations, but latency typically suffers. Further, physical * devices typically have a limit at which more concurrent operations have no * effect on throughput or can actually cause it to decrease. * * The scheduler selects the next operation to issue by first looking for an * I/O class whose minimum has not been satisfied. Once all are satisfied and * the aggregate maximum has not been hit, the scheduler looks for classes * whose maximum has not been satisfied. Iteration through the I/O classes is * done in the order specified above. No further operations are issued if the * aggregate maximum number of concurrent operations has been hit or if there * are no operations queued for an I/O class that has not hit its maximum. * Every time an I/O is queued or an operation completes, the I/O scheduler * looks for new operations to issue. * * All I/O classes have a fixed maximum number of outstanding operations * except for the async write class. Asynchronous writes represent the data * that is committed to stable storage during the syncing stage for * transaction groups (see txg.c). Transaction groups enter the syncing state * periodically so the number of queued async writes will quickly burst up and * then bleed down to zero. Rather than servicing them as quickly as possible, * the I/O scheduler changes the maximum number of active async write I/Os * according to the amount of dirty data in the pool (see dsl_pool.c). Since * both throughput and latency typically increase with the number of * concurrent operations issued to physical devices, reducing the burstiness * in the number of concurrent operations also stabilizes the response time of * operations from other -- and in particular synchronous -- queues. In broad * strokes, the I/O scheduler will issue more concurrent operations from the * async write queue as there's more dirty data in the pool. * * Async Writes * * The number of concurrent operations issued for the async write I/O class * follows a piece-wise linear function defined by a few adjustable points. * * | o---------| <-- zfs_vdev_async_write_max_active * ^ | /^ | * | | / | | * active | / | | * I/O | / | | * count | / | | * | / | | * |------------o | | <-- zfs_vdev_async_write_min_active * 0|____________^______|_________| * 0% | | 100% of zfs_dirty_data_max * | | * | `-- zfs_vdev_async_write_active_max_dirty_percent * `--------- zfs_vdev_async_write_active_min_dirty_percent * * Until the amount of dirty data exceeds a minimum percentage of the dirty * data allowed in the pool, the I/O scheduler will limit the number of * concurrent operations to the minimum. As that threshold is crossed, the * number of concurrent operations issued increases linearly to the maximum at * the specified maximum percentage of the dirty data allowed in the pool. * * Ideally, the amount of dirty data on a busy pool will stay in the sloped * part of the function between zfs_vdev_async_write_active_min_dirty_percent * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the * maximum percentage, this indicates that the rate of incoming data is * greater than the rate that the backend storage can handle. In this case, we * must further throttle incoming writes (see dmu_tx_delay() for details). */ /* * The maximum number of I/Os active to each device. Ideally, this will be >= * the sum of each queue's max_active. It must be at least the sum of each * queue's min_active. */ uint32_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of I/Os active to each device. If the * sum of the queue's max_active is < zfs_vdev_max_active, then the * min_active comes into play. We will send min_active from each queue, * and then select from queues in the order defined by zio_priority_t. * * In general, smaller max_active's will lead to lower latency of synchronous * operations. Larger max_active's may lead to higher overall throughput, * depending on underlying storage. * * The ratio of the queues' max_actives determines the balance of performance * between reads, writes, and scrubs. E.g., increasing * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete * more quickly, but reads and writes to have higher latency and lower * throughput. */ uint32_t zfs_vdev_sync_read_min_active = 10; uint32_t zfs_vdev_sync_read_max_active = 10; uint32_t zfs_vdev_sync_write_min_active = 10; uint32_t zfs_vdev_sync_write_max_active = 10; uint32_t zfs_vdev_async_read_min_active = 1; uint32_t zfs_vdev_async_read_max_active = 3; uint32_t zfs_vdev_async_write_min_active = 1; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; uint32_t zfs_vdev_trim_min_active = 1; /* * TRIM max active is large in comparison to the other values due to the fact * that TRIM IOs are coalesced at the device layer. This value is set such * that a typical SSD can process the queued IOs in a single request. */ uint32_t zfs_vdev_trim_max_active = 64; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent * dirty data, use zfs_vdev_async_write_min_active. When it has more than * zfs_vdev_async_write_active_max_dirty_percent, use * zfs_vdev_async_write_max_active. The value is linearly interpolated * between min and max. */ int zfs_vdev_async_write_active_min_dirty_percent = 30; int zfs_vdev_async_write_active_max_dirty_percent = 60; /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * For read I/Os, we also aggregate across small adjacency gaps; for writes * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; #ifdef __FreeBSD__ SYSCTL_DECL(_vfs_zfs_vdev); TUNABLE_INT("vfs.zfs.vdev.async_write_active_min_dirty_percent", &zfs_vdev_async_write_active_min_dirty_percent); static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_zfs_async_write_active_min_dirty_percent, "I", "Percentage of async write dirty data below which " "async_write_min_active is used."); TUNABLE_INT("vfs.zfs.vdev.async_write_active_max_dirty_percent", &zfs_vdev_async_write_active_max_dirty_percent); static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_zfs_async_write_active_max_dirty_percent, "I", "Percentage of async write dirty data above which " "async_write_max_active is used."); TUNABLE_INT("vfs.zfs.vdev.max_active", &zfs_vdev_max_active); SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN, &zfs_vdev_max_active, 0, "The maximum number of I/Os of all types active for each device."); #define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ TUNABLE_INT("vfs.zfs.vdev." #name "_min_active", \ &zfs_vdev_ ## name ## _min_active); \ SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, \ CTLFLAG_RWTUN, &zfs_vdev_ ## name ## _min_active, 0, \ "Initial number of I/O requests of type " #name \ " active for each device"); #define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ TUNABLE_INT("vfs.zfs.vdev." #name "_max_active", \ &zfs_vdev_ ## name ## _max_active); \ SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, \ CTLFLAG_RWTUN, &zfs_vdev_ ## name ## _max_active, 0, \ "Maximum number of I/O requests of type " #name \ " active for each device"); ZFS_VDEV_QUEUE_KNOB_MIN(sync_read); ZFS_VDEV_QUEUE_KNOB_MAX(sync_read); ZFS_VDEV_QUEUE_KNOB_MIN(sync_write); ZFS_VDEV_QUEUE_KNOB_MAX(sync_write); ZFS_VDEV_QUEUE_KNOB_MIN(async_read); ZFS_VDEV_QUEUE_KNOB_MAX(async_read); ZFS_VDEV_QUEUE_KNOB_MIN(async_write); ZFS_VDEV_QUEUE_KNOB_MAX(async_write); ZFS_VDEV_QUEUE_KNOB_MIN(scrub); ZFS_VDEV_QUEUE_KNOB_MAX(scrub); ZFS_VDEV_QUEUE_KNOB_MIN(trim); ZFS_VDEV_QUEUE_KNOB_MAX(trim); #undef ZFS_VDEV_QUEUE_KNOB TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN, &zfs_vdev_aggregation_limit, 0, "I/O requests are aggregated up to this size"); TUNABLE_INT("vfs.zfs.vdev.read_gap_limit", &zfs_vdev_read_gap_limit); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN, &zfs_vdev_read_gap_limit, 0, "Acceptable gap between two reads being aggregated"); TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit); SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN, &zfs_vdev_write_gap_limit, 0, "Acceptable gap between two writes being aggregated"); static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS) { int val, err; val = zfs_vdev_async_write_active_min_dirty_percent; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val > 100 || val >= zfs_vdev_async_write_active_max_dirty_percent) return (EINVAL); zfs_vdev_async_write_active_min_dirty_percent = val; return (0); } static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS) { int val, err; val = zfs_vdev_async_write_active_max_dirty_percent; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val > 100 || val <= zfs_vdev_async_write_active_min_dirty_percent) return (EINVAL); zfs_vdev_async_write_active_max_dirty_percent = val; return (0); } #endif int vdev_queue_offset_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_offset < z2->io_offset) return (-1); if (z1->io_offset > z2->io_offset) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } static inline avl_tree_t * vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) { return (&vq->vq_class[p].vqc_queued_tree); } static inline avl_tree_t * vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) { if (t == ZIO_TYPE_READ) return (&vq->vq_read_offset_tree); else if (t == ZIO_TYPE_WRITE) return (&vq->vq_write_offset_tree); else return (NULL); } int vdev_queue_timestamp_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_timestamp < z2->io_timestamp) return (-1); if (z1->io_timestamp > z2->io_timestamp) return (1); if (z1->io_offset < z2->io_offset) return (-1); if (z1->io_offset > z2->io_offset) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } void vdev_queue_init(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_queue_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); /* * The synchronous i/o queues are dispatched in FIFO rather * than LBA order. This provides more consistent latency for * these i/os. */ if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) compfn = vdev_queue_timestamp_compare; else compfn = vdev_queue_offset_compare; avl_create(vdev_queue_class_tree(vq, p), compfn, sizeof (zio_t), offsetof(struct zio, io_queue_node)); } vq->vq_lastoffset = 0; } void vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) avl_destroy(vdev_queue_class_tree(vq, p)); avl_destroy(&vq->vq_active_tree); avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; avl_tree_t *qtt; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); qtt = vdev_queue_type_tree(vq, zio->io_type); if (qtt) avl_add(qtt, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); spa->spa_queue_stats[zio->io_priority].spa_queued++; if (spa->spa_iokstat != NULL) kstat_waitq_enter(spa->spa_iokstat->ks_data); mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; avl_tree_t *qtt; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); qtt = vdev_queue_type_tree(vq, zio->io_type); if (qtt) avl_remove(qtt, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0); spa->spa_queue_stats[zio->io_priority].spa_queued--; if (spa->spa_iokstat != NULL) kstat_waitq_exit(spa->spa_iokstat->ks_data); mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active++; avl_add(&vq->vq_active_tree, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); spa->spa_queue_stats[zio->io_priority].spa_active++; if (spa->spa_iokstat != NULL) kstat_runq_enter(spa->spa_iokstat->ks_data); mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active--; avl_remove(&vq->vq_active_tree, zio); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0); spa->spa_queue_stats[zio->io_priority].spa_active--; if (spa->spa_iokstat != NULL) { kstat_io_t *ksio = spa->spa_iokstat->ks_data; kstat_runq_exit(spa->spa_iokstat->ks_data); if (zio->io_type == ZIO_TYPE_READ) { ksio->reads++; ksio->nread += zio->io_size; } else if (zio->io_type == ZIO_TYPE_WRITE) { ksio->writes++; ksio->nwritten += zio->io_size; } } mutex_exit(&spa->spa_iokstat_lock); #endif } static void vdev_queue_agg_io_done(zio_t *aio) { if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; while ((pio = zio_walk_parents(aio)) != NULL) { bcopy((char *)aio->io_data + (pio->io_offset - aio->io_offset), pio->io_data, pio->io_size); } } zio_buf_free(aio->io_data, aio->io_size); } static int vdev_queue_class_min_active(zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_min_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_min_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_min_active); case ZIO_PRIORITY_ASYNC_WRITE: return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_min_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); default: panic("invalid priority %u", p); return (0); } } static __noinline int vdev_queue_max_async_writes(spa_t *spa) { int writes; uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total; uint64_t min_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_min_dirty_percent / 100; uint64_t max_bytes = zfs_dirty_data_max * zfs_vdev_async_write_active_max_dirty_percent / 100; /* * Sync tasks correspond to interactive user actions. To reduce the * execution time of those actions we push data out as fast as possible. */ if (spa_has_pending_synctask(spa)) { return (zfs_vdev_async_write_max_active); } if (dirty < min_bytes) return (zfs_vdev_async_write_min_active); if (dirty > max_bytes) return (zfs_vdev_async_write_max_active); /* * linear interpolation: * slope = (max_writes - min_writes) / (max_bytes - min_bytes) * move right by min_bytes * move up by min_writes */ writes = (dirty - min_bytes) * (zfs_vdev_async_write_max_active - zfs_vdev_async_write_min_active) / (max_bytes - min_bytes) + zfs_vdev_async_write_min_active; ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); return (writes); } static int vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: return (zfs_vdev_sync_read_max_active); case ZIO_PRIORITY_SYNC_WRITE: return (zfs_vdev_sync_write_max_active); case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); default: panic("invalid priority %u", p); return (0); } } /* * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if * there is no eligible class. */ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { spa_t *spa = vq->vq_vdev->vdev_spa; zio_priority_t p; ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* find a queue that has not reached its minimum # outstanding i/os */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_min_active(p)) return (p); } /* * If we haven't found a queue, look for one that hasn't reached its * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < vdev_queue_class_max_active(spa, p)) return (p); } /* No eligible queued i/os */ return (ZIO_PRIORITY_NUM_QUEUEABLE); } /* * Compute the range spanned by two i/os, which is the endpoint of the last * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. */ #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) static zio_t * vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { zio_t *first, *last, *aio, *dio, *mandatory, *nio; uint64_t maxgap = 0; uint64_t size; boolean_t stretch; avl_tree_t *t; enum zio_flag flags; ASSERT(MUTEX_HELD(&vq->vq_lock)); if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) return (NULL); first = last = zio; if (zio->io_type == ZIO_TYPE_READ) maxgap = zfs_vdev_read_gap_limit; /* * We can aggregate I/Os that are sufficiently adjacent and of * the same flavor, as expressed by the AGG_INHERIT flags. * The latter requirement is necessary so that certain * attributes of the I/O, such as whether it's a normal I/O * or a scrub/resilver, can be preserved in the aggregate. * We can include optional I/Os, but don't allow them * to begin a range as they add no benefit in that situation. */ /* * We keep track of the last non-optional I/O. */ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; /* * Walk backwards through sufficiently contiguous I/Os * recording the last non-option I/O. */ flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; t = vdev_queue_type_tree(vq, zio->io_type); while (t != NULL && (dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && IO_GAP(dio, first) <= maxgap) { first = dio; if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = first; } /* * Skip any initial optional I/Os. */ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { first = AVL_NEXT(t, first); ASSERT(first != NULL); } /* * Walk forward through sufficiently contiguous I/Os. */ while ((dio = AVL_NEXT(t, last)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit && IO_GAP(last, dio) <= maxgap) { last = dio; if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = last; } /* * Now that we've established the range of the I/O aggregation * we must decide what to do with trailing optional I/Os. * For reads, there's nothing to do. While we are unable to * aggregate further, it's possible that a trailing optional * I/O would allow the underlying device to aggregate with * subsequent I/Os. We must therefore determine if the next * non-optional I/O is close enough to make aggregation * worthwhile. */ stretch = B_FALSE; if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { zio_t *nio = last; while ((dio = AVL_NEXT(t, nio)) != NULL && IO_GAP(nio, dio) == 0 && IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { nio = dio; if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { stretch = B_TRUE; break; } } } if (stretch) { /* This may be a no-op. */ dio = AVL_NEXT(t, last); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { while (last != mandatory && last != first) { ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); last = AVL_PREV(t, last); ASSERT(last != NULL); } } if (first == last) return (NULL); size = IO_SPAN(first, last); ASSERT3U(size, <=, zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, zio_buf_alloc(size), size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; do { dio = nio; nio = AVL_NEXT(t, dio); ASSERT3U(dio->io_type, ==, aio->io_type); if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); bzero((char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); } else if (dio->io_type == ZIO_TYPE_WRITE) { bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); } zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); zio_execute(dio); } while (dio != last); return (aio); } static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq) { zio_t *zio, *aio; zio_priority_t p; avl_index_t idx; avl_tree_t *tree; zio_t search; again: ASSERT(MUTEX_HELD(&vq->vq_lock)); p = vdev_queue_class_to_issue(vq); if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { /* No eligible queued i/os */ return (NULL); } /* * For LBA-ordered queues (async / scrub), issue the i/o which follows * the most recently issued i/o in LBA (offset) order. * * For FIFO queues (sync), issue the i/o with the lowest timestamp. */ tree = vdev_queue_class_tree(vq, p); search.io_timestamp = 0; search.io_offset = vq->vq_last_offset + 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); zio = avl_nearest(tree, idx, AVL_AFTER); if (zio == NULL) zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); if (aio != NULL) zio = aio; else vdev_queue_io_remove(vq, zio); /* * If the I/O is or was optional and therefore has no data, we need to * simply discard it. We need to drop the vdev queue's lock to avoid a * deadlock that we could encounter since this I/O will complete * immediately. */ if (zio->io_flags & ZIO_FLAG_NODATA) { mutex_exit(&vq->vq_lock); zio_vdev_io_bypass(zio); zio_execute(zio); mutex_enter(&vq->vq_lock); goto again; } vdev_queue_pending_add(vq, zio); vq->vq_last_offset = zio->io_offset; return (zio); } zio_t * vdev_queue_io(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) return (zio); /* * Children i/os inherent their parent's priority, which might * not match the child's i/o type. Fix it up here. */ if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } else { ASSERT(zio->io_type == ZIO_TYPE_FREE); zio->io_priority = ZIO_PRIORITY_TRIM; } zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; mutex_enter(&vq->vq_lock); zio->io_timestamp = gethrtime(); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); mutex_exit(&vq->vq_lock); if (nio == NULL) return (NULL); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); return (NULL); } return (nio); } void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; - if (zio_injection_enabled) - delay(SEC_TO_TICK(zio_handle_io_delay(zio))); - mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); vq->vq_io_complete_ts = gethrtime(); while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); } else { zio_vdev_io_reissue(nio); zio_execute(nio); } mutex_enter(&vq->vq_lock); } mutex_exit(&vq->vq_lock); } /* * As these three methods are only used for load calculations we're not concerned * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex * use here, instead we prefer to keep it lock free for performance. */ int vdev_queue_length(vdev_t *vd) { return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); } uint64_t vdev_queue_lastoffset(vdev_t *vd) { return (vd->vdev_queue.vq_lastoffset); } void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio) { vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size; } Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 297108) @@ -1,6761 +1,6769 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011-2012 Pawel Jakub Dawidek . * All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Copyright 2014 Xin Li . All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * The ioctl numbers can change from release to release, because * the caller (libzfs) must be matched to the kernel. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. */ #ifdef __FreeBSD__ #include "opt_kstack_pages.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include "zfs_ioctl_compat.h" CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX); static struct cdev *zfsdev; extern void zfs_init(void); extern void zfs_fini(void); uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; static uint_t zfs_allow_log_key; typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); typedef enum { NO_NAME, POOL_NAME, DATASET_NAME } zfs_ioc_namecheck_t; typedef enum { POOL_CHECK_NONE = 1 << 0, POOL_CHECK_SUSPENDED = 1 << 1, POOL_CHECK_READONLY = 1 << 2, } zfs_ioc_poolcheck_t; typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void zfsdev_close(void *data); static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature); /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; char buf[512]; va_list adx; /* * Get rid of annoying "../common/" prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); /* * To get this data, use the zfs-dprintf probe as so: * dtrace -q -n 'zfs-dprintf \ * /stringof(arg0) == "dbuf.c"/ \ * {printf("%s: %s", stringof(arg1), stringof(arg3))}' * arg0 = file name * arg1 = function name * arg2 = line number * arg3 = message */ DTRACE_PROBE4(zfs__dprintf, char *, newfile, char *, func, int, line, char *, buf); } static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Check to see if the named dataset is currently defined as bootable */ static boolean_t zfs_is_bootfs(const char *name) { objset_t *os; if (dmu_objset_hold(name, FTAG, &os) == 0) { boolean_t ret; ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); dmu_objset_rele(os, FTAG); return (ret); } return (B_FALSE); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ /* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (INGLOBALZONE(curthread) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curthread)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, "jailed", &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } #ifdef SECLABEL /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) { char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); } #endif /* SECLABEL */ static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curthread)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curthread)) { uint64_t zoned; char setpoint[MAXNAMELEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, "jailed", &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: #ifdef SECLABEL if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } #else return (EOPNOTSUPP); #endif break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } /* ARGSUSED */ static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); if (error != 0) return (error); /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ return (0); } /* ARGSUSED */ static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } /* ARGSUSED */ static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } /* ARGSUSED */ static int zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { vnode_t *vp; int error; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (SET_ERROR(EPERM)); } VN_RELE(vp); return (dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_SHARE, cr)); } int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (!INGLOBALZONE(curthread)) return (SET_ERROR(EPERM)); if (secpolicy_nfs(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } } int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (!INGLOBALZONE(curthread)) return (SET_ERROR(EPERM)); if (secpolicy_smb(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strncpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ /* ARGSUSED */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[MAXNAMELEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } /* ARGSUSED */ static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char *at = NULL; int error; if ((zc->zc_cookie & 1) != 0) { /* * This is recursive rename, so the starting snapshot might * not exist. Check file system or volume permission instead. */ at = strchr(zc->zc_name, '@'); if (at == NULL) return (EINVAL); *at = '\0'; } error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr); if (at != NULL) *at = '@'; return (error); } /* ARGSUSED */ static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[MAXNAMELEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; int error; nvpair_t *pair; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[MAXNAMELEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ static int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ /* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (secpolicy_zinject(cr)); } /* ARGSUSED */ static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else { if (groupmember(zc->zc_guid, cr)) return (0); } } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } /* ARGSUSED */ static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } /* ARGSUSED */ static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; nvlist_t *holds; int error; error = nvlist_lookup_nvlist(innvl, "holds", &holds); if (error != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[MAXNAMELEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[MAXNAMELEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); return (error); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = kmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { kmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { kmem_free(packed, size); return (error); } kmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { /* * Solaris returns ENOMEM here, because even if an error is * returned from an ioctl(2), new zc_nvlist_dst_size will be * passed to the userland. This is not the case for FreeBSD. * We need to return 0, so the kernel will copy the * zc_nvlist_dst_size back and the userland can discover that a * bigger buffer is needed. */ error = 0; } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } static int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); if (*zfvp) { VFS_HOLD((*zfvp)->z_vfs); } else { error = SET_ERROR(ESRCH); } mutex_exit(&os->os_user_ptr_lock); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_vfs will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all vnode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, zfvp); if (error == 0) { rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : RW_READER, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ rrm_exit(&(*zfvp)->z_teardown_lock, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { rrm_exit(&zfsvfs->z_teardown_lock, tag); if (zfsvfs->z_vfs) { VFS_RELE(zfsvfs->z_vfs); } else { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; uint64_t version = SPA_VERSION; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) { nvlist_free(config); nvlist_free(props); return (error); } (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; } error = spa_create(zc->zc_name, config, props, zplprops); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) (void) spa_destroy(zc->zc_name); pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); if (props) nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = kmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); kmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config, **l2cache, **spares; uint_t nl2cache = 0, nspares = 0; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, &spares, &nspares); #ifdef illumos /* * A root pool with concatenated devices is not supported. * Thus, can not add a device to a root pool. * * Intent log device can not be added to a rootpool because * during mountroot, zil is replayed, a seperated log device * can not be accessed during the mountroot time. * * l2cache and spare devices are ok to be added to a rootpool. */ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { nvlist_free(config); spa_close(spa, FTAG); return (SET_ERROR(EDOM)); } #endif /* illumos */ if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_nvlist_conf nvlist of devices to remove * zc_cookie to stop the remove? */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; int replacing = zc->zc_cookie; nvlist_t *config; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); VERIFY0(error); } error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } if (error == ENOMEM) error = 0; return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if (err = dmu_objset_hold(zc->zc_name, FTAG, &os)) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } boolean_t dataset_name_hidden(const char *name) { /* * Skip over datasets that are not visible in this zone, * internal datasets (which have a $ in their name), and * temporary datasets (which have a % in their name). */ if (strchr(name, '$') != NULL) return (B_TRUE); if (strchr(name, '%') != NULL) return (B_TRUE); if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL)) return (B_TRUE); return (B_FALSE); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * zc_simple when set, only name is requested * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? ESRCH : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == 0 && !zc->zc_simple) { dsl_dataset_t *ds; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); if (error == 0) { objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); if (error == 0) error = zfs_ioc_objset_stats_impl(zc, ossnap); dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { error = SET_ERROR(ESRCH); } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *domain; char *dash; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) return (-1); VERIFY(0 == nvpair_value_uint64(pair, &intval)); switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dsname); (void) zfs_ioc_userspace_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; char *strval; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); int err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(EINVAL); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (!nvlist_empty(genericnvl) && dsl_props_set(dsname, source, genericnvl) != 0) { /* * If this fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); int err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } } nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(const char *fsname, nvlist_t *nvl) { nvpair_t *pair = NULL; int error = 0; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_USERPROP, CRED())) return (error); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (E2BIG); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ if (received) { nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: nvlist_free(dummy); return (SET_ERROR(EINVAL)); } pair = nvlist_next_nvpair(dummy, NULL); err = zfs_prop_set_special(zc->zc_name, source, pair); nvlist_free(dummy); if (err != -1) return (err); /* special property already handled */ } else { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } /* property name has been validated by zfs_secpolicy_inherit_prop() */ return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props)) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_config_sync(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } /* * Search the vfs list for a specified resource. Returns a pointer to it * or NULL if no suitable entry is found. The caller of this routine * is responsible for releasing the returned vfs pointer. */ static vfs_t * zfs_get_vfs(const char *resource) { vfs_t *vfsp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(vfsp, &mountlist, mnt_list) { if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { VFS_HOLD(vfsp); break; } } mtx_unlock(&mountlist_mtx); return (vfsp); } /* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; ASSERT(zplprops != NULL); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[MAXNAMELEN]; char *cp; spa_t *spa; uint64_t spa_vers; int error; (void) strlcpy(parentname, dataset, sizeof (parentname)); cp = strrchr(parentname, '/'); ASSERT(cp != NULL); cp[0] = '\0'; if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); int32_t type32; dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; if (nvlist_lookup_int32(innvl, "type", &type32) != 0) return (SET_ERROR(EINVAL)); type = type32; (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize( volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); nvlist_free(zct.zct_zplprops); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } #ifdef __FreeBSD__ if (error == 0 && type == DMU_OST_ZVOL) zvol_create_minors(fsname); #endif return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; char *origin_name; if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) return (SET_ERROR(EINVAL)); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); if (error != 0) return (error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } #ifdef __FreeBSD__ if (error == 0) zvol_create_minors(fsname); #endif return (error); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if ((error = zfs_check_userprops(poolname, props)) != 0) return (error); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ /* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { char *message; spa_t *spa; int error; char *poolname; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); strfree(poolname); if (error != 0) return (error); if (nvlist_lookup_string(innvl, "message", &message) != 0) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ int zfs_unmount_snap(const char *snapname) { vfs_t *vfsp; zfsvfs_t *zfsvfs; int err; if (strchr(snapname, '@') == NULL) return (0); vfsp = zfs_get_vfs(snapname); if (vfsp == NULL) return (0); zfsvfs = vfsp->vfs_data; ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); err = vn_vfswlock(vfsp->vfs_vnodecovered); VFS_RELE(vfsp); if (err != 0) return (SET_ERROR(err)); /* * Always force the unmount for snapshots. */ #ifdef illumos (void) dounmount(vfsp, MS_FORCE, kcred); #else vfs_ref(vfsp); (void) dounmount(vfsp, MS_FORCE, curthread); #endif return (0); } /* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { return (zfs_unmount_snap(snapname)); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[MAXNAMELEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); (void) zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) * */ /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); error = zfs_unmount_snap(name); if (error != 0) return (error); #if defined(__FreeBSD__) zvol_remove_minors(name); #endif } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. Bookmark names are of the form #. * All bookmarks must be in the same pool. * * innvl: { * bookmark1 -> snapshot1, bookmark2 -> snapshot2 * } * * outnvl: bookmark -> error code (int32) * */ /* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *snap_name; /* * Verify the snapshot argument. */ if (nvpair_value_string(pair, &snap_name) != 0) return (SET_ERROR(EINVAL)); /* Verify that the keys (bookmarks) are unique */ for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) { if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) return (SET_ERROR(EINVAL)); } } return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } /* * inputs: * zc_name name of dataset to destroy * zc_objset_type type of objset * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { int err; if (zc->zc_objset_type == DMU_OST_ZFS) { err = zfs_unmount_snap(zc->zc_name); if (err != 0) return (err); } if (strchr(zc->zc_name, '@')) err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); else err = dsl_destroy_head(zc->zc_name); if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) #ifdef __FreeBSD__ zvol_remove_minors(zc->zc_name); #else (void) zvol_remove_minor(zc->zc_name); #endif return (err); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl is not used. * * outnvl: "target" -> name of most recent snapshot * } */ /* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; int error; if (getzfsvfs(fsname, &zfsvfs) == 0) { error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, fsname); error = error ? error : resume_err; } VFS_RELE(zfsvfs->z_vfs); } else { error = dsl_dataset_rollback(fsname, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char fullname[MAXNAMELEN]; (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname); return (zfs_unmount_snap(fullname)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie & 1; char *at; boolean_t allow_mounted = B_TRUE; #ifdef __FreeBSD__ allow_mounted = (zc->zc_cookie & 2) != 0; #endif zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (zc->zc_objset_type == DMU_OST_ZFS && allow_mounted) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { #ifdef illumos if (zc->zc_objset_type == DMU_OST_ZVOL) (void) zvol_remove_minor(zc->zc_name); #endif return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr)) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else { /* USERUSED and GROUPUSED are read-only */ return (SET_ERROR(EINVAL)); } if (err = zfs_secpolicy_write_perms(dsname, perm, cr)) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { if (intval >= ZIO_COMPRESS_GZIP_1 && intval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (intval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (intval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } /* * If this is a bootable dataset then * verify that the compression algorithm * is supported for booting. We must return * something other than ENOTSUP since it * implies a downrev pool version. */ if (zfs_is_bootfs(dsname) && !BOOTFS_COMPRESS_VALID(intval)) { return (SET_ERROR(ERANGE)); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * If this is a bootable dataset then * the we don't allow large (>128K) blocks, * because GRUB doesn't support them. */ if (zfs_is_bootfs(dsname) && intval > SPA_OLD_MAXBLOCKSIZE) { return (SET_ERROR(ERANGE)); } /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_value_uint64(pair, &intval) != 0) return (SET_ERROR(EINVAL)); /* check prop value is enabled in features */ feature = zio_checksum_to_feature(intval); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); /* * Salted checksums are not supported on root pools. */ if (spa_bootfs(spa) != 0 && intval < ZIO_CHECKSUM_FUNCTIONS && (zio_checksum_table[intval].ci_flags & ZCHECKSUM_FLAG_SALTED)) { spa_close(spa, FTAG); return (SET_ERROR(ERANGE)); } if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); break; } } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Checks for a race condition to make sure we don't increment a feature flag * multiple times. */ static int zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_feature_t *featurep = arg; if (!spa_feature_is_active(spa, *featurep)) return (0); else return (SET_ERROR(EBUSY)); } /* * The callback invoked on feature activation in the sync task caused by * zfs_prop_activate_feature. */ static void zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_feature_t *featurep = arg; spa_feature_incr(spa, *featurep, tx); } /* * Activates a feature on a pool in response to a property setting. This * creates a new sync task which modifies the pool to reflect the feature * as being active. */ static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature) { int err; /* EBUSY here indicates that the feature is already active */ err = dsl_sync_task(spa_name(spa), zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, &feature, 2, ZFS_SPACE_CHECK_RESERVED); if (err != 0 && err != EBUSY) return (err); else return (0); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dataset); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strcpy(zc->zc_value, nvpair_name(pair)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } #ifdef DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * inputs: * zc_name name of containing filesystem * zc_nvlist_src{_size} nvlist of properties to apply * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * zc_cleanup_fd cleanup-on-exit file descriptor * zc_action_handle handle for this guid/ds mapping (or zero on first call) * zc_resumable if data is incomplete assume sender will resume * * outputs: * zc_cookie number of bytes read * zc_nvlist_dst{_size} error for each unapplied received property * zc_obj zprop_errflags_t * zc_action_handle handle for this guid/ds mapping */ static int zfs_ioc_recv(zfs_cmd_t *zc) { file_t *fp; dmu_recv_cookie_t drc; boolean_t force = (boolean_t)zc->zc_guid; int fd; int error = 0; int props_error = 0; nvlist_t *errors; offset_t off; nvlist_t *props = NULL; /* sent properties */ nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *delayprops = NULL; /* sent properties applied post-receive */ char *origin = NULL; char *tosnap; char tofs[ZFS_MAXNAMELEN]; cap_rights_t rights; boolean_t first_recvd_props = B_FALSE; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strcpy(tofs, zc->zc_value); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props)) != 0) return (error); fd = zc->zc_cookie; #ifdef illumos fp = getf(fd); #else fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp); #endif if (fp == NULL) { nvlist_free(props); return (SET_ERROR(EBADF)); } errors = fnvlist_alloc(); if (zc->zc_string[0]) origin = zc->zc_string; error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc); if (error != 0) goto out; /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (props != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, so stash * away the existing ones. */ if (dsl_prop_get_received(tofs, &origprops) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(props, origprops); if (zfs_check_clearable(tofs, origprops, &errlist) != 0) (void) nvlist_merge(errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origprops, first_recvd_props ? NULL : props) != 0) zc->zc_obj |= ZPROP_ERR_NOCLEAR; } else { zc->zc_obj |= ZPROP_ERR_NOCLEAR; } } if (props != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { delayprops = extract_delay_props(props); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, props, errors); } } off = fp->f_offset; error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd, &zc->zc_action_handle); if (error == 0) { zfsvfs_t *zfsvfs = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ int end_err; error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); if (error == 0) error = zfs_resume_fs(zfsvfs, tofs); error = error ? error : end_err; VFS_RELE(zfsvfs->z_vfs); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, delayprops, errors); } } if (delayprops != NULL) { /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ ASSERT(nvlist_merge(props, delayprops, 0) == 0); nvlist_free(delayprops); } /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ props_error = SET_ERROR(EINVAL); } zc->zc_cookie = off - fp->f_offset; if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; #ifdef DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif #ifdef __FreeBSD__ if (error == 0) zvol_create_minors(tofs); #endif /* * On error, restore the original props. */ if (error != 0 && props != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, props, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origprops == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explictly if we're restoring local properties cleared in the * first new-style receive. */ if (origprops != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origprops, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } } out: nvlist_free(props); nvlist_free(origprops); nvlist_free(errors); releasef(fd); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate(tosnap, fromsnap, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { file_t *fp; cap_rights_t rights; #ifdef illumos fp = getf(zc->zc_cookie); #else fget_write(curthread, zc->zc_cookie, cap_rights_init(&rights, CAP_WRITE), &fp); #endif if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, #ifdef illumos zc->zc_cookie, fp->f_vnode, &off); #else zc->zc_cookie, fp, &off); #endif if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; releasef(zc->zc_cookie); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendarg_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dsa_outfd == zc->zc_cookie && dsp->dsa_proc == curproc) break; } if (dsp != NULL) zc->zc_cookie = *(dsp->dsa_off); else error = SET_ERROR(ENOENT); mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { (void) spa_vdev_state_exit(spa, NULL, ENODEV); spa_close(spa, FTAG); return (SET_ERROR(ENODEV)); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, NULL, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_reopen(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If a resilver is already in progress then set the * spa_scrub_reopen flag to B_TRUE so that we don't restart * the scan as a side effect of the reopen. Otherwise, let * vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * zc_value name of origin snapshot * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { char *cp; /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(zc->zc_value, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(zc->zc_value, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = kmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = ddi_copyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size, zc->zc_iflags); } kmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { objset_t *os; int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(zfsvfs->z_os, zfsvfs); error = zfs_resume_fs(zfsvfs, zc->zc_name); } } if (error == 0) error = dmu_objset_userspace_upgrade(zfsvfs->z_os); VFS_RELE(zfsvfs->z_vfs); } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); dmu_objset_rele(os, FTAG); } return (error); } #ifdef illumos /* * We don't want to have a hard dependency * against some special symbols in sharefs * nfs, and smbsrv. Determine them if needed when * the first file system is shared. * Neither sharefs, nfs or smbsrv are unloadable modules. */ int (*znfsexport_fs)(void *arg); int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); int (*zsmbexport_fs)(void *arg, boolean_t add_share); int zfs_nfsshare_inited; int zfs_smbshare_inited; ddi_modhandle_t nfs_mod; ddi_modhandle_t sharefs_mod; ddi_modhandle_t smbsrv_mod; #endif /* illumos */ kmutex_t zfs_share_lock; #ifdef illumos static int zfs_init_sharefs() { int error; ASSERT(MUTEX_HELD(&zfs_share_lock)); /* Both NFS and SMB shares also require sharetab support. */ if (sharefs_mod == NULL && ((sharefs_mod = ddi_modopen("fs/sharefs", KRTLD_MODE_FIRST, &error)) == NULL)) { return (SET_ERROR(ENOSYS)); } if (zshare_fs == NULL && ((zshare_fs = (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { return (SET_ERROR(ENOSYS)); } return (0); } #endif /* illumos */ static int zfs_ioc_share(zfs_cmd_t *zc) { #ifdef illumos int error; int opcode; switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (zfs_nfsshare_inited == 0) { mutex_enter(&zfs_share_lock); if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } if (znfsexport_fs == NULL && ((znfsexport_fs = (int (*)(void *)) ddi_modsym(nfs_mod, "nfs_export", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); if (error != 0) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } zfs_nfsshare_inited = 1; mutex_exit(&zfs_share_lock); } break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (zfs_smbshare_inited == 0) { mutex_enter(&zfs_share_lock); if (smbsrv_mod == NULL && ((smbsrv_mod = ddi_modopen("drv/smbsrv", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } if (zsmbexport_fs == NULL && ((zsmbexport_fs = (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, "smb_server_share", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); if (error != 0) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } zfs_smbshare_inited = 1; mutex_exit(&zfs_share_lock); } break; default: return (SET_ERROR(EINVAL)); } switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (error = znfsexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata)) return (error); break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (error = zsmbexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata, zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? B_TRUE: B_FALSE)) { return (error); } break; } opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? SHAREFS_ADD : SHAREFS_REMOVE; /* * Add or remove share from sharetab */ error = zshare_fs(opcode, (void *)(uintptr_t)zc->zc_share.z_sharedata, zc->zc_share.z_sharemax); return (error); #else /* !illumos */ return (ENOSYS); #endif /* illumos */ } ace_t full_access[] = { {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} }; /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; int error; minor_t minor; error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (error != 0) return (error); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strcpy(zc->zc_value, snap_name); strfree(snap_name); strfree(hold_name); zfs_onexit_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { file_t *fp; cap_rights_t rights; offset_t off; int error; #ifdef illumos fp = getf(zc->zc_cookie); #else fget_write(curthread, zc->zc_cookie, cap_rights_init(&rights, CAP_WRITE), &fp); #endif if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; #ifdef illumos error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); #else error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); #endif if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; releasef(zc->zc_cookie); return (error); } #ifdef illumos /* * Remove all ACL files in shares dir */ static int zfs_smb_acl_purge(znode_t *dzp) { zap_cursor_t zc; zap_attribute_t zap; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int error; for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); (error = zap_cursor_retrieve(&zc, &zap)) == 0; zap_cursor_advance(&zc)) { if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, NULL, 0)) != 0) break; } zap_cursor_fini(&zc); return (error); } #endif /* illumos */ static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { #ifdef illumos vnode_t *vp; znode_t *dzp; vnode_t *resourcevp = NULL; znode_t *sharedir; zfsvfs_t *zfsvfs; nvlist_t *nvlist; char *src, *target; vattr_t vattr; vsecattr_t vsec; int error = 0; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (SET_ERROR(EINVAL)); } dzp = VTOZ(vp); zfsvfs = dzp->z_zfsvfs; ZFS_ENTER(zfsvfs); /* * Create share dir if its missing. */ mutex_enter(&zfsvfs->z_lock); if (zfsvfs->z_shares_dir == 0) { dmu_tx_t *tx; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, ZFS_SHARES_DIR); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { error = zfs_create_share_dir(zfsvfs, tx); dmu_tx_commit(tx); } if (error != 0) { mutex_exit(&zfsvfs->z_lock); VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } } mutex_exit(&zfsvfs->z_lock); ASSERT(zfsvfs->z_shares_dir); if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } switch (zc->zc_cookie) { case ZFS_SMB_ACL_ADD: vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VREG; vattr.va_mode = S_IFREG|0777; vattr.va_uid = 0; vattr.va_gid = 0; vsec.vsa_mask = VSA_ACE; vsec.vsa_aclentp = &full_access; vsec.vsa_aclentsz = sizeof (full_access); vsec.vsa_aclcnt = 1; error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); if (resourcevp) VN_RELE(resourcevp); break; case ZFS_SMB_ACL_REMOVE: error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, NULL, 0); break; case ZFS_SMB_ACL_RENAME: if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); } if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, &target)) { VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); nvlist_free(nvlist); return (error); } error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, kcred, NULL, 0); nvlist_free(nvlist); break; case ZFS_SMB_ACL_PURGE: error = zfs_smb_acl_purge(sharedir); break; default: error = SET_ERROR(EINVAL); break; } VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); #else /* !illumos */ return (EOPNOTSUPP); #endif /* illumos */ } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ /* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; error = nvlist_lookup_nvlist(args, "holds", &holds); if (error != 0) return (SET_ERROR(EINVAL)); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { error = zfs_onexit_fd_hold(cleanup_fd, &minor); if (error != 0) return (error); } error = dsl_dataset_user_hold(holds, minor, errlist); if (minor != 0) zfs_onexit_fd_rele(cleanup_fd); return (error); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ /* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ /* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_name name of new filesystem or snapshot * zc_value full name of old snapshot * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } static int zfs_ioc_jail(zfs_cmd_t *zc) { return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, (int)zc->zc_jailid)); } static int zfs_ioc_unjail(zfs_cmd_t *zc) { return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, (int)zc->zc_jailid)); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * } * * outnvl is unused */ /* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { cap_rights_t rights; file_t *fp; int error; offset_t off; char *fromname = NULL; int fd; boolean_t largeblockok; boolean_t embedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) return (SET_ERROR(EINVAL)); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); #ifdef illumos file_t *fp = getf(fd); #else fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp); #endif if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; error = dmu_send(snapname, fromname, embedok, largeblockok, fd, #ifdef illumos resumeobj, resumeoff, fp->f_vnode, &off); #else resumeobj, resumeoff, fp, &off); #endif #ifdef illumos if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; #else fp->f_offset = off; #endif releasef(fd); return (error); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; int error; char *fromname; uint64_t space; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = nvlist_lookup_string(innvl, "from", &fromname); if (error == 0) { if (strchr(fromname, '@') != NULL) { /* * If from is a snapshot, hold it and use the more * efficient dmu_send_estimate to estimate send space * size using deadlists. */ dsl_dataset_t *fromsnap; error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) goto out; error = dmu_send_estimate(tosnap, fromsnap, &space); dsl_dataset_rele(fromsnap, FTAG); } else if (strchr(fromname, '#') != NULL) { /* * If from is a bookmark, fetch the creation TXG of the * snapshot it was created from and use that to find * blocks that were born after it. */ zfs_bookmark_phys_t frombm; error = dsl_bookmark_lookup(dp, fromname, tosnap, &frombm); if (error != 0) goto out; error = dmu_send_estimate_from_txg(tosnap, frombm.zbm_creation_txg, &space); } else { /* * from is not properly formatted as a snapshot or * bookmark */ error = SET_ERROR(EINVAL); goto out; } } else { // If estimating the size of a full send, use dmu_send_estimate error = dmu_send_estimate(tosnap, NULL, &space); } fnvlist_add_uint64(outnvl, "space", space); out: dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ static void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } static void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); #ifdef __FreeBSD__ zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, zfs_secpolicy_config, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, zfs_secpolicy_config, POOL_CHECK_NONE); #endif } int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } /* * Find a free minor number. */ minor_t zfsdev_minor_alloc(void) { static minor_t last_minor; minor_t m; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (ddi_get_soft_state(zfsdev_state, m) == NULL) { last_minor = m; return (m); } } return (0); } static int zfs_ctldev_init(struct cdev *devp) { minor_t minor; zfs_soft_state_t *zs; ASSERT(MUTEX_HELD(&spa_namespace_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) return (SET_ERROR(EAGAIN)); devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close); zs = ddi_get_soft_state(zfsdev_state, minor); zs->zss_type = ZSST_CTLDEV; zfs_onexit_init((zfs_onexit_t **)&zs->zss_data); return (0); } static void zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); zfs_onexit_destroy(zo); ddi_soft_state_free(zfsdev_state, minor); } void * zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which) { zfs_soft_state_t *zp; zp = ddi_get_soft_state(zfsdev_state, minor); if (zp == NULL || zp->zss_type != which) return (NULL); return (zp->zss_data); } static int zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) { int error = 0; #ifdef illumos if (getminor(*devp) != 0) return (zvol_open(devp, flag, otyp, cr)); #endif /* This is the control device. Allocate a new minor if requested. */ if (flag & FEXCL) { mutex_enter(&spa_namespace_lock); error = zfs_ctldev_init(devp); mutex_exit(&spa_namespace_lock); } return (error); } static void zfsdev_close(void *data) { zfs_onexit_t *zo; minor_t minor = (minor_t)(uintptr_t)data; if (minor == 0) return; mutex_enter(&spa_namespace_lock); zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); if (zo == NULL) { mutex_exit(&spa_namespace_lock); return; } zfs_ctldev_destroy(zo, minor); mutex_exit(&spa_namespace_lock); } static int zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, struct thread *td) { zfs_cmd_t *zc; uint_t vecnum; int error, rc, len; #ifdef illumos minor_t minor = getminor(dev); #else zfs_iocparm_t *zc_iocparm; int cflag, cmd, oldvecnum; boolean_t newioc, compat; void *compat_zc = NULL; cred_t *cr = td->td_ucred; #endif const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; nvlist_t *innvl = NULL; cflag = ZFS_CMD_COMPAT_NONE; compat = B_FALSE; newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */ len = IOCPARM_LEN(zcmd); vecnum = cmd = zcmd & 0xff; /* * Check if we are talking to supported older binaries * and translate zfs_cmd if necessary */ if (len != sizeof(zfs_iocparm_t)) { newioc = B_FALSE; compat = B_TRUE; vecnum = cmd; switch (len) { case sizeof(zfs_cmd_zcmd_t): cflag = ZFS_CMD_COMPAT_LZC; break; case sizeof(zfs_cmd_deadman_t): cflag = ZFS_CMD_COMPAT_DEADMAN; break; case sizeof(zfs_cmd_v28_t): cflag = ZFS_CMD_COMPAT_V28; break; case sizeof(zfs_cmd_v15_t): cflag = ZFS_CMD_COMPAT_V15; vecnum = zfs_ioctl_v15_to_v28[cmd]; /* * Return without further handling * if the command is blacklisted. */ if (vecnum == ZFS_IOC_COMPAT_PASS) return (0); else if (vecnum == ZFS_IOC_COMPAT_FAIL) return (ENOTSUP); break; default: return (EINVAL); } } #ifdef illumos vecnum = cmd - ZFS_IOC_FIRST; ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); #endif if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(EINVAL)); vec = &zfs_ioc_vec[vecnum]; zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); #ifdef illumos error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } #else /* !illumos */ bzero(zc, sizeof(zfs_cmd_t)); if (newioc) { zc_iocparm = (void *)arg; switch (zc_iocparm->zfs_ioctl_version) { case ZFS_IOCVER_CURRENT: if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) { error = SET_ERROR(EINVAL); goto out; } break; + case ZFS_IOCVER_RESUME: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_RESUME; + break; case ZFS_IOCVER_EDBP: if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) { error = SET_ERROR(EFAULT); goto out; } compat = B_TRUE; cflag = ZFS_CMD_COMPAT_EDBP; break; case ZFS_IOCVER_ZCMD: if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) || zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) { error = SET_ERROR(EFAULT); goto out; } compat = B_TRUE; cflag = ZFS_CMD_COMPAT_ZCMD; break; default: error = SET_ERROR(EINVAL); goto out; /* NOTREACHED */ } if (compat) { ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); bzero(compat_zc, sizeof(zfs_cmd_t)); error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, compat_zc, zc_iocparm->zfs_cmd_size, flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } } else { error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, zc, zc_iocparm->zfs_cmd_size, flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } } } if (compat) { if (newioc) { ASSERT(compat_zc != NULL); zfs_cmd_compat_get(zc, compat_zc, cflag); } else { ASSERT(compat_zc == NULL); zfs_cmd_compat_get(zc, arg, cflag); } oldvecnum = vecnum; error = zfs_ioctl_compat_pre(zc, &vecnum, cflag); if (error != 0) goto out; if (oldvecnum != vecnum) vec = &zfs_ioc_vec[vecnum]; } #endif /* !illumos */ zc->zc_iflags = flag & FKIOCTL; if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* rewrite innvl for backwards compatibility */ if (compat) innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag); /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case NO_NAME: break; } if (error == 0 && !(flag & FKIOCTL)) error = vec->zvec_secpolicy(zc, innvl, cr); if (error != 0) goto out; /* legacy ioctls can modify zc_name */ len = strcspn(zc->zc_name, "/@#") + 1; saved_poolname = kmem_alloc(len, KM_SLEEP); (void) strlcpy(saved_poolname, zc->zc_name, len); if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); if (error == 0 && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); /* rewrite outnvl for backwards compatibility */ if (compat) outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum, cflag); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { error = vec->zvec_legacy_func(zc); } out: nvlist_free(innvl); #ifdef illumos rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); #else if (compat) { zfs_ioctl_compat_post(zc, cmd, cflag); if (newioc) { ASSERT(compat_zc != NULL); ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag); rc = ddi_copyout(compat_zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd, zc_iocparm->zfs_cmd_size, flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); kmem_free(compat_zc, sizeof (zfs_cmd_t)); } else { zfs_cmd_compat_put(zc, arg, vecnum, cflag); } } else { ASSERT(newioc); rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd, sizeof (zfs_cmd_t), flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); } #endif if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) strfree(s); (void) tsd_set(zfs_allow_log_key, saved_poolname); } else { if (saved_poolname != NULL) strfree(saved_poolname); } kmem_free(zc, sizeof (zfs_cmd_t)); return (error); } #ifdef illumos static int zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { if (cmd != DDI_ATTACH) return (DDI_FAILURE); if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE) return (DDI_FAILURE); zfs_dip = dip; ddi_report_dev(dip); return (DDI_SUCCESS); } static int zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { if (spa_busy() || zfs_busy() || zvol_busy()) return (DDI_FAILURE); if (cmd != DDI_DETACH) return (DDI_FAILURE); zfs_dip = NULL; ddi_prop_remove_all(dip); ddi_remove_minor_node(dip, NULL); return (DDI_SUCCESS); } /*ARGSUSED*/ static int zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: *result = zfs_dip; return (DDI_SUCCESS); case DDI_INFO_DEVT2INSTANCE: *result = (void *)0; return (DDI_SUCCESS); } return (DDI_FAILURE); } #endif /* illumos */ /* * OK, so this is a little weird. * * /dev/zfs is the control node, i.e. minor 0. * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. * * /dev/zfs has basically nothing to do except serve up ioctls, * so most of the standard driver entry points are in zvol.c. */ #ifdef illumos static struct cb_ops zfs_cb_ops = { zfsdev_open, /* open */ zfsdev_close, /* close */ zvol_strategy, /* strategy */ nodev, /* print */ zvol_dump, /* dump */ zvol_read, /* read */ zvol_write, /* write */ zfsdev_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ nochpoll, /* poll */ ddi_prop_op, /* prop_op */ NULL, /* streamtab */ D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ CB_REV, /* version */ nodev, /* async read */ nodev, /* async write */ }; static struct dev_ops zfs_dev_ops = { DEVO_REV, /* version */ 0, /* refcnt */ zfs_info, /* info */ nulldev, /* identify */ nulldev, /* probe */ zfs_attach, /* attach */ zfs_detach, /* detach */ nodev, /* reset */ &zfs_cb_ops, /* driver operations */ NULL, /* no bus operations */ NULL, /* power */ ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv zfs_modldrv = { &mod_driverops, "ZFS storage pool", &zfs_dev_ops }; static struct modlinkage modlinkage = { MODREV_1, (void *)&zfs_modlfs, (void *)&zfs_modldrv, NULL }; #endif /* illumos */ static struct cdevsw zfs_cdevsw = { .d_version = D_VERSION, .d_open = zfsdev_open, .d_ioctl = zfsdev_ioctl, .d_name = ZFS_DEV_NAME }; static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; strfree(poolname); } static void zfsdev_init(void) { zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, ZFS_DEV_NAME); } static void zfsdev_fini(void) { if (zfsdev != NULL) destroy_dev(zfsdev); } static struct root_hold_token *zfs_root_token; struct proc *zfsproc; #ifdef illumos int _init(void) { int error; spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); zfs_ioctl_init(); if ((error = mod_install(&modlinkage)) != 0) { zvol_fini(); zfs_fini(); spa_fini(); return (error); } tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); error = ldi_ident_from_mod(&modlinkage, &zfs_li); ASSERT(error == 0); mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } int _fini(void) { int error; if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) return (SET_ERROR(EBUSY)); if ((error = mod_remove(&modlinkage)) != 0) return (error); zvol_fini(); zfs_fini(); spa_fini(); if (zfs_nfsshare_inited) (void) ddi_modclose(nfs_mod); if (zfs_smbshare_inited) (void) ddi_modclose(smbsrv_mod); if (zfs_nfsshare_inited || zfs_smbshare_inited) (void) ddi_modclose(sharefs_mod); tsd_destroy(&zfs_fsyncer_key); ldi_ident_release(zfs_li); zfs_li = NULL; mutex_destroy(&zfs_share_lock); return (error); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } #endif /* illumos */ static int zfs__init(void); static int zfs__fini(void); static void zfs_shutdown(void *, int); static eventhandler_tag zfs_shutdown_event_tag; #ifdef __FreeBSD__ #define ZFS_MIN_KSTACK_PAGES 4 #endif int zfs__init(void) { #ifdef __FreeBSD__ #if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " "overflow panic!\nPlease consider adding " "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, ZFS_MIN_KSTACK_PAGES); #endif #endif zfs_root_token = root_mount_hold("ZFS"); mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); zfs_ioctl_init(); tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n"); root_mount_rel(zfs_root_token); zfsdev_init(); return (0); } int zfs__fini(void) { if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) { return (EBUSY); } zfsdev_fini(); zvol_fini(); zfs_fini(); spa_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); mutex_destroy(&zfs_share_lock); return (0); } static void zfs_shutdown(void *arg __unused, int howto __unused) { /* * ZFS fini routines can not properly work in a panic-ed system. */ if (panicstr == NULL) (void)zfs__fini(); } static int zfs_modevent(module_t mod, int type, void *unused __unused) { int err; switch (type) { case MOD_LOAD: err = zfs__init(); if (err == 0) zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( shutdown_post_sync, zfs_shutdown, NULL, SHUTDOWN_PRI_FIRST); return (err); case MOD_UNLOAD: err = zfs__fini(); if (err == 0 && zfs_shutdown_event_tag != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, zfs_shutdown_event_tag); return (err); case MOD_SHUTDOWN: return (0); default: break; } return (EOPNOTSUPP); } static moduledata_t zfs_mod = { "zfsctrl", zfs_modevent, 0 }; DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(zfsctrl, 1); MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1); MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 297108) @@ -1,3610 +1,3662 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); #if defined(__amd64__) static int zio_use_uma = 1; #else static int zio_use_uma = 0; #endif TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, "Use uma(9) for ZIO allocations"); static int zio_exclude_metadata = 0; TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata); SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, "Exclude metadata buffers from dumps as well"); zio_trim_stats_t zio_trim_stats = { { "bytes", KSTAT_DATA_UINT64, "Number of bytes successfully TRIMmed" }, { "success", KSTAT_DATA_UINT64, "Number of successful TRIM requests" }, { "unsupported", KSTAT_DATA_UINT64, "Number of TRIM requests that failed because TRIM is not supported" }, { "failed", KSTAT_DATA_UINT64, "Number of TRIM requests that failed for reasons other than not supported" }, }; static kstat_t *zio_trim_ksp; /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", "zio_ioctl" }; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #ifdef _KERNEL extern vmem_t *zio_alloc_arena; #endif #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free); SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN, &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass"); int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress); SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN, &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass"); int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite); SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN, &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass"); /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; #else int zio_buf_debug_limit = 0; #endif void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); if (!zio_use_uma) goto out; /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; while (!ISP2(p2)) p2 &= p2 - 1; #ifdef illumos #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; #endif #endif /* illumos */ if (size <= 4 * SPA_MINBLOCKSIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = MIN(p2 >> 2, PAGESIZE); } if (align != 0) { char name[36]; (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); /* * Since zio_data bufs do not appear in crash dumps, we * pass KMC_NOTOUCH so that no allocator metadata is * stored with the buffers. */ (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags | KMC_NOTOUCH | KMC_NODEBUG); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } out: zio_inject_init(); zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc", KSTAT_TYPE_NAMED, sizeof(zio_trim_stats) / sizeof(kstat_named_t), KSTAT_FLAG_VIRTUAL); if (zio_trim_ksp != NULL) { zio_trim_ksp->ks_data = &zio_trim_stats; kstat_install(zio_trim_ksp); } } void zio_fini(void) { size_t c; kmem_cache_t *last_cache = NULL; kmem_cache_t *last_data_cache = NULL; for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { if (zio_buf_cache[c] != last_cache) { last_cache = zio_buf_cache[c]; kmem_cache_destroy(zio_buf_cache[c]); } zio_buf_cache[c] = NULL; if (zio_data_buf_cache[c] != last_data_cache) { last_data_cache = zio_data_buf_cache[c]; kmem_cache_destroy(zio_data_buf_cache[c]); } zio_data_buf_cache[c] = NULL; } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); if (zio_trim_ksp != NULL) { kstat_delete(zio_trim_ksp); zio_trim_ksp = NULL; } } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; int flags = zio_exclude_metadata ? KM_NODEBUG : 0; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); else return (kmem_alloc(size, KM_SLEEP|flags)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); else return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) kmem_cache_free(zio_buf_cache[c], buf); else kmem_free(buf, size); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); if (zio_use_uma) kmem_cache_free(zio_data_buf_cache[c], buf); else kmem_free(buf, size); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ static void zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_data = zio->io_data; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_data = data; zio->io_size = size; } static void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_data, zt->zt_orig_size); if (zt->zt_bufsize != 0) zio_buf_free(zio->io_data, zt->zt_bufsize); zio->io_data = zt->zt_orig_data; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks and decompression * ========================================================================== */ static void zio_subblock(zio_t *zio, void *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) bcopy(zio->io_data, data, size); } static void zio_decompress(zio_t *zio, void *data, uint64_t size) { if (zio->io_error == 0 && zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_data, data, zio->io_size, size) != 0) zio->io_error = SET_ERROR(EIO); } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ /* * NOTE - Callers to zio_walk_parents() and zio_walk_children must * continue calling these functions until they return NULL. * Otherwise, the next caller will pick up the list walk in * some indeterminate state. (Otherwise every caller would * have to pass in a cookie to keep the state represented by * io_walk_link, which gets annoying.) */ zio_t * zio_walk_parents(zio_t *cio) { zio_link_t *zl = cio->io_walk_link; list_t *pl = &cio->io_parent_list; zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); cio->io_walk_link = zl; if (zl == NULL) return (NULL); ASSERT(zl->zl_child == cio); return (zl->zl_parent); } zio_t * zio_walk_children(zio_t *pio) { zio_link_t *zl = pio->io_walk_link; list_t *cl = &pio->io_child_list; zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); pio->io_walk_link = zl; if (zl == NULL) return (NULL); ASSERT(zl->zl_parent == pio); return (zl->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_t *pio = zio_walk_parents(cio); VERIFY(zio_walk_parents(cio) == NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT(cio->io_child_type <= pio->io_child_type); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); pio->io_child_count++; cio->io_parent_count++; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); pio->io_child_count--; cio->io_parent_count--; mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) { uint64_t *countp = &zio->io_children[child][wait]; boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); if (*countp != 0) { zio->io_stage >>= 1; zio->io_stall = countp; waiting = B_TRUE; } mutex_exit(&zio->io_lock); return (waiting); } static void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { pio->io_stall = NULL; mutex_exit(&pio->io_lock); zio_execute(pio); } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_data = zio->io_data = data; zio->io_orig_size = zio->io_size = size; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } return (zio); } static void zio_destroy(zio_t *zio) { list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) { if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { zfs_panic_recover("blkptr at %p has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { zfs_panic_recover("blkptr at %p has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) { zfs_panic_recover("blkptr at %p has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); if (vdevid >= spa->spa_root_vdev->vdev_children) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { zfs_panic_recover("blkptr at %p DVA %u has hole " "VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) { zfs_panic_recover("blkptr at %p DVA %u has invalid " "OFFSET %llu", bp, i, (longlong_t)offset); } } } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; zfs_blkptr_verify(spa, bp); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_physdone = physdone; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). */ if (data == NULL && zio->io_prop.zp_dedup_verify) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. */ if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, BP_GET_PSIZE(bp), 0))); } } zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, uint64_t size, enum zio_flag flags) { zio_t *zio; enum zio_stage stage = ZIO_FREE_PIPELINE; ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); metaslab_check_free(spa, bp); arc_freed(spa, bp); if (zfs_trim_enabled) stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START | ZIO_STAGE_VDEV_IO_ASSESS; /* * GANG and DEDUP blocks can induce a read (for the gang block header, * or the DDT), so issue them asynchronously so that this thread is * not tied up. */ else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) stage |= ZIO_STAGE_ISSUE_ASYNC; flags |= ZIO_FLAG_DONT_QUEUE; zio = zio_create(pio, spa, txg, bp, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); return (zio); } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; dprintf_bp(bp, "claiming in txg %llu", txg); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); ASSERT(txg == spa_first_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, offset, size, done, private, priority, flags)); } return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ void *wbuf = zio_buf_alloc(size); bcopy(data, wbuf, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; ASSERT(vd->vdev_parent == (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* Not all IO types require vdev io done stage e.g. free */ if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; if (vd->vdev_children == 0) offset += VDEV_LABEL_START_SIZE; flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } zio_t * zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size) { ASSERT(vd->vdev_ops->vdev_op_leaf); return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT(zio->io_executor == NULL); ASSERT(zio->io_orig_size == zio->io_size); ASSERT(size <= zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) zio->io_orig_size = zio->io_size = size; } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static int zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); void *cbuf = zio_buf_alloc(psize); zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, zio->io_data); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static int zio_write_bp_init(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_size; uint64_t psize = lsize; int pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) return (ZIO_PIPELINE_STOP); if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) return (ZIO_PIPELINE_CONTINUE); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (ZIO_PIPELINE_CONTINUE); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (ZIO_PIPELINE_CONTINUE); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); } zio->io_bp_override = NULL; BP_ZERO(bp); } if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } if (compress != ZIO_COMPRESS_OFF) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (ZIO_PIPELINE_CONTINUE); } else { /* * Round up compressed size up to the ashift * of the smallest-ashift device, and zero the tail. * This ensures that the compressed size of the BP * (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); size_t rounded = (size_t)P2ROUNDUP(psize, 1ULL << spa->spa_min_ashift); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { bzero((char *)cbuf + psize, rounded - psize); psize = rounded; zio_push_transform(zio, cbuf, psize, lsize, NULL); } } } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { ASSERT(psize != 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (ZIO_PIPELINE_CONTINUE); } static int zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if (zio->io_priority == ZIO_PRIORITY_NOW && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ #if defined(illumos) || !defined(_KERNEL) ASSERT(zio->io_tqent.tqent_next == NULL); #else ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); #endif spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { kthread_t *executor = zio->io_executor; spa_t *spa = zio->io_spa; for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (taskq_member(tqs->stqs_taskq[i], executor)) return (B_TRUE); } } return (B_FALSE); } static int zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } void zio_interrupt(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } +void +zio_delay_interrupt(zio_t *zio) +{ + /* + * The timeout_generic() function isn't defined in userspace, so + * rather than trying to implement the function, the zio delay + * functionality has been disabled for userspace builds. + */ + +#ifdef _KERNEL + /* + * If io_target_timestamp is zero, then no delay has been registered + * for this IO, thus jump to the end of this function and "skip" the + * delay; issuing it directly to the zio layer. + */ + if (zio->io_target_timestamp != 0) { + hrtime_t now = gethrtime(); + + if (now >= zio->io_target_timestamp) { + /* + * This IO has already taken longer than the target + * delay to complete, so we don't want to delay it + * any longer; we "miss" the delay and issue it + * directly to the zio layer. This is likely due to + * the target latency being set to a value less than + * the underlying hardware can satisfy (e.g. delay + * set to 1ms, but the disks take 10ms to complete an + * IO request). + */ + + DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, + hrtime_t, now); + + zio_interrupt(zio); + } else { + hrtime_t diff = zio->io_target_timestamp - now; + + DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, + hrtime_t, now, hrtime_t, diff); + + (void) timeout_generic(CALLOUT_NORMAL, + (void (*)(void *))zio_interrupt, zio, diff, 1, 0); + } + + return; + } +#endif + + DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); + zio_interrupt(zio); +} + /* * Execute the I/O pipeline until one of the following occurs: * * (1) the I/O completes * (2) the pipeline stalls waiting for dependent child I/Os * (3) the I/O issues, so we're waiting for an I/O completion interrupt * (4) the I/O is delegated by vdev-level caching or aggregation * (5) the I/O is deferred due to vdev-level queueing * (6) the I/O is handed off to another thread. * * In all cases, the pipeline stops whenever there's no CPU work; it never * burns a thread in cv_wait(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; void zio_execute(zio_t *zio) { zio->io_executor = curthread; while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; int rv; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) return; ASSERT(rv == ZIO_PIPELINE_CONTINUE); } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { int error; ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_executor == NULL); zio->io_waiter = curthread; zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) cv_wait(&zio->io_cv, &zio->io_lock); mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { ASSERT(zio->io_executor == NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && zio_unique_parent(zio) == NULL) { /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); } zio_execute(zio); } /* * ========================================================================== * Reexecute or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio); mutex_enter(&pio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); } /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on him. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) zio_execute(pio); } void zio_suspend(spa_t *spa, zio_t *zio) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = B_TRUE; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = B_FALSE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { zio_t *zio; if (gn != NULL) { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), data, BP_GET_PSIZE(bp)); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } /* ARGSUSED */ zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp), ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(zio->io_child_count == 0); if (zio->io_error) return; if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(zio->io_data, zio->io_size); ASSERT(zio->io_data == gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); data = (char *)data + BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree && gio->io_data != NULL) ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) zio_nowait(zio); } static int zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (ZIO_PIPELINE_CONTINUE); } static int zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (ZIO_PIPELINE_CONTINUE); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); zio_t *gio = zio->io_gang_leader; dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static int zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); zio_prop_t zp; int error; error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); if (error) { pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio_nowait(zio); return (ZIO_PIPELINE_CONTINUE); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static int zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (ZIO_PIPELINE_CONTINUE); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_data == NULL) dde->dde_repair_data = zio->io_data; else zio_buf_free(zio->io_data, zio->io_size); mutex_exit(&pio->io_lock); } static int zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (ZIO_PIPELINE_CONTINUE); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, zio_buf_alloc(zio->io_size), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (ZIO_PIPELINE_CONTINUE); } static int zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (ZIO_PIPELINE_CONTINUE); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } if (dde->dde_repair_data != NULL) { bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (ZIO_PIPELINE_CONTINUE); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || bcmp(zio->io_orig_data, lio->io_orig_data, zio->io_orig_size) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || bcmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); VERIFY(arc_buf_remove_ref(abuf, &abuf)); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); while ((pio = zio_walk_parents(zio)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { while (zio_walk_parents(zio) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static void zio_ddt_ditto_write_done(zio_t *zio) { int p = DDT_PHYS_DITTO; zio_prop_t *zp = &zio->io_prop; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_key_t *ddk = &dde->dde_key; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); if (ddp->ddp_phys_birth != 0) ddt_phys_free(ddt, ddk, ddp, zio->io_txg); ddt_phys_fill(ddp, bp); } ddt_exit(ddt); } static int zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; int ditto_copies; zio_t *cio = NULL; zio_t *dio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; } zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); ASSERT(ditto_copies < SPA_DVAS_PER_BP); if (ditto_copies > ddt_ditto_copies_present(dde) && dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { zio_prop_t czp = *zp; czp.zp_copies = ditto_copies; /* * If we arrived here with an override bp, we won't have run * the transform stack, so we won't have the data we need to * generate a child i/o. So, toss the override bp and restart. * This is safe, because using the override bp is just an * optimization; and it's rare, so the cost doesn't matter. */ if (zio->io_bp_override) { zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; zio->io_pipeline = ZIO_WRITE_PIPELINE; zio->io_bp_override = NULL; BP_ZERO(bp); ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, zio->io_orig_size, &czp, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); if (cio) zio_nowait(cio); if (dio) zio_nowait(dio); return (ZIO_PIPELINE_CONTINUE); } ddt_entry_t *freedde; /* for debugging */ static int zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); ddp = ddt_phys_select(dde, bp); ddt_phys_decref(ddp); ddt_exit(ddt); return (ZIO_PIPELINE_CONTINUE); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static int zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); /* * The dump device does not support gang blocks so allocation on * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid * the "fast" gang feature. */ flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? METASLAB_GANG_CHILD : 0; error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags); if (error) { spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) return (zio_write_gang_block(zio)); zio->io_error = error; } return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (ZIO_PIPELINE_CONTINUE); } static int zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t use_slog) { int error = 1; ASSERT(txg > spa_syncing_txg(spa)); /* * ZIL blocks are always contiguous (i.e. not gang blocks) so we * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" * when allocating them. */ if (use_slog) { error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); } if (error) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); } if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); } return (error); } /* * Free an intent log block. */ void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) { ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); ASSERT(!BP_IS_GANG(bp)); zio_free(spa, txg, bp); } /* * ========================================================================== * Read, write and delete to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static int zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; int ret; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && zio->io_priority == ZIO_PRIORITY_NOW) { trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); return (ZIO_PIPELINE_CONTINUE); } /* * We keep track of time-sensitive I/Os so that the scan thread * can quickly react to certain workloads. In particular, we care * about non-scrubbing, top-level reads and writes with the following * characteristics: * - synchronous writes of user data to non-slog devices * - any reads of user data * When these conditions are met, adjust the timestamp of spa_last_io * which allows the scan thread to adjust its workload accordingly. */ if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && vd == vd->vdev_top && !vd->vdev_islog && zio->io_bookmark.zb_objset != DMU_META_OBJSET && zio->io_txg != spa_syncing_txg(spa)) { uint64_t old = spa->spa_last_io; uint64_t new = ddi_get_lbolt64(); if (old != new) (void) atomic_cas_64(&spa->spa_last_io, old, new); } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); char *abuf = NULL; if (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE) abuf = zio_buf_alloc(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { bcopy(zio->io_data, abuf, zio->io_size); bzero(abuf + zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, abuf ? asize : 0, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering with nested replication. * For example, given a mirror of mirrors, (A+B)+(C+D), if only * A is out of date, we'll read from C+D, then use the data to * resilver A+B -- but we don't actually want to resilver B, just A. * The top-level mirror has no way to know this, so instead we just * discard unnecessary repairs as we work our way down the vdev tree. * The same logic applies to any form of nested replication: * ditto + mirror, RAID-Z + replacing, etc. This covers them all. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (ZIO_PIPELINE_CONTINUE); } if (vd->vdev_ops->vdev_op_leaf) { switch (zio->io_type) { case ZIO_TYPE_READ: if (vdev_cache_read(zio)) return (ZIO_PIPELINE_CONTINUE); /* FALLTHROUGH */ case ZIO_TYPE_WRITE: case ZIO_TYPE_FREE: if ((zio = vdev_queue_io(zio)) == NULL) return (ZIO_PIPELINE_STOP); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (ZIO_PIPELINE_STOP); } break; } /* * Note that we ignore repair writes for TRIM because they can * conflict with normal writes. This isn't an issue because, by * definition, we only repair blocks that aren't freed. */ if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && !trim_map_write_start(zio)) return (ZIO_PIPELINE_STOP); } vd->vdev_ops->vdev_op_io_start(zio); return (ZIO_PIPELINE_STOP); } static int zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); if (vd != NULL && vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE)) { if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) trim_map_write_done(zio); vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injection(vd, zio, EIO); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error) { if (zio->io_error == ENOTSUP && zio->io_type == ZIO_TYPE_FREE) { /* Not all devices support TRIM. */ } else if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); return (ZIO_PIPELINE_CONTINUE); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const void *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } /*ARGSUSED*/ void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); bcopy(zio->io_data, buf, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_buf_free; } static int zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); if (zio->io_type == ZIO_TYPE_FREE && zio->io_priority != ZIO_PRIORITY_NOW) { switch (zio->io_error) { case 0: ZIO_TRIM_STAT_INCR(bytes, zio->io_size); ZIO_TRIM_STAT_BUMP(success); break; case EOPNOTSUPP: ZIO_TRIM_STAT_BUMP(unsupported); break; default: ZIO_TRIM_STAT_BUMP(failed); break; } } /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (ZIO_PIPELINE_STOP); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; } if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && zio->io_physdone != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); zio->io_physdone(zio->io_logical); } return (ZIO_PIPELINE_CONTINUE); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static int zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); return (ZIO_PIPELINE_CONTINUE); } static int zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (ZIO_PIPELINE_CONTINUE); ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, zio, zio->io_offset, zio->io_size, NULL, &info); } } return (ZIO_PIPELINE_CONTINUE); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static int zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) return (ZIO_PIPELINE_STOP); if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio); zio_notify_parent(pio, zio, ZIO_WAIT_READY); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (ZIO_PIPELINE_CONTINUE); } static int zio_done(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; zio_t *pio, *pio_next; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (bp != NULL && !BP_IS_EMBEDDED(bp)) { ASSERT(bp->blk_pad[0] == 0); ASSERT(bp->blk_pad[1] == 0); ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); ASSERT(BP_COUNT_GANG(bp) == 0 || (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); char *abuf = zio->io_data; if (asize != psize) { abuf = zio_buf_alloc(asize); bcopy(zio->io_data, abuf, psize); bzero(abuf + psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); if (asize != psize) zio_buf_free(abuf, asize); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == lio) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(spa, zio); zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 0, 0); } } if (zio->io_error && zio == lio) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(vd == NULL && bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(spa) == SPA_LOAD_NONE && spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute = 0; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { zio_link_t *zl = zio->io_walk_link; pio_next = zio_walk_parents(zio); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(spa, zio); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ #if defined(illumos) || !defined(_KERNEL) ASSERT(zio->io_tqent.tqent_next == NULL); #else ASSERT(zio->io_tqent.tqent_task.ta_pending == 0); #endif spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } return (ZIO_PIPELINE_STOP); } ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { zio_link_t *zl = zio->io_walk_link; pio_next = zio_walk_parents(zio); zio_remove_child(pio, zio, zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (ZIO_PIPELINE_STOP); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_free_bp_init, zio_issue_async, zio_write_bp_init, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT(last_block->zb_level == 0); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c (revision 297107) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c (revision 297108) @@ -1,528 +1,755 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* * ZFS fault injection * * To handle fault injection, we keep track of a series of zinject_record_t * structures which describe which logical block(s) should be injected with a * fault. These are kept in a global list. Each record corresponds to a given * spa_t and maintains a special hold on the spa_t so that it cannot be deleted * or exported while the injection record exists. * * Device level injection is done using the 'zi_guid' field. If this is set, it * means that the error is destined for a particular device, not a piece of * data. * * This is a rather poor data structure and algorithm, but we don't expect more * than a few faults at any one time, so it should be sufficient for our needs. */ #include #include #include #include #include #include uint32_t zio_injection_enabled; +/* + * Data describing each zinject handler registered on the system, and + * contains the list node linking the handler in the global zinject + * handler list. + */ typedef struct inject_handler { int zi_id; spa_t *zi_spa; zinject_record_t zi_record; + uint64_t *zi_lanes; + int zi_next_lane; list_node_t zi_link; } inject_handler_t; +/* + * List of all zinject handlers registered on the system, protected by + * the inject_lock defined below. + */ static list_t inject_handlers; + +/* + * This protects insertion into, and traversal of, the inject handler + * list defined above; as well as the inject_delay_count. Any time a + * handler is inserted or removed from the list, this lock should be + * taken as a RW_WRITER; and any time traversal is done over the list + * (without modification to it) this lock should be taken as a RW_READER. + */ static krwlock_t inject_lock; + +/* + * This holds the number of zinject delay handlers that have been + * registered on the system. It is protected by the inject_lock defined + * above. Thus modifications to this count must be a RW_WRITER of the + * inject_lock, and reads of this count must be (at least) a RW_READER + * of the lock. + */ +static int inject_delay_count = 0; + +/* + * This lock is used only in zio_handle_io_delay(), refer to the comment + * in that function for more details. + */ +static kmutex_t inject_delay_mtx; + +/* + * Used to assign unique identifying numbers to each new zinject handler. + */ static int inject_next_id = 1; /* * Returns true if the given record matches the I/O in progress. */ static boolean_t zio_match_handler(zbookmark_phys_t *zb, uint64_t type, zinject_record_t *record, int error) { /* * Check for a match against the MOS, which is based on type */ if (zb->zb_objset == DMU_META_OBJSET && record->zi_objset == DMU_META_OBJSET && record->zi_object == DMU_META_DNODE_OBJECT) { if (record->zi_type == DMU_OT_NONE || type == record->zi_type) return (record->zi_freq == 0 || spa_get_random(100) < record->zi_freq); else return (B_FALSE); } /* * Check for an exact match. */ if (zb->zb_objset == record->zi_objset && zb->zb_object == record->zi_object && zb->zb_level == record->zi_level && zb->zb_blkid >= record->zi_start && zb->zb_blkid <= record->zi_end && error == record->zi_error) return (record->zi_freq == 0 || spa_get_random(100) < record->zi_freq); return (B_FALSE); } /* * Panic the system when a config change happens in the function * specified by tag. */ void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) { inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa) continue; if (handler->zi_record.zi_type == type && strcmp(tag, handler->zi_record.zi_func) == 0) panic("Panic requested in function %s\n", tag); } rw_exit(&inject_lock); } /* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. */ int zio_handle_fault_injection(zio_t *zio, int error) { int ret = 0; inject_handler_t *handler; /* * Ignore I/O not associated with any logical data. */ if (zio->io_logical == NULL) return (0); /* * Currently, we only support fault injection on reads. */ if (zio->io_type != ZIO_TYPE_READ) return (0); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; /* If this handler matches, return EIO */ if (zio_match_handler(&zio->io_logical->io_bookmark, zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, &handler->zi_record, error)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } /* * Determine if the zio is part of a label update and has an injection * handler associated with that portion of the label. Currently, we * allow error injection in either the nvlist or the uberblock region of * of the vdev label. */ int zio_handle_label_injection(zio_t *zio, int error) { inject_handler_t *handler; vdev_t *vd = zio->io_vd; uint64_t offset = zio->io_offset; int label; int ret = 0; if (offset >= VDEV_LABEL_START_SIZE && offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) continue; /* * The injection region is the relative offsets within a * vdev label. We must determine the label which is being * updated and adjust our region accordingly. */ label = vdev_label_number(vd->vdev_psize, offset); start = vdev_label_offset(vd->vdev_psize, label, start); end = vdev_label_offset(vd->vdev_psize, label, end); if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && (offset >= start && offset <= end)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) { inject_handler_t *handler; int ret = 0; /* * We skip over faults in the labels unless it's during * device open (i.e. zio == NULL). */ if (zio != NULL) { uint64_t offset = zio->io_offset; if (offset < VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); } rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { if (handler->zi_record.zi_failfast && (zio == NULL || (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { continue; } /* Handle type specific I/O failures */ if (zio != NULL && handler->zi_record.zi_iotype != ZIO_TYPES && handler->zi_record.zi_iotype != zio->io_type) continue; if (handler->zi_record.zi_error == error) { /* * For a failed open, pretend like the device * has gone away. */ if (error == ENXIO) vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; /* * Treat these errors as if they had been * retried so that all the appropriate stats * and FMA events are generated. */ if (!handler->zi_record.zi_failfast && zio != NULL) zio->io_flags |= ZIO_FLAG_IO_RETRY; ret = error; break; } if (handler->zi_record.zi_error == ENXIO) { ret = SET_ERROR(EIO); break; } } } rw_exit(&inject_lock); return (ret); } /* * Simulate hardware that ignores cache flushes. For requested number * of seconds nix the actual writing to disk. */ void zio_handle_ignored_writes(zio_t *zio) { inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { /* Ignore errors not destined for this pool */ if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; /* * Positive duration implies # of seconds, negative * a number of txgs */ if (handler->zi_record.zi_timer == 0) { if (handler->zi_record.zi_duration > 0) handler->zi_record.zi_timer = ddi_get_lbolt64(); else handler->zi_record.zi_timer = zio->io_txg; } /* Have a "problem" writing 60% of the time */ if (spa_get_random(100) < 60) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; break; } rw_exit(&inject_lock); } void spa_handle_ignored_writes(spa_t *spa) { inject_handler_t *handler; if (zio_injection_enabled == 0) return; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; if (handler->zi_record.zi_duration > 0) { VERIFY(handler->zi_record.zi_timer == 0 || handler->zi_record.zi_timer + handler->zi_record.zi_duration * hz > ddi_get_lbolt64()); } else { /* duration is negative so the subtraction here adds */ VERIFY(handler->zi_record.zi_timer == 0 || handler->zi_record.zi_timer - handler->zi_record.zi_duration >= spa_syncing_txg(spa)); } } rw_exit(&inject_lock); } -uint64_t +hrtime_t zio_handle_io_delay(zio_t *zio) { vdev_t *vd = zio->io_vd; - inject_handler_t *handler; - uint64_t seconds = 0; + inject_handler_t *min_handler = NULL; + hrtime_t min_target = 0; - if (zio_injection_enabled == 0) - return (0); - rw_enter(&inject_lock, RW_READER); - for (handler = list_head(&inject_handlers); handler != NULL; - handler = list_next(&inject_handlers, handler)) { + /* + * inject_delay_count is a subset of zio_injection_enabled that + * is only incremented for delay handlers. These checks are + * mainly added to remind the reader why we're not explicitly + * checking zio_injection_enabled like the other functions. + */ + IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); + IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); + /* + * If there aren't any inject delay handlers registered, then we + * can short circuit and simply return 0 here. A value of zero + * informs zio_delay_interrupt() that this request should not be + * delayed. This short circuit keeps us from acquiring the + * inject_delay_mutex unnecessarily. + */ + if (inject_delay_count == 0) { + rw_exit(&inject_lock); + return (0); + } + + /* + * Each inject handler has a number of "lanes" associated with + * it. Each lane is able to handle requests independently of one + * another, and at a latency defined by the inject handler + * record's zi_timer field. Thus if a handler in configured with + * a single lane with a 10ms latency, it will delay requests + * such that only a single request is completed every 10ms. So, + * if more than one request is attempted per each 10ms interval, + * the average latency of the requests will be greater than + * 10ms; but if only a single request is submitted each 10ms + * interval the average latency will be 10ms. + * + * We need to acquire this mutex to prevent multiple concurrent + * threads being assigned to the same lane of a given inject + * handler. The mutex allows us to perform the following two + * operations atomically: + * + * 1. determine the minimum handler and minimum target + * value of all the possible handlers + * 2. update that minimum handler's lane array + * + * Without atomicity, two (or more) threads could pick the same + * lane in step (1), and then conflict with each other in step + * (2). This could allow a single lane handler to process + * multiple requests simultaneously, which shouldn't be possible. + */ + mutex_enter(&inject_delay_mtx); + + for (inject_handler_t *handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) continue; - if (vd->vdev_guid == handler->zi_record.zi_guid) { - seconds = handler->zi_record.zi_timer; - break; + if (vd->vdev_guid != handler->zi_record.zi_guid) + continue; + + /* + * Defensive; should never happen as the array allocation + * occurs prior to inserting this handler on the list. + */ + ASSERT3P(handler->zi_lanes, !=, NULL); + + /* + * This should never happen, the zinject command should + * prevent a user from setting an IO delay with zero lanes. + */ + ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); + + ASSERT3U(handler->zi_record.zi_nlanes, >, + handler->zi_next_lane); + + /* + * We want to issue this IO to the lane that will become + * idle the soonest, so we compare the soonest this + * specific handler can complete the IO with all other + * handlers, to find the lowest value of all possible + * lanes. We then use this lane to submit the request. + * + * Since each handler has a constant value for its + * delay, we can just use the "next" lane for that + * handler; as it will always be the lane with the + * lowest value for that particular handler (i.e. the + * lane that will become idle the soonest). This saves a + * scan of each handler's lanes array. + * + * There's two cases to consider when determining when + * this specific IO request should complete. If this + * lane is idle, we want to "submit" the request now so + * it will complete after zi_timer milliseconds. Thus, + * we set the target to now + zi_timer. + * + * If the lane is busy, we want this request to complete + * zi_timer milliseconds after the lane becomes idle. + * Since the 'zi_lanes' array holds the time at which + * each lane will become idle, we use that value to + * determine when this request should complete. + */ + hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); + hrtime_t busy = handler->zi_record.zi_timer + + handler->zi_lanes[handler->zi_next_lane]; + hrtime_t target = MAX(idle, busy); + + if (min_handler == NULL) { + min_handler = handler; + min_target = target; + continue; } + ASSERT3P(min_handler, !=, NULL); + ASSERT3U(min_target, !=, 0); + + /* + * We don't yet increment the "next lane" variable since + * we still might find a lower value lane in another + * handler during any remaining iterations. Once we're + * sure we've selected the absolute minimum, we'll claim + * the lane and increment the handler's "next lane" + * field below. + */ + + if (target < min_target) { + min_handler = handler; + min_target = target; + } } + + /* + * 'min_handler' will be NULL if no IO delays are registered for + * this vdev, otherwise it will point to the handler containing + * the lane that will become idle the soonest. + */ + if (min_handler != NULL) { + ASSERT3U(min_target, !=, 0); + min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; + + /* + * If we've used all possible lanes for this handler, + * loop back and start using the first lane again; + * otherwise, just increment the lane index. + */ + min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % + min_handler->zi_record.zi_nlanes; + } + + mutex_exit(&inject_delay_mtx); rw_exit(&inject_lock); - return (seconds); + + return (min_target); } /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, * which is the switch to trigger all fault injection. */ int zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) { inject_handler_t *handler; int error; spa_t *spa; /* * If this is pool-wide metadata, make sure we unload the corresponding * spa_t, so that the next attempt to load it will trigger the fault. * We call spa_reset() to unload the pool appropriately. */ if (flags & ZINJECT_UNLOAD_SPA) if ((error = spa_reset(name)) != 0) return (error); + if (record->zi_cmd == ZINJECT_DELAY_IO) { + /* + * A value of zero for the number of lanes or for the + * delay time doesn't make sense. + */ + if (record->zi_timer == 0 || record->zi_nlanes == 0) + return (SET_ERROR(EINVAL)); + + /* + * The number of lanes is directly mapped to the size of + * an array used by the handler. Thus, to ensure the + * user doesn't trigger an allocation that's "too large" + * we cap the number of lanes here. + */ + if (record->zi_nlanes >= UINT16_MAX) + return (SET_ERROR(EINVAL)); + } + if (!(flags & ZINJECT_NULL)) { /* * spa_inject_ref() will add an injection reference, which will * prevent the pool from being removed from the namespace while * still allowing it to be unloaded. */ if ((spa = spa_inject_addref(name)) == NULL) return (SET_ERROR(ENOENT)); handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); + handler->zi_spa = spa; + handler->zi_record = *record; + + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + handler->zi_lanes = kmem_zalloc( + sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes, KM_SLEEP); + handler->zi_next_lane = 0; + } else { + handler->zi_lanes = NULL; + handler->zi_next_lane = 0; + } + rw_enter(&inject_lock, RW_WRITER); + /* + * We can't move this increment into the conditional + * above because we need to hold the RW_WRITER lock of + * inject_lock, and we don't want to hold that while + * allocating the handler's zi_lanes array. + */ + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >=, 0); + inject_delay_count++; + ASSERT3S(inject_delay_count, >, 0); + } + *id = handler->zi_id = inject_next_id++; - handler->zi_spa = spa; - handler->zi_record = *record; list_insert_tail(&inject_handlers, handler); atomic_inc_32(&zio_injection_enabled); rw_exit(&inject_lock); } /* * Flush the ARC, so that any attempts to read this data will end up * going to the ZIO layer. Note that this is a little overkill, but * we don't have the necessary ARC interfaces to do anything else, and * fault injection isn't a performance critical path. */ if (flags & ZINJECT_FLUSH_ARC) /* * We must use FALSE to ensure arc_flush returns, since * we're not preventing concurrent ARC insertions. */ arc_flush(NULL, FALSE); return (0); } /* * Returns the next record with an ID greater than that supplied to the * function. Used to iterate over all handlers in the system. */ int zio_inject_list_next(int *id, char *name, size_t buflen, zinject_record_t *record) { inject_handler_t *handler; int ret; mutex_enter(&spa_namespace_lock); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) if (handler->zi_id > *id) break; if (handler) { *record = handler->zi_record; *id = handler->zi_id; (void) strncpy(name, spa_name(handler->zi_spa), buflen); ret = 0; } else { ret = SET_ERROR(ENOENT); } rw_exit(&inject_lock); mutex_exit(&spa_namespace_lock); return (ret); } /* * Clear the fault handler with the given identifier, or return ENOENT if none * exists. */ int zio_clear_fault(int id) { inject_handler_t *handler; rw_enter(&inject_lock, RW_WRITER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) if (handler->zi_id == id) break; if (handler == NULL) { rw_exit(&inject_lock); return (SET_ERROR(ENOENT)); } + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >, 0); + inject_delay_count--; + ASSERT3S(inject_delay_count, >=, 0); + } + list_remove(&inject_handlers, handler); rw_exit(&inject_lock); + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3P(handler->zi_lanes, !=, NULL); + kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes); + } else { + ASSERT3P(handler->zi_lanes, ==, NULL); + } + spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); return (0); } void zio_inject_init(void) { rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&inject_handlers, sizeof (inject_handler_t), offsetof(inject_handler_t, zi_link)); } void zio_inject_fini(void) { list_destroy(&inject_handlers); + mutex_destroy(&inject_delay_mtx); rw_destroy(&inject_lock); } Index: stable/10 =================================================================== --- stable/10 (revision 297107) +++ stable/10 (revision 297108) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r296510,296563,296567