diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 8db1aef76518..eb81dfe07068 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -1,1885 +1,1882 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2018 by Delphix. All rights reserved. * Copyright (c) 2016, 2017 Intel Corporation. * Copyright 2016 Igor Kozhukhov . */ /* * Functions to convert between a list of vdevs and an nvlist representing the * configuration. Each entry in the list can be one of: * * Device vdevs * disk=(path=..., devid=...) * file=(path=...) * * Group vdevs * raidz[1|2]=(...) * mirror=(...) * * Hot spares * * While the underlying implementation supports it, group vdevs cannot contain * other group vdevs. All userland verification of devices is contained within * this file. If successful, the nvlist returned can be passed directly to the * kernel; we've done as much verification as possible in userland. * * Hot spares are a special case, and passed down as an array of disk vdevs, at * the same level as the root of the vdev tree. * * The only function exported by this file is 'make_root_vdev'. The * function performs several passes: * * 1. Construct the vdev specification. Performs syntax validation and * makes sure each device is valid. * 2. Check for devices in use. Using libblkid to make sure that no * devices are also in use. Some can be overridden using the 'force' * flag, others cannot. * 3. Check for replication errors if the 'force' flag is not specified. * validates that the replication level is consistent across the * entire pool. * 4. Call libzfs to label any whole disks with an EFI label. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include #include /* * For any given vdev specification, we can have multiple errors. The * vdev_error() function keeps track of whether we have seen an error yet, and * prints out a header if its the first error we've seen. */ boolean_t error_seen; boolean_t is_force; void vdev_error(const char *fmt, ...) { va_list ap; if (!error_seen) { (void) fprintf(stderr, gettext("invalid vdev specification\n")); if (!is_force) (void) fprintf(stderr, gettext("use '-f' to override " "the following errors:\n")); else (void) fprintf(stderr, gettext("the following errors " "must be manually repaired:\n")); error_seen = B_TRUE; } va_start(ap, fmt); (void) vfprintf(stderr, fmt, ap); va_end(ap); } /* * Check that a file is valid. All we can do in this case is check that it's * not in use by another pool, and not in use by swap. */ int check_file_generic(const char *file, boolean_t force, boolean_t isspare) { char *name; int fd; int ret = 0; pool_state_t state; boolean_t inuse; if ((fd = open(file, O_RDONLY)) < 0) return (0); if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { const char *desc; switch (state) { case POOL_STATE_ACTIVE: desc = gettext("active"); break; case POOL_STATE_EXPORTED: desc = gettext("exported"); break; case POOL_STATE_POTENTIALLY_ACTIVE: desc = gettext("potentially active"); break; default: desc = gettext("unknown"); break; } /* * Allow hot spares to be shared between pools. */ if (state == POOL_STATE_SPARE && isspare) { free(name); (void) close(fd); return (0); } if (state == POOL_STATE_ACTIVE || state == POOL_STATE_SPARE || !force) { switch (state) { case POOL_STATE_SPARE: vdev_error(gettext("%s is reserved as a hot " "spare for pool %s\n"), file, name); break; default: vdev_error(gettext("%s is part of %s pool " "'%s'\n"), file, desc, name); break; } ret = -1; } free(name); } (void) close(fd); return (ret); } /* * This may be a shorthand device path or it could be total gibberish. * Check to see if it is a known device available in zfs_vdev_paths. * As part of this check, see if we've been given an entire disk * (minus the slice number). */ static int is_shorthand_path(const char *arg, char *path, size_t path_size, struct stat64 *statbuf, boolean_t *wholedisk) { int error; error = zfs_resolve_shortname(arg, path, path_size); if (error == 0) { *wholedisk = zfs_dev_is_whole_disk(path); if (*wholedisk || (stat64(path, statbuf) == 0)) return (0); } strlcpy(path, arg, path_size); memset(statbuf, 0, sizeof (*statbuf)); *wholedisk = B_FALSE; return (error); } /* * Determine if the given path is a hot spare within the given configuration. * If no configuration is given we rely solely on the label. */ static boolean_t is_spare(nvlist_t *config, const char *path) { int fd; pool_state_t state; char *name = NULL; nvlist_t *label; uint64_t guid, spareguid; nvlist_t *nvroot; nvlist_t **spares; uint_t i, nspares; boolean_t inuse; if (zpool_is_draid_spare(path)) return (B_TRUE); if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) return (B_FALSE); if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || !inuse || state != POOL_STATE_SPARE || zpool_read_label(fd, &label, NULL) != 0) { free(name); (void) close(fd); return (B_FALSE); } free(name); (void) close(fd); if (config == NULL) { nvlist_free(label); return (B_TRUE); } verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); nvlist_free(label); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { for (i = 0; i < nspares; i++) { verify(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &spareguid) == 0); if (spareguid == guid) return (B_TRUE); } } return (B_FALSE); } /* * Create a leaf vdev. Determine if this is a file or a device. If it's a * device, fill in the device id to make a complete nvlist. Valid forms for a * leaf vdev are: * * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for /xxx * draid* Virtual dRAID spare */ static nvlist_t * make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) { char path[MAXPATHLEN]; struct stat64 statbuf; nvlist_t *vdev = NULL; const char *type = NULL; boolean_t wholedisk = B_FALSE; uint64_t ashift = 0; int err; /* * Determine what type of vdev this is, and put the full path into * 'path'. We detect whether this is a device of file afterwards by * checking the st_mode of the file. */ if (arg[0] == '/') { /* * Complete device or file path. Exact type is determined by * examining the file descriptor afterwards. Symbolic links * are resolved to their real paths to determine whole disk * and S_ISBLK/S_ISREG type checks. However, we are careful * to store the given path as ZPOOL_CONFIG_PATH to ensure we * can leverage udev's persistent device labels. */ if (realpath(arg, path) == NULL) { (void) fprintf(stderr, gettext("cannot resolve path '%s'\n"), arg); return (NULL); } wholedisk = zfs_dev_is_whole_disk(path); if (!wholedisk && (stat64(path, &statbuf) != 0)) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), path, strerror(errno)); return (NULL); } /* After whole disk check restore original passed path */ strlcpy(path, arg, sizeof (path)); } else if (zpool_is_draid_spare(arg)) { if (!is_primary) { (void) fprintf(stderr, gettext("cannot open '%s': dRAID spares can only " "be used to replace primary vdevs\n"), arg); return (NULL); } wholedisk = B_TRUE; strlcpy(path, arg, sizeof (path)); type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); if (err != 0) { /* * If we got ENOENT, then the user gave us * gibberish, so try to direct them with a * reasonable error message. Otherwise, * regurgitate strerror() since it's the best we * can do. */ if (err == ENOENT) { (void) fprintf(stderr, gettext("cannot open '%s': no such " "device in %s\n"), arg, DISK_ROOT); (void) fprintf(stderr, gettext("must be a full path or " "shorthand device name\n")); return (NULL); } else { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), path, strerror(errno)); return (NULL); } } } if (type == NULL) { /* * Determine whether this is a device or a file. */ if (wholedisk || S_ISBLK(statbuf.st_mode)) { type = VDEV_TYPE_DISK; } else if (S_ISREG(statbuf.st_mode)) { type = VDEV_TYPE_FILE; } else { fprintf(stderr, gettext("cannot use '%s': must " "be a block device or regular file\n"), path); return (NULL); } } /* * Finally, we have the complete device or file, and we know that it is * acceptable to use. Construct the nvlist to describe this vdev. All * vdevs have a 'path' element, and devices also have a 'devid' element. */ verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); /* * Override defaults if custom properties are provided. */ if (props != NULL) { char *value = NULL; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { (void) fprintf(stderr, gettext("ashift must be a number.\n")); return (NULL); } if (ashift != 0 && (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { (void) fprintf(stderr, gettext("invalid 'ashift=%" PRIu64 "' " "property: only values between %" PRId32 " " "and %" PRId32 " are allowed.\n"), ashift, ASHIFT_MIN, ASHIFT_MAX); return (NULL); } } } /* * If the device is known to incorrectly report its physical sector * size explicitly provide the known correct value. */ if (ashift == 0) { int sector_size; if (check_sector_size_database(path, §or_size) == B_TRUE) ashift = highbit64(sector_size) - 1; } if (ashift > 0) (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); return (vdev); } /* * Go through and verify the replication level of the pool is consistent. * Performs the following checks: * * For the new spec, verifies that devices in mirrors and raidz are the * same size. * * If the current configuration already has inconsistent replication * levels, ignore any other potential problems in the new spec. * * Otherwise, make sure that the current spec (if there is one) and the new * spec have consistent replication levels. * * If there is no current spec (create), make sure new spec has at least * one general purpose vdev. */ typedef struct replication_level { char *zprl_type; uint64_t zprl_children; uint64_t zprl_parity; } replication_level_t; #define ZPOOL_FUZZ (16 * 1024 * 1024) /* * N.B. For the purposes of comparing replication levels dRAID can be * considered functionally equivalent to raidz. */ static boolean_t is_raidz_mirror(replication_level_t *a, replication_level_t *b, replication_level_t **raidz, replication_level_t **mirror) { if ((strcmp(a->zprl_type, "raidz") == 0 || strcmp(a->zprl_type, "draid") == 0) && strcmp(b->zprl_type, "mirror") == 0) { *raidz = a; *mirror = b; return (B_TRUE); } return (B_FALSE); } /* * Comparison for determining if dRAID and raidz where passed in either order. */ static boolean_t is_raidz_draid(replication_level_t *a, replication_level_t *b) { if ((strcmp(a->zprl_type, "raidz") == 0 || strcmp(a->zprl_type, "draid") == 0) && (strcmp(b->zprl_type, "raidz") == 0 || strcmp(b->zprl_type, "draid") == 0)) { return (B_TRUE); } return (B_FALSE); } /* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then * an error message will be displayed for each self-inconsistent vdev. */ static replication_level_t * get_replication(nvlist_t *nvroot, boolean_t fatal) { nvlist_t **top; uint_t t, toplevels; nvlist_t **child; uint_t c, children; nvlist_t *nv; char *type; replication_level_t lastrep = {0}; replication_level_t rep; replication_level_t *ret; replication_level_t *raidz, *mirror; boolean_t dontreport; ret = safe_malloc(sizeof (replication_level_t)); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &top, &toplevels) == 0); for (t = 0; t < toplevels; t++) { uint64_t is_log = B_FALSE; nv = top[t]; /* * For separate logs we ignore the top level vdev replication * constraints. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; /* * Ignore holes introduced by removing aux devices, along * with indirect vdevs introduced by previously removed * vdevs. */ verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_HOLE) == 0 || strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { /* * This is a 'file' or 'disk' vdev. */ rep.zprl_type = type; rep.zprl_children = 1; rep.zprl_parity = 0; } else { int64_t vdev_size; /* * This is a mirror or RAID-Z vdev. Go through and make * sure the contents are all the same (files vs. disks), * keeping track of the number of elements in the * process. * * We also check that the size of each vdev (if it can * be determined) is the same. */ rep.zprl_type = type; rep.zprl_children = 0; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); assert(rep.zprl_parity != 0); } else { rep.zprl_parity = 0; } /* * The 'dontreport' variable indicates that we've * already reported an error for this spec, so don't * bother doing it again. */ type = NULL; dontreport = 0; vdev_size = -1LL; for (c = 0; c < children; c++) { nvlist_t *cnv = child[c]; char *path; struct stat64 statbuf; int64_t size = -1LL; char *childtype; int fd, err; rep.zprl_children++; verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &childtype) == 0); /* * If this is a replacing or spare vdev, then * get the real first child of the vdev: do this * in a loop because replacing and spare vdevs * can be nested. */ while (strcmp(childtype, VDEV_TYPE_REPLACING) == 0 || strcmp(childtype, VDEV_TYPE_SPARE) == 0) { nvlist_t **rchild; uint_t rchildren; verify(nvlist_lookup_nvlist_array(cnv, ZPOOL_CONFIG_CHILDREN, &rchild, &rchildren) == 0); assert(rchildren == 2); cnv = rchild[0]; verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &childtype) == 0); } verify(nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &path) == 0); /* * If we have a raidz/mirror that combines disks * with files, report it as an error. */ if (!dontreport && type != NULL && strcmp(type, childtype) != 0) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication " "level: %s contains both " "files and devices\n"), rep.zprl_type); else return (NULL); dontreport = B_TRUE; } /* * According to stat(2), the value of 'st_size' * is undefined for block devices and character * devices. But there is no effective way to * determine the real size in userland. * * Instead, we'll take advantage of an * implementation detail of spec_size(). If the * device is currently open, then we (should) * return a valid size. * * If we still don't get a valid size (indicated * by a size of 0 or MAXOFFSET_T), then ignore * this device altogether. */ if ((fd = open(path, O_RDONLY)) >= 0) { err = fstat64_blk(fd, &statbuf); (void) close(fd); } else { err = stat64(path, &statbuf); } if (err != 0 || statbuf.st_size == 0 || statbuf.st_size == MAXOFFSET_T) continue; size = statbuf.st_size; /* * Also make sure that devices and * slices have a consistent size. If * they differ by a significant amount * (~16MB) then report an error. */ if (!dontreport && (vdev_size != -1LL && (llabs(size - vdev_size) > ZPOOL_FUZZ))) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "%s contains devices of " "different sizes\n"), rep.zprl_type); else return (NULL); dontreport = B_TRUE; } type = childtype; vdev_size = size; } } /* * At this point, we have the replication of the last toplevel * vdev in 'rep'. Compare it to 'lastrep' to see if it is * different. */ if (lastrep.zprl_type != NULL) { if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { /* * Accepted raidz and mirror when they can * handle the same number of disk failures. */ if (raidz->zprl_parity != mirror->zprl_children - 1) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication " "level: " "%s and %s vdevs with " "different redundancy, " "%llu vs. %llu (%llu-way) " "are present\n"), raidz->zprl_type, mirror->zprl_type, (u_longlong_t) raidz->zprl_parity, (u_longlong_t) mirror->zprl_children - 1, (u_longlong_t) mirror->zprl_children); else return (NULL); } } else if (is_raidz_draid(&lastrep, &rep)) { /* * Accepted raidz and draid when they can * handle the same number of disk failures. */ if (lastrep.zprl_parity != rep.zprl_parity) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication " "level: %s and %s vdevs " "with different " "redundancy, %llu vs. " "%llu are present\n"), lastrep.zprl_type, rep.zprl_type, (u_longlong_t) lastrep.zprl_parity, (u_longlong_t) rep.zprl_parity); else return (NULL); } } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { if (ret != NULL) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication level: " "both %s and %s vdevs are " "present\n"), lastrep.zprl_type, rep.zprl_type); else return (NULL); } else if (lastrep.zprl_parity != rep.zprl_parity) { if (ret) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication level: " "both %llu and %llu device parity " "%s vdevs are present\n"), (u_longlong_t) lastrep.zprl_parity, (u_longlong_t)rep.zprl_parity, rep.zprl_type); else return (NULL); } else if (lastrep.zprl_children != rep.zprl_children) { if (ret) free(ret); ret = NULL; if (fatal) vdev_error(gettext( "mismatched replication level: " "both %llu-way and %llu-way %s " "vdevs are present\n"), (u_longlong_t) lastrep.zprl_children, (u_longlong_t) rep.zprl_children, rep.zprl_type); else return (NULL); } } lastrep = rep; } if (ret != NULL) *ret = rep; return (ret); } /* * Check the replication level of the vdev spec against the current pool. Calls * get_replication() to make sure the new spec is self-consistent. If the pool * has a consistent replication level, then we ignore any errors. Otherwise, * report any difference between the two. */ static int check_replication(nvlist_t *config, nvlist_t *newroot) { nvlist_t **child; uint_t children; replication_level_t *current = NULL, *new; replication_level_t *raidz, *mirror; int ret; /* * If we have a current pool configuration, check to see if it's * self-consistent. If not, simply return success. */ if (config != NULL) { nvlist_t *nvroot; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if ((current = get_replication(nvroot, B_FALSE)) == NULL) return (0); } /* * for spares there may be no children, and therefore no * replication level to check */ if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) || (children == 0)) { free(current); return (0); } /* * If all we have is logs then there's no replication level to check. */ if (num_logs(newroot) == children) { free(current); return (0); } /* * Get the replication level of the new vdev spec, reporting any * inconsistencies found. */ if ((new = get_replication(newroot, B_TRUE)) == NULL) { free(current); return (-1); } /* * Check to see if the new vdev spec matches the replication level of * the current pool. */ ret = 0; if (current != NULL) { if (is_raidz_mirror(current, new, &raidz, &mirror) || is_raidz_mirror(new, current, &raidz, &mirror)) { if (raidz->zprl_parity != mirror->zprl_children - 1) { vdev_error(gettext( "mismatched replication level: pool and " "new vdev with different redundancy, %s " "and %s vdevs, %llu vs. %llu (%llu-way)\n"), raidz->zprl_type, mirror->zprl_type, (u_longlong_t)raidz->zprl_parity, (u_longlong_t)mirror->zprl_children - 1, (u_longlong_t)mirror->zprl_children); ret = -1; } } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { vdev_error(gettext( "mismatched replication level: pool uses %s " "and new vdev is %s\n"), current->zprl_type, new->zprl_type); ret = -1; } else if (current->zprl_parity != new->zprl_parity) { vdev_error(gettext( "mismatched replication level: pool uses %llu " "device parity and new vdev uses %llu\n"), (u_longlong_t)current->zprl_parity, (u_longlong_t)new->zprl_parity); ret = -1; } else if (current->zprl_children != new->zprl_children) { vdev_error(gettext( "mismatched replication level: pool uses %llu-way " "%s and new vdev uses %llu-way %s\n"), (u_longlong_t)current->zprl_children, current->zprl_type, (u_longlong_t)new->zprl_children, new->zprl_type); ret = -1; } } free(new); if (current != NULL) free(current); return (ret); } static int zero_label(char *path) { const int size = 4096; char buf[size]; int err, fd; if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), path, strerror(errno)); return (-1); } memset(buf, 0, size); err = write(fd, buf, size); (void) fdatasync(fd); (void) close(fd); if (err == -1) { (void) fprintf(stderr, gettext("cannot zero first %d bytes " "of '%s': %s\n"), size, path, strerror(errno)); return (-1); } if (err != size) { (void) fprintf(stderr, gettext("could only zero %d/%d bytes " "of '%s'\n"), err, size, path); return (-1); } return (0); } /* * Go through and find any whole disks in the vdev specification, labelling them * as appropriate. When constructing the vdev spec, we were unable to open this * device in order to provide a devid. Now that we have labelled the disk and * know that slice 0 is valid, we can construct the devid now. * * If the disk was already labeled with an EFI label, we will have gotten the * devid already (because we were able to open the whole disk). Otherwise, we * need to get the devid after we label the disk. */ static int make_disks(zpool_handle_t *zhp, nvlist_t *nv) { nvlist_t **child; uint_t c, children; char *type, *path; char devpath[MAXPATHLEN]; char udevpath[MAXPATHLEN]; uint64_t wholedisk; struct stat64 statbuf; int is_exclusive = 0; int fd; int ret; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (strcmp(type, VDEV_TYPE_DISK) != 0) return (0); /* * We have a disk device. If this is a whole disk write * out the efi partition table, otherwise write zero's to * the first 4k of the partition. This is to ensure that * libblkid will not misidentify the partition due to a * magic value left by the previous filesystem. */ verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); if (!wholedisk) { /* * Update device id string for mpath nodes (Linux only) */ if (is_mpath_whole_disk(path)) update_vdev_config_dev_strs(nv); if (!is_spare(NULL, path)) (void) zero_label(path); return (0); } if (realpath(path, devpath) == NULL) { ret = errno; (void) fprintf(stderr, gettext("cannot resolve path '%s'\n"), path); return (ret); } /* * Remove any previously existing symlink from a udev path to * the device before labeling the disk. This ensures that * only newly created links are used. Otherwise there is a * window between when udev deletes and recreates the link * during which access attempts will fail with ENOENT. */ strlcpy(udevpath, path, MAXPATHLEN); (void) zfs_append_partition(udevpath, MAXPATHLEN); fd = open(devpath, O_RDWR|O_EXCL); if (fd == -1) { if (errno == EBUSY) is_exclusive = 1; #ifdef __FreeBSD__ if (errno == EPERM) is_exclusive = 1; #endif } else { (void) close(fd); } /* * If the partition exists, contains a valid spare label, * and is opened exclusively there is no need to partition * it. Hot spares have already been partitioned and are * held open exclusively by the kernel as a safety measure. * * If the provided path is for a /dev/disk/ device its * symbolic link will be removed, partition table created, * and then block until udev creates the new link. */ if (!is_exclusive && !is_spare(NULL, udevpath)) { char *devnode = strrchr(devpath, '/') + 1; ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); if (ret == 0) { ret = lstat64(udevpath, &statbuf); if (ret == 0 && S_ISLNK(statbuf.st_mode)) (void) unlink(udevpath); } /* * When labeling a pool the raw device node name * is provided as it appears under /dev/. */ if (zpool_label_disk(g_zfs, zhp, devnode) == -1) return (-1); /* * Wait for udev to signal the device is available * by the provided path. */ ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); if (ret) { (void) fprintf(stderr, gettext("missing link: %s was " "partitioned but %s is missing\n"), devnode, udevpath); return (ret); } ret = zero_label(udevpath); if (ret) return (ret); } /* * Update the path to refer to the partition. The presence of * the 'whole_disk' field indicates to the CLI that we should * chop off the partition number when displaying the device in * future output. */ verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); /* * Update device id strings for whole disks (Linux only) */ update_vdev_config_dev_strs(nv); return (0); } for (c = 0; c < children; c++) if ((ret = make_disks(zhp, child[c])) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) if ((ret = make_disks(zhp, child[c])) != 0) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) for (c = 0; c < children; c++) if ((ret = make_disks(zhp, child[c])) != 0) return (ret); return (0); } /* * Go through and find any devices that are in use. We rely on libdiskmgt for * the majority of this task. */ static boolean_t is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, boolean_t replacing, boolean_t isspare) { nvlist_t **child; uint_t c, children; char *type, *path; int ret = 0; char buf[MAXPATHLEN]; uint64_t wholedisk = B_FALSE; boolean_t anyinuse = B_FALSE; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); /* * As a generic check, we look to see if this is a replace of a * hot spare within the same pool. If so, we allow it * regardless of what libblkid or zpool_in_use() says. */ if (replacing) { (void) strlcpy(buf, path, sizeof (buf)); if (wholedisk) { ret = zfs_append_partition(buf, sizeof (buf)); if (ret == -1) return (-1); } if (is_spare(config, buf)) return (B_FALSE); } if (strcmp(type, VDEV_TYPE_DISK) == 0) ret = check_device(path, force, isspare, wholedisk); else if (strcmp(type, VDEV_TYPE_FILE) == 0) ret = check_file(path, force, isspare); return (ret != 0); } for (c = 0; c < children; c++) if (is_device_in_use(config, child[c], force, replacing, B_FALSE)) anyinuse = B_TRUE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) if (is_device_in_use(config, child[c], force, replacing, B_TRUE)) anyinuse = B_TRUE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) for (c = 0; c < children; c++) if (is_device_in_use(config, child[c], force, replacing, B_FALSE)) anyinuse = B_TRUE; return (anyinuse); } /* * Returns the parity level extracted from a raidz or draid type. * If the parity cannot be determined zero is returned. */ static int get_parity(const char *type) { long parity = 0; const char *p; if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { p = type + strlen(VDEV_TYPE_RAIDZ); if (*p == '\0') { /* when unspecified default to single parity */ return (1); } else if (*p == '0') { /* no zero prefixes allowed */ return (0); } else { /* 0-3, no suffixes allowed */ char *end; errno = 0; parity = strtol(p, &end, 10); if (errno != 0 || *end != '\0' || parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { return (0); } } } else if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { p = type + strlen(VDEV_TYPE_DRAID); if (*p == '\0' || *p == ':') { /* when unspecified default to single parity */ return (1); } else if (*p == '0') { /* no zero prefixes allowed */ return (0); } else { /* 0-3, allowed suffixes: '\0' or ':' */ char *end; errno = 0; parity = strtol(p, &end, 10); if (errno != 0 || parity < 1 || parity > VDEV_DRAID_MAXPARITY || (*end != '\0' && *end != ':')) { return (0); } } } return ((int)parity); } /* * Assign the minimum and maximum number of devices allowed for * the specified type. On error NULL is returned, otherwise the * type prefix is returned (raidz, mirror, etc). */ static const char * is_grouping(const char *type, int *mindev, int *maxdev) { int nparity; if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { nparity = get_parity(type); if (nparity == 0) return (NULL); if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) *maxdev = 255; if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { return (VDEV_TYPE_RAIDZ); } else { return (VDEV_TYPE_DRAID); } } if (maxdev != NULL) *maxdev = INT_MAX; if (strcmp(type, "mirror") == 0) { if (mindev != NULL) *mindev = 2; return (VDEV_TYPE_MIRROR); } if (strcmp(type, "spare") == 0) { if (mindev != NULL) *mindev = 1; return (VDEV_TYPE_SPARE); } if (strcmp(type, "log") == 0) { if (mindev != NULL) *mindev = 1; return (VDEV_TYPE_LOG); } if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { if (mindev != NULL) *mindev = 1; return (type); } if (strcmp(type, "cache") == 0) { if (mindev != NULL) *mindev = 1; return (VDEV_TYPE_L2CACHE); } return (NULL); } /* * Extract the configuration parameters encoded in the dRAID type and * use them to generate a dRAID configuration. The expected format is: * * draid[][:][:][:] * * The intent is to be able to generate a good configuration when no * additional information is provided. The only mandatory component * of the 'type' is the 'draid' prefix. If a value is not provided * then reasonable defaults are used. The optional components may * appear in any order but the d/s/c suffix is required. * * Valid inputs: * - data: number of data devices per group (1-255) * - parity: number of parity blocks per group (1-3) * - spares: number of distributed spare (0-100) * - children: total number of devices (1-255) * * Examples: * - zpool create tank draid * - zpool create tank draid2:8d:51c:2s */ static int draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) { uint64_t nparity = 1; uint64_t nspares = 0; uint64_t ndata = UINT64_MAX; uint64_t ngroups = 1; long value; if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) return (EINVAL); nparity = (uint64_t)get_parity(type); - if (nparity == 0) + if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + fprintf(stderr, + gettext("invalid dRAID parity level %llu; must be " + "between 1 and %d\n"), (u_longlong_t)nparity, + VDEV_DRAID_MAXPARITY); return (EINVAL); + } char *p = (char *)type; while ((p = strchr(p, ':')) != NULL) { char *end; p = p + 1; errno = 0; if (!isdigit(p[0])) { (void) fprintf(stderr, gettext("invalid dRAID " "syntax; expected [:] not '%s'\n"), type); return (EINVAL); } /* Expected non-zero value with c/d/s suffix */ value = strtol(p, &end, 10); char suffix = tolower(*end); if (errno != 0 || (suffix != 'c' && suffix != 'd' && suffix != 's')) { (void) fprintf(stderr, gettext("invalid dRAID " "syntax; expected [:] not '%s'\n"), type); return (EINVAL); } if (suffix == 'c') { if ((uint64_t)value != children) { fprintf(stderr, gettext("invalid number of dRAID children; " "%llu required but %llu provided\n"), (u_longlong_t)value, (u_longlong_t)children); return (EINVAL); } } else if (suffix == 'd') { ndata = (uint64_t)value; } else if (suffix == 's') { nspares = (uint64_t)value; } else { verify(0); /* Unreachable */ } } /* * When a specific number of data disks is not provided limit a * redundancy group to 8 data disks. This value was selected to * provide a reasonable tradeoff between capacity and performance. */ if (ndata == UINT64_MAX) { if (children > nspares + nparity) { ndata = MIN(children - nspares - nparity, 8); } else { fprintf(stderr, gettext("request number of " "distributed spares %llu and parity level %llu\n" "leaves no disks available for data\n"), (u_longlong_t)nspares, (u_longlong_t)nparity); return (EINVAL); } } /* Verify the maximum allowed group size is never exceeded. */ if (ndata == 0 || (ndata + nparity > children - nspares)) { fprintf(stderr, gettext("requested number of dRAID data " "disks per group %llu is too high,\nat most %llu disks " "are available for data\n"), (u_longlong_t)ndata, (u_longlong_t)(children - nspares - nparity)); return (EINVAL); } - if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { - fprintf(stderr, - gettext("invalid dRAID parity level %llu; must be " - "between 1 and %d\n"), (u_longlong_t)nparity, - VDEV_DRAID_MAXPARITY); - return (EINVAL); - } - /* * Verify the requested number of spares can be satisfied. * An arbitrary limit of 100 distributed spares is applied. */ if (nspares > 100 || nspares > (children - (ndata + nparity))) { fprintf(stderr, gettext("invalid number of dRAID spares %llu; additional " "disks would be required\n"), (u_longlong_t)nspares); return (EINVAL); } /* Verify the requested number children is sufficient. */ if (children < (ndata + nparity + nspares)) { fprintf(stderr, gettext("%llu disks were provided, but at " "least %llu disks are required for this config\n"), (u_longlong_t)children, (u_longlong_t)(ndata + nparity + nspares)); } if (children > VDEV_DRAID_MAX_CHILDREN) { fprintf(stderr, gettext("%llu disks were provided, but " "dRAID only supports up to %u disks"), (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); } /* * Calculate the minimum number of groups required to fill a slice. * This is the LCM of the stripe width (ndata + nparity) and the * number of data drives (children - nspares). */ while (ngroups * (ndata + nparity) % (children - nspares) != 0) ngroups++; /* Store the basic dRAID configuration. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); return (0); } /* * Construct a syntactically valid vdev specification, * and ensure that all devices and files exist and can be opened. * Note: we don't bother freeing anything in the error paths * because the program is just going to exit anyway. */ static nvlist_t * construct_spec(nvlist_t *props, int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares, **l2cache; int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; const char *type, *fulltype; boolean_t is_log, is_special, is_dedup, is_spare; boolean_t seen_logs; top = NULL; toplevels = 0; spares = NULL; l2cache = NULL; nspares = 0; nlogs = 0; nl2cache = 0; is_log = is_special = is_dedup = is_spare = B_FALSE; seen_logs = B_FALSE; nvroot = NULL; while (argc > 0) { fulltype = argv[0]; nv = NULL; /* * If it's a mirror, raidz, or draid the subsequent arguments * are its leaves -- until we encounter the next mirror, * raidz or draid. */ if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; if (strcmp(type, VDEV_TYPE_SPARE) == 0) { if (spares != NULL) { (void) fprintf(stderr, gettext("invalid vdev " "specification: 'spare' can be " "specified only once\n")); goto spec_out; } is_spare = B_TRUE; is_log = is_special = is_dedup = B_FALSE; } if (strcmp(type, VDEV_TYPE_LOG) == 0) { if (seen_logs) { (void) fprintf(stderr, gettext("invalid vdev " "specification: 'log' can be " "specified only once\n")); goto spec_out; } seen_logs = B_TRUE; is_log = B_TRUE; is_special = is_dedup = is_spare = B_FALSE; argc--; argv++; /* * A log is not a real grouping device. * We just set is_log and continue. */ continue; } if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { is_special = B_TRUE; is_log = is_dedup = is_spare = B_FALSE; argc--; argv++; continue; } if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { is_dedup = B_TRUE; is_log = is_special = is_spare = B_FALSE; argc--; argv++; continue; } if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { if (l2cache != NULL) { (void) fprintf(stderr, gettext("invalid vdev " "specification: 'cache' can be " "specified only once\n")); goto spec_out; } is_log = is_special = B_FALSE; is_dedup = is_spare = B_FALSE; } if (is_log || is_special || is_dedup) { if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { (void) fprintf(stderr, gettext("invalid vdev " "specification: unsupported '%s' " "device: %s\n"), is_log ? "log" : "special", type); goto spec_out; } nlogs++; } for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; children++; child = realloc(child, children * sizeof (nvlist_t *)); if (child == NULL) zpool_no_memory(); if ((nv = make_leaf_vdev(props, argv[c], !(is_log || is_special || is_dedup || is_spare))) == NULL) { for (c = 0; c < children - 1; c++) nvlist_free(child[c]); free(child); goto spec_out; } child[children - 1] = nv; } if (children < mindev) { (void) fprintf(stderr, gettext("invalid vdev " "specification: %s requires at least %d " "devices\n"), argv[0], mindev); for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); goto spec_out; } if (children > maxdev) { (void) fprintf(stderr, gettext("invalid vdev " "specification: %s supports no more than " "%d devices\n"), argv[0], maxdev); for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); goto spec_out; } argc -= c; argv += c; if (strcmp(type, VDEV_TYPE_SPARE) == 0) { spares = child; nspares = children; continue; } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { l2cache = child; nl2cache = children; continue; } else { /* create a top-level vdev with children */ verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); if (is_log) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_LOG) == 0); } if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_SPECIAL) == 0); } if (is_dedup) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_DEDUP) == 0); } if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } if (strcmp(type, VDEV_TYPE_DRAID) == 0) { if (draid_config_by_type(nv, fulltype, children) != 0) { for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); goto spec_out; } } verify(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)child, children) == 0); for (c = 0; c < children; c++) nvlist_free(child[c]); free(child); } } else { /* * We have a device. Pass off to make_leaf_vdev() to * construct the appropriate nvlist describing the vdev. */ if ((nv = make_leaf_vdev(props, argv[0], !(is_log || is_special || is_dedup || is_spare))) == NULL) goto spec_out; verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); if (is_log) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_LOG) == 0); nlogs++; } if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_SPECIAL) == 0); } if (is_dedup) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_DEDUP) == 0); } argc--; argv++; } toplevels++; top = realloc(top, toplevels * sizeof (nvlist_t *)); if (top == NULL) zpool_no_memory(); top[toplevels - 1] = nv; } if (toplevels == 0 && nspares == 0 && nl2cache == 0) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto spec_out; } if (seen_logs && nlogs == 0) { (void) fprintf(stderr, gettext("invalid vdev specification: " "log requires at least 1 device\n")); goto spec_out; } /* * Finally, create nvroot and add all top-level vdevs to it. */ verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)top, toplevels) == 0); if (nspares != 0) verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, (const nvlist_t **)spares, nspares) == 0); if (nl2cache != 0) verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, (const nvlist_t **)l2cache, nl2cache) == 0); spec_out: for (t = 0; t < toplevels; t++) nvlist_free(top[t]); for (t = 0; t < nspares; t++) nvlist_free(spares[t]); for (t = 0; t < nl2cache; t++) nvlist_free(l2cache[t]); free(spares); free(l2cache); free(top); return (nvroot); } nvlist_t * split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, splitflags_t flags, int argc, char **argv) { nvlist_t *newroot = NULL, **child; uint_t c, children; if (argc > 0) { if ((newroot = construct_spec(props, argc, argv)) == NULL) { (void) fprintf(stderr, gettext("Unable to build a " "pool from the specified devices\n")); return (NULL); } if (!flags.dryrun && make_disks(zhp, newroot) != 0) { nvlist_free(newroot); return (NULL); } /* avoid any tricks in the spec */ verify(nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); for (c = 0; c < children; c++) { char *path; const char *type; int min, max; verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_PATH, &path) == 0); if ((type = is_grouping(path, &min, &max)) != NULL) { (void) fprintf(stderr, gettext("Cannot use " "'%s' as a device for splitting\n"), type); nvlist_free(newroot); return (NULL); } } } if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { nvlist_free(newroot); return (NULL); } return (newroot); } static int num_normal_vdevs(nvlist_t *nvroot) { nvlist_t **top; uint_t t, toplevels, normal = 0; verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &top, &toplevels) == 0); for (t = 0; t < toplevels; t++) { uint64_t log = B_FALSE; (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); if (log) continue; if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; normal++; } return (normal); } /* * Get and validate the contents of the given vdev specification. This ensures * that the nvlist returned is well-formed, that all the devices exist, and that * they are not currently in use by any other known consumer. The 'poolconfig' * parameter is the current configuration of the pool when adding devices * existing pool, and is used to perform additional checks, such as changing the * replication level of the pool. It can be 'NULL' to indicate that this is a * new pool. The 'force' flag controls whether devices should be forcefully * added, even if they appear in use. */ nvlist_t * make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, boolean_t replacing, boolean_t dryrun, int argc, char **argv) { nvlist_t *newroot; nvlist_t *poolconfig = NULL; is_force = force; /* * Construct the vdev specification. If this is successful, we know * that we have a valid specification, and that all devices can be * opened. */ if ((newroot = construct_spec(props, argc, argv)) == NULL) return (NULL); if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) { nvlist_free(newroot); return (NULL); } /* * Validate each device to make sure that it's not shared with another * subsystem. We do this even if 'force' is set, because there are some * uses (such as a dedicated dump device) that even '-f' cannot * override. */ if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { nvlist_free(newroot); return (NULL); } /* * Check the replication level of the given vdevs and report any errors * found. We include the existing pool spec, if any, as we need to * catch changes against the existing replication level. */ if (check_rep && check_replication(poolconfig, newroot) != 0) { nvlist_free(newroot); return (NULL); } /* * On pool create the new vdev spec must have one normal vdev. */ if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { vdev_error(gettext("at least one general top-level vdev must " "be specified\n")); nvlist_free(newroot); return (NULL); } /* * Run through the vdev specification and label any whole disks found. */ if (!dryrun && make_disks(zhp, newroot) != 0) { nvlist_free(newroot); return (NULL); } return (newroot); } diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 7afcc1231348..6bd35713ff18 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -1,1202 +1,1200 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2017, 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #endif /* * This controls the number of entries in the buffer the redaction_list_update * synctask uses to buffer writes to the redaction list. */ static const int redact_sync_bufsize = 1024; /* * Controls how often to update the redaction list when creating a redaction * list. */ static const uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* 1s */ /* * This tunable controls the length of the queues that zfs redact worker threads * use to communicate. If the dmu_redact_snap thread is blocking on these * queues, this variable may need to be increased. If there is a significant * slowdown at the start of a redact operation as these threads consume all the * available IO resources, or the queues are consuming too much memory, this * variable may need to be decreased. */ static const int zfs_redact_queue_length = 1024 * 1024; /* * These tunables control the fill fraction of the queues by zfs redact. The * fill fraction controls the frequency with which threads have to be * cv_signaled. If a lot of cpu time is being spent on cv_signal, then these * should be tuned down. If the queues empty before the signalled thread can * catch up, then these should be tuned up. */ static const uint64_t zfs_redact_queue_ff = 20; struct redact_record { bqueue_node_t ln; boolean_t eos_marker; /* Marks the end of the stream */ uint64_t start_object; uint64_t start_blkid; uint64_t end_object; uint64_t end_blkid; uint8_t indblkshift; uint32_t datablksz; }; struct redact_thread_arg { bqueue_t q; objset_t *os; /* Objset to traverse */ dsl_dataset_t *ds; /* Dataset to traverse */ struct redact_record *current_record; int error_code; boolean_t cancel; zbookmark_phys_t resume; objlist_t *deleted_objs; uint64_t *num_blocks_visited; uint64_t ignore_object; /* ignore further callbacks on this */ uint64_t txg; /* txg to traverse since */ }; /* * The redaction node is a wrapper around the redaction record that is used * by the redaction merging thread to sort the records and determine overlaps. * * It contains two nodes; one sorts the records by their start_zb, and the other * sorts the records by their end_zb. */ struct redact_node { avl_node_t avl_node_start; avl_node_t avl_node_end; struct redact_record *record; struct redact_thread_arg *rt_arg; uint32_t thread_num; }; struct merge_data { list_t md_redact_block_pending; redact_block_phys_t md_coalesce_block; uint64_t md_last_time; redact_block_phys_t md_furthest[TXG_SIZE]; /* Lists of struct redact_block_list_node. */ list_t md_blocks[TXG_SIZE]; boolean_t md_synctask_txg[TXG_SIZE]; uint64_t md_latest_synctask_txg; redaction_list_t *md_redaction_list; }; /* * A wrapper around struct redact_block so it can be stored in a list_t. */ struct redact_block_list_node { redact_block_phys_t block; list_node_t node; }; /* * We've found a new redaction candidate. In order to improve performance, we * coalesce these blocks when they're adjacent to each other. This function * handles that. If the new candidate block range is immediately after the * range we're building, coalesce it into the range we're building. Otherwise, * put the record we're building on the queue, and update the build pointer to * point to the new record. */ static void record_merge_enqueue(bqueue_t *q, struct redact_record **build, struct redact_record *new) { if (new->eos_marker) { if (*build != NULL) bqueue_enqueue(q, *build, sizeof (**build)); bqueue_enqueue_flush(q, new, sizeof (*new)); return; } if (*build == NULL) { *build = new; return; } struct redact_record *curbuild = *build; if ((curbuild->end_object == new->start_object && curbuild->end_blkid + 1 == new->start_blkid && curbuild->end_blkid != UINT64_MAX) || (curbuild->end_object + 1 == new->start_object && curbuild->end_blkid == UINT64_MAX && new->start_blkid == 0)) { curbuild->end_object = new->end_object; curbuild->end_blkid = new->end_blkid; kmem_free(new, sizeof (*new)); } else { bqueue_enqueue(q, curbuild, sizeof (*curbuild)); *build = new; } } #ifdef _KERNEL struct objnode { avl_node_t node; uint64_t obj; }; static int objnode_compare(const void *o1, const void *o2) { const struct objnode *obj1 = o1; const struct objnode *obj2 = o2; if (obj1->obj < obj2->obj) return (-1); if (obj1->obj > obj2->obj) return (1); return (0); } static objlist_t * zfs_get_deleteq(objset_t *os) { objlist_t *deleteq_objlist = objlist_create(); uint64_t deleteq_obj; zap_cursor_t zc; zap_attribute_t za; dmu_object_info_t doi; ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi)); ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE); VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); /* * In order to insert objects into the objlist, they must be in sorted * order. We don't know what order we'll get them out of the ZAP in, so * we insert them into and remove them from an avl_tree_t to sort them. */ avl_tree_t at; avl_create(&at, objnode_compare, sizeof (struct objnode), offsetof(struct objnode, node)); for (zap_cursor_init(&zc, os, deleteq_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP); obj->obj = za.za_first_integer; avl_add(&at, obj); } zap_cursor_fini(&zc); struct objnode *next, *found = avl_first(&at); while (found != NULL) { next = AVL_NEXT(&at, found); objlist_insert(deleteq_objlist, found->obj); found = next; } void *cookie = NULL; while ((found = avl_destroy_nodes(&at, &cookie)) != NULL) kmem_free(found, sizeof (*found)); avl_destroy(&at); return (deleteq_objlist); } #endif /* * This is the callback function to traverse_dataset for the redaction threads * for dmu_redact_snap. This thread is responsible for creating redaction * records for all the data that is modified by the snapshots we're redacting * with respect to. Redaction records represent ranges of data that have been * modified by one of the redaction snapshots, and are stored in the * redact_record struct. We need to create redaction records for three * cases: * * First, if there's a normal write, we need to create a redaction record for * that block. * * Second, if there's a hole, we need to create a redaction record that covers * the whole range of the hole. If the hole is in the meta-dnode, it must cover * every block in all of the objects in the hole. * * Third, if there is a deleted object, we need to create a redaction record for * all of the blocks in that object. */ static int redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { (void) spa, (void) zilog; struct redact_thread_arg *rta = arg; struct redact_record *record; ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= rta->resume.zb_object); if (rta->cancel) return (SET_ERROR(EINTR)); if (rta->ignore_object == zb->zb_object) return (0); /* * If we're visiting a dnode, we need to handle the case where the * object has been deleted. */ if (zb->zb_level == ZB_DNODE_LEVEL) { ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); if (zb->zb_object == 0) return (0); /* * If the object has been deleted, redact all of the blocks in * it. */ if (dnp->dn_type == DMU_OT_NONE || objlist_exists(rta->deleted_objs, zb->zb_object)) { rta->ignore_object = zb->zb_object; record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); record->eos_marker = B_FALSE; record->start_object = record->end_object = zb->zb_object; record->start_blkid = 0; record->end_blkid = UINT64_MAX; record_merge_enqueue(&rta->q, &rta->current_record, record); } return (0); } else if (zb->zb_level < 0) { return (0); } else if (zb->zb_level > 0 && !BP_IS_HOLE(bp)) { /* * If this is an indirect block, but not a hole, it doesn't * provide any useful information for redaction, so ignore it. */ return (0); } /* * At this point, there are two options left for the type of block we're * looking at. Either this is a hole (which could be in the dnode or * the meta-dnode), or it's a level 0 block of some sort. If it's a * hole, we create a redaction record that covers the whole range. If * the hole is in a dnode, we need to redact all the blocks in that * hole. If the hole is in the meta-dnode, we instead need to redact * all blocks in every object covered by that hole. If it's a level 0 * block, we only need to redact that single block. */ record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); record->eos_marker = B_FALSE; record->start_object = record->end_object = zb->zb_object; if (BP_IS_HOLE(bp)) { record->start_blkid = zb->zb_blkid * bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level); record->end_blkid = ((zb->zb_blkid + 1) * bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level)) - 1; if (zb->zb_object == DMU_META_DNODE_OBJECT) { record->start_object = record->start_blkid * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / sizeof (dnode_phys_t)); record->start_blkid = 0; record->end_object = ((record->end_blkid + 1) * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / sizeof (dnode_phys_t))) - 1; record->end_blkid = UINT64_MAX; } } else if (zb->zb_level != 0 || zb->zb_object == DMU_META_DNODE_OBJECT) { kmem_free(record, sizeof (*record)); return (0); } else { record->start_blkid = record->end_blkid = zb->zb_blkid; } record->indblkshift = dnp->dn_indblkshift; record->datablksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; record_merge_enqueue(&rta->q, &rta->current_record, record); return (0); } static __attribute__((noreturn)) void redact_traverse_thread(void *arg) { struct redact_thread_arg *rt_arg = arg; int err; struct redact_record *data; #ifdef _KERNEL if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS) rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os); else rt_arg->deleted_objs = objlist_create(); #else rt_arg->deleted_objs = objlist_create(); #endif err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg, &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, redact_cb, rt_arg); if (err != EINTR) rt_arg->error_code = err; objlist_destroy(rt_arg->deleted_objs); data = kmem_zalloc(sizeof (*data), KM_SLEEP); data->eos_marker = B_TRUE; record_merge_enqueue(&rt_arg->q, &rt_arg->current_record, data); thread_exit(); } static inline void create_zbookmark_from_obj_off(zbookmark_phys_t *zb, uint64_t object, uint64_t blkid) { zb->zb_object = object; zb->zb_level = 0; zb->zb_blkid = blkid; } /* * This is a utility function that can do the comparison for the start or ends * of the ranges in a redact_record. */ static int redact_range_compare(uint64_t obj1, uint64_t off1, uint32_t dbss1, uint64_t obj2, uint64_t off2, uint32_t dbss2) { zbookmark_phys_t z1, z2; create_zbookmark_from_obj_off(&z1, obj1, off1); create_zbookmark_from_obj_off(&z2, obj2, off2); return (zbookmark_compare(dbss1 >> SPA_MINBLOCKSHIFT, 0, dbss2 >> SPA_MINBLOCKSHIFT, 0, &z1, &z2)); } /* * Compare two redaction records by their range's start location. Also makes * eos records always compare last. We use the thread number in the redact_node * to ensure that records do not compare equal (which is not allowed in our avl * trees). */ static int redact_node_compare_start(const void *arg1, const void *arg2) { const struct redact_node *rn1 = arg1; const struct redact_node *rn2 = arg2; const struct redact_record *rr1 = rn1->record; const struct redact_record *rr2 = rn2->record; if (rr1->eos_marker) return (1); if (rr2->eos_marker) return (-1); int cmp = redact_range_compare(rr1->start_object, rr1->start_blkid, rr1->datablksz, rr2->start_object, rr2->start_blkid, rr2->datablksz); if (cmp == 0) cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); return (cmp); } /* * Compare two redaction records by their range's end location. Also makes * eos records always compare last. We use the thread number in the redact_node * to ensure that records do not compare equal (which is not allowed in our avl * trees). */ static int redact_node_compare_end(const void *arg1, const void *arg2) { const struct redact_node *rn1 = arg1; const struct redact_node *rn2 = arg2; const struct redact_record *srr1 = rn1->record; const struct redact_record *srr2 = rn2->record; if (srr1->eos_marker) return (1); if (srr2->eos_marker) return (-1); int cmp = redact_range_compare(srr1->end_object, srr1->end_blkid, srr1->datablksz, srr2->end_object, srr2->end_blkid, srr2->datablksz); if (cmp == 0) cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); return (cmp); } /* * Utility function that compares two redaction records to determine if any part * of the "from" record is before any part of the "to" record. Also causes End * of Stream redaction records to compare after all others, so that the * redaction merging logic can stay simple. */ static boolean_t redact_record_before(const struct redact_record *from, const struct redact_record *to) { if (from->eos_marker == B_TRUE) return (B_FALSE); else if (to->eos_marker == B_TRUE) return (B_TRUE); return (redact_range_compare(from->start_object, from->start_blkid, from->datablksz, to->end_object, to->end_blkid, to->datablksz) <= 0); } /* * Pop a new redaction record off the queue, check that the records are in the * right order, and free the old data. */ static struct redact_record * get_next_redact_record(bqueue_t *bq, struct redact_record *prev) { struct redact_record *next = bqueue_dequeue(bq); ASSERT(redact_record_before(prev, next)); kmem_free(prev, sizeof (*prev)); return (next); } /* * Remove the given redaction node from both trees, pull a new redaction record * off the queue, free the old redaction record, update the redaction node, and * reinsert the node into the trees. */ static int update_avl_trees(avl_tree_t *start_tree, avl_tree_t *end_tree, struct redact_node *redact_node) { avl_remove(start_tree, redact_node); avl_remove(end_tree, redact_node); redact_node->record = get_next_redact_record(&redact_node->rt_arg->q, redact_node->record); avl_add(end_tree, redact_node); avl_add(start_tree, redact_node); return (redact_node->rt_arg->error_code); } /* * Synctask for updating redaction lists. We first take this txg's list of * redacted blocks and append those to the redaction list. We then update the * redaction list's bonus buffer. We store the furthest blocks we visited and * the list of snapshots that we're redacting with respect to. We need these so * that redacted sends and receives can be correctly resumed. */ static void redaction_list_update_sync(void *arg, dmu_tx_t *tx) { struct merge_data *md = arg; uint64_t txg = dmu_tx_get_txg(tx); list_t *list = &md->md_blocks[txg & TXG_MASK]; redact_block_phys_t *furthest_visited = &md->md_furthest[txg & TXG_MASK]; objset_t *mos = tx->tx_pool->dp_meta_objset; redaction_list_t *rl = md->md_redaction_list; int bufsize = redact_sync_bufsize; redact_block_phys_t *buf = kmem_alloc(bufsize * sizeof (*buf), KM_SLEEP); int index = 0; dmu_buf_will_dirty(rl->rl_dbuf, tx); for (struct redact_block_list_node *rbln = list_remove_head(list); rbln != NULL; rbln = list_remove_head(list)) { ASSERT3U(rbln->block.rbp_object, <=, furthest_visited->rbp_object); ASSERT(rbln->block.rbp_object < furthest_visited->rbp_object || rbln->block.rbp_blkid <= furthest_visited->rbp_blkid); buf[index] = rbln->block; index++; if (index == bufsize) { dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries * sizeof (*buf), bufsize * sizeof (*buf), buf, tx); rl->rl_phys->rlp_num_entries += bufsize; index = 0; } kmem_free(rbln, sizeof (*rbln)); } if (index > 0) { dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries * sizeof (*buf), index * sizeof (*buf), buf, tx); rl->rl_phys->rlp_num_entries += index; } kmem_free(buf, bufsize * sizeof (*buf)); md->md_synctask_txg[txg & TXG_MASK] = B_FALSE; rl->rl_phys->rlp_last_object = furthest_visited->rbp_object; rl->rl_phys->rlp_last_blkid = furthest_visited->rbp_blkid; } static void commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object, uint64_t blkid) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node)); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); if (!md->md_synctask_txg[txg & TXG_MASK]) { dsl_sync_task_nowait(dmu_tx_pool(tx), redaction_list_update_sync, md, tx); md->md_synctask_txg[txg & TXG_MASK] = B_TRUE; md->md_latest_synctask_txg = txg; } md->md_furthest[txg & TXG_MASK].rbp_object = object; md->md_furthest[txg & TXG_MASK].rbp_blkid = blkid; list_move_tail(&md->md_blocks[txg & TXG_MASK], &md->md_redact_block_pending); dmu_tx_commit(tx); md->md_last_time = gethrtime(); } /* * We want to store the list of blocks that we're redacting in the bookmark's * redaction list. However, this list is stored in the MOS, which means it can * only be written to in syncing context. To get around this, we create a * synctask that will write to the mos for us. We tell it what to write by * a linked list for each current transaction group; every time we decide to * redact a block, we append it to the transaction group that is currently in * open context. We also update some progress information that the synctask * will store to enable resumable redacted sends. */ static void update_redaction_list(struct merge_data *md, objset_t *os, uint64_t object, uint64_t blkid, uint64_t endblkid, uint32_t blksz) { boolean_t enqueue = B_FALSE; redact_block_phys_t cur = {0}; uint64_t count = endblkid - blkid + 1; while (count > REDACT_BLOCK_MAX_COUNT) { update_redaction_list(md, os, object, blkid, blkid + REDACT_BLOCK_MAX_COUNT - 1, blksz); blkid += REDACT_BLOCK_MAX_COUNT; count -= REDACT_BLOCK_MAX_COUNT; } redact_block_phys_t *coalesce = &md->md_coalesce_block; boolean_t new; if (coalesce->rbp_size_count == 0) { new = B_TRUE; enqueue = B_FALSE; } else { uint64_t old_count = redact_block_get_count(coalesce); if (coalesce->rbp_object == object && coalesce->rbp_blkid + old_count == blkid && old_count + count <= REDACT_BLOCK_MAX_COUNT) { ASSERT3U(redact_block_get_size(coalesce), ==, blksz); redact_block_set_count(coalesce, old_count + count); new = B_FALSE; enqueue = B_FALSE; } else { new = B_TRUE; enqueue = B_TRUE; } } if (new) { cur = *coalesce; coalesce->rbp_blkid = blkid; coalesce->rbp_object = object; redact_block_set_count(coalesce, count); redact_block_set_size(coalesce, blksz); } if (enqueue && redact_block_get_size(&cur) != 0) { struct redact_block_list_node *rbln = kmem_alloc(sizeof (struct redact_block_list_node), KM_SLEEP); rbln->block = cur; list_insert_tail(&md->md_redact_block_pending, rbln); } if (gethrtime() > md->md_last_time + redaction_list_update_interval_ns) { commit_rl_updates(os, md, object, blkid); } } /* * This thread merges all the redaction records provided by the worker threads, * and determines which blocks are redacted by all the snapshots. The algorithm * for doing so is similar to performing a merge in mergesort with n sub-lists * instead of 2, with some added complexity due to the fact that the entries are * ranges, not just single blocks. This algorithm relies on the fact that the * queues are sorted, which is ensured by the fact that traverse_dataset * traverses the dataset in a consistent order. We pull one entry off the front * of the queues of each secure dataset traversal thread. Then we repeat the * following: each record represents a range of blocks modified by one of the * redaction snapshots, and each block in that range may need to be redacted in * the send stream. Find the record with the latest start of its range, and the * record with the earliest end of its range. If the last start is before the * first end, then we know that the blocks in the range [last_start, first_end] * are covered by all of the ranges at the front of the queues, which means * every thread redacts that whole range. For example, let's say the ranges on * each queue look like this: * * Block Id 1 2 3 4 5 6 7 8 9 10 11 * Thread 1 | [====================] * Thread 2 | [========] * Thread 3 | [=================] * * Thread 3 has the last start (5), and the thread 2 has the last end (6). All * three threads modified the range [5,6], so that data should not be sent over * the wire. After we've determined whether or not to redact anything, we take * the record with the first end. We discard that record, and pull a new one * off the front of the queue it came from. In the above example, we would * discard Thread 2's record, and pull a new one. Let's say the next record we * pulled from Thread 2 covered range [10,11]. The new layout would look like * this: * * Block Id 1 2 3 4 5 6 7 8 9 10 11 * Thread 1 | [====================] * Thread 2 | [==] * Thread 3 | [=================] * * When we compare the last start (10, from Thread 2) and the first end (9, from * Thread 1), we see that the last start is greater than the first end. * Therefore, we do not redact anything from these records. We'll iterate by * replacing the record from Thread 1. * * We iterate by replacing the record with the lowest end because we know * that the record with the lowest end has helped us as much as it can. All the * ranges before it that we will ever redact have been redacted. In addition, * by replacing the one with the lowest end, we guarantee we catch all ranges * that need to be redacted. For example, if in the case above we had replaced * the record from Thread 1 instead, we might have ended up with the following: * * Block Id 1 2 3 4 5 6 7 8 9 10 11 12 * Thread 1 | [==] * Thread 2 | [========] * Thread 3 | [=================] * * If the next record from Thread 2 had been [8,10], for example, we should have * redacted part of that range, but because we updated Thread 1's record, we * missed it. * * We implement this algorithm by using two trees. The first sorts the * redaction records by their start_zb, and the second sorts them by their * end_zb. We use these to find the record with the last start and the record * with the first end. We create a record with that start and end, and send it * on. The overall runtime of this implementation is O(n log m), where n is the * total number of redaction records from all the different redaction snapshots, * and m is the number of redaction snapshots. * * If we redact with respect to zero snapshots, we create a redaction * record with the start object and blkid to 0, and the end object and blkid to * UINT64_MAX. This will result in us redacting every block. */ static int perform_thread_merge(bqueue_t *q, uint32_t num_threads, struct redact_thread_arg *thread_args, boolean_t *cancel) { struct redact_node *redact_nodes = NULL; avl_tree_t start_tree, end_tree; struct redact_record *record; struct redact_record *current_record = NULL; int err = 0; struct merge_data md = { {0} }; list_create(&md.md_redact_block_pending, sizeof (struct redact_block_list_node), offsetof(struct redact_block_list_node, node)); /* * If we're redacting with respect to zero snapshots, then no data is * permitted to be sent. We enqueue a record that redacts all blocks, * and an eos marker. */ if (num_threads == 0) { record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); // We can't redact object 0, so don't try. record->start_object = 1; record->start_blkid = 0; record->end_object = record->end_blkid = UINT64_MAX; bqueue_enqueue(q, record, sizeof (*record)); return (0); } - if (num_threads > 0) { - redact_nodes = kmem_zalloc(num_threads * - sizeof (*redact_nodes), KM_SLEEP); - } + redact_nodes = kmem_zalloc(num_threads * + sizeof (*redact_nodes), KM_SLEEP); avl_create(&start_tree, redact_node_compare_start, sizeof (struct redact_node), offsetof(struct redact_node, avl_node_start)); avl_create(&end_tree, redact_node_compare_end, sizeof (struct redact_node), offsetof(struct redact_node, avl_node_end)); for (int i = 0; i < num_threads; i++) { struct redact_node *node = &redact_nodes[i]; struct redact_thread_arg *targ = &thread_args[i]; node->record = bqueue_dequeue(&targ->q); node->rt_arg = targ; node->thread_num = i; avl_add(&start_tree, node); avl_add(&end_tree, node); } /* * Once the first record in the end tree has returned EOS, every record * must be an EOS record, so we should stop. */ while (err == 0 && !((struct redact_node *)avl_first(&end_tree))-> record->eos_marker) { if (*cancel) { err = EINTR; break; } struct redact_node *last_start = avl_last(&start_tree); struct redact_node *first_end = avl_first(&end_tree); /* * If the last start record is before the first end record, * then we have blocks that are redacted by all threads. * Therefore, we should redact them. Copy the record, and send * it to the main thread. */ if (redact_record_before(last_start->record, first_end->record)) { record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); *record = *first_end->record; record->start_object = last_start->record->start_object; record->start_blkid = last_start->record->start_blkid; record_merge_enqueue(q, ¤t_record, record); } err = update_avl_trees(&start_tree, &end_tree, first_end); } /* * We're done; if we were cancelled, we need to cancel our workers and * clear out their queues. Either way, we need to remove every thread's * redact_node struct from the avl trees. */ for (int i = 0; i < num_threads; i++) { if (err != 0) { thread_args[i].cancel = B_TRUE; while (!redact_nodes[i].record->eos_marker) { (void) update_avl_trees(&start_tree, &end_tree, &redact_nodes[i]); } } avl_remove(&start_tree, &redact_nodes[i]); avl_remove(&end_tree, &redact_nodes[i]); kmem_free(redact_nodes[i].record, sizeof (struct redact_record)); bqueue_destroy(&thread_args[i].q); } avl_destroy(&start_tree); avl_destroy(&end_tree); kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); if (current_record != NULL) bqueue_enqueue(q, current_record, sizeof (*current_record)); return (err); } struct redact_merge_thread_arg { bqueue_t q; spa_t *spa; int numsnaps; struct redact_thread_arg *thr_args; boolean_t cancel; int error_code; }; static __attribute__((noreturn)) void redact_merge_thread(void *arg) { struct redact_merge_thread_arg *rmta = arg; rmta->error_code = perform_thread_merge(&rmta->q, rmta->numsnaps, rmta->thr_args, &rmta->cancel); struct redact_record *rec = kmem_zalloc(sizeof (*rec), KM_SLEEP); rec->eos_marker = B_TRUE; bqueue_enqueue_flush(&rmta->q, rec, 1); thread_exit(); } /* * Find the next object in or after the redaction range passed in, and hold * its dnode with the provided tag. Also update *object to contain the new * object number. */ static int hold_next_object(objset_t *os, struct redact_record *rec, const void *tag, uint64_t *object, dnode_t **dn) { int err = 0; if (*dn != NULL) dnode_rele(*dn, tag); *dn = NULL; if (*object < rec->start_object) { *object = rec->start_object - 1; } err = dmu_object_next(os, object, B_FALSE, 0); if (err != 0) return (err); err = dnode_hold(os, *object, tag, dn); while (err == 0 && (*object < rec->start_object || DMU_OT_IS_METADATA((*dn)->dn_type))) { dnode_rele(*dn, tag); *dn = NULL; err = dmu_object_next(os, object, B_FALSE, 0); if (err != 0) break; err = dnode_hold(os, *object, tag, dn); } return (err); } static int perform_redaction(objset_t *os, redaction_list_t *rl, struct redact_merge_thread_arg *rmta) { int err = 0; bqueue_t *q = &rmta->q; struct redact_record *rec = NULL; struct merge_data md = { {0} }; list_create(&md.md_redact_block_pending, sizeof (struct redact_block_list_node), offsetof(struct redact_block_list_node, node)); md.md_redaction_list = rl; for (int i = 0; i < TXG_SIZE; i++) { list_create(&md.md_blocks[i], sizeof (struct redact_block_list_node), offsetof(struct redact_block_list_node, node)); } dnode_t *dn = NULL; uint64_t prev_obj = 0; for (rec = bqueue_dequeue(q); !rec->eos_marker && err == 0; rec = get_next_redact_record(q, rec)) { ASSERT3U(rec->start_object, !=, 0); uint64_t object; if (prev_obj != rec->start_object) { object = rec->start_object - 1; err = hold_next_object(os, rec, FTAG, &object, &dn); } else { object = prev_obj; } while (err == 0 && object <= rec->end_object) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = EINTR; break; } /* * Part of the current object is contained somewhere in * the range covered by rec. */ uint64_t startblkid; uint64_t endblkid; uint64_t maxblkid = dn->dn_phys->dn_maxblkid; if (rec->start_object < object) startblkid = 0; else if (rec->start_blkid > maxblkid) break; else startblkid = rec->start_blkid; if (rec->end_object > object || rec->end_blkid > maxblkid) { endblkid = maxblkid; } else { endblkid = rec->end_blkid; } update_redaction_list(&md, os, object, startblkid, endblkid, dn->dn_datablksz); if (object == rec->end_object) break; err = hold_next_object(os, rec, FTAG, &object, &dn); } if (err == ESRCH) err = 0; if (dn != NULL) prev_obj = object; } if (err == 0 && dn != NULL) dnode_rele(dn, FTAG); if (err == ESRCH) err = 0; rmta->cancel = B_TRUE; while (!rec->eos_marker) rec = get_next_redact_record(q, rec); kmem_free(rec, sizeof (*rec)); /* * There may be a block that's being coalesced, sync that out before we * return. */ if (err == 0 && md.md_coalesce_block.rbp_size_count != 0) { struct redact_block_list_node *rbln = kmem_alloc(sizeof (struct redact_block_list_node), KM_SLEEP); rbln->block = md.md_coalesce_block; list_insert_tail(&md.md_redact_block_pending, rbln); } commit_rl_updates(os, &md, UINT64_MAX, UINT64_MAX); /* * Wait for all the redaction info to sync out before we return, so that * anyone who attempts to resume this redaction will have all the data * they need. */ dsl_pool_t *dp = spa_get_dsl(os->os_spa); if (md.md_latest_synctask_txg != 0) txg_wait_synced(dp, md.md_latest_synctask_txg); for (int i = 0; i < TXG_SIZE; i++) list_destroy(&md.md_blocks[i]); return (err); } static boolean_t redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) { for (int i = 0; i < num_snaps; i++) { if (snaps[i] == guid) return (B_TRUE); } return (B_FALSE); } int dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, const char *redactbook) { int err = 0; dsl_pool_t *dp = NULL; dsl_dataset_t *ds = NULL; int numsnaps = 0; objset_t *os; struct redact_thread_arg *args = NULL; redaction_list_t *new_rl = NULL; char *newredactbook; if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0) return (err); newredactbook = kmem_zalloc(sizeof (char) * ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) != 0) { goto out; } dsl_dataset_long_hold(ds, FTAG); if (!ds->ds_is_snapshot || dmu_objset_from_ds(ds, &os) != 0) { err = EINVAL; goto out; } if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) { err = EALREADY; goto out; } numsnaps = fnvlist_num_pairs(redactnvl); if (numsnaps > 0) args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); nvpair_t *pair = NULL; for (int i = 0; i < numsnaps; i++) { pair = nvlist_next_nvpair(redactnvl, pair); const char *name = nvpair_name(pair); struct redact_thread_arg *rta = &args[i]; err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT, FTAG, &rta->ds); if (err != 0) break; /* * We want to do the long hold before we can get any other * errors, because the cleanup code will release the long * hold if rta->ds is filled in. */ dsl_dataset_long_hold(rta->ds, FTAG); err = dmu_objset_from_ds(rta->ds, &rta->os); if (err != 0) break; if (!dsl_dataset_is_before(rta->ds, ds, 0)) { err = EINVAL; break; } if (dsl_dataset_feature_is_active(rta->ds, SPA_FEATURE_REDACTED_DATASETS)) { err = EALREADY; break; } } if (err != 0) goto out; VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL); boolean_t resuming = B_FALSE; zfs_bookmark_phys_t bookmark; (void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN); char *c = strchr(newredactbook, '@'); ASSERT3P(c, !=, NULL); int n = snprintf(c, ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook), "#%s", redactbook); if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) { dsl_pool_rele(dp, FTAG); kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN); if (args != NULL) kmem_free(args, numsnaps * sizeof (*args)); return (SET_ERROR(ENAMETOOLONG)); } err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark); if (err == 0) { resuming = B_TRUE; if (bookmark.zbm_redaction_obj == 0) { err = EEXIST; goto out; } err = dsl_redaction_list_hold_obj(dp, bookmark.zbm_redaction_obj, FTAG, &new_rl); if (err != 0) { err = EIO; goto out; } dsl_redaction_list_long_hold(dp, new_rl, FTAG); if (new_rl->rl_phys->rlp_num_snaps != numsnaps) { err = ESRCH; goto out; } for (int i = 0; i < numsnaps; i++) { struct redact_thread_arg *rta = &args[i]; if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps, new_rl->rl_phys->rlp_num_snaps, dsl_dataset_phys(rta->ds)->ds_guid)) { err = ESRCH; goto out; } } if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX && new_rl->rl_phys->rlp_last_object == UINT64_MAX) { err = EEXIST; goto out; } dsl_pool_rele(dp, FTAG); dp = NULL; } else { uint64_t *guids = NULL; if (numsnaps > 0) { guids = kmem_zalloc(numsnaps * sizeof (uint64_t), KM_SLEEP); } for (int i = 0; i < numsnaps; i++) { struct redact_thread_arg *rta = &args[i]; guids[i] = dsl_dataset_phys(rta->ds)->ds_guid; } dsl_pool_rele(dp, FTAG); dp = NULL; err = dsl_bookmark_create_redacted(newredactbook, snapname, numsnaps, guids, FTAG, &new_rl); kmem_free(guids, numsnaps * sizeof (uint64_t)); if (err != 0) { goto out; } } for (int i = 0; i < numsnaps; i++) { struct redact_thread_arg *rta = &args[i]; (void) bqueue_init(&rta->q, zfs_redact_queue_ff, zfs_redact_queue_length, offsetof(struct redact_record, ln)); if (resuming) { rta->resume.zb_blkid = new_rl->rl_phys->rlp_last_blkid; rta->resume.zb_object = new_rl->rl_phys->rlp_last_object; } rta->txg = dsl_dataset_phys(ds)->ds_creation_txg; (void) thread_create(NULL, 0, redact_traverse_thread, rta, 0, curproc, TS_RUN, minclsyspri); } struct redact_merge_thread_arg *rmta; rmta = kmem_zalloc(sizeof (struct redact_merge_thread_arg), KM_SLEEP); (void) bqueue_init(&rmta->q, zfs_redact_queue_ff, zfs_redact_queue_length, offsetof(struct redact_record, ln)); rmta->numsnaps = numsnaps; rmta->spa = os->os_spa; rmta->thr_args = args; (void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc, TS_RUN, minclsyspri); err = perform_redaction(os, new_rl, rmta); bqueue_destroy(&rmta->q); kmem_free(rmta, sizeof (struct redact_merge_thread_arg)); out: kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN); if (new_rl != NULL) { dsl_redaction_list_long_rele(new_rl, FTAG); dsl_redaction_list_rele(new_rl, FTAG); } for (int i = 0; i < numsnaps; i++) { struct redact_thread_arg *rta = &args[i]; /* * rta->ds may be NULL if we got an error while filling * it in. */ if (rta->ds != NULL) { dsl_dataset_long_rele(rta->ds, FTAG); dsl_dataset_rele_flags(rta->ds, DS_HOLD_FLAG_DECRYPT, FTAG); } } if (args != NULL) kmem_free(args, numsnaps * sizeof (*args)); if (dp != NULL) dsl_pool_rele(dp, FTAG); if (ds != NULL) { dsl_dataset_long_rele(ds, FTAG); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); } return (SET_ERROR(err)); } diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 763b0c920f11..f9daaabbed3e 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -1,2258 +1,2258 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * ZFS System attributes: * * A generic mechanism to allow for arbitrary attributes * to be stored in a dnode. The data will be stored in the bonus buffer of * the dnode and if necessary a special "spill" block will be used to handle * overflow situations. The spill block will be sized to fit the data * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the * spill block is stored at the end of the current bonus buffer. Any * attributes that would be in the way of the blkptr_t will be relocated * into the spill block. * * Attribute registration: * * Stored persistently on a per dataset basis * a mapping between attribute "string" names and their actual attribute * numeric values, length, and byteswap function. The names are only used * during registration. All attributes are known by their unique attribute * id value. If an attribute can have a variable size then the value * 0 will be used to indicate this. * * Attribute Layout: * * Attribute layouts are a way to compactly store multiple attributes, but * without taking the overhead associated with managing each attribute * individually. Since you will typically have the same set of attributes * stored in the same order a single table will be used to represent that * layout. The ZPL for example will usually have only about 10 different * layouts (regular files, device files, symlinks, * regular files + scanstamp, files/dir with extended attributes, and then * you have the possibility of all of those minus ACL, because it would * be kicked out into the spill block) * * Layouts are simply an array of the attributes and their * ordering i.e. [0, 1, 4, 5, 2] * * Each distinct layout is given a unique layout number and that is what's * stored in the header at the beginning of the SA data buffer. * * A layout only covers a single dbuf (bonus or spill). If a set of * attributes is split up between the bonus buffer and a spill buffer then * two different layouts will be used. This allows us to byteswap the * spill without looking at the bonus buffer and keeps the on disk format of * the bonus and spill buffer the same. * * Adding a single attribute will cause the entire set of attributes to * be rewritten and could result in a new layout number being constructed * as part of the rewrite if no such layout exists for the new set of * attributes. The new attribute will be appended to the end of the already * existing attributes. * * Both the attribute registration and attribute layout information are * stored in normal ZAP attributes. Their should be a small number of * known layouts and the set of attributes is assumed to typically be quite * small. * * The registered attributes and layout "table" information is maintained * in core and a special "sa_os_t" is attached to the objset_t. * * A special interface is provided to allow for quickly applying * a large set of attributes at once. sa_replace_all_by_template() is * used to set an array of attributes. This is used by the ZPL when * creating a brand new file. The template that is passed into the function * specifies the attribute, size for variable length attributes, location of * data and special "data locator" function if the data isn't in a contiguous * location. * * Byteswap implications: * * Since the SA attributes are not entirely self describing we can't do * the normal byteswap processing. The special ZAP layout attribute and * attribute registration attributes define the byteswap function and the * size of the attributes, unless it is variable sized. * The normal ZFS byteswapping infrastructure assumes you don't need * to read any objects in order to do the necessary byteswapping. Whereas * SA attributes can only be properly byteswapped if the dataset is opened * and the layout/attribute ZAP attributes are available. Because of this * the SA attributes will be byteswapped when they are first accessed by * the SA code that will read the SA data. */ typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, uint16_t length, int length_idx, boolean_t, void *userp); static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr); static void sa_idx_tab_rele(objset_t *os, void *arg); static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, int buflen); static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_data_op_t action, sa_data_locator_t *locator, void *datastart, uint16_t buflen, dmu_tx_t *tx); static arc_byteswap_func_t sa_bswap_table[] = { byteswap_uint64_array, byteswap_uint32_array, byteswap_uint16_array, byteswap_uint8_array, zfs_acl_byteswap, }; #ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS #define SA_COPY_DATA(f, s, t, l) \ do { \ if (f == NULL) { \ if (l == 8) { \ *(uint64_t *)t = *(uint64_t *)s; \ } else if (l == 16) { \ *(uint64_t *)t = *(uint64_t *)s; \ *(uint64_t *)((uintptr_t)t + 8) = \ *(uint64_t *)((uintptr_t)s + 8); \ } else { \ memcpy(t, s, l); \ } \ } else { \ sa_copy_data(f, s, t, l); \ } \ } while (0) #else #define SA_COPY_DATA(f, s, t, l) sa_copy_data(f, s, t, l) #endif /* * This table is fixed and cannot be changed. Its purpose is to * allow the SA code to work with both old/new ZPL file systems. * It contains the list of legacy attributes. These attributes aren't * stored in the "attribute" registry zap objects, since older ZPL file systems * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will * use this static table. */ static const sa_attr_reg_t sa_legacy_attrs[] = { {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, }; /* * This is only used for objects of type DMU_OT_ZNODE */ static const sa_attr_type_t sa_legacy_zpl_layout[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; /* * Special dummy layout used for buffers with no attributes. */ static const sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; static const size_t sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs); static kmem_cache_t *sa_cache = NULL; static int sa_cache_constructor(void *buf, void *unused, int kmflag) { (void) unused, (void) kmflag; sa_handle_t *hdl = buf; mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } static void sa_cache_destructor(void *buf, void *unused) { (void) unused; sa_handle_t *hdl = buf; mutex_destroy(&hdl->sa_lock); } void sa_cache_init(void) { sa_cache = kmem_cache_create("sa_cache", sizeof (sa_handle_t), 0, sa_cache_constructor, sa_cache_destructor, NULL, NULL, NULL, 0); } void sa_cache_fini(void) { if (sa_cache) kmem_cache_destroy(sa_cache); } static int layout_num_compare(const void *arg1, const void *arg2) { const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; return (TREE_CMP(node1->lot_num, node2->lot_num)); } static int layout_hash_compare(const void *arg1, const void *arg2) { const sa_lot_t *node1 = (const sa_lot_t *)arg1; const sa_lot_t *node2 = (const sa_lot_t *)arg2; int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash); if (likely(cmp)) return (cmp); return (TREE_CMP(node1->lot_instance, node2->lot_instance)); } static boolean_t sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) { int i; if (count != tbf->lot_attr_count) return (1); for (i = 0; i != count; i++) { if (attrs[i] != tbf->lot_attrs[i]) return (1); } return (0); } #define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) static uint64_t sa_layout_info_hash(const sa_attr_type_t *attrs, int attr_count) { uint64_t crc = -1ULL; for (int i = 0; i != attr_count; i++) crc ^= SA_ATTR_HASH(attrs[i]); return (crc); } static int sa_get_spill(sa_handle_t *hdl) { int rc; if (hdl->sa_spill == NULL) { if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, &hdl->sa_spill)) == 0) VERIFY(0 == sa_build_index(hdl, SA_SPILL)); } else { rc = 0; } return (rc); } /* * Main attribute lookup/update function * returns 0 for success or non zero for failures * * Operates on bulk array, first failure will abort further processing */ static int sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, sa_data_op_t data_op, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; int i; int error = 0; sa_buf_type_t buftypes; buftypes = 0; ASSERT(count > 0); for (i = 0; i != count; i++) { ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); bulk[i].sa_addr = NULL; /* First check the bonus buffer */ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { SA_ATTR_INFO(sa, hdl->sa_bonus_tab, SA_GET_HDR(hdl, SA_BONUS), bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); if (tx && !(buftypes & SA_BONUS)) { dmu_buf_will_dirty(hdl->sa_bonus, tx); buftypes |= SA_BONUS; } } if (bulk[i].sa_addr == NULL && ((error = sa_get_spill(hdl)) == 0)) { if (TOC_ATTR_PRESENT( hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { SA_ATTR_INFO(sa, hdl->sa_spill_tab, SA_GET_HDR(hdl, SA_SPILL), bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); if (tx && !(buftypes & SA_SPILL) && bulk[i].sa_size == bulk[i].sa_length) { dmu_buf_will_dirty(hdl->sa_spill, tx); buftypes |= SA_SPILL; } } } if (error && error != ENOENT) { return ((error == ECKSUM) ? EIO : error); } switch (data_op) { case SA_LOOKUP: if (bulk[i].sa_addr == NULL) return (SET_ERROR(ENOENT)); if (bulk[i].sa_data) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_addr, bulk[i].sa_data, bulk[i].sa_size); } continue; case SA_UPDATE: /* existing rewrite of attr */ if (bulk[i].sa_addr && bulk[i].sa_size == bulk[i].sa_length) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_addr, bulk[i].sa_length); continue; } else if (bulk[i].sa_addr) { /* attr size change */ error = sa_modify_attrs(hdl, bulk[i].sa_attr, SA_REPLACE, bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_length, tx); } else { /* adding new attribute */ error = sa_modify_attrs(hdl, bulk[i].sa_attr, SA_ADD, bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_length, tx); } if (error) return (error); break; default: break; } } return (error); } static sa_lot_t * sa_add_layout_entry(objset_t *os, const sa_attr_type_t *attrs, int attr_count, uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) { sa_os_t *sa = os->os_sa; sa_lot_t *tb, *findtb; int i; avl_index_t loc; ASSERT(MUTEX_HELD(&sa->sa_lock)); tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); tb->lot_attr_count = attr_count; tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, KM_SLEEP); memcpy(tb->lot_attrs, attrs, sizeof (sa_attr_type_t) * attr_count); tb->lot_num = lot_num; tb->lot_hash = hash; tb->lot_instance = 0; if (zapadd) { char attr_name[8]; if (sa->sa_layout_attr_obj == 0) { sa->sa_layout_attr_obj = zap_create_link(os, DMU_OT_SA_ATTR_LAYOUTS, sa->sa_master_obj, SA_LAYOUTS, tx); } (void) snprintf(attr_name, sizeof (attr_name), "%d", (int)lot_num); VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, attr_name, 2, attr_count, attrs, tx)); } list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), offsetof(sa_idx_tab_t, sa_next)); for (i = 0; i != attr_count; i++) { if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) tb->lot_var_sizes++; } avl_add(&sa->sa_layout_num_tree, tb); /* verify we don't have a hash collision */ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { for (; findtb && findtb->lot_hash == hash; findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { if (findtb->lot_instance != tb->lot_instance) break; tb->lot_instance++; } } avl_add(&sa->sa_layout_hash_tree, tb); return (tb); } static void sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, int count, dmu_tx_t *tx, sa_lot_t **lot) { sa_lot_t *tb, tbsearch; avl_index_t loc; sa_os_t *sa = os->os_sa; boolean_t found = B_FALSE; mutex_enter(&sa->sa_lock); tbsearch.lot_hash = hash; tbsearch.lot_instance = 0; tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); if (tb) { for (; tb && tb->lot_hash == hash; tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { if (sa_layout_equal(tb, attrs, count) == 0) { found = B_TRUE; break; } } } if (!found) { tb = sa_add_layout_entry(os, attrs, count, avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); } mutex_exit(&sa->sa_lock); *lot = tb; } static int sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) { int error; uint32_t blocksize; if (size == 0) { blocksize = SPA_MINBLOCKSIZE; } else if (size > SPA_OLD_MAXBLOCKSIZE) { ASSERT(0); return (SET_ERROR(EFBIG)); } else { blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); } error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); ASSERT(error == 0); return (error); } static void sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) { if (func == NULL) { memcpy(target, datastart, buflen); } else { boolean_t start; int bytes; void *dataptr; void *saptr = target; uint32_t length; start = B_TRUE; bytes = 0; while (bytes < buflen) { func(&dataptr, &length, buflen, start, datastart); memcpy(saptr, dataptr, length); saptr = (void *)((caddr_t)saptr + length); bytes += length; start = B_FALSE; } } } /* * Determine several different values pertaining to system attribute * buffers. * * Return the size of the sa_hdr_phys_t header for the buffer. Each * variable length attribute except the first contributes two bytes to * the header size, which is then rounded up to an 8-byte boundary. * * The following output parameters are also computed. * * index - The index of the first attribute in attr_desc that will * spill over. Only valid if will_spill is set. * * total - The total number of bytes of all system attributes described * in attr_desc. * * will_spill - Set when spilling is necessary. It is only set when * the buftype is SA_BONUS. */ static int sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index, int *total, boolean_t *will_spill) { int var_size_count = 0; int i; int hdrsize; int extra_hdrsize; if (buftype == SA_BONUS && sa->sa_force_spill) { *total = 0; *index = 0; *will_spill = B_TRUE; return (0); } *index = -1; *total = 0; *will_spill = B_FALSE; extra_hdrsize = 0; hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : sizeof (sa_hdr_phys_t); ASSERT(IS_P2ALIGNED(full_space, 8)); for (i = 0; i != attr_count; i++) { boolean_t is_var_sz, might_spill_here; int tmp_hdrsize; *total = P2ROUNDUP(*total, 8); *total += attr_desc[i].sa_length; if (*will_spill) continue; is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); if (is_var_sz) var_size_count++; /* * Calculate what the SA header size would be if this * attribute doesn't spill. */ tmp_hdrsize = hdrsize + ((is_var_sz && var_size_count > 1) ? sizeof (uint16_t) : 0); /* * Check whether this attribute spans into the space * that would be used by the spill block pointer should * a spill block be needed. */ might_spill_here = buftype == SA_BONUS && *index == -1 && (*total + P2ROUNDUP(tmp_hdrsize, 8)) > (full_space - sizeof (blkptr_t)); if (is_var_sz && var_size_count > 1) { if (buftype == SA_SPILL || tmp_hdrsize + *total < full_space) { /* * Record the extra header size in case this * increase needs to be reversed due to * spill-over. */ hdrsize = tmp_hdrsize; if (*index != -1 || might_spill_here) extra_hdrsize += sizeof (uint16_t); } else { ASSERT(buftype == SA_BONUS); if (*index == -1) *index = i; *will_spill = B_TRUE; continue; } } /* * Store index of where spill *could* occur. Then * continue to count the remaining attribute sizes. The * sum is used later for sizing bonus and spill buffer. */ if (might_spill_here) *index = i; if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && buftype == SA_BONUS) *will_spill = B_TRUE; } if (*will_spill) hdrsize -= extra_hdrsize; hdrsize = P2ROUNDUP(hdrsize, 8); return (hdrsize); } #define BUF_SPACE_NEEDED(total, header) (total + header) /* * Find layout that corresponds to ordering of attributes * If not found a new layout number is created and added to * persistent layout tables. */ static int sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; uint64_t hash; sa_buf_type_t buftype; sa_hdr_phys_t *sahdr; void *data_start; sa_attr_type_t *attrs, *attrs_start; int i, lot_count; int dnodesize; int spill_idx; int hdrsize; int spillhdrsize = 0; int used; dmu_object_type_t bonustype; sa_lot_t *lot; int len_idx; int spill_used; int bonuslen; boolean_t spilling; dmu_buf_will_dirty(hdl->sa_bonus, tx); bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); bonuslen = DN_BONUS_SIZE(dnodesize); /* first determine bonus header size and sum of all attributes */ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, SA_BONUS, bonuslen, &spill_idx, &used, &spilling); if (used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ? MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) : used + hdrsize, tx)); ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || bonustype == DMU_OT_SA); /* setup and size spill buffer when needed */ if (spilling) { boolean_t dummy; if (hdl->sa_spill == NULL) { VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL, &hdl->sa_spill) == 0); } dmu_buf_will_dirty(hdl->sa_spill, tx); spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx], attr_count - spill_idx, hdl->sa_spill, SA_SPILL, hdl->sa_spill->db_size, &i, &spill_used, &dummy); if (spill_used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > hdl->sa_spill->db_size) VERIFY(0 == sa_resize_spill(hdl, BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); } /* setup starting pointers to lay down data */ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; buftype = SA_BONUS; attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, KM_SLEEP); lot_count = 0; for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { uint16_t length; ASSERT(IS_P2ALIGNED(data_start, 8)); attrs[i] = attr_desc[i].sa_attr; length = SA_REGISTERED_LEN(sa, attrs[i]); if (length == 0) length = attr_desc[i].sa_length; if (spilling && i == spill_idx) { /* switch to spill buffer */ VERIFY(bonustype == DMU_OT_SA); if (buftype == SA_BONUS && !sa->sa_force_spill) { sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); SA_SET_HDR(sahdr, lot->lot_num, hdrsize); } buftype = SA_SPILL; hash = -1ULL; len_idx = 0; sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; sahdr->sa_magic = SA_MAGIC; data_start = (void *)((uintptr_t)sahdr + spillhdrsize); attrs_start = &attrs[i]; lot_count = 0; } hash ^= SA_ATTR_HASH(attrs[i]); attr_desc[i].sa_addr = data_start; attr_desc[i].sa_size = length; SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, data_start, length); if (sa->sa_attr_table[attrs[i]].sa_length == 0) { sahdr->sa_lengths[len_idx++] = length; } data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + length), 8); lot_count++; } sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); /* * Verify that old znodes always have layout number 0. * Must be DMU_OT_SA for arbitrary layouts */ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || (bonustype == DMU_OT_SA && lot->lot_num > 1)); if (bonustype == DMU_OT_SA) { SA_SET_HDR(sahdr, lot->lot_num, buftype == SA_BONUS ? hdrsize : spillhdrsize); } kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); if (hdl->sa_bonus_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); hdl->sa_bonus_tab = NULL; } if (!sa->sa_force_spill) VERIFY(0 == sa_build_index(hdl, SA_BONUS)); if (hdl->sa_spill) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); if (!spilling) { /* * remove spill block that is no longer needed. */ dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; hdl->sa_spill_tab = NULL; VERIFY(0 == dmu_rm_spill(hdl->sa_os, sa_handle_object(hdl), tx)); } else { VERIFY(0 == sa_build_index(hdl, SA_SPILL)); } } return (0); } static void sa_free_attr_table(sa_os_t *sa) { int i; if (sa->sa_attr_table == NULL) return; for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_name) kmem_free(sa->sa_attr_table[i].sa_name, strlen(sa->sa_attr_table[i].sa_name) + 1); } kmem_free(sa->sa_attr_table, sizeof (sa_attr_table_t) * sa->sa_num_attrs); sa->sa_attr_table = NULL; } static int sa_attr_table_setup(objset_t *os, const sa_attr_reg_t *reg_attrs, int count) { sa_os_t *sa = os->os_sa; uint64_t sa_attr_count = 0; uint64_t sa_reg_count = 0; int error = 0; uint64_t attr_value; sa_attr_table_t *tb; zap_cursor_t zc; zap_attribute_t za; int registered_count = 0; int i; dmu_objset_type_t ostype = dmu_objset_type(os); sa->sa_user_table = kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); if (sa->sa_reg_attr_obj != 0) { error = zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count); /* * Make sure we retrieved a count and that it isn't zero */ if (error || (error == 0 && sa_attr_count == 0)) { if (error == 0) error = SET_ERROR(EINVAL); goto bail; } sa_reg_count = sa_attr_count; } if (ostype == DMU_OST_ZFS && sa_attr_count == 0) sa_attr_count += sa_legacy_attr_count; /* Allocate attribute numbers for attributes that aren't registered */ for (i = 0; i != count; i++) { boolean_t found = B_FALSE; int j; if (ostype == DMU_OST_ZFS) { for (j = 0; j != sa_legacy_attr_count; j++) { if (strcmp(reg_attrs[i].sa_name, sa_legacy_attrs[j].sa_name) == 0) { sa->sa_user_table[i] = sa_legacy_attrs[j].sa_attr; found = B_TRUE; } } } if (found) continue; if (sa->sa_reg_attr_obj) error = zap_lookup(os, sa->sa_reg_attr_obj, reg_attrs[i].sa_name, 8, 1, &attr_value); else error = SET_ERROR(ENOENT); switch (error) { case ENOENT: sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; sa_attr_count++; break; case 0: sa->sa_user_table[i] = ATTR_NUM(attr_value); break; default: goto bail; } } sa->sa_num_attrs = sa_attr_count; tb = sa->sa_attr_table = kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); /* * Attribute table is constructed from requested attribute list, * previously foreign registered attributes, and also the legacy * ZPL set of attributes. */ if (sa->sa_reg_attr_obj) { for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t value; value = za.za_first_integer; registered_count++; tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); tb[ATTR_NUM(value)].sa_registered = B_TRUE; if (tb[ATTR_NUM(value)].sa_name) { continue; } tb[ATTR_NUM(value)].sa_name = kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, strlen(za.za_name) +1); } zap_cursor_fini(&zc); /* * Make sure we processed the correct number of registered * attributes */ if (registered_count != sa_reg_count) { ASSERT(error != 0); goto bail; } } if (ostype == DMU_OST_ZFS) { for (i = 0; i != sa_legacy_attr_count; i++) { if (tb[i].sa_name) continue; tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; tb[i].sa_length = sa_legacy_attrs[i].sa_length; tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; tb[i].sa_registered = B_FALSE; tb[i].sa_name = kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, KM_SLEEP); (void) strlcpy(tb[i].sa_name, sa_legacy_attrs[i].sa_name, strlen(sa_legacy_attrs[i].sa_name) + 1); } } for (i = 0; i != count; i++) { sa_attr_type_t attr_id; attr_id = sa->sa_user_table[i]; if (tb[attr_id].sa_name) continue; tb[attr_id].sa_length = reg_attrs[i].sa_length; tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; tb[attr_id].sa_attr = attr_id; tb[attr_id].sa_name = kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, strlen(reg_attrs[i].sa_name) + 1); } sa->sa_need_attr_registration = (sa_attr_count != registered_count); return (0); bail: kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); sa->sa_user_table = NULL; sa_free_attr_table(sa); ASSERT(error != 0); return (error); } int sa_setup(objset_t *os, uint64_t sa_obj, const sa_attr_reg_t *reg_attrs, int count, sa_attr_type_t **user_table) { zap_cursor_t zc; zap_attribute_t za; sa_os_t *sa; dmu_objset_type_t ostype = dmu_objset_type(os); sa_attr_type_t *tb; int error; mutex_enter(&os->os_user_ptr_lock); if (os->os_sa) { mutex_enter(&os->os_sa->sa_lock); mutex_exit(&os->os_user_ptr_lock); tb = os->os_sa->sa_user_table; mutex_exit(&os->os_sa->sa_lock); *user_table = tb; return (0); } sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); mutex_init(&sa->sa_lock, NULL, MUTEX_NOLOCKDEP, NULL); sa->sa_master_obj = sa_obj; os->os_sa = sa; mutex_enter(&sa->sa_lock); mutex_exit(&os->os_user_ptr_lock); avl_create(&sa->sa_layout_num_tree, layout_num_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); if (sa_obj) { error = zap_lookup(os, sa_obj, SA_LAYOUTS, 8, 1, &sa->sa_layout_attr_obj); if (error != 0 && error != ENOENT) goto fail; error = zap_lookup(os, sa_obj, SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj); if (error != 0 && error != ENOENT) goto fail; } if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) goto fail; if (sa->sa_layout_attr_obj != 0) { uint64_t layout_count; error = zap_count(os, sa->sa_layout_attr_obj, &layout_count); /* * Layout number count should be > 0 */ if (error || (error == 0 && layout_count == 0)) { if (error == 0) error = SET_ERROR(EINVAL); goto fail; } for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { sa_attr_type_t *lot_attrs; uint64_t lot_num; lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * za.za_num_integers, KM_SLEEP); if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, za.za_name, 2, za.za_num_integers, lot_attrs))) != 0) { kmem_free(lot_attrs, sizeof (sa_attr_type_t) * za.za_num_integers); break; } VERIFY0(ddi_strtoull(za.za_name, NULL, 10, (unsigned long long *)&lot_num)); (void) sa_add_layout_entry(os, lot_attrs, za.za_num_integers, lot_num, sa_layout_info_hash(lot_attrs, za.za_num_integers), B_FALSE, NULL); kmem_free(lot_attrs, sizeof (sa_attr_type_t) * za.za_num_integers); } zap_cursor_fini(&zc); /* * Make sure layout count matches number of entries added * to AVL tree */ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { ASSERT(error != 0); goto fail; } } /* Add special layout number for old ZNODES */ if (ostype == DMU_OST_ZFS) { (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, sa_legacy_attr_count, 0, sa_layout_info_hash(sa_legacy_zpl_layout, sa_legacy_attr_count), B_FALSE, NULL); (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, 0, B_FALSE, NULL); } *user_table = os->os_sa->sa_user_table; mutex_exit(&sa->sa_lock); return (0); fail: os->os_sa = NULL; sa_free_attr_table(sa); if (sa->sa_user_table) kmem_free(sa->sa_user_table, sa->sa_user_table_sz); mutex_exit(&sa->sa_lock); avl_destroy(&sa->sa_layout_hash_tree); avl_destroy(&sa->sa_layout_num_tree); mutex_destroy(&sa->sa_lock); kmem_free(sa, sizeof (sa_os_t)); return ((error == ECKSUM) ? EIO : error); } void sa_tear_down(objset_t *os) { sa_os_t *sa = os->os_sa; sa_lot_t *layout; void *cookie; kmem_free(sa->sa_user_table, sa->sa_user_table_sz); /* Free up attr table */ sa_free_attr_table(sa); cookie = NULL; while ((layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) { sa_idx_tab_t *tab; while ((tab = list_head(&layout->lot_idx_tab))) { ASSERT(zfs_refcount_count(&tab->sa_refcount)); sa_idx_tab_rele(os, tab); } } cookie = NULL; while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) { kmem_free(layout->lot_attrs, sizeof (sa_attr_type_t) * layout->lot_attr_count); kmem_free(layout, sizeof (sa_lot_t)); } avl_destroy(&sa->sa_layout_hash_tree); avl_destroy(&sa->sa_layout_num_tree); mutex_destroy(&sa->sa_lock); kmem_free(sa, sizeof (sa_os_t)); os->os_sa = NULL; } static void sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t var_length, void *userp) { sa_idx_tab_t *idx_tab = userp; if (var_length) { ASSERT(idx_tab->sa_variable_lengths); idx_tab->sa_variable_lengths[length_idx] = length; } TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); } static void sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, sa_iterfunc_t func, sa_lot_t *tab, void *userp) { void *data_start; sa_lot_t *tb = tab; sa_lot_t search; avl_index_t loc; sa_os_t *sa = os->os_sa; int i; uint16_t *length_start = NULL; uint8_t length_idx = 0; if (tab == NULL) { search.lot_num = SA_LAYOUT_NUM(hdr, type); tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); ASSERT(tb); } if (IS_SA_BONUSTYPE(type)) { data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + offsetof(sa_hdr_phys_t, sa_lengths) + (sizeof (uint16_t) * tb->lot_var_sizes)), 8); length_start = hdr->sa_lengths; } else { data_start = hdr; } for (i = 0; i != tb->lot_attr_count; i++) { int attr_length, reg_length; uint8_t idx_len; reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; IMPLY(reg_length == 0, IS_SA_BONUSTYPE(type)); if (reg_length) { attr_length = reg_length; idx_len = 0; } else { attr_length = length_start[length_idx]; idx_len = length_idx++; } func(hdr, data_start, tb->lot_attrs[i], attr_length, idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + attr_length), 8); } } static void sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t variable_length, void *userp) { (void) hdr, (void) length_idx, (void) variable_length; sa_handle_t *hdl = userp; sa_os_t *sa = hdl->sa_os->os_sa; sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); } static void sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) { sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); dmu_buf_impl_t *db; int num_lengths = 1; int i; sa_os_t *sa __maybe_unused = hdl->sa_os->os_sa; ASSERT(MUTEX_HELD(&sa->sa_lock)); if (sa_hdr_phys->sa_magic == SA_MAGIC) return; db = SA_GET_DB(hdl, buftype); if (buftype == SA_SPILL) { arc_release(db->db_buf, NULL); arc_buf_thaw(db->db_buf); } sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); /* * Determine number of variable lengths in header * The standard 8 byte header has one for free and a * 16 byte header would have 4 + 1; */ if (SA_HDR_SIZE(sa_hdr_phys) > 8) num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; for (i = 0; i != num_lengths; i++) sa_hdr_phys->sa_lengths[i] = BSWAP_16(sa_hdr_phys->sa_lengths[i]); sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, sa_byteswap_cb, NULL, hdl); if (buftype == SA_SPILL) arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); } static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) { sa_hdr_phys_t *sa_hdr_phys; dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); sa_os_t *sa = hdl->sa_os->os_sa; sa_idx_tab_t *idx_tab; sa_hdr_phys = SA_GET_HDR(hdl, buftype); mutex_enter(&sa->sa_lock); /* Do we need to byteswap? */ /* only check if not old znode */ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && sa_hdr_phys->sa_magic != 0) { if (BSWAP_32(sa_hdr_phys->sa_magic) != SA_MAGIC) { mutex_exit(&sa->sa_lock); zfs_dbgmsg("Buffer Header: %x != SA_MAGIC:%x " "object=%#llx\n", sa_hdr_phys->sa_magic, SA_MAGIC, (u_longlong_t)db->db.db_object); return (SET_ERROR(EIO)); } sa_byteswap(hdl, buftype); } idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); if (buftype == SA_BONUS) hdl->sa_bonus_tab = idx_tab; else hdl->sa_spill_tab = idx_tab; mutex_exit(&sa->sa_lock); return (0); } static void sa_evict_sync(void *dbu) { (void) dbu; panic("evicting sa dbuf\n"); } static void sa_idx_tab_rele(objset_t *os, void *arg) { sa_os_t *sa = os->os_sa; sa_idx_tab_t *idx_tab = arg; if (idx_tab == NULL) return; mutex_enter(&sa->sa_lock); if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); if (idx_tab->sa_variable_lengths) kmem_free(idx_tab->sa_variable_lengths, sizeof (uint16_t) * idx_tab->sa_layout->lot_var_sizes); zfs_refcount_destroy(&idx_tab->sa_refcount); kmem_free(idx_tab->sa_idx_tab, sizeof (uint32_t) * sa->sa_num_attrs); kmem_free(idx_tab, sizeof (sa_idx_tab_t)); } mutex_exit(&sa->sa_lock); } static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) { sa_os_t *sa __maybe_unused = os->os_sa; ASSERT(MUTEX_HELD(&sa->sa_lock)); (void) zfs_refcount_add(&idx_tab->sa_refcount, NULL); } void sa_spill_rele(sa_handle_t *hdl) { mutex_enter(&hdl->sa_lock); if (hdl->sa_spill) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; hdl->sa_spill_tab = NULL; } mutex_exit(&hdl->sa_lock); } void sa_handle_destroy(sa_handle_t *hdl) { dmu_buf_t *db = hdl->sa_bonus; mutex_enter(&hdl->sa_lock); (void) dmu_buf_remove_user(db, &hdl->sa_dbu); if (hdl->sa_bonus_tab) sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); if (hdl->sa_spill_tab) sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); dmu_buf_rele(hdl->sa_bonus, NULL); if (hdl->sa_spill) dmu_buf_rele(hdl->sa_spill, NULL); mutex_exit(&hdl->sa_lock); kmem_cache_free(sa_cache, hdl); } int sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { int error = 0; sa_handle_t *handle = NULL; #ifdef ZFS_DEBUG dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT(doi.doi_bonus_type == DMU_OT_SA || doi.doi_bonus_type == DMU_OT_ZNODE); #endif /* find handle, if it exists */ /* if one doesn't exist then create a new one, and initialize it */ if (hdl_type == SA_HDL_SHARED) handle = dmu_buf_get_user(db); if (handle == NULL) { sa_handle_t *winner = NULL; handle = kmem_cache_alloc(sa_cache, KM_SLEEP); handle->sa_dbu.dbu_evict_func_sync = NULL; handle->sa_dbu.dbu_evict_func_async = NULL; handle->sa_userp = userp; handle->sa_bonus = db; handle->sa_os = os; handle->sa_spill = NULL; handle->sa_bonus_tab = NULL; handle->sa_spill_tab = NULL; error = sa_build_index(handle, SA_BONUS); if (hdl_type == SA_HDL_SHARED) { dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL, NULL); winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); } if (winner != NULL) { kmem_cache_free(sa_cache, handle); handle = winner; } } *handlepp = handle; return (error); } int sa_handle_get(objset_t *objset, uint64_t objid, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { dmu_buf_t *db; int error; if ((error = dmu_bonus_hold(objset, objid, NULL, &db))) return (error); return (sa_handle_get_from_db(objset, db, userp, hdl_type, handlepp)); } int sa_buf_hold(objset_t *objset, uint64_t obj_num, const void *tag, dmu_buf_t **db) { return (dmu_bonus_hold(objset, obj_num, tag, db)); } void sa_buf_rele(dmu_buf_t *db, const void *tag) { dmu_buf_rele(db, tag); } static int sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) { ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); } static int sa_lookup_locked(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) { int error; sa_bulk_attr_t bulk; VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN); bulk.sa_attr = attr; bulk.sa_data = buf; bulk.sa_length = buflen; bulk.sa_data_func = NULL; ASSERT(hdl); error = sa_lookup_impl(hdl, &bulk, 1); return (error); } int sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) { int error; mutex_enter(&hdl->sa_lock); error = sa_lookup_locked(hdl, attr, buf, buflen); mutex_exit(&hdl->sa_lock); return (error); } #ifdef _KERNEL int sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, zfs_uio_t *uio) { int error; sa_bulk_attr_t bulk; bulk.sa_data = NULL; bulk.sa_attr = attr; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { error = zfs_uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, zfs_uio_resid(uio)), UIO_READ, uio); } mutex_exit(&hdl->sa_lock); return (error); } /* * For the existed object that is upgraded from old system, its ondisk layout * has no slot for the project ID attribute. But quota accounting logic needs * to access related slots by offset directly. So we need to adjust these old * objects' layout to make the project ID to some unified and fixed offset. */ int sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid) { znode_t *zp = sa_get_userdata(hdl); dmu_buf_t *db = sa_get_db(hdl); zfsvfs_t *zfsvfs = ZTOZSB(zp); int count = 0, err = 0; sa_bulk_attr_t *bulk, *attrs; zfs_acl_locator_cb_t locate = { 0 }; uint64_t uid, gid, mode, rdev, xattr = 0, parent, gen, links; uint64_t crtime[2], mtime[2], ctime[2], atime[2]; zfs_acl_phys_t znode_acl = { 0 }; char scanstamp[AV_SCANSTAMP_SZ]; if (zp->z_acl_cached == NULL) { zfs_acl_t *aclp; mutex_enter(&zp->z_acl_lock); err = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); mutex_exit(&zp->z_acl_lock); if (err != 0 && err != ENOENT) return (err); } bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); mutex_enter(&hdl->sa_lock); mutex_enter(&zp->z_lock); err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid, sizeof (uint64_t)); if (unlikely(err == 0)) /* Someone has added project ID attr by race. */ err = EEXIST; if (err != ENOENT) goto out; /* First do a bulk query of the attributes that aren't cached */ if (zp->z_is_sa) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp))) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } else { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &znode_acl, 88); } err = sa_bulk_lookup_locked(hdl, bulk, count); if (err != 0) goto out; err = sa_lookup_locked(hdl, SA_ZPL_XATTR(zfsvfs), &xattr, 8); if (err != 0 && err != ENOENT) goto out; zp->z_projid = projid; zp->z_pflags |= ZFS_PROJID; links = ZTONLNK(zp); count = 0; err = 0; SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8); if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp))) SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if (zp->z_acl_cached != NULL) { SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &zp->z_acl_cached->z_acl_count, 8); if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) zfs_acl_xform(zp, zp->z_acl_cached, CRED()); locate.cb_aclp = zp->z_acl_cached; SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); } if (xattr) SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { memcpy(scanstamp, (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, AV_SCANSTAMP_SZ); SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL, scanstamp, AV_SCANSTAMP_SZ); zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; } VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); VERIFY(sa_replace_all_by_template_locked(hdl, attrs, count, tx) == 0); if (znode_acl.z_acl_extern_obj) { VERIFY(0 == dmu_object_free(zfsvfs->z_os, znode_acl.z_acl_extern_obj, tx)); } zp->z_is_sa = B_TRUE; out: mutex_exit(&zp->z_lock); mutex_exit(&hdl->sa_lock); kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END); kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END); return (err); } #endif static sa_idx_tab_t * sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr) { sa_idx_tab_t *idx_tab; sa_os_t *sa = os->os_sa; sa_lot_t *tb, search; avl_index_t loc; /* * Deterimine layout number. If SA node and header == 0 then * force the index table to the dummy "1" empty layout. * * The layout number would only be zero for a newly created file * that has not added any attributes yet, or with crypto enabled which * doesn't write any attributes to the bonus buffer. */ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); /* Verify header size is consistent with layout information */ ASSERT(tb); ASSERT((IS_SA_BONUSTYPE(bonustype) && SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb)) || !IS_SA_BONUSTYPE(bonustype) || (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); /* * See if any of the already existing TOC entries can be reused? */ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { boolean_t valid_idx = B_TRUE; int i; if (tb->lot_var_sizes != 0 && idx_tab->sa_variable_lengths != NULL) { for (i = 0; i != tb->lot_var_sizes; i++) { if (hdr->sa_lengths[i] != idx_tab->sa_variable_lengths[i]) { valid_idx = B_FALSE; break; } } } if (valid_idx) { sa_idx_tab_hold(os, idx_tab); return (idx_tab); } } /* No such luck, create a new entry */ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); idx_tab->sa_idx_tab = kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); idx_tab->sa_layout = tb; zfs_refcount_create(&idx_tab->sa_refcount); if (tb->lot_var_sizes) idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * tb->lot_var_sizes, KM_SLEEP); sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, tb, idx_tab); sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ sa_idx_tab_hold(os, idx_tab); /* one for layout */ list_insert_tail(&tb->lot_idx_tab, idx_tab); return (idx_tab); } void sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, boolean_t start, void *userdata) { ASSERT(start); *dataptr = userdata; *len = total_len; } static void sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) { uint64_t attr_value = 0; sa_os_t *sa = hdl->sa_os->os_sa; sa_attr_table_t *tb = sa->sa_attr_table; int i; mutex_enter(&sa->sa_lock); if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { mutex_exit(&sa->sa_lock); return; } if (sa->sa_reg_attr_obj == 0) { sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, DMU_OT_SA_ATTR_REGISTRATION, sa->sa_master_obj, SA_REGISTRY, tx); } for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_registered) continue; ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, tb[i].sa_byteswap); VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, tb[i].sa_name, 8, 1, &attr_value, tx)); tb[i].sa_registered = B_TRUE; } sa->sa_need_attr_registration = B_FALSE; mutex_exit(&sa->sa_lock); } /* * Replace all attributes with attributes specified in template. * If dnode had a spill buffer then those attributes will be * also be replaced, possibly with just an empty spill block * * This interface is intended to only be used for bulk adding of * attributes for a new file. It will also be used by the ZPL * when converting and old formatted znode to native SA support. */ int sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; if (sa->sa_need_attr_registration) sa_attr_register_sync(hdl, tx); return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); } int sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { int error; mutex_enter(&hdl->sa_lock); error = sa_replace_all_by_template_locked(hdl, attr_desc, attr_count, tx); mutex_exit(&hdl->sa_lock); return (error); } /* * Add/remove a single attribute or replace a variable-sized attribute value * with a value of a different size, and then rewrite the entire set * of attributes. * Same-length attribute value replacement (including fixed-length attributes) * is handled more efficiently by the upper layers. */ static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_data_op_t action, sa_data_locator_t *locator, void *datastart, uint16_t buflen, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; int bonus_data_size = 0; int spill_data_size = 0; int spill_attr_count = 0; int error; uint16_t length, reg_length; int i, j, k, length_idx; sa_hdr_phys_t *hdr; sa_idx_tab_t *idx_tab; int attr_count; int count; ASSERT(MUTEX_HELD(&hdl->sa_lock)); /* First make of copy of the old data */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); memcpy(old_data[0], hdl->sa_bonus->db_data, hdl->sa_bonus->db_size); bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; } else { old_data[0] = NULL; } DB_DNODE_EXIT(db); /* Bring spill buffer online if it isn't currently */ if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP); memcpy(old_data[1], hdl->sa_spill->db_data, hdl->sa_spill->db_size); spill_attr_count = hdl->sa_spill_tab->sa_layout->lot_attr_count; } else if (error && error != ENOENT) { if (old_data[0]) kmem_free(old_data[0], bonus_data_size); return (error); } else { old_data[1] = NULL; } /* build descriptor of all attributes */ attr_count = bonus_attr_count + spill_attr_count; if (action == SA_ADD) attr_count++; else if (action == SA_REMOVE) attr_count--; attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); /* * loop through bonus and spill buffer if it exists, and * build up new attr_descriptor to reset the attributes */ k = j = 0; count = bonus_attr_count; hdr = SA_GET_HDR(hdl, SA_BONUS); idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); - for (; k != 2; k++) { + for (; ; k++) { /* * Iterate over each attribute in layout. Fetch the * size of variable-length attributes needing rewrite * from sa_lengths[]. */ for (i = 0, length_idx = 0; i != count; i++) { sa_attr_type_t attr; attr = idx_tab->sa_layout->lot_attrs[i]; reg_length = SA_REGISTERED_LEN(sa, attr); if (reg_length == 0) { length = hdr->sa_lengths[length_idx]; length_idx++; } else { length = reg_length; } if (attr == newattr) { /* * There is nothing to do for SA_REMOVE, * so it is just skipped. */ if (action == SA_REMOVE) continue; /* * Duplicate attributes are not allowed, so the * action can not be SA_ADD here. */ ASSERT3S(action, ==, SA_REPLACE); /* * Only a variable-sized attribute can be * replaced here, and its size must be changing. */ ASSERT3U(reg_length, ==, 0); ASSERT3U(length, !=, buflen); SA_ADD_BULK_ATTR(attr_desc, j, attr, locator, datastart, buflen); } else { SA_ADD_BULK_ATTR(attr_desc, j, attr, NULL, (void *) (TOC_OFF(idx_tab->sa_idx_tab[attr]) + (uintptr_t)old_data[k]), length); } } if (k == 0 && hdl->sa_spill) { hdr = SA_GET_HDR(hdl, SA_SPILL); idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); count = spill_attr_count; } else { break; } } if (action == SA_ADD) { reg_length = SA_REGISTERED_LEN(sa, newattr); IMPLY(reg_length != 0, reg_length == buflen); SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, datastart, buflen); } ASSERT3U(j, ==, attr_count); error = sa_build_layouts(hdl, attr_desc, attr_count, tx); if (old_data[0]) kmem_free(old_data[0], bonus_data_size); if (old_data[1]) vmem_free(old_data[1], spill_data_size); kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); return (error); } static int sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, dmu_tx_t *tx) { int error; sa_os_t *sa = hdl->sa_os->os_sa; dmu_object_type_t bonustype; dmu_buf_t *saved_spill; ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); saved_spill = hdl->sa_spill; /* sync out registration table if necessary */ if (sa->sa_need_attr_registration) sa_attr_register_sync(hdl, tx); error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) sa->sa_update_cb(hdl, tx); /* * If saved_spill is NULL and current sa_spill is not NULL that * means we increased the refcount of the spill buffer through * sa_get_spill() or dmu_spill_hold_by_dnode(). Therefore we * must release the hold before calling dmu_tx_commit() to avoid * making a copy of this buffer in dbuf_sync_leaf() due to the * reference count now being greater than 1. */ if (!saved_spill && hdl->sa_spill) { if (hdl->sa_spill_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); hdl->sa_spill_tab = NULL; } dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; } return (error); } /* * update or add new attribute */ int sa_update(sa_handle_t *hdl, sa_attr_type_t type, void *buf, uint32_t buflen, dmu_tx_t *tx) { int error; sa_bulk_attr_t bulk; VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN); bulk.sa_attr = type; bulk.sa_data_func = NULL; bulk.sa_length = buflen; bulk.sa_data = buf; mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, &bulk, 1, tx); mutex_exit(&hdl->sa_lock); return (error); } /* * Return size of an attribute */ int sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) { sa_bulk_attr_t bulk; int error; bulk.sa_data = NULL; bulk.sa_attr = attr; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { mutex_exit(&hdl->sa_lock); return (error); } *size = bulk.sa_size; mutex_exit(&hdl->sa_lock); return (0); } int sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) { ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); return (sa_lookup_impl(hdl, attrs, count)); } int sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) { int error; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_bulk_lookup_locked(hdl, attrs, count); mutex_exit(&hdl->sa_lock); return (error); } int sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) { int error; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, attrs, count, tx); mutex_exit(&hdl->sa_lock); return (error); } int sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) { int error; mutex_enter(&hdl->sa_lock); error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, NULL, 0, tx); mutex_exit(&hdl->sa_lock); return (error); } void sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) { dmu_object_info_from_db(hdl->sa_bonus, doi); } void sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) { dmu_object_size_from_db(hdl->sa_bonus, blksize, nblocks); } void sa_set_userp(sa_handle_t *hdl, void *ptr) { hdl->sa_userp = ptr; } dmu_buf_t * sa_get_db(sa_handle_t *hdl) { return (hdl->sa_bonus); } void * sa_get_userdata(sa_handle_t *hdl) { return (hdl->sa_userp); } void sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) { ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); os->os_sa->sa_update_cb = func; } void sa_register_update_callback(objset_t *os, sa_update_cb_t *func) { mutex_enter(&os->os_sa->sa_lock); sa_register_update_callback_locked(os, func); mutex_exit(&os->os_sa->sa_lock); } uint64_t sa_handle_object(sa_handle_t *hdl) { return (hdl->sa_bonus->db_object); } boolean_t sa_enabled(objset_t *os) { return (os->os_sa == NULL); } int sa_set_sa_object(objset_t *os, uint64_t sa_object) { sa_os_t *sa = os->os_sa; if (sa->sa_master_obj) return (1); sa->sa_master_obj = sa_object; return (0); } int sa_hdrsize(void *arg) { sa_hdr_phys_t *hdr = arg; return (SA_HDR_SIZE(hdr)); } void sa_handle_lock(sa_handle_t *hdl) { ASSERT(hdl); mutex_enter(&hdl->sa_lock); } void sa_handle_unlock(sa_handle_t *hdl) { ASSERT(hdl); mutex_exit(&hdl->sa_lock); } #ifdef _KERNEL EXPORT_SYMBOL(sa_handle_get); EXPORT_SYMBOL(sa_handle_get_from_db); EXPORT_SYMBOL(sa_handle_destroy); EXPORT_SYMBOL(sa_buf_hold); EXPORT_SYMBOL(sa_buf_rele); EXPORT_SYMBOL(sa_spill_rele); EXPORT_SYMBOL(sa_lookup); EXPORT_SYMBOL(sa_update); EXPORT_SYMBOL(sa_remove); EXPORT_SYMBOL(sa_bulk_lookup); EXPORT_SYMBOL(sa_bulk_lookup_locked); EXPORT_SYMBOL(sa_bulk_update); EXPORT_SYMBOL(sa_size); EXPORT_SYMBOL(sa_object_info); EXPORT_SYMBOL(sa_object_size); EXPORT_SYMBOL(sa_get_userdata); EXPORT_SYMBOL(sa_set_userp); EXPORT_SYMBOL(sa_get_db); EXPORT_SYMBOL(sa_handle_object); EXPORT_SYMBOL(sa_register_update_callback); EXPORT_SYMBOL(sa_setup); EXPORT_SYMBOL(sa_replace_all_by_template); EXPORT_SYMBOL(sa_replace_all_by_template_locked); EXPORT_SYMBOL(sa_enabled); EXPORT_SYMBOL(sa_cache_init); EXPORT_SYMBOL(sa_cache_fini); EXPORT_SYMBOL(sa_set_sa_object); EXPORT_SYMBOL(sa_hdrsize); EXPORT_SYMBOL(sa_handle_lock); EXPORT_SYMBOL(sa_handle_unlock); EXPORT_SYMBOL(sa_lookup_uio); EXPORT_SYMBOL(sa_add_projid); #endif /* _KERNEL */ diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c index 2e8489c7dfcf..e6afb1c58c95 100644 --- a/module/zfs/zap_leaf.c +++ b/module/zfs/zap_leaf.c @@ -1,848 +1,848 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* * The 512-byte leaf is broken into 32 16-byte chunks. * chunk number n means l_chunk[n], even though the header precedes it. * the names are stored null-terminated. */ #include #include #include #include #include #include #include #include #include static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); #define CHAIN_END 0xffff /* end of the chunk chain */ #define LEAF_HASH(l, h) \ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ ((h) >> \ (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) #define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) static void zap_memset(void *a, int c, size_t n) { char *cp = a; char *cpend = cp + n; while (cp < cpend) *cp++ = c; } static void stv(int len, void *addr, uint64_t value) { switch (len) { case 1: *(uint8_t *)addr = value; return; case 2: *(uint16_t *)addr = value; return; case 4: *(uint32_t *)addr = value; return; case 8: *(uint64_t *)addr = value; return; default: cmn_err(CE_PANIC, "bad int len %d", len); } } static uint64_t ldv(int len, const void *addr) { switch (len) { case 1: return (*(uint8_t *)addr); case 2: return (*(uint16_t *)addr); case 4: return (*(uint32_t *)addr); case 8: return (*(uint64_t *)addr); default: cmn_err(CE_PANIC, "bad int len %d", len); } return (0xFEEDFACEDEADBEEFULL); } void zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) { zap_leaf_t l; dmu_buf_t l_dbuf; l_dbuf.db_data = buf; l.l_bs = highbit64(size) - 1; l.l_dbuf = &l_dbuf; buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); struct zap_leaf_entry *le; switch (lc->l_free.lf_type) { case ZAP_CHUNK_ENTRY: le = &lc->l_entry; le->le_type = BSWAP_8(le->le_type); le->le_value_intlen = BSWAP_8(le->le_value_intlen); le->le_next = BSWAP_16(le->le_next); le->le_name_chunk = BSWAP_16(le->le_name_chunk); le->le_name_numints = BSWAP_16(le->le_name_numints); le->le_value_chunk = BSWAP_16(le->le_value_chunk); le->le_value_numints = BSWAP_16(le->le_value_numints); le->le_cd = BSWAP_32(le->le_cd); le->le_hash = BSWAP_64(le->le_hash); break; case ZAP_CHUNK_FREE: lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); break; case ZAP_CHUNK_ARRAY: lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); /* la_array doesn't need swapping */ break; default: cmn_err(CE_PANIC, "bad leaf type %d", lc->l_free.lf_type); } } } void zap_leaf_init(zap_leaf_t *l, boolean_t sort) { l->l_bs = highbit64(l->l_dbuf->db_size) - 1; zap_memset(&zap_leaf_phys(l)->l_hdr, 0, sizeof (struct zap_leaf_header)); zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; } ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF; zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC; zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); if (sort) zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; } /* * Routines which manipulate leaf chunks (l_chunk[]). */ static uint16_t zap_leaf_chunk_alloc(zap_leaf_t *l) { ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); zap_leaf_phys(l)->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; zap_leaf_phys(l)->l_hdr.lh_nfree--; return (chunk); } static void zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) { struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); zlf->lf_type = ZAP_CHUNK_FREE; zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist; memset(zlf->lf_pad, 0, sizeof (zlf->lf_pad)); /* help it to compress */ zap_leaf_phys(l)->l_hdr.lh_freelist = chunk; zap_leaf_phys(l)->l_hdr.lh_nfree++; } /* * Routines which manipulate leaf arrays (zap_leaf_array type chunks). */ static uint16_t zap_leaf_array_create(zap_leaf_t *l, const char *buf, int integer_size, int num_integers) { uint16_t chunk_head; uint16_t *chunkp = &chunk_head; int byten = 0; uint64_t value = 0; int shift = (integer_size - 1) * 8; int len = num_integers; ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN); while (len > 0) { uint16_t chunk = zap_leaf_chunk_alloc(l); struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; la->la_type = ZAP_CHUNK_ARRAY; for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { if (byten == 0) value = ldv(integer_size, buf); la->la_array[i] = value >> shift; value <<= 8; if (++byten == integer_size) { byten = 0; buf += integer_size; if (--len == 0) break; } } *chunkp = chunk; chunkp = &la->la_next; } *chunkp = CHAIN_END; return (chunk_head); } static void zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) { uint16_t chunk = *chunkp; *chunkp = CHAIN_END; while (chunk != CHAIN_END) { int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, ZAP_CHUNK_ARRAY); zap_leaf_chunk_free(l, chunk); chunk = nextchunk; } } /* array_len and buf_len are in integers, not bytes */ static void zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, void *buf) { int len = MIN(array_len, buf_len); int byten = 0; uint64_t value = 0; char *p = buf; ASSERT3U(array_int_len, <=, buf_int_len); /* Fast path for one 8-byte integer */ if (array_int_len == 8 && buf_int_len == 8 && len == 1) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; uint8_t *ip = la->la_array; uint64_t *buf64 = buf; *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; return; } /* Fast path for an array of 1-byte integers (eg. the entry name) */ if (array_int_len == 1 && buf_int_len == 1 && buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { while (chunk != CHAIN_END) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; memcpy(p, la->la_array, ZAP_LEAF_ARRAY_BYTES); p += ZAP_LEAF_ARRAY_BYTES; chunk = la->la_next; } return; } while (len > 0) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { + for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { value = (value << 8) | la->la_array[i]; byten++; if (byten == array_int_len) { stv(buf_int_len, p, value); byten = 0; len--; if (len == 0) return; p += buf_int_len; } } chunk = la->la_next; } } static boolean_t zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_numints) { int bseen = 0; if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { uint64_t *thiskey = kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP); ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, sizeof (*thiskey), array_numints, thiskey); boolean_t match = memcmp(thiskey, zn->zn_key_orig, array_numints * sizeof (*thiskey)) == 0; kmem_free(thiskey, array_numints * sizeof (*thiskey)); return (match); } ASSERT(zn->zn_key_intlen == 1); if (zn->zn_matchtype & MT_NORMALIZE) { char *thisname = kmem_alloc(array_numints, KM_SLEEP); zap_leaf_array_read(l, chunk, sizeof (char), array_numints, sizeof (char), array_numints, thisname); boolean_t match = zap_match(zn, thisname); kmem_free(thisname, array_numints); return (match); } /* * Fast path for exact matching. * First check that the lengths match, so that we don't read * past the end of the zn_key_orig array. */ if (array_numints != zn->zn_key_orig_numints) return (B_FALSE); while (bseen < array_numints) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); if (memcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) break; chunk = la->la_next; bseen += toread; } return (bseen == array_numints); } /* * Routines which manipulate leaf entries. */ int zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) { struct zap_leaf_entry *le; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); *chunkp != CHAIN_END; chunkp = &le->le_next) { uint16_t chunk = *chunkp; le = ZAP_LEAF_ENTRY(l, chunk); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (le->le_hash != zn->zn_hash) continue; /* * NB: the entry chain is always sorted by cd on * normalized zap objects, so this will find the * lowest-cd match for MT_NORMALIZE. */ ASSERT((zn->zn_matchtype == 0) || (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); if (zap_leaf_array_match(l, zn, le->le_name_chunk, le->le_name_numints)) { zeh->zeh_num_integers = le->le_value_numints; zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; zeh->zeh_leaf = l; return (0); } } return (SET_ERROR(ENOENT)); } /* Return (h1,cd1 >= h2,cd2) */ #define HCD_GTEQ(h1, cd1, h2, cd2) \ ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) int zap_leaf_lookup_closest(zap_leaf_t *l, uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) { uint64_t besth = -1ULL; uint32_t bestcd = -1U; uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; struct zap_leaf_entry *le; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh]; chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { ASSERT3U(bestlh, >=, lh); bestlh = lh; besth = le->le_hash; bestcd = le->le_cd; zeh->zeh_num_integers = le->le_value_numints; zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_fakechunk = chunk; zeh->zeh_chunkp = &zeh->zeh_fakechunk; zeh->zeh_leaf = l; } } } return (bestcd == -1U ? SET_ERROR(ENOENT) : 0); } int zap_entry_read(const zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, void *buf) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (le->le_value_intlen > integer_size) return (SET_ERROR(EINVAL)); zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_value_intlen, le->le_value_numints, integer_size, num_integers, buf); if (zeh->zeh_num_integers > num_integers) return (SET_ERROR(EOVERFLOW)); return (0); } int zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, char *buf) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8, le->le_name_numints, 8, buflen / 8, buf); } else { zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, le->le_name_numints, 1, buflen, buf); } if (le->le_name_numints > buflen) return (SET_ERROR(EOVERFLOW)); return (0); } int zap_entry_update(zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, const void *buf) { zap_leaf_t *l = zeh->zeh_leaf; struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks) return (SET_ERROR(EAGAIN)); zap_leaf_array_free(l, &le->le_value_chunk); le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); le->le_value_numints = num_integers; le->le_value_intlen = integer_size; return (0); } void zap_entry_remove(zap_entry_handle_t *zeh) { zap_leaf_t *l = zeh->zeh_leaf; ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); uint16_t entry_chunk = *zeh->zeh_chunkp; struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); zap_leaf_array_free(l, &le->le_name_chunk); zap_leaf_array_free(l, &le->le_value_chunk); *zeh->zeh_chunkp = le->le_next; zap_leaf_chunk_free(l, entry_chunk); zap_leaf_phys(l)->l_hdr.lh_nentries--; } int zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, uint8_t integer_size, uint64_t num_integers, const void *buf, zap_entry_handle_t *zeh) { uint16_t chunk; struct zap_leaf_entry *le; uint64_t h = zn->zn_hash; uint64_t valuelen = integer_size * num_integers; int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) return (SET_ERROR(E2BIG)); if (cd == ZAP_NEED_CD) { /* find the lowest unused cd */ if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { cd = 0; for (chunk = *LEAF_HASH_ENTPTR(l, h); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); if (le->le_cd > cd) break; if (le->le_hash == h) { ASSERT3U(cd, ==, le->le_cd); cd++; } } } else { /* old unsorted format; do it the O(n^2) way */ for (cd = 0; ; cd++) { for (chunk = *LEAF_HASH_ENTPTR(l, h); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); if (le->le_hash == h && le->le_cd == cd) { break; } } /* If this cd is not in use, we are good. */ if (chunk == CHAIN_END) break; } } /* * We would run out of space in a block before we could * store enough entries to run out of CD values. */ ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); } if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks) return (SET_ERROR(EAGAIN)); /* make the entry */ chunk = zap_leaf_chunk_alloc(l); le = ZAP_LEAF_ENTRY(l, chunk); le->le_type = ZAP_CHUNK_ENTRY; le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig, zn->zn_key_intlen, zn->zn_key_orig_numints); le->le_name_numints = zn->zn_key_orig_numints; le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); le->le_value_numints = num_integers; le->le_value_intlen = integer_size; le->le_hash = h; le->le_cd = cd; /* link it into the hash chain */ /* XXX if we did the search above, we could just use that */ uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk); zap_leaf_phys(l)->l_hdr.lh_nentries++; zeh->zeh_leaf = l; zeh->zeh_num_integers = num_integers; zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; return (0); } /* * Determine if there is another entry with the same normalized form. * For performance purposes, either zn or name must be provided (the * other can be NULL). Note, there usually won't be any hash * conflicts, in which case we don't need the concatenated/normalized * form of the name. But all callers have one of these on hand anyway, * so might as well take advantage. A cleaner but slower interface * would accept neither argument, and compute the normalized name as * needed (using zap_name_alloc_str(zap_entry_read_name(zeh))). */ boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, const char *name, zap_t *zap) { struct zap_leaf_entry *le; boolean_t allocdzn = B_FALSE; if (zap->zap_normflags == 0) return (B_FALSE); for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); if (le->le_hash != zeh->zeh_hash) continue; if (le->le_cd == zeh->zeh_cd) continue; if (zn == NULL) { zn = zap_name_alloc_str(zap, name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_leaf_array_match(zeh->zeh_leaf, zn, le->le_name_chunk, le->le_name_numints)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for transferring entries between leafs. */ static uint16_t * zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); struct zap_leaf_entry *le2; uint16_t *chunkp; /* * keep the entry chain sorted by cd * NB: this will not cause problems for unsorted leafs, though * it is unnecessary there. */ for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash); *chunkp != CHAIN_END; chunkp = &le2->le_next) { le2 = ZAP_LEAF_ENTRY(l, *chunkp); if (le2->le_cd > le->le_cd) break; } le->le_next = *chunkp; *chunkp = entry; return (chunkp); } static uint16_t zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) { uint16_t new_chunk; uint16_t *nchunkp = &new_chunk; while (chunk != CHAIN_END) { uint16_t nchunk = zap_leaf_chunk_alloc(nl); struct zap_leaf_array *nla = &ZAP_LEAF_CHUNK(nl, nchunk).l_array; struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int nextchunk = la->la_next; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); *nla = *la; /* structure assignment */ zap_leaf_chunk_free(l, chunk); chunk = nextchunk; *nchunkp = nchunk; nchunkp = &nla->la_next; } *nchunkp = CHAIN_END; return (new_chunk); } static void zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); uint16_t chunk = zap_leaf_chunk_alloc(nl); struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk); *nle = *le; /* structure assignment */ (void) zap_leaf_rehash_entry(nl, chunk); nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); nle->le_value_chunk = zap_leaf_transfer_array(l, le->le_value_chunk, nl); zap_leaf_chunk_free(l, entry); zap_leaf_phys(l)->l_hdr.lh_nentries--; zap_leaf_phys(nl)->l_hdr.lh_nentries++; } /* * Transfer the entries whose hash prefix ends in 1 to the new leaf. */ void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* set new prefix and prefix_len */ zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; zap_leaf_phys(l)->l_hdr.lh_prefix_len++; zap_leaf_phys(nl)->l_hdr.lh_prefix = zap_leaf_phys(l)->l_hdr.lh_prefix | 1; zap_leaf_phys(nl)->l_hdr.lh_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* break existing hash chains */ zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); if (sort) zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; /* * Transfer entries whose hash bit 'bit' is set to nl; rehash * the remaining entries * * NB: We could find entries via the hashtable instead. That * would be O(hashents+numents) rather than O(numblks+numents), * but this accesses memory more sequentially, and when we're * called, the block is usually pretty full. */ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); if (le->le_type != ZAP_CHUNK_ENTRY) continue; if (le->le_hash & (1ULL << bit)) zap_leaf_transfer_entry(l, i, nl); else (void) zap_leaf_rehash_entry(l, i); } } void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) { int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - zap_leaf_phys(l)->l_hdr.lh_prefix_len; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_leafs_with_2n_pointers[n]++; n = zap_leaf_phys(l)->l_hdr.lh_nentries/5; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_blocks_with_n5_entries[n]++; n = ((1<l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / (1<zs_blocks_n_tenths_full[n]++; for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { int nentries = 0; int chunk = zap_leaf_phys(l)->l_hash[i]; while (chunk != CHAIN_END) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, chunk); n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_entries_using_n_chunks[n]++; chunk = le->le_next; nentries++; } n = nentries; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_buckets_with_n_entries[n]++; } }