Index: vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c =================================================================== --- vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c (revision 289525) +++ vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c (revision 289526) @@ -1,4117 +1,4042 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_comutil.h" #include "zfeature_common.h" static int read_efi_label(nvlist_t *config, diskaddr_t *sb); #define DISK_ROOT "/dev/dsk" #define RDISK_ROOT "/dev/rdsk" #define BACKUP_SLICE "s2" typedef struct prop_flags { int create:1; /* Validate property on creation */ int import:1; /* Validate property on import */ } prop_flags_t; /* * ==================================================================== * zpool property functions * ==================================================================== */ static int zpool_get_all_props(zpool_handle_t *zhp) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) return (-1); while (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { zcmd_free_nvlists(&zc); return (-1); } } else { zcmd_free_nvlists(&zc); return (-1); } } if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) { zcmd_free_nvlists(&zc); return (-1); } zcmd_free_nvlists(&zc); return (0); } static int zpool_props_refresh(zpool_handle_t *zhp) { nvlist_t *old_props; old_props = zhp->zpool_props; if (zpool_get_all_props(zhp) != 0) return (-1); nvlist_free(old_props); return (0); } static char * zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; uint64_t ival; char *value; zprop_source_t source; nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0); source = ival; verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); } else { source = ZPROP_SRC_DEFAULT; if ((value = (char *)zpool_prop_default_string(prop)) == NULL) value = "-"; } if (src) *src = source; return (value); } uint64_t zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; uint64_t value; zprop_source_t source; if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) { /* * zpool_get_all_props() has most likely failed because * the pool is faulted, but if all we need is the top level * vdev's guid then get it from the zhp config nvlist. */ if ((prop == ZPOOL_PROP_GUID) && (nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) && (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) == 0)) { return (value); } return (zpool_prop_default_numeric(prop)); } nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0); source = value; verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); } else { source = ZPROP_SRC_DEFAULT; value = zpool_prop_default_numeric(prop); } if (src) *src = source; return (value); } /* * Map VDEV STATE to printed strings. */ char * zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) { switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return (gettext("OFFLINE")); case VDEV_STATE_REMOVED: return (gettext("REMOVED")); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return (gettext("FAULTED")); else if (aux == VDEV_AUX_SPLIT_POOL) return (gettext("SPLIT")); else return (gettext("UNAVAIL")); case VDEV_STATE_FAULTED: return (gettext("FAULTED")); case VDEV_STATE_DEGRADED: return (gettext("DEGRADED")); case VDEV_STATE_HEALTHY: return (gettext("ONLINE")); } return (gettext("UNKNOWN")); } /* * Get a zpool property value for 'prop' and return the value in * a pre-allocated buffer. */ int zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { uint64_t intval; const char *strval; zprop_source_t src = ZPROP_SRC_NONE; nvlist_t *nvroot; vdev_stat_t *vs; uint_t vsc; if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { switch (prop) { case ZPOOL_PROP_NAME: (void) strlcpy(buf, zpool_get_name(zhp), len); break; case ZPOOL_PROP_HEALTH: (void) strlcpy(buf, "FAULTED", len); break; case ZPOOL_PROP_GUID: intval = zpool_get_prop_int(zhp, prop, &src); (void) snprintf(buf, len, "%llu", intval); break; case ZPOOL_PROP_ALTROOT: case ZPOOL_PROP_CACHEFILE: case ZPOOL_PROP_COMMENT: if (zhp->zpool_props != NULL || zpool_get_all_props(zhp) == 0) { (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; } /* FALLTHROUGH */ default: (void) strlcpy(buf, "-", len); break; } if (srctype != NULL) *srctype = src; return (0); } if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && prop != ZPOOL_PROP_NAME) return (-1); switch (zpool_prop_get_type(prop)) { case PROP_TYPE_STRING: (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; case PROP_TYPE_NUMBER: intval = zpool_get_prop_int(zhp, prop, &src); switch (prop) { case ZPOOL_PROP_SIZE: case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: case ZPOOL_PROP_FREEING: case ZPOOL_PROP_LEAKED: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case ZPOOL_PROP_EXPANDSZ: if (intval == 0) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case ZPOOL_PROP_CAPACITY: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_FRAGMENTATION: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_DEDUPRATIO: (void) snprintf(buf, len, "%llu.%02llux", (u_longlong_t)(intval / 100), (u_longlong_t)(intval % 100)); break; case ZPOOL_PROP_HEALTH: verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); (void) strlcpy(buf, zpool_state_to_name(intval, vs->vs_aux), len); break; case ZPOOL_PROP_VERSION: if (intval >= SPA_VERSION_FEATURES) { (void) snprintf(buf, len, "-"); break; } /* FALLTHROUGH */ default: (void) snprintf(buf, len, "%llu", intval); } break; case PROP_TYPE_INDEX: intval = zpool_get_prop_int(zhp, prop, &src); if (zpool_prop_index_to_string(prop, intval, &strval) != 0) return (-1); (void) strlcpy(buf, strval, len); break; default: abort(); } if (srctype) *srctype = src; return (0); } /* * Check if the bootfs name has the same pool name as it is set to. * Assuming bootfs is a valid dataset name. */ static boolean_t bootfs_name_valid(const char *pool, char *bootfs) { int len = strlen(pool); if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT)) return (B_FALSE); if (strncmp(pool, bootfs, len) == 0 && (bootfs[len] == '/' || bootfs[len] == '\0')) return (B_TRUE); return (B_FALSE); } -/* - * Inspect the configuration to determine if any of the devices contain - * an EFI label. - */ -static boolean_t -pool_uses_efi(nvlist_t *config) -{ - nvlist_t **child; - uint_t c, children; - - if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - return (read_efi_label(config, NULL) >= 0); - - for (c = 0; c < children; c++) { - if (pool_uses_efi(child[c])) - return (B_TRUE); - } - return (B_FALSE); -} - boolean_t zpool_is_bootable(zpool_handle_t *zhp) { char bootfs[ZPOOL_MAXNAMELEN]; return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs, sizeof (bootfs), NULL, B_FALSE) == 0 && strncmp(bootfs, "-", sizeof (bootfs)) != 0); } /* * Given an nvlist of zpool properties to be set, validate that they are * correct, and parse any numeric properties (index, boolean, etc) if they are * specified as strings. */ static nvlist_t * zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf) { nvpair_t *elem; nvlist_t *retprops; zpool_prop_t prop; char *strval; uint64_t intval; char *slash, *check; struct stat64 statbuf; zpool_handle_t *zhp; - nvlist_t *nvroot; if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); prop = zpool_name_to_prop(propname); if (prop == ZPROP_INVAL && zpool_prop_feature(propname)) { int err; char *fname = strchr(propname, '@') + 1; err = zfeature_lookup_name(fname, NULL); if (err != 0) { ASSERT3U(err, ==, ENOENT); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid feature '%s'"), fname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set to " "'enabled'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvlist_add_uint64(retprops, propname, 0) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Make sure this property is valid and applies to this type. */ if (prop == ZPROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zpool_prop_readonly(prop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops, &strval, &intval, errbuf) != 0) goto error; /* * Perform additional checking for specific properties. */ switch (prop) { case ZPOOL_PROP_VERSION: if (intval < version || !SPA_VERSION_IS_SUPPORTED(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %d is invalid."), propname, intval); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } break; case ZPOOL_PROP_BOOTFS: if (flags.create || flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' cannot be set at creation " "or import time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (version < SPA_VERSION_BOOTFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to support " "'%s' property"), propname); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } /* * bootfs property value has to be a dataset name and * the dataset has to be in the same pool as it sets to. */ if (strval[0] != '\0' && !bootfs_name_valid(poolname, strval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is an invalid name"), strval); (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto error; } if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "could not open pool '%s'"), poolname); (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); goto error; } - verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - - /* - * bootfs property cannot be set on a disk which has - * been EFI labeled. - */ - if (pool_uses_efi(nvroot)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' not supported on " - "EFI labeled devices"), propname); - (void) zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf); - zpool_close(zhp); - goto error; - } zpool_close(zhp); break; case ZPOOL_PROP_ALTROOT: if (!flags.create && !flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set during pool " "creation or import"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bad alternate root '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } break; case ZPOOL_PROP_CACHEFILE: if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' must be empty, an " "absolute path, or 'none'"), propname); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } slash = strrchr(strval, '/'); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid file"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } *slash = '\0'; if (strval[0] != '\0' && (stat64(strval, &statbuf) != 0 || !S_ISDIR(statbuf.st_mode))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid directory"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } *slash = '/'; break; case ZPOOL_PROP_COMMENT: for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment may only have printable " "characters")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } if (strlen(strval) > ZPROP_MAX_COMMENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment must not exceed %d characters"), ZPROP_MAX_COMMENT); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_READONLY: if (!flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set at " "import time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; } } return (retprops); error: nvlist_free(retprops); return (NULL); } /* * Set zpool property : propname=propval. */ int zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) { zfs_cmd_t zc = { 0 }; int ret = -1; char errbuf[1024]; nvlist_t *nvl = NULL; nvlist_t *realprops; uint64_t version; prop_flags_t flags = { 0 }; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zpool_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_add_string(nvl, propname, propval) != 0) { nvlist_free(nvl); return (no_memory(zhp->zpool_hdl)); } version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) { nvlist_free(nvl); return (-1); } nvlist_free(nvl); nvl = realprops; /* * Execute the corresponding ioctl() to set this property. */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) { nvlist_free(nvl); return (-1); } ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc); zcmd_free_nvlists(&zc); nvlist_free(nvl); if (ret) (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); else (void) zpool_props_refresh(zhp); return (ret); } int zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) { libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; nvlist_t *features = NULL; zprop_list_t **last; boolean_t firstexpand = (NULL == *plp); if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0) return (-1); last = plp; while (*last != NULL) last = &(*last)->pl_next; if ((*plp)->pl_all) features = zpool_get_features(zhp); if ((*plp)->pl_all && firstexpand) { for (int i = 0; i < SPA_FEATURES; i++) { zprop_list_t *entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_INVAL; entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", spa_feature_table[i].fi_uname); entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } } /* add any unsupported features */ for (nvpair_t *nvp = nvlist_next_nvpair(features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { char *propname; boolean_t found; zprop_list_t *entry; if (zfeature_is_supported(nvpair_name(nvp))) continue; propname = zfs_asprintf(hdl, "unsupported@%s", nvpair_name(nvp)); /* * Before adding the property to the list make sure that no * other pool already added the same property. */ found = B_FALSE; entry = *plp; while (entry != NULL) { if (entry->pl_user_prop != NULL && strcmp(propname, entry->pl_user_prop) == 0) { found = B_TRUE; break; } entry = entry->pl_next; } if (found) { free(propname); continue; } entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_INVAL; entry->pl_user_prop = propname; entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed) continue; if (entry->pl_prop != ZPROP_INVAL && zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), NULL, B_FALSE) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } } return (0); } /* * Get the state for the given feature on the given ZFS pool. */ int zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, size_t len) { uint64_t refcount; boolean_t found = B_FALSE; nvlist_t *features = zpool_get_features(zhp); boolean_t supported; const char *feature = strchr(propname, '@') + 1; supported = zpool_prop_feature(propname); ASSERT(supported || zfs_prop_unsupported(propname)); /* * Convert from feature name to feature guid. This conversion is * unecessary for unsupported@... properties because they already * use guids. */ if (supported) { int ret; spa_feature_t fid; ret = zfeature_lookup_name(feature, &fid); if (ret != 0) { (void) strlcpy(buf, "-", len); return (ENOTSUP); } feature = spa_feature_table[fid].fi_guid; } if (nvlist_lookup_uint64(features, feature, &refcount) == 0) found = B_TRUE; if (supported) { if (!found) { (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); } else { if (refcount == 0) (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); else (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); } } else { if (found) { if (refcount == 0) { (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); } else { (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); } } else { (void) strlcpy(buf, "-", len); return (ENOTSUP); } } return (0); } /* * Don't start the slice at the default block of 34; many storage * devices will use a stripe width of 128k, so start there instead. */ #define NEW_START_BLOCK 256 /* * Validate the given pool name, optionally putting an extended error message in * 'buf'. */ boolean_t zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) { namecheck_err_t why; char what; int ret; ret = pool_namecheck(pool, &why, &what); /* * The rules for reserved pool names were extended at a later point. * But we need to support users with existing pools that may now be * invalid. So we only check for this expanded set of names during a * create (or import), and only in userland. */ if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); return (B_FALSE); } if (ret != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in pool name"), what); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name must begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool name is reserved")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_MULTIPLE_AT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' delimiters in name")); break; } } return (B_FALSE); } return (B_TRUE); } /* * Open a handle to the given pool, even if the pool is currently in the FAULTED * state. */ zpool_handle_t * zpool_open_canfail(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; boolean_t missing; /* * Make sure the pool name is valid. */ if (!zpool_name_valid(hdl, B_TRUE, pool)) { (void) zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); return (NULL); } if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) return (NULL); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (NULL); } if (missing) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool")); (void) zfs_error_fmt(hdl, EZFS_NOENT, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); zpool_close(zhp); return (NULL); } return (zhp); } /* * Like the above, but silent on error. Used when iterating over pools (because * the configuration cache may be out of date). */ int zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret) { zpool_handle_t *zhp; boolean_t missing; if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) return (-1); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (-1); } if (missing) { zpool_close(zhp); *ret = NULL; return (0); } *ret = zhp; return (0); } /* * Similar to zpool_open_canfail(), but refuses to open pools in the faulted * state. */ zpool_handle_t * zpool_open(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; if ((zhp = zpool_open_canfail(hdl, pool)) == NULL) return (NULL); if (zhp->zpool_state == POOL_STATE_UNAVAIL) { (void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name); zpool_close(zhp); return (NULL); } return (zhp); } /* * Close the handle. Simply frees the memory associated with the handle. */ void zpool_close(zpool_handle_t *zhp) { if (zhp->zpool_config) nvlist_free(zhp->zpool_config); if (zhp->zpool_old_config) nvlist_free(zhp->zpool_old_config); if (zhp->zpool_props) nvlist_free(zhp->zpool_props); free(zhp); } /* * Return the name of the pool. */ const char * zpool_get_name(zpool_handle_t *zhp) { return (zhp->zpool_name); } /* * Return the state of the pool (ACTIVE or UNAVAILABLE) */ int zpool_get_state(zpool_handle_t *zhp) { return (zhp->zpool_state); } /* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we * don't have to worry about error semantics. */ int zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *fsprops) { zfs_cmd_t zc = { 0 }; nvlist_t *zc_fsprops = NULL; nvlist_t *zc_props = NULL; char msg[1024]; int ret = -1; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot create '%s'"), pool); if (!zpool_name_valid(hdl, B_FALSE, pool)) return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) return (-1); if (props) { prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE }; if ((zc_props = zpool_valid_proplist(hdl, pool, props, SPA_VERSION_1, flags, msg)) == NULL) { goto create_failed; } } if (fsprops) { uint64_t zoned; char *zonestr; zoned = ((nvlist_lookup_string(fsprops, zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) && strcmp(zonestr, "on") == 0); if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, NULL, msg)) == NULL) { goto create_failed; } if (!zc_props && (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { goto create_failed; } if (nvlist_add_nvlist(zc_props, ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) { goto create_failed; } } if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) goto create_failed; (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) { zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device")); return (zfs_error(hdl, EZFS_BADDEV, msg)); case ERANGE: /* * This happens if the record size is smaller or larger * than the allowed size range, or not a power of 2. * * NOTE: although zfs_valid_proplist is called earlier, * this case may have slipped through since the * pool does not exist yet and it is therefore * impossible to read properties e.g. max blocksize * from the pool. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "record size invalid")); return (zfs_error(hdl, EZFS_BADPROP, msg)); case EOVERFLOW: /* * This occurs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is less than the " "minimum size (%s)"), buf); } return (zfs_error(hdl, EZFS_BADDEV, msg)); case ENOSPC: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is out of space")); return (zfs_error(hdl, EZFS_BADDEV, msg)); case ENOTBLK: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cache device must be a disk or disk slice")); return (zfs_error(hdl, EZFS_BADDEV, msg)); default: return (zpool_standard_error(hdl, errno, msg)); } } create_failed: zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); return (ret); } /* * Destroy the given pool. It is up to the caller to ensure that there are no * datasets left in the pool. */ int zpool_destroy(zpool_handle_t *zhp, const char *log_str) { zfs_cmd_t zc = { 0 }; zfs_handle_t *zfp = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; char msg[1024]; if (zhp->zpool_state == POOL_STATE_ACTIVE && (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zpool_name); if (errno == EROFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, msg); } else { (void) zpool_standard_error(hdl, errno, msg); } if (zfp) zfs_close(zfp); return (-1); } if (zfp) { remove_mountpoint(zfp); zfs_close(zfp); } return (0); } /* * Add the given vdevs to the pool. The caller must have already performed the * necessary verification to ensure that the vdev specification is well-formed. */ int zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) { zfs_cmd_t zc = { 0 }; int ret; libzfs_handle_t *hdl = zhp->zpool_hdl; char msg[1024]; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot add to '%s'"), zhp->zpool_name); if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_SPARES && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add hot spares")); return (zfs_error(hdl, EZFS_BADVERSION, msg)); } - if (zpool_is_bootable(zhp) && nvlist_lookup_nvlist_array(nvroot, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { - uint64_t s; - - for (s = 0; s < nspares; s++) { - char *path; - - if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, - &path) == 0 && pool_uses_efi(spares[s])) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "device '%s' contains an EFI label and " - "cannot be used on root pools."), - zpool_vdev_name(hdl, NULL, spares[s], - B_FALSE)); - return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); - } - } - } - if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_L2CACHE && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add cache devices")); return (zfs_error(hdl, EZFS_BADVERSION, msg)); } if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device")); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case EOVERFLOW: /* * This occurrs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicenum(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is less than the minimum " "size (%s)"), buf); } (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to add these vdevs")); (void) zfs_error(hdl, EZFS_BADVERSION, msg); break; case EDOM: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "root pool can not have multiple vdevs" " or separate logs")); (void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg); break; case ENOTBLK: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cache device must be a disk or disk slice")); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; default: (void) zpool_standard_error(hdl, errno, msg); } ret = -1; } else { ret = 0; } zcmd_free_nvlists(&zc); return (ret); } /* * Exports the pool from the system. The caller must ensure that there are no * mounted datasets in the pool. */ static int zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, const char *log_str) { zfs_cmd_t zc = { 0 }; char msg[1024]; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot export '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = force; zc.zc_guid = hardforce; zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { switch (errno) { case EXDEV: zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "use '-f' to override the following errors:\n" "'%s' has an active shared spare which could be" " used by other pools once '%s' is exported."), zhp->zpool_name, zhp->zpool_name); return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, msg)); default: return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, msg)); } } return (0); } int zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) { return (zpool_export_common(zhp, force, B_FALSE, log_str)); } int zpool_export_force(zpool_handle_t *zhp, const char *log_str) { return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); } static void zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, nvlist_t *config) { nvlist_t *nv = NULL; uint64_t rewindto; int64_t loss = -1; struct tm t; char timestr[128]; if (!hdl->libzfs_printerr || config == NULL) return; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { return; } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) return; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && strftime(timestr, 128, 0, &t) != 0) { if (dryrun) { (void) printf(dgettext(TEXT_DOMAIN, "Would be able to return %s " "to its state as of %s.\n"), name, timestr); } else { (void) printf(dgettext(TEXT_DOMAIN, "Pool %s returned to its state as of %s.\n"), name, timestr); } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", (loss + 30) / 60); (void) printf(dgettext(TEXT_DOMAIN, "minutes of transactions.\n")); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", loss); (void) printf(dgettext(TEXT_DOMAIN, "seconds of transactions.\n")); } } } void zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, nvlist_t *config) { nvlist_t *nv = NULL; int64_t loss = -1; uint64_t edata = UINT64_MAX; uint64_t rewindto; struct tm t; char timestr[128]; if (!hdl->libzfs_printerr) return; if (reason >= 0) (void) printf(dgettext(TEXT_DOMAIN, "action: ")); else (void) printf(dgettext(TEXT_DOMAIN, "\t")); /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) goto no_info; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, &edata); (void) printf(dgettext(TEXT_DOMAIN, "Recovery is possible, but will result in some data loss.\n")); if (localtime_r((time_t *)&rewindto, &t) != NULL && strftime(timestr, 128, 0, &t) != 0) { (void) printf(dgettext(TEXT_DOMAIN, "\tReturning the pool to its state as of %s\n" "\tshould correct the problem. "), timestr); } else { (void) printf(dgettext(TEXT_DOMAIN, "\tReverting the pool to an earlier state " "should correct the problem.\n\t")); } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, "Approximately %lld minutes of data\n" "\tmust be discarded, irreversibly. "), (loss + 30) / 60); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, "Approximately %lld seconds of data\n" "\tmust be discarded, irreversibly. "), loss); } if (edata != 0 && edata != UINT64_MAX) { if (edata == 1) { (void) printf(dgettext(TEXT_DOMAIN, "After rewind, at least\n" "\tone persistent user-data error will remain. ")); } else { (void) printf(dgettext(TEXT_DOMAIN, "After rewind, several\n" "\tpersistent user-data errors will remain. ")); } } (void) printf(dgettext(TEXT_DOMAIN, "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "), reason >= 0 ? "clear" : "import", name); (void) printf(dgettext(TEXT_DOMAIN, "A scrub of the pool\n" "\tis strongly recommended after recovery.\n")); return; no_info: (void) printf(dgettext(TEXT_DOMAIN, "Destroy and re-create the pool from\n\ta backup source.\n")); } /* * zpool_import() is a contracted interface. Should be kept the same * if possible. * * Applications should use zpool_import_props() to import a pool with * new properties value to be set. */ int zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, char *altroot) { nvlist_t *props = NULL; int ret; if (altroot != NULL) { if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } if (nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 || nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) { nvlist_free(props); return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } } ret = zpool_import_props(hdl, config, newname, props, ZFS_IMPORT_NORMAL); if (props) nvlist_free(props); return (ret); } static void print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; char *vname; uint64_t is_log = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); if (name != NULL) (void) printf("\t%*s%s%s\n", indent, "", name, is_log ? " [log]" : ""); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE); print_vdev_tree(hdl, vname, child[c], indent + 2); free(vname); } } void zpool_print_unsup_feat(nvlist_t *config) { nvlist_t *nvinfo, *unsup_feat; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0); verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT, &unsup_feat) == 0); for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; nvp = nvlist_next_nvpair(unsup_feat, nvp)) { char *desc; verify(nvpair_type(nvp) == DATA_TYPE_STRING); verify(nvpair_value_string(nvp, &desc) == 0); if (strlen(desc) > 0) (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc); else (void) printf("\t%s\n", nvpair_name(nvp)); } } /* * Import the given pool using the known configuration and a list of * properties to be set. The configuration should have come from * zpool_find_import(). The 'newname' parameters control whether the pool * is imported with a different name. */ int zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, nvlist_t *props, int flags) { zfs_cmd_t zc = { 0 }; zpool_rewind_policy_t policy; nvlist_t *nv = NULL; nvlist_t *nvinfo = NULL; nvlist_t *missing = NULL; char *thename; char *origname; int ret; int error = 0; char errbuf[1024]; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &origname) == 0); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot import pool '%s'"), origname); if (newname != NULL) { if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); thename = (char *)newname; } else { thename = origname; } if (props != NULL) { uint64_t version; prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if ((props = zpool_valid_proplist(hdl, origname, props, version, flags, errbuf)) == NULL) return (-1); if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { nvlist_free(props); return (-1); } nvlist_free(props); } (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name)); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &zc.zc_guid) == 0); if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) { zcmd_free_nvlists(&zc); return (-1); } if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) { zcmd_free_nvlists(&zc); return (-1); } zc.zc_cookie = flags; while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { zcmd_free_nvlists(&zc); return (-1); } } if (ret != 0) error = errno; (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); zcmd_free_nvlists(&zc); zpool_get_rewind_policy(config, &policy); if (error) { char desc[1024]; /* * Dry-run failed, but we print out what success * looks like if we found a best txg */ if (policy.zrp_request & ZPOOL_TRY_REWIND) { zpool_rewind_exclaim(hdl, newname ? origname : thename, B_TRUE, nv); nvlist_free(nv); return (-1); } if (newname == NULL) (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s'"), thename); else (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), origname, thename); switch (error) { case ENOTSUP: if (nv != NULL && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { (void) printf(dgettext(TEXT_DOMAIN, "This " "pool uses the following feature(s) not " "supported by this system:\n")); zpool_print_unsup_feat(nv); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) { (void) printf(dgettext(TEXT_DOMAIN, "All unsupported features are only " "required for writing to the pool." "\nThe pool can be imported using " "'-o readonly=on'.\n")); } } /* * Unsupported version. */ (void) zfs_error(hdl, EZFS_BADVERSION, desc); break; case EINVAL: (void) zfs_error(hdl, EZFS_INVALCONFIG, desc); break; case EROFS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, desc); break; case ENXIO: if (nv && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { (void) printf(dgettext(TEXT_DOMAIN, "The devices below are missing, use " "'-m' to import the pool anyway:\n")); print_vdev_tree(hdl, NULL, missing, 2); (void) printf("\n"); } (void) zpool_standard_error(hdl, error, desc); break; case EEXIST: (void) zpool_standard_error(hdl, error, desc); break; default: (void) zpool_standard_error(hdl, error, desc); zpool_explain_recover(hdl, newname ? origname : thename, -error, nv); break; } nvlist_free(nv); ret = -1; } else { zpool_handle_t *zhp; /* * This should never fail, but play it safe anyway. */ if (zpool_open_silent(hdl, thename, &zhp) != 0) ret = -1; else if (zhp != NULL) zpool_close(zhp); if (policy.zrp_request & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { zpool_rewind_exclaim(hdl, newname ? origname : thename, ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv); } nvlist_free(nv); return (0); } return (ret); } /* * Scan the pool. */ int zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = func; if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 || (errno == ENOENT && func != POOL_SCAN_NONE)) return (0); if (func == POOL_SCAN_SCRUB) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name); } else if (func == POOL_SCAN_NONE) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), zc.zc_name); } else { assert(!"unexpected result"); } if (errno == EBUSY) { nvlist_t *nvroot; pool_scan_stat_t *ps = NULL; uint_t psc; verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); if (ps && ps->pss_func == POOL_SCAN_SCRUB) return (zfs_error(hdl, EZFS_SCRUBBING, msg)); else return (zfs_error(hdl, EZFS_RESILVERING, msg)); } else if (errno == ENOENT) { return (zfs_error(hdl, EZFS_NO_SCRUB, msg)); } else { return (zpool_standard_error(hdl, errno, msg)); } } /* * This provides a very minimal check whether a given string is likely a * c#t#d# style string. Users of this are expected to do their own * verification of the s# part. */ #define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1])) /* * More elaborate version for ones which may start with "/dev/dsk/" * and the like. */ static int ctd_check_path(char *str) { /* * If it starts with a slash, check the last component. */ if (str && str[0] == '/') { char *tmp = strrchr(str, '/'); /* * If it ends in "/old", check the second-to-last * component of the string instead. */ if (tmp != str && strcmp(tmp, "/old") == 0) { for (tmp--; *tmp != '/'; tmp--) ; } str = tmp + 1; } return (CTD_CHECK(str)); } /* * Find a vdev that matches the search criteria specified. We use the * the nvpair name to determine how we should look for the device. * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL * spare; but FALSE if its an INUSE spare. */ static nvlist_t * vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { uint_t c, children; nvlist_t **child; nvlist_t *ret; uint64_t is_log; char *srchkey; nvpair_t *pair = nvlist_next_nvpair(search, NULL); /* Nothing to look for */ if (search == NULL || pair == NULL) return (NULL); /* Obtain the key we will use to search */ srchkey = nvpair_name(pair); switch (nvpair_type(pair)) { case DATA_TYPE_UINT64: if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { uint64_t srchval, theguid; verify(nvpair_value_uint64(pair, &srchval) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &theguid) == 0); if (theguid == srchval) return (nv); } break; case DATA_TYPE_STRING: { char *srchval, *val; verify(nvpair_value_string(pair, &srchval) == 0); if (nvlist_lookup_string(nv, srchkey, &val) != 0) break; /* * Search for the requested value. Special cases: * * - ZPOOL_CONFIG_PATH for whole disk entries. These end in * "s0" or "s0/old". The "s0" part is hidden from the user, * but included in the string, so this matches around it. * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). * * Otherwise, all other searches are simple string compares. */ if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && ctd_check_path(val)) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); if (wholedisk) { int slen = strlen(srchval); int vlen = strlen(val); if (slen != vlen - 2) break; /* * make_leaf_vdev() should only set * wholedisk for ZPOOL_CONFIG_PATHs which * will include "/dev/dsk/", giving plenty of * room for the indices used next. */ ASSERT(vlen >= 6); /* * strings identical except trailing "s0" */ if (strcmp(&val[vlen - 2], "s0") == 0 && strncmp(srchval, val, slen) == 0) return (nv); /* * strings identical except trailing "s0/old" */ if (strcmp(&val[vlen - 6], "s0/old") == 0 && strcmp(&srchval[slen - 4], "/old") == 0 && strncmp(srchval, val, slen - 4) == 0) return (nv); break; } } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { char *type, *idx, *end, *p; uint64_t id, vdev_id; /* * Determine our vdev type, keeping in mind * that the srchval is composed of a type and * vdev id pair (i.e. mirror-4). */ if ((type = strdup(srchval)) == NULL) return (NULL); if ((p = strrchr(type, '-')) == NULL) { free(type); break; } idx = p + 1; *p = '\0'; /* * If the types don't match then keep looking. */ if (strncmp(val, type, strlen(val)) != 0) { free(type); break; } verify(strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || strncmp(type, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &id) == 0); errno = 0; vdev_id = strtoull(idx, &end, 10); free(type); if (errno != 0) return (NULL); /* * Now verify that we have the correct vdev id. */ if (vdev_id == id) return (nv); } /* * Common case */ if (strcmp(srchval, val) == 0) return (nv); break; } default: break; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (NULL); for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { /* * The 'is_log' value is only set for the toplevel * vdev, not the leaf vdevs. So we always lookup the * log device from the root of the vdev tree (where * 'log' is non-NULL). */ if (log != NULL && nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log) == 0 && is_log) { *log = B_TRUE; } return (ret); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *avail_spare = B_TRUE; return (ret); } } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *l2cache = B_TRUE; return (ret); } } } return (NULL); } /* * Given a physical path (minus the "/devices" prefix), find the * associated vdev. */ nvlist_t * zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { nvlist_t *search, *nvroot, *ret; verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0); verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); nvlist_free(search); return (ret); } /* * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). */ boolean_t zpool_vdev_is_interior(const char *name) { if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); return (B_FALSE); } nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { char buf[MAXPATHLEN]; char *end; nvlist_t *nvroot, *search, *ret; uint64_t guid; verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); guid = strtoull(path, &end, 10); if (guid != 0 && *end == '\0') { verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0); } else if (zpool_vdev_is_interior(path)) { verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0); } else if (path[0] != '/') { (void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path); verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0); } else { verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0); } verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); nvlist_free(search); return (ret); } static int vdev_online(nvlist_t *nv) { uint64_t ival; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) return (0); return (1); } /* * Helper function for zpool_get_physpaths(). */ static int vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size, size_t *bytes_written) { size_t bytes_left, pos, rsz; char *tmppath; const char *format; if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH, &tmppath) != 0) return (EZFS_NODEVICE); pos = *bytes_written; bytes_left = physpath_size - pos; format = (pos == 0) ? "%s" : " %s"; rsz = snprintf(physpath + pos, bytes_left, format, tmppath); *bytes_written += rsz; if (rsz >= bytes_left) { /* if physpath was not copied properly, clear it */ if (bytes_left != 0) { physpath[pos] = 0; } return (EZFS_NOSPC); } return (0); } static int vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size, size_t *rsz, boolean_t is_spare) { char *type; int ret; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (EZFS_INVALCONFIG); if (strcmp(type, VDEV_TYPE_DISK) == 0) { /* * An active spare device has ZPOOL_CONFIG_IS_SPARE set. * For a spare vdev, we only want to boot from the active * spare device. */ if (is_spare) { uint64_t spare = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare); if (!spare) return (EZFS_INVALCONFIG); } if (vdev_online(nv)) { if ((ret = vdev_get_one_physpath(nv, physpath, phypath_size, rsz)) != 0) return (ret); } } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 || strcmp(type, VDEV_TYPE_REPLACING) == 0 || (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) { nvlist_t **child; uint_t count; int i, ret; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) return (EZFS_INVALCONFIG); for (i = 0; i < count; i++) { ret = vdev_get_physpaths(child[i], physpath, phypath_size, rsz, is_spare); if (ret == EZFS_NOSPC) return (ret); } } return (EZFS_POOL_INVALARG); } /* * Get phys_path for a root pool config. * Return 0 on success; non-zero on failure. */ static int zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size) { size_t rsz; nvlist_t *vdev_root; nvlist_t **child; uint_t count; char *type; rsz = 0; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vdev_root) != 0) return (EZFS_INVALCONFIG); if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 || nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) return (EZFS_INVALCONFIG); /* - * root pool can not have EFI labeled disks and can only have - * a single top-level vdev. + * root pool can only have a single top-level vdev. */ - if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 || - pool_uses_efi(vdev_root)) + if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1) return (EZFS_POOL_INVALARG); (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz, B_FALSE); /* No online devices */ if (rsz == 0) return (EZFS_NODEVICE); return (0); } /* * Get phys_path for a root pool * Return 0 on success; non-zero on failure. */ int zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size) { return (zpool_get_config_physpath(zhp->zpool_config, physpath, phypath_size)); } /* * If the device has being dynamically expanded then we need to relabel * the disk to use the new unallocated space. */ static int zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) { char path[MAXPATHLEN]; char errbuf[1024]; int fd, error; int (*_efi_use_whole_disk)(int); if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT, "efi_use_whole_disk")) == NULL) return (-1); (void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name); if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " "relabel '%s': unable to open device"), name); return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); } /* * It's possible that we might encounter an error if the device * does not have any unallocated space left. If so, we simply * ignore that error and continue on. */ error = _efi_use_whole_disk(fd); (void) close(fd); if (error && error != VT_ENOSPC) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " "relabel '%s': unable to read disk capacity"), name); return (zfs_error(hdl, EZFS_NOCAP, errbuf)); } return (0); } /* * Bring the specified vdev online. The 'flags' parameter is a set of the * ZFS_ONLINE_* flags. */ int zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, vdev_state_t *newstate) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; if (flags & ZFS_ONLINE_EXPAND) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot expand %s"), path); } else { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot online %s"), path); } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); if (flags & ZFS_ONLINE_EXPAND || zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { char *pathname = NULL; uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0); /* * XXX - L2ARC 1.0 devices can't support expansion. */ if (l2cache) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot expand cache devices")); return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg)); } if (wholedisk) { pathname += strlen(DISK_ROOT) + 1; (void) zpool_relabel_disk(hdl, pathname); } } zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { if (errno == EINVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " "from this pool into a new one. Use '%s' " "instead"), "zpool detach"); return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg)); } return (zpool_standard_error(hdl, errno, msg)); } *newstate = zc.zc_cookie; return (0); } /* * Take the specified vdev offline */ int zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot offline %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); zc.zc_cookie = VDEV_STATE_OFFLINE; zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); case EEXIST: /* * The log device has unplayed logs */ return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg)); default: return (zpool_standard_error(hdl, errno, msg)); } } /* * Mark the given vdev faulted. */ int zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_FAULTED; zc.zc_obj = aux; if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); default: return (zpool_standard_error(hdl, errno, msg)); } } /* * Mark the given vdev degraded. */ int zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_DEGRADED; zc.zc_obj = aux; if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } /* * Returns TRUE if the given nvlist is a vdev that was originally swapped in as * a hot spare. */ static boolean_t is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) { nvlist_t **child; uint_t c, children; char *type; if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_SPARE) == 0 && children == 2 && child[which] == tgt) return (B_TRUE); for (c = 0; c < children; c++) if (is_replacing_spare(child[c], tgt, which)) return (B_TRUE); } return (B_FALSE); } /* * Attach new_disk (fully described by nvroot) to old_disk. * If 'replacing' is specified, the new disk will replace the old one. */ int zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing) { zfs_cmd_t zc = { 0 }; char msg[1024]; int ret; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; uint64_t val; char *newname; nvlist_t **child; uint_t children; nvlist_t *config_root; libzfs_handle_t *hdl = zhp->zpool_hdl; boolean_t rootpool = zpool_is_bootable(zhp); if (replacing) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot replace %s with %s"), old_disk, new_disk); else (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot attach %s to %s"), new_disk, old_disk); - /* - * If this is a root pool, make sure that we're not attaching an - * EFI labeled device. - */ - if (rootpool && pool_uses_efi(nvroot)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "EFI labeled devices are not supported on root pools.")); - return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); - } - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, &islog)) == 0) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); zc.zc_cookie = replacing; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); return (zfs_error(hdl, EZFS_INVALCONFIG, msg)); } verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0); if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL) return (-1); /* * If the target is a hot spare that has been swapped in, we can only * replace it with another hot spare. */ if (replacing && nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 && (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache, NULL) == NULL || !avail_spare) && is_replacing_spare(config_root, tgt, 1)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only be replaced by another hot spare")); free(newname); return (zfs_error(hdl, EZFS_BADTARGET, msg)); } free(newname); if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) return (-1); ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); zcmd_free_nvlists(&zc); if (ret == 0) { if (rootpool) { /* * XXX need a better way to prevent user from * booting up a half-baked vdev. */ (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make " "sure to wait until resilver is done " "before rebooting.\n")); } return (0); } switch (errno) { case ENOTSUP: /* * Can't attach to or replace this type of vdev. */ if (replacing) { uint64_t version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); else if (version >= SPA_VERSION_MULTI_REPLACE) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " "for completion or use 'zpool detach'")); else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only attach to mirrors and top-level " "disks")); } (void) zfs_error(hdl, EZFS_BADTARGET, msg); break; case EINVAL: /* * The new device must be a single disk. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case EOVERFLOW: /* * The new device is too small. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is too small")); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case EDOM: /* * The new device has a different alignment requirement. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "devices have different sector alignment")); (void) zfs_error(hdl, EZFS_BADDEV, msg); break; case ENAMETOOLONG: /* * The resulting top-level vdev spec won't fit in the label. */ (void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg); break; default: (void) zpool_standard_error(hdl, errno, msg); } return (-1); } /* * Detach the specified device. */ int zpool_vdev_detach(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot detach %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == 0) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0) return (0); switch (errno) { case ENOTSUP: /* * Can't detach from this type of vdev. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only " "applicable to mirror and replacing vdevs")); (void) zfs_error(hdl, EZFS_BADTARGET, msg); break; case EBUSY: /* * There are no other replicas of this device. */ (void) zfs_error(hdl, EZFS_NOREPLICAS, msg); break; default: (void) zpool_standard_error(hdl, errno, msg); } return (-1); } /* * Find a mirror vdev in the source nvlist. * * The mchild array contains a list of disks in one of the top-level mirrors * of the source pool. The schild array contains a list of disks that the * user specified on the command line. We loop over the mchild array to * see if any entry in the schild array matches. * * If a disk in the mchild array is found in the schild array, we return * the index of that entry. Otherwise we return -1. */ static int find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, nvlist_t **schild, uint_t schildren) { uint_t mc; for (mc = 0; mc < mchildren; mc++) { uint_t sc; char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, mchild[mc], B_FALSE); for (sc = 0; sc < schildren; sc++) { char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, schild[sc], B_FALSE); boolean_t result = (strcmp(mpath, spath) == 0); free(spath); if (result) { free(mpath); return (mc); } } free(mpath); } return (-1); } /* * Split a mirror pool. If newroot points to null, then a new nvlist * is generated and it is the responsibility of the caller to free it. */ int zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, nvlist_t *props, splitflags_t flags) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; nvlist_t **varray = NULL, *zc_props = NULL; uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t vers; boolean_t freelist = B_FALSE, memory_err = B_TRUE; int retval = 0; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("Internal error: unable to " "retrieve pool configuration\n")); return (-1); } verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0); if (props) { prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, props, vers, flags, msg)) == NULL) return (-1); } if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool is missing vdev tree")); if (zc_props) nvlist_free(zc_props); return (-1); } varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); vcount = 0; if (*newroot == NULL || nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, &newchild, &newchildren) != 0) newchildren = 0; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; char *type; nvlist_t **mchild, *vdev; uint_t mchildren; int entry; /* * Unlike cache & spares, slogs are stored in the * ZPOOL_CONFIG_CHILDREN array. We filter them out here. */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_log || is_hole) { /* * Create a hole vdev and put it in the config. */ if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) != 0) goto out; if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, 1) != 0) goto out; if (lastlog == 0) lastlog = vcount; varray[vcount++] = vdev; continue; } lastlog = 0; verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool must be composed only of mirrors\n")); retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); goto out; } verify(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); /* find or add an entry for this top-level vdev */ if (newchildren > 0 && (entry = find_vdev_entry(zhp, mchild, mchildren, newchild, newchildren)) >= 0) { /* We found a disk that the user specified. */ vdev = mchild[entry]; ++found; } else { /* User didn't specify a disk for this vdev. */ vdev = mchild[mchildren - 1]; } if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; } /* did we find every disk the user specified? */ if (found != newchildren) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " "include at most one disk from each mirror")); retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); goto out; } /* Prepare the nvlist for populating. */ if (*newroot == NULL) { if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) goto out; freelist = B_TRUE; if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0) goto out; } else { verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); } /* Add all the children we found */ if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray, lastlog == 0 ? vcount : lastlog) != 0) goto out; /* * If we're just doing a dry run, exit now with success. */ if (flags.dryrun) { memory_err = B_FALSE; freelist = B_FALSE; goto out; } /* now build up the config list & call the ioctl */ if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || nvlist_add_string(newconfig, ZPOOL_CONFIG_POOL_NAME, newname) != 0 || nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) goto out; /* * The new pool is automatically part of the namespace unless we * explicitly export it. */ if (!flags.import) zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0) goto out; if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) goto out; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { retval = zpool_standard_error(hdl, errno, msg); goto out; } freelist = B_FALSE; memory_err = B_FALSE; out: if (varray != NULL) { int v; for (v = 0; v < vcount; v++) nvlist_free(varray[v]); free(varray); } zcmd_free_nvlists(&zc); if (zc_props) nvlist_free(zc_props); if (newconfig) nvlist_free(newconfig); if (freelist) { nvlist_free(*newroot); *newroot = NULL; } if (retval != 0) return (retval); if (memory_err) return (no_memory(hdl)); return (0); } /* * Remove the given device. Currently, this is supported only for hot spares * and level 2 cache devices. */ int zpool_vdev_remove(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t version; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == 0) return (zfs_error(hdl, EZFS_NODEVICE, msg)); /* * XXX - this should just go away. */ if (!avail_spare && !l2cache && !islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only inactive hot spares, cache, top-level, " "or log devices can be removed")); return (zfs_error(hdl, EZFS_NODEVICE, msg)); } version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog && version < SPA_VERSION_HOLES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgrade to support log removal")); return (zfs_error(hdl, EZFS_BADVERSION, msg)); } verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } /* * Clear the errors for the pool, or the particular device if specified. */ int zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; zpool_rewind_policy_t policy; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *nvi = NULL; int error; if (path) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), path); else (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (path) { if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == 0) return (zfs_error(hdl, EZFS_NODEVICE, msg)); /* * Don't allow error clearing for hot spares. Do allow * error clearing for l2cache devices. */ if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); } zpool_get_rewind_policy(rewindnvl, &policy); zc.zc_cookie = policy.zrp_request; if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0) return (-1); if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0) return (-1); while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { zcmd_free_nvlists(&zc); return (-1); } } if (!error || ((policy.zrp_request & ZPOOL_TRY_REWIND) && errno != EPERM && errno != EACCES)) { if (policy.zrp_request & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); zpool_rewind_exclaim(hdl, zc.zc_name, ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nvi); nvlist_free(nvi); } zcmd_free_nvlists(&zc); return (0); } zcmd_free_nvlists(&zc); return (zpool_standard_error(hdl, errno, msg)); } /* * Similar to zpool_clear(), but takes a GUID (used by fmd). */ int zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = ZPOOL_NO_REWIND; if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } /* * Change the GUID for a pool. */ int zpool_reguid(zpool_handle_t *zhp) { char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; zfs_cmd_t zc = { 0 }; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } /* * Reopen the pool. */ int zpool_reopen(zpool_handle_t *zhp) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); } /* * Convert from a devid string to a path. */ static char * devid_to_path(char *devid_str) { ddi_devid_t devid; char *minor; char *path; devid_nmlist_t *list = NULL; int ret; if (devid_str_decode(devid_str, &devid, &minor) != 0) return (NULL); ret = devid_deviceid_to_nmlist("/dev", devid, minor, &list); devid_str_free(minor); devid_free(devid); if (ret != 0) return (NULL); /* * In a case the strdup() fails, we will just return NULL below. */ path = strdup(list[0].devname); devid_free_nmlist(list); return (path); } /* * Convert from a path to a devid string. */ static char * path_to_devid(const char *path) { int fd; ddi_devid_t devid; char *minor, *ret; if ((fd = open(path, O_RDONLY)) < 0) return (NULL); minor = NULL; ret = NULL; if (devid_get(fd, &devid) == 0) { if (devid_get_minor_name(fd, &minor) == 0) ret = devid_str_encode(devid, minor); if (minor != NULL) devid_str_free(minor); devid_free(devid); } (void) close(fd); return (ret); } /* * Issue the necessary ioctl() to update the stored path value for the vdev. We * ignore any failure here, since a common case is for an unprivileged user to * type 'zpool status', and we'll display the correct information anyway. */ static void set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) { zfs_cmd_t zc = { 0 }; (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strncpy(zc.zc_value, path, sizeof (zc.zc_value)); verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); (void) ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SETPATH, &zc); } /* * Given a vdev, return the name to display in iostat. If the vdev has a path, * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type. * We also check if this is a whole disk, in which case we strip off the * trailing 's0' slice name. * * This routine is also responsible for identifying when disks have been * reconfigured in a new location. The kernel will have opened the device by * devid, but the path will still refer to the old location. To catch this, we * first do a path -> devid translation (which is fast for the common case). If * the devid matches, we're done. If not, we do a reverse devid -> path * translation and issue the appropriate ioctl() to update the path of the vdev. * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any * of these checks. */ char * zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, boolean_t verbose) { char *path, *devid; uint64_t value; char buf[64]; vdev_stat_t *vs; uint_t vsc; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) == 0); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { /* * If the device is dead (faulted, offline, etc) then don't * bother opening it. Otherwise we may be forcing the user to * open a misbehaving device, which can have undesirable * effects. */ if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) != 0 || vs->vs_state >= VDEV_STATE_DEGRADED) && zhp != NULL && nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) { /* * Determine if the current path is correct. */ char *newdevid = path_to_devid(path); if (newdevid == NULL || strcmp(devid, newdevid) != 0) { char *newpath; if ((newpath = devid_to_path(devid)) != NULL) { /* * Update the path appropriately. */ set_path(zhp, nv, newpath); if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, newpath) == 0) verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); free(newpath); } } if (newdevid) devid_str_free(newdevid); } if (strncmp(path, "/dev/dsk/", 9) == 0) path += 9; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value) { int pathlen = strlen(path); char *tmp = zfs_strdup(hdl, path); /* * If it starts with c#, and ends with "s0", chop * the "s0" off, or if it ends with "s0/old", remove * the "s0" from the middle. */ if (CTD_CHECK(tmp)) { if (strcmp(&tmp[pathlen - 2], "s0") == 0) { tmp[pathlen - 2] = '\0'; } else if (pathlen > 6 && strcmp(&tmp[pathlen - 6], "s0/old") == 0) { (void) strcpy(&tmp[pathlen - 6], "/old"); } } return (tmp); } } else { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0); /* * If it's a raidz device, we need to stick in the parity level. */ if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &value) == 0); (void) snprintf(buf, sizeof (buf), "%s%llu", path, (u_longlong_t)value); path = buf; } /* * We identify each top-level vdev by using a * naming convention. */ if (verbose) { uint64_t id; verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &id) == 0); (void) snprintf(buf, sizeof (buf), "%s-%llu", path, (u_longlong_t)id); path = buf; } } return (zfs_strdup(hdl, path)); } static int zbookmark_mem_compare(const void *a, const void *b) { return (memcmp(a, b, sizeof (zbookmark_phys_t))); } /* * Retrieve the persistent error log, uniquify the members, and return to the * caller. */ int zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) { zfs_cmd_t zc = { 0 }; uint64_t count; zbookmark_phys_t *zb = NULL; int i; /* * Retrieve the raw error list from the kernel. If the number of errors * has increased, allocate more space and continue until we get the * entire list. */ verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT, &count) == 0); if (count == 0) return (0); if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl, count * sizeof (zbookmark_phys_t))) == (uintptr_t)NULL) return (-1); zc.zc_nvlist_dst_size = count; (void) strcpy(zc.zc_name, zhp->zpool_name); for (;;) { if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_ERROR_LOG, &zc) != 0) { free((void *)(uintptr_t)zc.zc_nvlist_dst); if (errno == ENOMEM) { void *dst; count = zc.zc_nvlist_dst_size; dst = zfs_alloc(zhp->zpool_hdl, count * sizeof (zbookmark_phys_t)); if (dst == NULL) return (-1); zc.zc_nvlist_dst = (uintptr_t)dst; } else { return (-1); } } else { break; } } /* * Sort the resulting bookmarks. This is a little confusing due to the * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last * to first, and 'zc_nvlist_dst_size' indicates the number of boomarks * _not_ copied as part of the process. So we point the start of our * array appropriate and decrement the total number of elements. */ zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) + zc.zc_nvlist_dst_size; count -= zc.zc_nvlist_dst_size; qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); /* * Fill in the nverrlistp with nvlist's of dataset and object numbers. */ for (i = 0; i < count; i++) { nvlist_t *nv; /* ignoring zb_blkid and zb_level for now */ if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset && zb[i-1].zb_object == zb[i].zb_object) continue; if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0) goto nomem; if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET, zb[i].zb_objset) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, zb[i].zb_object) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) { nvlist_free(nv); goto nomem; } nvlist_free(nv); } free((void *)(uintptr_t)zc.zc_nvlist_dst); return (0); nomem: free((void *)(uintptr_t)zc.zc_nvlist_dst); return (no_memory(zhp->zpool_hdl)); } /* * Upgrade a ZFS pool to the latest on-disk version. */ int zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strcpy(zc.zc_name, zhp->zpool_name); zc.zc_cookie = new_version; if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0) return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"), zhp->zpool_name)); return (0); } void zfs_save_arguments(int argc, char **argv, char *string, int len) { (void) strlcpy(string, basename(argv[0]), len); for (int i = 1; i < argc; i++) { (void) strlcat(string, " ", len); (void) strlcat(string, argv[i], len); } } int zpool_log_history(libzfs_handle_t *hdl, const char *message) { zfs_cmd_t zc = { 0 }; nvlist_t *args; int err; args = fnvlist_alloc(); fnvlist_add_string(args, "message", message); err = zcmd_write_src_nvlist(hdl, &zc, args); if (err == 0) err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc); nvlist_free(args); zcmd_free_nvlists(&zc); return (err); } /* * Perform ioctl to get some command history of a pool. * * 'buf' is the buffer to fill up to 'len' bytes. 'off' is the * logical offset of the history buffer to start reading from. * * Upon return, 'off' is the next logical offset to read from and * 'len' is the actual amount of bytes read into 'buf'. */ static int get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)buf; zc.zc_history_len = *len; zc.zc_history_offset = *off; if (ioctl(hdl->libzfs_fd, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { switch (errno) { case EPERM: return (zfs_error_fmt(hdl, EZFS_PERM, dgettext(TEXT_DOMAIN, "cannot show history for pool '%s'"), zhp->zpool_name)); case ENOENT: return (zfs_error_fmt(hdl, EZFS_NOHISTORY, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s'"), zhp->zpool_name)); case ENOTSUP: return (zfs_error_fmt(hdl, EZFS_BADVERSION, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s', pool must be upgraded"), zhp->zpool_name)); default: return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get history for '%s'"), zhp->zpool_name)); } } *len = zc.zc_history_len; *off = zc.zc_history_offset; return (0); } /* * Process the buffer of nvlists, unpacking and storing each nvlist record * into 'records'. 'leftover' is set to the number of bytes that weren't * processed as there wasn't a complete record. */ int zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, nvlist_t ***records, uint_t *numrecords) { uint64_t reclen; nvlist_t *nv; int i; while (bytes_read > sizeof (reclen)) { /* get length of packed record (stored as little endian) */ for (i = 0, reclen = 0; i < sizeof (reclen); i++) reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i); if (bytes_read < sizeof (reclen) + reclen) break; /* unpack record */ if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0) return (ENOMEM); bytes_read -= sizeof (reclen) + reclen; buf += sizeof (reclen) + reclen; /* add record to nvlist array */ (*numrecords)++; if (ISP2(*numrecords + 1)) { *records = realloc(*records, *numrecords * 2 * sizeof (nvlist_t *)); } (*records)[*numrecords - 1] = nv; } *leftover = bytes_read; return (0); } /* * Retrieve the command history of a pool. */ int zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp) { char *buf; int buflen = 128 * 1024; uint64_t off = 0; nvlist_t **records = NULL; uint_t numrecords = 0; int err, i; buf = malloc(buflen); if (buf == NULL) return (ENOMEM); do { uint64_t bytes_read = buflen; uint64_t leftover; if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0) break; /* if nothing else was read in, we're at EOF, just return */ if (!bytes_read) break; if ((err = zpool_history_unpack(buf, bytes_read, &leftover, &records, &numrecords)) != 0) break; off -= leftover; if (leftover == bytes_read) { /* * no progress made, because buffer is not big enough * to hold this record; resize and retry. */ buflen *= 2; free(buf); buf = malloc(buflen); if (buf == NULL) return (ENOMEM); } /* CONSTCOND */ } while (1); free(buf); if (!err) { verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD, records, numrecords) == 0); } for (i = 0; i < numrecords; i++) nvlist_free(records[i]); free(records); return (err); } void zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) { zfs_cmd_t zc = { 0 }; boolean_t mounted = B_FALSE; char *mntpnt = NULL; char dsname[MAXNAMELEN]; if (dsobj == 0) { /* special case for the MOS */ (void) snprintf(pathname, len, ":<0x%llx>", obj); return; } /* get the dataset's name */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_obj = dsobj; if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { /* just write out a path of two object numbers */ (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", dsobj, obj); return; } (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); /* find out if the dataset is mounted */ mounted = is_mounted(zhp->zpool_hdl, dsname, &mntpnt); /* get the corrupted object's path */ (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); zc.zc_obj = obj; if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_PATH, &zc) == 0) { if (mounted) { (void) snprintf(pathname, len, "%s%s", mntpnt, zc.zc_value); } else { (void) snprintf(pathname, len, "%s:%s", dsname, zc.zc_value); } } else { (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, obj); } free(mntpnt); } /* * Read the EFI label from the config, if a label does not exist then * pass back the error to the caller. If the caller has passed a non-NULL * diskaddr argument then we set it to the starting address of the EFI * partition. */ static int read_efi_label(nvlist_t *config, diskaddr_t *sb) { char *path; int fd; char diskname[MAXPATHLEN]; int err = -1; if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) return (err); (void) snprintf(diskname, sizeof (diskname), "%s%s", RDISK_ROOT, strrchr(path, '/')); if ((fd = open(diskname, O_RDONLY|O_NDELAY)) >= 0) { struct dk_gpt *vtoc; if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { if (sb != NULL) *sb = vtoc->efi_parts[0].p_start; efi_free(vtoc); } (void) close(fd); } return (err); } /* * determine where a partition starts on a disk in the current * configuration */ static diskaddr_t find_start_block(nvlist_t *config) { nvlist_t **child; uint_t c, children; diskaddr_t sb = MAXOFFSET_T; uint64_t wholedisk; if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk) != 0 || !wholedisk) { return (MAXOFFSET_T); } if (read_efi_label(config, &sb) < 0) sb = MAXOFFSET_T; return (sb); } for (c = 0; c < children; c++) { sb = find_start_block(child[c]); if (sb != MAXOFFSET_T) { return (sb); } } return (MAXOFFSET_T); } /* * Label an individual disk. The name provided is the short name, * stripped of any leading /dev path. */ int zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) { char path[MAXPATHLEN]; struct dk_gpt *vtoc; int fd; size_t resv = EFI_MIN_RESV_SIZE; uint64_t slice_size; diskaddr_t start_block; char errbuf[1024]; /* prepare an error message just in case */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); if (zhp) { nvlist_t *nvroot; - - if (zpool_is_bootable(zhp)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "EFI labeled devices are not supported on root " - "pools.")); - return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); - } verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (zhp->zpool_start_block == 0) start_block = find_start_block(nvroot); else start_block = zhp->zpool_start_block; zhp->zpool_start_block = start_block; } else { /* new pool */ start_block = NEW_START_BLOCK; } (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, BACKUP_SLICE); if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { /* * This shouldn't happen. We've long since verified that this * is a valid device. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to open device")); return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); } if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { /* * The only way this can fail is if we run out of memory, or we * were unable to read the disk's capacity */ if (errno == ENOMEM) (void) no_memory(hdl); (void) close(fd); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "unable to read disk capacity"), name); return (zfs_error(hdl, EZFS_NOCAP, errbuf)); } slice_size = vtoc->efi_last_u_lba + 1; slice_size -= EFI_MIN_RESV_SIZE; if (start_block == MAXOFFSET_T) start_block = NEW_START_BLOCK; slice_size -= start_block; vtoc->efi_parts[0].p_start = start_block; vtoc->efi_parts[0].p_size = slice_size; /* * Why we use V_USR: V_BACKUP confuses users, and is considered * disposable by some EFI utilities (since EFI doesn't have a backup * slice). V_UNASSIGNED is supposed to be used only for zero size * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, * etc. were all pretty specific. V_USR is as close to reality as we * can get, in the absence of V_OTHER. */ vtoc->efi_parts[0].p_tag = V_USR; (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); vtoc->efi_parts[8].p_start = slice_size + start_block; vtoc->efi_parts[8].p_size = resv; vtoc->efi_parts[8].p_tag = V_RESERVED; if (efi_write(fd, vtoc) != 0) { /* * Some block drivers (like pcata) may not support EFI * GPT labels. Print out a helpful error message dir- * ecting the user to manually label the disk and give * a specific slice. */ (void) close(fd); efi_free(vtoc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using fdisk(1M) and then provide a specific slice")); return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); } (void) close(fd); efi_free(vtoc); return (0); } static boolean_t supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf) { char *type; nvlist_t **child; uint_t children, c; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_FILE) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0 || strcmp(type, VDEV_TYPE_MISSING) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "vdev type '%s' is not supported"), type); (void) zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf); return (B_FALSE); } if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) { if (!supported_dump_vdev_type(hdl, child[c], errbuf)) return (B_FALSE); } } return (B_TRUE); } /* * Check if this zvol is allowable for use as a dump device; zero if * it is, > 0 if it isn't, < 0 if it isn't a zvol. * * Allowable storage configurations include mirrors, all raidz variants, and * pools with log, cache, and spare devices. Pools which are backed by files or * have missing/hole vdevs are not suitable. */ int zvol_check_dump_config(char *arg) { zpool_handle_t *zhp = NULL; nvlist_t *config, *nvroot; char *p, *volname; nvlist_t **top; uint_t toplevels; libzfs_handle_t *hdl; char errbuf[1024]; char poolname[ZPOOL_MAXNAMELEN]; int pathlen = strlen(ZVOL_FULL_DEV_DIR); int ret = 1; if (strncmp(arg, ZVOL_FULL_DEV_DIR, pathlen)) { return (-1); } (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "dump is not supported on device '%s'"), arg); if ((hdl = libzfs_init()) == NULL) return (1); libzfs_print_on_error(hdl, B_TRUE); volname = arg + pathlen; /* check the configuration of the pool */ if ((p = strchr(volname, '/')) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "malformed dataset name")); (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); return (1); } else if (p - volname >= ZFS_MAXNAMELEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset name is too long")); (void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf); return (1); } else { (void) strncpy(poolname, volname, p - volname); poolname[p - volname] = '\0'; } if ((zhp = zpool_open(hdl, poolname)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "could not open pool '%s'"), poolname); (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); goto out; } config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "could not obtain vdev configuration for '%s'"), poolname); (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); goto out; } verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &top, &toplevels) == 0); if (!supported_dump_vdev_type(hdl, top[0], errbuf)) { goto out; } ret = 0; out: if (zhp) zpool_close(zhp); libzfs_fini(hdl); return (ret); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c (revision 289525) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c (revision 289526) @@ -1,3377 +1,3375 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Virtual device management. */ static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, NULL }; /* maximum scrub/resilver I/O queue per leaf vdev */ int zfs_scrub_limit = 10; /* * When a vdev is added, it will be divided into approximately (but no * more than) this number of metaslabs. */ int metaslabs_per_vdev = 200; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* * The allocatable space for a raidz vdev is N * sizeof(smallest child), * so each child must provide at least 1/Nth of its asize. */ if (pvd->vdev_ops == &vdev_raidz_ops) return (pvd->vdev_min_asize / pvd->vdev_children); return (pvd->vdev_min_asize); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { return (vdev_count_leaves_impl(spa->spa_root_vdev)); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; spa_t *spa = cvd->vdev_spa; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_zalloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL, &vd->vdev_dtl_lock); } txg_list_create(&vd->vdev_ms_list, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); /* * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(ENOTSUP)); if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(ENOTSUP)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } } else { nparity = 0; } ASSERT(nparity != -1ULL); vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_islog = islog; vd->vdev_nparity = nparity; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (parent && !parent->vdev_parent && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); } if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); vd->vdev_mg = metaslab_group_create(islog ? spa_log_class(spa) : spa_normal_class(spa), vd); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be * valid in the current context. Local vdevs will * remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); ASSERT(oldc <= newc); mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (oldc != 0) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (m = oldc; m < newc; m++) { uint64_t object = 0; if (txg == 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error) return (error); } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error) return (error); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (oldc == 0 && !vd->vdev_removing) metaslab_group_activate(vd->vdev_mg); if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { uint64_t m; uint64_t count = vd->vdev_ms_count; if (vd->vdev_ms != NULL) { metaslab_group_passivate(vd->vdev_mg); for (m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; } } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_data, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { zio_buf_free(zio->io_data, zio->io_size); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; zio_buf_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); while ((pio = zio_walk_parents(zio)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } boolean_t vdev_uses_zvols(vdev_t *vd) { if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; /* * in order to handle pools on top of zvols, do the opens * in a single thread so that the same thread holds the * spa_namespace_lock */ if (vdev_uses_zvols(vd)) { for (int c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); return; } tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); for (int c = 0; c < children; c++) VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], TQ_SLEEP) != NULL); taskq_destroy(tq); } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For testing purposes, a higher ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); } else { /* * Detect if the alignment requirement has increased. * We don't want to make the pool unavailable, just * issue a warning instead. */ if (ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { cmn_err(CE_WARN, "Disk, '%s', has a block alignment that is " "larger than the pool's alignment\n", vd->vdev_path); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy and the asize has increased, * then we've experienced dynamic LUN growth. If automatic * expansion is enabled then use the additional space. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the min and max ashift values for normal data devices. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && !vd->vdev_islog && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && vdev_resilver_needed(vd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); return (0); } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * If 'strict' is false ignore the spa guid check. This is necessary because * if the machine crashed during a re-guid the new guid might have been written * to all of the vdev labels, but not the cached config. The strict check * will be performed when the pool is opened again using the mos config. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd, boolean_t strict) { spa_t *spa = vd->vdev_spa; nvlist_t *label; uint64_t guid = 0, top_guid; uint64_t state; for (int c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c], strict) != 0) return (SET_ERROR(EBADF)); /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { uint64_t aux_guid = 0; nvlist_t *nvl; uint64_t txg = spa_last_synced_txg(spa) != 0 ? spa_last_synced_txg(spa) : -1ULL; if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); return (0); } if (strict && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != spa_guid(spa))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; /* * If this vdev just became a top-level vdev because its * sibling was detached, it will have adopted the parent's * vdev guid -- but the label may or may not be on disk yet. * Fortunately, either version of the label will have the * same top guid, so if we're a top-level vdev, we can * safely compare to that instead. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (SET_ERROR(EBADF)); /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; } return (0); } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *pvd = vd->vdev_parent; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache && !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd, B_TRUE); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : ENXIO); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { /* * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. */ vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); ASSERT(!vd->vdev_ishole); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(rt->rt_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(rt->rt_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); mutex_enter(rt->rt_lock); if (range_tree_space(rt) != 0) dirty = range_tree_contains(rt, txg, size); mutex_exit(rt->rt_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(rt->rt_lock); empty = (range_tree_space(rt) == 0); mutex_exit(rt->rt_lock); return (empty); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_start - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_end); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd) { spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_resilver_txg == 0 || range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the scn_max_txg * value to the highest txg value that exists in all DTLs. If this * device's max DTL is not part of this scan (i.e. it is not in * the range (scn_min_txg, scn_max_txg] then it is not eligible * for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); return (B_TRUE); } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; mutex_enter(&vd->vdev_dtl_lock); /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to * excise regions on vdevs that were available during * the entire duration of this scan. */ if (scrub_txg != 0 && (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) && vdev_dtl_should_excise(vd)) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering and no longer has any * DTLs then reset its resilvering flag. */ if (vd->vdev_resilver_txg != 0 && range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) vd->vdev_resilver_txg = 0; mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vd->vdev_nparity != 0) minref = vd->vdev_nparity + 1; /* RAID-Z */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(!vd->vdev_ishole); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); /* * Now that we've opened the space_map we need to update * the in-core DTL. */ space_map_update(vd->vdev_dtl_sm); error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; kmutex_t rtlock; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(!vd->vdev_ishole); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0, &vd->vdev_dtl_lock)); ASSERT(vd->vdev_dtl_sm != NULL); } mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); rtsync = range_tree_create(NULL, NULL, &rtlock); mutex_enter(&rtlock); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); mutex_exit(&rtlock); mutex_destroy(&rtlock); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " "new object %llu", txg, spa_name(spa), object, space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); mutex_enter(&vd->vdev_dtl_lock); space_map_update(vd->vdev_dtl_sm); mutex_exit(&vd->vdev_dtl_lock); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); if (!required && zio_injection_enabled) required = !!zio_handle_device_injection(vd, NULL, ECHILD); return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } void vdev_load(vdev_t *vd) { /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_load(vd->vdev_child[c]); /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && !vd->vdev_ishole && (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || vdev_metaslab_init(vd, 0) != 0)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } void vdev_remove(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp == NULL || msp->ms_sm == NULL) continue; mutex_enter(&msp->ms_lock); /* * If the metaslab was not loaded when the vdev * was removed then the histogram accounting may * not be accurate. Update the histogram information * here so that we ensure that the metaslab group * and metaslab class are up-to-date. */ metaslab_group_histogram_remove(mg, msp); VERIFY0(space_map_allocated(msp->ms_sm)); space_map_free(msp->ms_sm, tx); space_map_close(msp->ms_sm); msp->ms_sm = NULL; mutex_exit(&msp->ms_lock); } metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } if (vd->vdev_ms_array) { (void) dmu_object_free(mos, vd->vdev_ms_array, tx); vd->vdev_ms_array = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(!vd->vdev_ishole); while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); if (reassess) metaslab_sync_reassess(vd->vdev_mg); } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { ASSERT(vd == vd->vdev_top); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); dmu_tx_commit(tx); } /* * Remove the metadata associated with this vdev once it's empty. */ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove(vd, txg); while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t postevent = B_FALSE; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); postevent = (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ? B_TRUE : B_FALSE; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } if (postevent) spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_offline_log(spa); spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in reponse to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) spa_async_request(spa, SPA_ASYNC_RESILVER); spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && !vd->vdev_ishole); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } /* * Get statistics for the given vdev. */ void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; } /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (vd == rvd) { for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *cvd = rvd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; for (int t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } } mutex_exit(&vd->vdev_stat_lock); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_phys_t *scn_phys = &spa->spa_dsl_pool->dp_scan->scn_phys; uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } vs->vs_ops[type]++; vs->vs_bytes[type] += psize; mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta = space_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; metaslab_group_t *mg = vd->vdev_mg; metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); if (mc == spa_normal_class(spa)) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } if (mc != NULL) { ASSERT(rvd == vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, dspace_delta); } } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_config_dirty_list, vd); } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes into the decision. */ if (child->vdev_ishole) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); /* * If we have brought this vdev back into service, we need * to notify fmd so that it can gracefully repair any outstanding * cases due to a missing device. We do this in all cases, even those * that probably don't correlate to a repaired fault. This is sure to * catch all cases, and we let the zfs-retire agent sort it out. If * this is a transient state it's OK, as the retire agent will * double-check the state of the vdev before repairing it. */ if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && vd->vdev_prevstate != state) zfs_post_state_change(spa, vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } zfs_ereport_post(class, spa, vd, NULL, save_state, 0); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. Currently, we do not support RAID-Z or partial configuration. * In addition, only a single top-level vdev is allowed and none of the leaves * can be wholedisks. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && vd->vdev_children > 1) { return (B_FALSE); } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { return (B_FALSE); } - } else if (vd->vdev_wholedisk == 1) { - return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } /* * Load the state from the original vdev tree (ovd) which * we've retrieved from the MOS config object. If the original * vdev was offline or faulted then we transfer that state to the * device in the current vdev tree (nvd). */ void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { spa_t *spa = nvd->vdev_spa; ASSERT(nvd->vdev_top->vdev_islog); ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); for (int c = 0; c < nvd->vdev_children; c++) vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); if (nvd->vdev_ops->vdev_op_leaf) { /* * Restore the persistent vdev state */ nvd->vdev_offline = ovd->vdev_offline; nvd->vdev_faulted = ovd->vdev_faulted; nvd->vdev_degraded = ovd->vdev_degraded; nvd->vdev_removed = ovd->vdev_removed; } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime we panic the system. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) { zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " "delta %lluns, last io %lluns", fio->io_timestamp, delta, vq->vq_io_complete_ts); fm_panic("I/O to pool '%s' appears to be " "hung.", spa_name(spa)); } } mutex_exit(&vq->vq_lock); } }