diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c index 4a6cfbf8c05c..fe43e2ab971e 100644 --- a/cmd/zed/agents/fmd_api.c +++ b/cmd/zed/agents/fmd_api.c @@ -1,781 +1,776 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ /* * This file implements the minimal FMD module API required to support the * fault logic modules in ZED. This support includes module registration, * memory allocation, module property accessors, basic case management, * one-shot timers and SERD engines. * * In the ZED runtime, the modules are called from a single thread so no * locking is required in this emulated FMD environment. */ #include #include #include #include #include #include #include "fmd_api.h" #include "fmd_serd.h" #include "zfs_agents.h" #include "../zed_log.h" typedef struct fmd_modstat { fmd_stat_t ms_accepted; /* total events accepted by module */ fmd_stat_t ms_caseopen; /* cases currently open */ fmd_stat_t ms_casesolved; /* total cases solved by module */ fmd_stat_t ms_caseclosed; /* total cases closed by module */ } fmd_modstat_t; typedef struct fmd_module { const char *mod_name; /* basename of module (ro) */ const fmd_hdl_info_t *mod_info; /* module info registered with handle */ void *mod_spec; /* fmd_hdl_get/setspecific data value */ fmd_stat_t *mod_ustat; /* module specific custom stats */ uint_t mod_ustat_cnt; /* count of ustat stats */ fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */ fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */ char *mod_vers; /* a copy of module version string */ } fmd_module_t; /* * ZED has two FMD hardwired module instances */ fmd_module_t zfs_retire_module; fmd_module_t zfs_diagnosis_module; /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif /* * Register a module with fmd and finish module initialization. * Returns an integer indicating whether it succeeded (zero) or * failed (non-zero). */ int fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip) { (void) version; fmd_module_t *mp = (fmd_module_t *)hdl; mp->mod_info = mip; mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */ mp->mod_spec = NULL; /* bare minimum module stats */ (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted"); (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen"); (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved"); (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed"); fmd_serd_hash_create(&mp->mod_serds); fmd_hdl_debug(hdl, "register module"); return (0); } void fmd_hdl_unregister(fmd_hdl_t *hdl) { fmd_module_t *mp = (fmd_module_t *)hdl; fmd_modstat_t *msp = &mp->mod_stats; const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; /* dump generic module stats */ fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name, msp->ms_accepted.fmds_value.ui64); if (ops->fmdo_close != NULL) { fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name, msp->ms_caseopen.fmds_value.ui64); fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name, msp->ms_casesolved.fmds_value.ui64); fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name, msp->ms_caseclosed.fmds_value.ui64); } /* dump module specific stats */ if (mp->mod_ustat != NULL) { int i; for (i = 0; i < mp->mod_ustat_cnt; i++) { fmd_hdl_debug(hdl, "%s: %llu", mp->mod_ustat[i].fmds_name, mp->mod_ustat[i].fmds_value.ui64); } } fmd_serd_hash_destroy(&mp->mod_serds); fmd_hdl_debug(hdl, "unregister module"); } /* * fmd_hdl_setspecific() is used to associate a data pointer with * the specified handle for the duration of the module's lifetime. * This pointer can be retrieved using fmd_hdl_getspecific(). */ void fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec) { fmd_module_t *mp = (fmd_module_t *)hdl; mp->mod_spec = spec; } /* * Return the module-specific data pointer previously associated * with the handle using fmd_hdl_setspecific(). */ void * fmd_hdl_getspecific(fmd_hdl_t *hdl) { fmd_module_t *mp = (fmd_module_t *)hdl; return (mp->mod_spec); } void * fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags) { (void) hdl; return (umem_alloc(size, flags)); } void * fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags) { (void) hdl; return (umem_zalloc(size, flags)); } void fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size) { (void) hdl; umem_free(data, size); } /* * Record a module debug message using the specified format. */ void fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...) { char message[256]; va_list vargs; fmd_module_t *mp = (fmd_module_t *)hdl; va_start(vargs, format); (void) vsnprintf(message, sizeof (message), format, vargs); va_end(vargs); /* prefix message with module name */ zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message); } /* Property Retrieval */ int32_t fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) { (void) hdl; /* * These can be looked up in mp->modinfo->fmdi_props * For now we just hard code for phase 2. In the * future, there can be a ZED based override. */ if (strcmp(name, "spare_on_remove") == 0) return (1); - if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) - return (10); /* N = 10 events */ - - return (0); -} - -int64_t -fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) -{ - (void) hdl; - - /* - * These can be looked up in mp->modinfo->fmdi_props - * For now we just hard code for phase 2. In the - * future, there can be a ZED based override. - */ - if (strcmp(name, "remove_timeout") == 0) - return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */ - - if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) - return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */ - return (0); } /* FMD Statistics */ fmd_stat_t * fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv) { fmd_module_t *mp = (fmd_module_t *)hdl; if (flags == FMD_STAT_NOALLOC) { mp->mod_ustat = statv; mp->mod_ustat_cnt = nstats; } return (statv); } /* Case Management */ fmd_case_t * fmd_case_open(fmd_hdl_t *hdl, void *data) { fmd_module_t *mp = (fmd_module_t *)hdl; uuid_t uuid; fmd_case_t *cp; cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP); cp->ci_mod = hdl; cp->ci_state = FMD_CASE_UNSOLVED; cp->ci_flags = FMD_CF_DIRTY; cp->ci_data = data; cp->ci_bufptr = NULL; cp->ci_bufsiz = 0; uuid_generate(uuid); uuid_unparse(uuid, cp->ci_uuid); fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid); mp->mod_stats.ms_caseopen.fmds_value.ui64++; return (cp); } void fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp) { fmd_module_t *mp = (fmd_module_t *)hdl; /* * For ZED, the event was already sent from fmd_case_add_suspect() */ if (cp->ci_state >= FMD_CASE_SOLVED) fmd_hdl_debug(hdl, "case is already solved or closed"); cp->ci_state = FMD_CASE_SOLVED; fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid); mp->mod_stats.ms_casesolved.fmds_value.ui64++; } void fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp) { fmd_module_t *mp = (fmd_module_t *)hdl; const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid); if (ops->fmdo_close != NULL) ops->fmdo_close(hdl, cp); mp->mod_stats.ms_caseopen.fmds_value.ui64--; mp->mod_stats.ms_caseclosed.fmds_value.ui64++; if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0) fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz); fmd_hdl_free(hdl, cp, sizeof (fmd_case_t)); } void fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid) { fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid); } boolean_t fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp) { (void) hdl; return (cp->ci_state >= FMD_CASE_SOLVED); } void fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep) { (void) hdl, (void) cp, (void) ep; } static void zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code) { nvlist_t *rsrc; const char *strval; uint64_t guid; uint8_t byte; zed_log_msg(LOG_INFO, "\nzed_fault_event:"); if (uuid != NULL) zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid); if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0) zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval); if (code != NULL) zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code); if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0) zed_log_msg(LOG_INFO, "\t%s: %hhu", FM_FAULT_CERTAINTY, byte); if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0) zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME, strval); if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL, guid); if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV, guid); } } static const char * fmd_fault_mkcode(nvlist_t *fault) { const char *class; const char *code = "-"; /* * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po */ if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) { if (strcmp(class, "fault.fs.zfs.vdev.io") == 0) code = "ZFS-8000-FD"; else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0) code = "ZFS-8000-GH"; else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0) code = "ZFS-8000-HC"; else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0) code = "ZFS-8000-JQ"; else if (strcmp(class, "fault.fs.zfs.log_replay") == 0) code = "ZFS-8000-K4"; else if (strcmp(class, "fault.fs.zfs.pool") == 0) code = "ZFS-8000-CS"; else if (strcmp(class, "fault.fs.zfs.device") == 0) code = "ZFS-8000-D3"; } return (code); } void fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault) { nvlist_t *nvl; const char *code = fmd_fault_mkcode(fault); int64_t tod[2]; int err = 0; /* * payload derived from fmd_protocol_list() */ (void) gettimeofday(&cp->ci_tv, NULL); tod[0] = cp->ci_tv.tv_sec; tod[1] = cp->ci_tv.tv_usec; nvl = fmd_nvl_alloc(hdl, FMD_SLEEP); err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION); err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS); err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid); err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code); err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2); err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1); err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, (const nvlist_t **)&fault, 1); if (err) zed_log_die("failed to populate nvlist"); zed_log_fault(fault, cp->ci_uuid, code); zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl); nvlist_free(nvl); nvlist_free(fault); } void fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data) { (void) hdl; cp->ci_data = data; } void * fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp) { (void) hdl; return (cp->ci_data); } void fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size) { assert(strcmp(name, "data") == 0), (void) name; assert(cp->ci_bufptr == NULL); assert(size < (1024 * 1024)); cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP); cp->ci_bufsiz = size; } void fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, void *buf, size_t size) { (void) hdl; assert(strcmp(name, "data") == 0), (void) name; assert(cp->ci_bufptr != NULL); assert(size <= cp->ci_bufsiz); memcpy(buf, cp->ci_bufptr, size); } void fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, const void *buf, size_t size) { (void) hdl; assert(strcmp(name, "data") == 0), (void) name; assert(cp->ci_bufptr != NULL); assert(cp->ci_bufsiz >= size); memcpy(cp->ci_bufptr, buf, size); } /* SERD Engines */ void fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t) { fmd_module_t *mp = (fmd_module_t *)hdl; if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) { zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': " " name already exists", name); return; } (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t); } void fmd_serd_destroy(fmd_hdl_t *hdl, const char *name) { fmd_module_t *mp = (fmd_module_t *)hdl; fmd_serd_eng_delete(&mp->mod_serds, name); fmd_hdl_debug(hdl, "serd_destroy %s", name); } int fmd_serd_exists(fmd_hdl_t *hdl, const char *name) { fmd_module_t *mp = (fmd_module_t *)hdl; return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); } -void -fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +int +fmd_serd_active(fmd_hdl_t *hdl, const char *name) { fmd_module_t *mp = (fmd_module_t *)hdl; fmd_serd_eng_t *sgp; if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); - return; + return (0); } + return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp)); +} - fmd_serd_eng_reset(sgp); +void +fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; - fmd_hdl_debug(hdl, "serd_reset %s", name); + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); + } else { + fmd_serd_eng_reset(sgp); + fmd_hdl_debug(hdl, "serd_reset %s", name); + } } int fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) { fmd_module_t *mp = (fmd_module_t *)hdl; fmd_serd_eng_t *sgp; - int err; if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", name); return (0); } - err = fmd_serd_eng_record(sgp, ep->ev_hrt); + return (fmd_serd_eng_record(sgp, ep->ev_hrt)); +} + +void +fmd_serd_gc(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; - return (err); + fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL); } /* FMD Timers */ static void _timer_notify(union sigval sv) { fmd_timer_t *ftp = sv.sival_ptr; fmd_hdl_t *hdl = ftp->ft_hdl; fmd_module_t *mp = (fmd_module_t *)hdl; const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; struct itimerspec its; - fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); + fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid); /* disarm the timer */ memset(&its, 0, sizeof (struct itimerspec)); timer_settime(ftp->ft_tid, 0, &its, NULL); /* Note that the fmdo_timeout can remove this timer */ if (ops->fmdo_timeout != NULL) ops->fmdo_timeout(hdl, ftp, ftp->ft_arg); } /* * Install a new timer which will fire at least delta nanoseconds after the * current time. After the timeout has expired, the module's fmdo_timeout * entry point is called. */ fmd_timer_t * fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta) { (void) ep; struct sigevent sev; struct itimerspec its; fmd_timer_t *ftp; ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP); ftp->ft_arg = arg; ftp->ft_hdl = hdl; its.it_value.tv_sec = delta / 1000000000; its.it_value.tv_nsec = delta % 1000000000; its.it_interval.tv_sec = its.it_value.tv_sec; its.it_interval.tv_nsec = its.it_value.tv_nsec; sev.sigev_notify = SIGEV_THREAD; sev.sigev_notify_function = _timer_notify; sev.sigev_notify_attributes = NULL; sev.sigev_value.sival_ptr = ftp; sev.sigev_signo = 0; timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid); timer_settime(ftp->ft_tid, 0, &its, NULL); fmd_hdl_debug(hdl, "installing timer for %d secs (%p)", (int)its.it_value.tv_sec, ftp->ft_tid); return (ftp); } void fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp) { fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid); timer_delete(ftp->ft_tid); fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t)); } /* Name-Value Pair Lists */ nvlist_t * fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty, nvlist_t *asru, nvlist_t *fru, nvlist_t *resource) { (void) hdl; nvlist_t *nvl; int err = 0; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) zed_log_die("failed to xalloc fault nvlist"); err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION); err |= nvlist_add_string(nvl, FM_CLASS, class); err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty); if (asru != NULL) err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru); if (fru != NULL) err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru); if (resource != NULL) err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource); if (err) zed_log_die("failed to populate nvlist: %s\n", strerror(err)); return (nvl); } /* * sourced from fmd_string.c */ static int fmd_strmatch(const char *s, const char *p) { char c; if (p == NULL) return (0); if (s == NULL) s = ""; /* treat NULL string as the empty string */ do { if ((c = *p++) == '\0') return (*s == '\0'); if (c == '*') { while (*p == '*') p++; /* consecutive *'s can be collapsed */ if (*p == '\0') return (1); while (*s != '\0') { if (fmd_strmatch(s++, p) != 0) return (1); } return (0); } } while (c == *s++); return (0); } int fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern) { (void) hdl; const char *class; return (nvl != NULL && nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 && fmd_strmatch(class, pattern)); } nvlist_t * fmd_nvl_alloc(fmd_hdl_t *hdl, int flags) { (void) hdl, (void) flags; nvlist_t *nvl = NULL; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (NULL); return (nvl); } /* * ZED Agent specific APIs */ fmd_hdl_t * fmd_module_hdl(const char *name) { if (strcmp(name, "zfs-retire") == 0) return ((fmd_hdl_t *)&zfs_retire_module); if (strcmp(name, "zfs-diagnosis") == 0) return ((fmd_hdl_t *)&zfs_diagnosis_module); return (NULL); } boolean_t fmd_module_initialized(fmd_hdl_t *hdl) { fmd_module_t *mp = (fmd_module_t *)hdl; return (mp->mod_info != NULL); } /* * fmd_module_recv is called for each event that is received by * the fault manager that has a class that matches one of the * module's subscriptions. */ void fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) { fmd_module_t *mp = (fmd_module_t *)hdl; const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; fmd_event_t faux_event = {0}; int64_t *tv; uint_t n; /* * Will need to normalized this if we persistently store the case data */ if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0) faux_event.ev_hrt = tv[0] * NANOSEC + tv[1]; else faux_event.ev_hrt = 0; ops->fmdo_recv(hdl, &faux_event, nvl, class); mp->mod_stats.ms_accepted.fmds_value.ui64++; /* TBD - should we initiate fm_module_gc() periodically? */ } diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h index b940d0d395ec..8471feecf33f 100644 --- a/cmd/zed/agents/fmd_api.h +++ b/cmd/zed/agents/fmd_api.h @@ -1,241 +1,242 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2016, Intel Corporation. */ #ifndef _FMD_API_H #define _FMD_API_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Fault Management Daemon Client Interfaces */ #define FMD_API_VERSION 5 typedef struct fmd_hdl fmd_hdl_t; typedef struct fmd_timer { timer_t ft_tid; void *ft_arg; fmd_hdl_t *ft_hdl; } fmd_timer_t; #define id_t fmd_timer_t * typedef struct fmd_event { hrtime_t ev_hrt; /* event time used by SERD engines */ } fmd_event_t; typedef struct fmd_case { char ci_uuid[48]; /* uuid string for this case */ fmd_hdl_t *ci_mod; /* module that owns this case */ void *ci_data; /* data from fmd_case_setspecific() */ ushort_t ci_state; /* case state (see below) */ ushort_t ci_flags; /* case flags (see below) */ struct timeval ci_tv; /* time of original diagnosis */ void *ci_bufptr; /* case data serialization buffer */ size_t ci_bufsiz; } fmd_case_t; #define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */ #define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */ #define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */ #define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */ #define FMD_CASE_REPAIRED 4 /* case is repaired */ #define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */ #define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */ #define FMD_CF_SOLVED 0x02 /* case has been solved */ #define FMD_CF_ISOLATED 0x04 /* case has been isolated */ #define FMD_CF_REPAIRED 0x08 /* case has been repaired */ #define FMD_CF_RESOLVED 0x10 /* case has been resolved */ #define FMD_TYPE_BOOL 0 /* int */ #define FMD_TYPE_INT32 1 /* int32_t */ #define FMD_TYPE_UINT32 2 /* uint32_t */ #define FMD_TYPE_INT64 3 /* int64_t */ #define FMD_TYPE_UINT64 4 /* uint64_t */ #define FMD_TYPE_TIME 5 /* uint64_t */ #define FMD_TYPE_SIZE 6 /* uint64_t */ typedef struct fmd_prop { const char *fmdp_name; /* property name */ uint_t fmdp_type; /* property type (see above) */ const char *fmdp_defv; /* default value */ } fmd_prop_t; typedef struct fmd_stat { char fmds_name[32]; /* statistic name */ uint_t fmds_type; /* statistic type (see above) */ char fmds_desc[64]; /* statistic description */ union { int bool; /* FMD_TYPE_BOOL */ int32_t i32; /* FMD_TYPE_INT32 */ uint32_t ui32; /* FMD_TYPE_UINT32 */ int64_t i64; /* FMD_TYPE_INT64 */ uint64_t ui64; /* FMD_TYPE_UINT64 */ } fmds_value; } fmd_stat_t; typedef struct fmd_hdl_ops { void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *); void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *); void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *); void (*fmdo_stats)(fmd_hdl_t *); void (*fmdo_gc)(fmd_hdl_t *); } fmd_hdl_ops_t; #define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */ #define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */ #define FMD_SEND_RETRY 2 /* fmdo_send requests retry */ typedef struct fmd_hdl_info { const char *fmdi_desc; /* fmd client description string */ const char *fmdi_vers; /* fmd client version string */ const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */ const fmd_prop_t *fmdi_props; /* array of configuration props */ } fmd_hdl_info_t; extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *); extern void fmd_hdl_unregister(fmd_hdl_t *); extern void fmd_hdl_setspecific(fmd_hdl_t *, void *); extern void *fmd_hdl_getspecific(fmd_hdl_t *); #define FMD_SLEEP UMEM_NOFAIL extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int); extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int); extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t); extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int); extern void fmd_hdl_strfree(fmd_hdl_t *, char *); extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); -extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); #define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ #define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *); extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *); extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *); extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *); extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *); extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *); extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *); extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *); extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *); extern void fmd_case_uuclose(fmd_hdl_t *, const char *); extern int fmd_case_uuclosed(fmd_hdl_t *, const char *); extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *); extern void fmd_case_uuresolved(fmd_hdl_t *, const char *); extern boolean_t fmd_case_solved(fmd_hdl_t *, fmd_case_t *); extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *); extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *); extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *); extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *); extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *); extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *); extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *); extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t); extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *); extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *, const char *, void *, size_t); extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *, const char *, const void *, size_t); extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); extern void fmd_serd_destroy(fmd_hdl_t *, const char *); extern int fmd_serd_exists(fmd_hdl_t *, const char *); +extern int fmd_serd_active(fmd_hdl_t *, const char *); extern void fmd_serd_reset(fmd_hdl_t *, const char *); extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); extern int fmd_serd_fired(fmd_hdl_t *, const char *); extern int fmd_serd_empty(fmd_hdl_t *, const char *); +extern void fmd_serd_gc(fmd_hdl_t *); extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); extern void fmd_timer_remove(fmd_hdl_t *, id_t); extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *, const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *); extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *); #define FMD_HAS_FAULT_FRU 0 #define FMD_HAS_FAULT_ASRU 1 #define FMD_HAS_FAULT_RESOURCE 2 extern void fmd_repair_fru(fmd_hdl_t *, const char *); extern int fmd_repair_asru(fmd_hdl_t *, const char *); extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int); extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int); /* * ZED Specific Interfaces */ extern fmd_hdl_t *fmd_module_hdl(const char *); extern boolean_t fmd_module_initialized(fmd_hdl_t *); extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *); /* ZFS FMA Retire Agent */ extern void _zfs_retire_init(fmd_hdl_t *); extern void _zfs_retire_fini(fmd_hdl_t *); /* ZFS FMA Diagnosis Engine */ extern void _zfs_diagnosis_init(fmd_hdl_t *); extern void _zfs_diagnosis_fini(fmd_hdl_t *); #ifdef __cplusplus } #endif #endif /* _FMD_API_H */ diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c index 0bb2c535f094..f942e62b3f48 100644 --- a/cmd/zed/agents/fmd_serd.c +++ b/cmd/zed/agents/fmd_serd.c @@ -1,335 +1,336 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 2016, Intel Corporation. */ #include #include #include #include #include #include #include "fmd_api.h" #include "fmd_serd.h" #include "../zed_log.h" #define FMD_STR_BUCKETS 211 #ifdef SERD_ENG_DEBUG #define serd_log_msg(fmt, ...) \ zed_log_msg(LOG_INFO, fmt, __VA_ARGS__) #else #define serd_log_msg(fmt, ...) #endif /* * SERD Engine Backend */ /* * Compute the delta between events in nanoseconds. To account for very old * events which are replayed, we must handle the case where time is negative. * We convert the hrtime_t's to unsigned 64-bit integers and then handle the * case where 'old' is greater than 'new' (i.e. high-res time has wrapped). */ static hrtime_t fmd_event_delta(hrtime_t t1, hrtime_t t2) { uint64_t old = t1; uint64_t new = t2; return (new >= old ? new - old : (UINT64_MAX - old) + new + 1); } static fmd_serd_eng_t * fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t) { fmd_serd_eng_t *sgp; sgp = malloc(sizeof (fmd_serd_eng_t)); if (sgp == NULL) { perror("malloc"); exit(EXIT_FAILURE); } memset(sgp, 0, sizeof (fmd_serd_eng_t)); sgp->sg_name = strdup(name); if (sgp->sg_name == NULL) { perror("strdup"); exit(EXIT_FAILURE); } sgp->sg_flags = FMD_SERD_DIRTY; sgp->sg_n = n; sgp->sg_t = t; list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t), offsetof(fmd_serd_elem_t, se_list)); return (sgp); } static void fmd_serd_eng_free(fmd_serd_eng_t *sgp) { fmd_serd_eng_reset(sgp); free(sgp->sg_name); list_destroy(&sgp->sg_list); free(sgp); } /* * sourced from fmd_string.c */ static ulong_t fmd_strhash(const char *key) { ulong_t g, h = 0; const char *p; for (p = key; *p != '\0'; p++) { h = (h << 4) + *p; if ((g = (h & 0xf0000000)) != 0) { h ^= (g >> 24); h ^= g; } } return (h); } void fmd_serd_hash_create(fmd_serd_hash_t *shp) { shp->sh_hashlen = FMD_STR_BUCKETS; shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *)); shp->sh_count = 0; if (shp->sh_hash == NULL) { perror("calloc"); exit(EXIT_FAILURE); } } void fmd_serd_hash_destroy(fmd_serd_hash_t *shp) { fmd_serd_eng_t *sgp, *ngp; uint_t i; for (i = 0; i < shp->sh_hashlen; i++) { for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) { ngp = sgp->sg_next; fmd_serd_eng_free(sgp); } } free(shp->sh_hash); memset(shp, 0, sizeof (fmd_serd_hash_t)); } void fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg) { fmd_serd_eng_t *sgp; uint_t i; for (i = 0; i < shp->sh_hashlen; i++) { for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next) func(sgp, arg); } } fmd_serd_eng_t * fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name, uint_t n, hrtime_t t) { uint_t h = fmd_strhash(name) % shp->sh_hashlen; fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t); serd_log_msg(" SERD Engine: inserting %s N %d T %llu", name, (int)n, (long long unsigned)t); sgp->sg_next = shp->sh_hash[h]; shp->sh_hash[h] = sgp; shp->sh_count++; return (sgp); } fmd_serd_eng_t * fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name) { uint_t h = fmd_strhash(name) % shp->sh_hashlen; fmd_serd_eng_t *sgp; for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) { if (strcmp(name, sgp->sg_name) == 0) return (sgp); } return (NULL); } void fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name) { uint_t h = fmd_strhash(name) % shp->sh_hashlen; fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h]; serd_log_msg(" SERD Engine: deleting %s", name); for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) { if (strcmp(sgp->sg_name, name) != 0) pp = &sgp->sg_next; else break; } if (sgp != NULL) { *pp = sgp->sg_next; fmd_serd_eng_free(sgp); assert(shp->sh_count != 0); shp->sh_count--; } } static void fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep) { list_remove(&sgp->sg_list, sep); sgp->sg_count--; serd_log_msg(" SERD Engine: discarding %s, %d remaining", sgp->sg_name, (int)sgp->sg_count); free(sep); } int fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt) { fmd_serd_elem_t *sep, *oep; /* * If the fired flag is already set, return false and discard the * event. This means that the caller will only see the engine "fire" * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired() * function can also be used in combination with fmd_serd_eng_record(). */ if (sgp->sg_flags & FMD_SERD_FIRED) { serd_log_msg(" SERD Engine: record %s already fired!", sgp->sg_name); return (B_FALSE); } while (sgp->sg_count >= sgp->sg_n) fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list)); sep = malloc(sizeof (fmd_serd_elem_t)); if (sep == NULL) { perror("malloc"); exit(EXIT_FAILURE); } sep->se_hrt = hrt; list_insert_head(&sgp->sg_list, sep); sgp->sg_count++; serd_log_msg(" SERD Engine: recording %s of %d (%llu)", sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt); /* * Pick up the oldest element pointer for comparison to 'sep'. We must * do this after adding 'sep' because 'oep' and 'sep' can be the same. */ oep = list_tail(&sgp->sg_list); if (sgp->sg_count >= sgp->sg_n && fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) { sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY; serd_log_msg(" SERD Engine: fired %s", sgp->sg_name); return (B_TRUE); } sgp->sg_flags |= FMD_SERD_DIRTY; return (B_FALSE); } int fmd_serd_eng_fired(fmd_serd_eng_t *sgp) { return (sgp->sg_flags & FMD_SERD_FIRED); } int fmd_serd_eng_empty(fmd_serd_eng_t *sgp) { return (sgp->sg_count == 0); } void fmd_serd_eng_reset(fmd_serd_eng_t *sgp) { serd_log_msg(" SERD Engine: resetting %s", sgp->sg_name); while (sgp->sg_count != 0) fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list)); sgp->sg_flags &= ~FMD_SERD_FIRED; sgp->sg_flags |= FMD_SERD_DIRTY; } void -fmd_serd_eng_gc(fmd_serd_eng_t *sgp) +fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg) { + (void) arg; fmd_serd_elem_t *sep, *nep; hrtime_t hrt; if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED)) return; /* no garbage collection needed if empty or fired */ sep = list_head(&sgp->sg_list); if (sep == NULL) return; hrt = sep->se_hrt - sgp->sg_t; for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) { if (sep->se_hrt >= hrt) break; /* sep and subsequent events are all within T */ nep = list_next(&sgp->sg_list, sep); fmd_serd_eng_discard(sgp, sep); sgp->sg_flags |= FMD_SERD_DIRTY; } } diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h index 25b6888e61f2..80ff9a3b25b8 100644 --- a/cmd/zed/agents/fmd_serd.h +++ b/cmd/zed/agents/fmd_serd.h @@ -1,86 +1,86 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 2016, Intel Corporation. */ #ifndef _FMD_SERD_H #define _FMD_SERD_H #ifdef __cplusplus extern "C" { #endif #include #include typedef struct fmd_serd_elem { list_node_t se_list; /* linked list forward/back pointers */ hrtime_t se_hrt; /* upper bound on event hrtime */ } fmd_serd_elem_t; typedef struct fmd_serd_eng { char *sg_name; /* string name for this engine */ struct fmd_serd_eng *sg_next; /* next engine on hash chain */ list_t sg_list; /* list of fmd_serd_elem_t's */ uint_t sg_count; /* count of events in sg_list */ uint_t sg_flags; /* engine flags (see below) */ uint_t sg_n; /* engine N parameter (event count) */ hrtime_t sg_t; /* engine T parameter (nanoseconds) */ } fmd_serd_eng_t; #define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */ #define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */ typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *); typedef struct fmd_serd_hash { fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */ uint_t sh_hashlen; /* length of hash bucket array */ uint_t sh_count; /* count of engines in hash */ } fmd_serd_hash_t; extern void fmd_serd_hash_create(fmd_serd_hash_t *); extern void fmd_serd_hash_destroy(fmd_serd_hash_t *); extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *); extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *, const char *, uint32_t, hrtime_t); extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *); extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *); extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t); extern int fmd_serd_eng_fired(fmd_serd_eng_t *); extern int fmd_serd_eng_empty(fmd_serd_eng_t *); extern void fmd_serd_eng_reset(fmd_serd_eng_t *); -extern void fmd_serd_eng_gc(fmd_serd_eng_t *); +extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *); #ifdef __cplusplus } #endif #endif /* _FMD_SERD_H */ diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c index f6ba334a3ba3..e0ad00800add 100644 --- a/cmd/zed/agents/zfs_diagnosis.c +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -1,1028 +1,1119 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include #include "zfs_agents.h" #include "fmd_api.h" /* * Default values for the serd engine when processing checksum or io errors. The * semantics are N in T . */ #define DEFAULT_CHECKSUM_N 10 /* events */ #define DEFAULT_CHECKSUM_T 600 /* seconds */ #define DEFAULT_IO_N 10 /* events */ #define DEFAULT_IO_T 600 /* seconds */ +#define DEFAULT_SLOW_IO_N 10 /* events */ +#define DEFAULT_SLOW_IO_T 30 /* seconds */ + +#define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */ /* - * Our serd engines are named 'zfs___{checksum,io}'. This - * #define reserves enough space for two 64-bit hex values plus the length of - * the longest string. + * Our serd engines are named in the following format: + * 'zfs___{checksum,io,slow_io}' + * This #define reserves enough space for two 64-bit hex values plus the + * length of the longest string. */ #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) /* * On-disk case structure. This must maintain backwards compatibility with * previous versions of the DE. By default, any members appended to the end * will be filled with zeros if they don't exist in a previous version. */ typedef struct zfs_case_data { uint64_t zc_version; uint64_t zc_ena; uint64_t zc_pool_guid; uint64_t zc_vdev_guid; int zc_pool_state; char zc_serd_checksum[MAX_SERDLEN]; char zc_serd_io[MAX_SERDLEN]; + char zc_serd_slow_io[MAX_SERDLEN]; int zc_has_remove_timer; } zfs_case_data_t; /* * Time-of-day */ typedef struct er_timeval { uint64_t ertv_sec; uint64_t ertv_nsec; } er_timeval_t; /* * In-core case structure. */ typedef struct zfs_case { boolean_t zc_present; uint32_t zc_version; zfs_case_data_t zc_data; fmd_case_t *zc_case; uu_list_node_t zc_node; id_t zc_remove_timer; char *zc_fru; er_timeval_t zc_when; } zfs_case_t; #define CASE_DATA "data" #define CASE_FRU "fru" #define CASE_DATA_VERSION_INITIAL 1 #define CASE_DATA_VERSION_SERD 2 typedef struct zfs_de_stats { fmd_stat_t old_drops; fmd_stat_t dev_drops; fmd_stat_t vdev_drops; fmd_stat_t import_drops; fmd_stat_t resource_drops; } zfs_de_stats_t; zfs_de_stats_t zfs_stats = { { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } }; -static hrtime_t zfs_remove_timeout; +/* wait 15 seconds after a removal */ +static hrtime_t zfs_remove_timeout = SEC2NSEC(15); uu_list_pool_t *zfs_case_pool; uu_list_t *zfs_cases; #define ZFS_MAKE_RSRC(type) \ FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type #define ZFS_MAKE_EREPORT(type) \ FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type +static void zfs_purge_cases(fmd_hdl_t *hdl); + /* * Write out the persistent representation of an active case. */ static void zfs_case_serialize(zfs_case_t *zcp) { zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; } /* * Read back the persistent representation of an active case. */ static zfs_case_t * zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) { zfs_case_t *zcp; zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); zcp->zc_case = cp; fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, sizeof (zcp->zc_data)); if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); return (NULL); } /* * fmd_buf_read() will have already zeroed out the remainder of the * buffer, so we don't have to do anything special if the version * doesn't include the SERD engine name. */ if (zcp->zc_data.zc_has_remove_timer) zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, zfs_remove_timeout); uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); (void) uu_list_insert_before(zfs_cases, NULL, zcp); fmd_case_setspecific(hdl, cp, zcp); return (zcp); } +/* + * count other unique slow-io cases in a pool + */ +static uint_t +zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case) +{ + zfs_case_t *zcp; + uint_t cases = 0; + static hrtime_t next_check = 0; + + /* + * Note that plumbing in some external GC would require adding locking, + * since most of this module code is not thread safe and assumes there + * is only one thread running against the module. So we perform GC here + * inline periodically so that future delay induced faults will be + * possible once the issue causing multiple vdev delays is resolved. + */ + if (gethrestime_sec() > next_check) { + /* Periodically purge old SERD entries and stale cases */ + fmd_serd_gc(hdl); + zfs_purge_cases(hdl); + next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS; + } + + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid && + zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid && + zcp->zc_data.zc_serd_slow_io[0] != '\0' && + fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) { + cases++; + } + } + return (cases); +} + /* * Iterate over any active cases. If any cases are associated with a pool or * vdev which is no longer present on the system, close the associated case. */ static void zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) { uint64_t vdev_guid = 0; uint_t c, children; nvlist_t **child; zfs_case_t *zcp; (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); /* * Mark any cases associated with this (pool, vdev) pair. */ for (zcp = uu_list_first(zfs_cases); zcp != NULL; zcp = uu_list_next(zfs_cases, zcp)) { if (zcp->zc_data.zc_pool_guid == pool_guid && zcp->zc_data.zc_vdev_guid == vdev_guid) { zcp->zc_present = B_TRUE; zcp->zc_when = *loaded; } } /* * Iterate over all children. */ if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) zfs_mark_vdev(pool_guid, child[c], loaded); } if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) zfs_mark_vdev(pool_guid, child[c], loaded); } if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) zfs_mark_vdev(pool_guid, child[c], loaded); } } static int zfs_mark_pool(zpool_handle_t *zhp, void *unused) { (void) unused; zfs_case_t *zcp; uint64_t pool_guid; uint64_t *tod; er_timeval_t loaded = { 0 }; nvlist_t *config, *vd; uint_t nelem = 0; int ret; pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); /* * Mark any cases associated with just this pool. */ for (zcp = uu_list_first(zfs_cases); zcp != NULL; zcp = uu_list_next(zfs_cases, zcp)) { if (zcp->zc_data.zc_pool_guid == pool_guid && zcp->zc_data.zc_vdev_guid == 0) zcp->zc_present = B_TRUE; } if ((config = zpool_get_config(zhp, NULL)) == NULL) { zpool_close(zhp); return (-1); } (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, &tod, &nelem); if (nelem == 2) { loaded.ertv_sec = tod[0]; loaded.ertv_nsec = tod[1]; for (zcp = uu_list_first(zfs_cases); zcp != NULL; zcp = uu_list_next(zfs_cases, zcp)) { if (zcp->zc_data.zc_pool_guid == pool_guid && zcp->zc_data.zc_vdev_guid == 0) { zcp->zc_when = loaded; } } } ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); if (ret) { zpool_close(zhp); return (-1); } zfs_mark_vdev(pool_guid, vd, &loaded); zpool_close(zhp); return (0); } struct load_time_arg { uint64_t lt_guid; er_timeval_t *lt_time; boolean_t lt_found; }; static int zpool_find_load_time(zpool_handle_t *zhp, void *arg) { struct load_time_arg *lta = arg; uint64_t pool_guid; uint64_t *tod; nvlist_t *config; uint_t nelem; if (lta->lt_found) { zpool_close(zhp); return (0); } pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); if (pool_guid != lta->lt_guid) { zpool_close(zhp); return (0); } if ((config = zpool_get_config(zhp, NULL)) == NULL) { zpool_close(zhp); return (-1); } if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, &tod, &nelem) == 0 && nelem == 2) { lta->lt_found = B_TRUE; lta->lt_time->ertv_sec = tod[0]; lta->lt_time->ertv_nsec = tod[1]; } zpool_close(zhp); return (0); } static void zfs_purge_cases(fmd_hdl_t *hdl) { zfs_case_t *zcp; uu_list_walk_t *walk; libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); /* * There is no way to open a pool by GUID, or lookup a vdev by GUID. No * matter what we do, we're going to have to stomach an O(vdevs * cases) * algorithm. In reality, both quantities are likely so small that * neither will matter. Given that iterating over pools is more * expensive than iterating over the in-memory case list, we opt for a * 'present' flag in each case that starts off cleared. We then iterate * over all pools, marking those that are still present, and removing * those that aren't found. * * Note that we could also construct an FMRI and rely on * fmd_nvl_fmri_present(), but this would end up doing the same search. */ /* * Mark the cases as not present. */ for (zcp = uu_list_first(zfs_cases); zcp != NULL; zcp = uu_list_next(zfs_cases, zcp)) zcp->zc_present = B_FALSE; /* * Iterate over all pools and mark the pools and vdevs found. If this * fails (most probably because we're out of memory), then don't close * any of the cases and we cannot be sure they are accurate. */ if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) return; /* * Remove those cases which were not found. */ walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); while ((zcp = uu_list_walk_next(walk)) != NULL) { if (!zcp->zc_present) fmd_case_close(hdl, zcp->zc_case); } uu_list_walk_end(walk); } /* * Construct the name of a serd engine given the pool/vdev GUID and type (io or * checksum). */ static void zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, const char *type) { (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", (long long unsigned int)pool_guid, (long long unsigned int)vdev_guid, type); } +static void +zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp) +{ + fmd_hdl_debug(hdl, "retiring case"); + + fmd_case_close(hdl, zcp->zc_case); +} + /* * Solve a given ZFS case. This first checks to make sure the diagnosis is * still valid, as well as cleaning up any pending timer associated with the * case. */ static void zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname) { nvlist_t *detector, *fault; boolean_t serialize; nvlist_t *fru = NULL; fmd_hdl_debug(hdl, "solving fault '%s'", faultname); /* * Construct the detector from the case data. The detector is in the * ZFS scheme, and is either the pool or the vdev, depending on whether * this is a vdev or pool fault. */ detector = fmd_nvl_alloc(hdl, FMD_SLEEP); (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, zcp->zc_data.zc_pool_guid); if (zcp->zc_data.zc_vdev_guid != 0) { (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, zcp->zc_data.zc_vdev_guid); } fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, fru, detector); fmd_case_add_suspect(hdl, zcp->zc_case, fault); nvlist_free(fru); fmd_case_solve(hdl, zcp->zc_case); serialize = B_FALSE; if (zcp->zc_data.zc_has_remove_timer) { fmd_timer_remove(hdl, zcp->zc_remove_timer); zcp->zc_data.zc_has_remove_timer = 0; serialize = B_TRUE; } if (serialize) zfs_case_serialize(zcp); nvlist_free(detector); } static boolean_t timeval_earlier(er_timeval_t *a, er_timeval_t *b) { return (a->ertv_sec < b->ertv_sec || (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); } static void zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) { (void) hdl; int64_t *tod; uint_t nelem; if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod, &nelem) == 0 && nelem == 2) { when->ertv_sec = tod[0]; when->ertv_nsec = tod[1]; } else { when->ertv_sec = when->ertv_nsec = UINT64_MAX; } } /* * Main fmd entry point. */ static void zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) { zfs_case_t *zcp, *dcp; int32_t pool_state; uint64_t ena, pool_guid, vdev_guid; uint64_t checksum_n, checksum_t; uint64_t io_n, io_t; er_timeval_t pool_load; er_timeval_t er_when; nvlist_t *detector; boolean_t pool_found = B_FALSE; boolean_t isresource; const char *type; /* * We subscribe to notifications for vdev or pool removal. In these * cases, there may be cases that no longer apply. Purge any cases * that no longer apply. */ if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) { fmd_hdl_debug(hdl, "purging orphaned cases from %s", strrchr(class, '.') + 1); zfs_purge_cases(hdl); zfs_stats.resource_drops.fmds_value.ui64++; return; } isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); if (isresource) { /* * For resources, we don't have a normal payload. */ if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) pool_state = SPA_LOAD_OPEN; else pool_state = SPA_LOAD_NONE; detector = NULL; } else { (void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &detector); (void) nvlist_lookup_int32(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); } /* * We also ignore all ereports generated during an import of a pool, * since the only possible fault (.pool) would result in import failure, * and hence no persistent fault. Some day we may want to do something * with these ereports, so we continue generating them internally. */ if (pool_state == SPA_LOAD_IMPORT) { zfs_stats.import_drops.fmds_value.ui64++; fmd_hdl_debug(hdl, "ignoring '%s' during import", class); return; } /* * Device I/O errors are ignored during pool open. */ if (pool_state == SPA_LOAD_OPEN && (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class); zfs_stats.dev_drops.fmds_value.ui64++; return; } /* * We ignore ereports for anything except disks and files. */ if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, &type) == 0) { if (strcmp(type, VDEV_TYPE_DISK) != 0 && strcmp(type, VDEV_TYPE_FILE) != 0) { zfs_stats.vdev_drops.fmds_value.ui64++; return; } } /* * Determine if this ereport corresponds to an open case. * Each vdev or pool can have a single case. */ (void) nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) vdev_guid = 0; if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) ena = 0; zfs_ereport_when(hdl, nvl, &er_when); for (zcp = uu_list_first(zfs_cases); zcp != NULL; zcp = uu_list_next(zfs_cases, zcp)) { if (zcp->zc_data.zc_pool_guid == pool_guid) { pool_found = B_TRUE; pool_load = zcp->zc_when; } if (zcp->zc_data.zc_vdev_guid == vdev_guid) break; } /* * Avoid falsely accusing a pool of being faulty. Do so by * not replaying ereports that were generated prior to the * current import. If the failure that generated them was * transient because the device was actually removed but we * didn't receive the normal asynchronous notification, we * don't want to mark it as faulted and potentially panic. If * there is still a problem we'd expect not to be able to * import the pool, or that new ereports will be generated * once the pool is used. */ if (pool_found && timeval_earlier(&er_when, &pool_load)) { fmd_hdl_debug(hdl, "ignoring pool %llx, " "ereport time %lld.%lld, pool load time = %lld.%lld", pool_guid, er_when.ertv_sec, er_when.ertv_nsec, pool_load.ertv_sec, pool_load.ertv_nsec); zfs_stats.old_drops.fmds_value.ui64++; return; } if (!pool_found) { /* * Haven't yet seen this pool, but same situation * may apply. */ libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); struct load_time_arg la; la.lt_guid = pool_guid; la.lt_time = &pool_load; la.lt_found = B_FALSE; if (zhdl != NULL && zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && la.lt_found == B_TRUE) { pool_found = B_TRUE; if (timeval_earlier(&er_when, &pool_load)) { fmd_hdl_debug(hdl, "ignoring pool %llx, " "ereport time %lld.%lld, " "pool load time = %lld.%lld", pool_guid, er_when.ertv_sec, er_when.ertv_nsec, pool_load.ertv_sec, pool_load.ertv_nsec); zfs_stats.old_drops.fmds_value.ui64++; return; } } } if (zcp == NULL) { fmd_case_t *cs; zfs_case_data_t data = { 0 }; /* * If this is one of our 'fake' resource ereports, and there is * no case open, simply discard it. */ if (isresource) { zfs_stats.resource_drops.fmds_value.ui64++; fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", class, vdev_guid); return; } /* * Skip tracking some ereports */ if (strcmp(class, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || strcmp(class, - ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || - strcmp(class, - ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) { zfs_stats.resource_drops.fmds_value.ui64++; return; } /* * Open a new case. */ cs = fmd_case_open(hdl, NULL); fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'", vdev_guid, class); /* * Initialize the case buffer. To commonize code, we actually * create the buffer with existing data, and then call * zfs_case_unserialize() to instantiate the in-core structure. */ fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); data.zc_version = CASE_DATA_VERSION_SERD; data.zc_ena = ena; data.zc_pool_guid = pool_guid; data.zc_vdev_guid = vdev_guid; data.zc_pool_state = (int)pool_state; fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); zcp = zfs_case_unserialize(hdl, cs); assert(zcp != NULL); if (pool_found) zcp->zc_when = pool_load; } if (isresource) { fmd_hdl_debug(hdl, "resource event '%s'", class); if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { /* * The 'resource.fs.zfs.autoreplace' event indicates * that the pool was loaded with the 'autoreplace' * property set. In this case, any pending device * failures should be ignored, as the asynchronous * autoreplace handling will take care of them. */ fmd_case_close(hdl, zcp->zc_case); } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { /* * The 'resource.fs.zfs.removed' event indicates that * device removal was detected, and the device was * closed asynchronously. If this is the case, we * assume that any recent I/O errors were due to the * device removal, not any fault of the device itself. * We reset the SERD engine, and cancel any pending * timers. */ if (zcp->zc_data.zc_has_remove_timer) { fmd_timer_remove(hdl, zcp->zc_remove_timer); zcp->zc_data.zc_has_remove_timer = 0; zfs_case_serialize(zcp); } if (zcp->zc_data.zc_serd_io[0] != '\0') fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io); if (zcp->zc_data.zc_serd_checksum[0] != '\0') fmd_serd_reset(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_slow_io[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_slow_io); } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { uint64_t state = 0; if (zcp != NULL && nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 && state == VDEV_STATE_HEALTHY) { fmd_hdl_debug(hdl, "closing case after a " "device statechange to healthy"); fmd_case_close(hdl, zcp->zc_case); } } zfs_stats.resource_drops.fmds_value.ui64++; return; } /* * Associate the ereport with this case. */ fmd_case_add_ereport(hdl, zcp->zc_case, ep); /* * Don't do anything else if this case is already solved. */ if (fmd_case_solved(hdl, zcp->zc_case)) return; - fmd_hdl_debug(hdl, "error event '%s'", class); + if (vdev_guid) + fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class, + vdev_guid); + else + fmd_hdl_debug(hdl, "error event '%s'", class); /* * Determine if we should solve the case and generate a fault. We solve * a case if: * * a. A pool failed to open (ereport.fs.zfs.pool) * b. A device failed to open (ereport.fs.zfs.pool) while a pool * was up and running. * * We may see a series of ereports associated with a pool open, all * chained together by the same ENA. If the pool open succeeds, then * we'll see no further ereports. To detect when a pool open has * succeeded, we associate a timer with the event. When it expires, we * close the case. */ if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { /* * Pool level fault. Before solving the case, go through and * close any open device cases that may be pending. */ for (dcp = uu_list_first(zfs_cases); dcp != NULL; dcp = uu_list_next(zfs_cases, dcp)) { if (dcp->zc_data.zc_pool_guid == zcp->zc_data.zc_pool_guid && dcp->zc_data.zc_vdev_guid != 0) fmd_case_close(hdl, dcp->zc_case); } zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool"); } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { /* * Pool level fault for reading the intent logs. */ zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay"); } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { /* * Device fault. */ zfs_case_solve(hdl, zcp, "fault.fs.zfs.device"); } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) || + fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { const char *failmode = NULL; boolean_t checkremove = B_FALSE; uint32_t pri = 0; int32_t flags = 0; /* * If this is a checksum or I/O error, then toss it into the * appropriate SERD engine and check to see if it has fired. * Ideally, we want to do something more sophisticated, * (persistent errors for a single data block, etc). For now, * a single SERD engine is sufficient. */ if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { if (zcp->zc_data.zc_serd_io[0] == '\0') { if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N, &io_n) != 0) { io_n = DEFAULT_IO_N; } if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T, &io_t) != 0) { io_t = DEFAULT_IO_T; } zfs_serd_name(zcp->zc_data.zc_serd_io, pool_guid, vdev_guid, "io"); fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, io_n, SEC2NSEC(io_t)); zfs_case_serialize(zcp); } if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) checkremove = B_TRUE; + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) { + uint64_t slow_io_n, slow_io_t; + + /* + * Create a slow io SERD engine when the VDEV has the + * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties. + */ + if (zcp->zc_data.zc_serd_slow_io[0] == '\0' && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, + &slow_io_n) == 0 && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, + &slow_io_t) == 0) { + zfs_serd_name(zcp->zc_data.zc_serd_slow_io, + pool_guid, vdev_guid, "slow_io"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_slow_io, + slow_io_n, + SEC2NSEC(slow_io_t)); + zfs_case_serialize(zcp); + } + /* Pass event to SERD engine and see if this triggers */ + if (zcp->zc_data.zc_serd_slow_io[0] != '\0' && + fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io, + ep)) { + /* + * Ignore a slow io diagnosis when other + * VDEVs in the pool show signs of being slow. + */ + if (zfs_other_slow_cases(hdl, &zcp->zc_data)) { + zfs_case_retire(hdl, zcp); + fmd_hdl_debug(hdl, "pool %llu has " + "multiple slow io cases -- skip " + "degrading vdev %llu", + (u_longlong_t) + zcp->zc_data.zc_pool_guid, + (u_longlong_t) + zcp->zc_data.zc_vdev_guid); + } else { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.slow_io"); + } + } } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { /* * We ignore ereports for checksum errors generated by * scrub/resilver I/O to avoid potentially further * degrading the pool while it's being repaired. */ if (((nvlist_lookup_uint32(nvl, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) && (pri == ZIO_PRIORITY_SCRUB || pri == ZIO_PRIORITY_REBUILD)) || ((nvlist_lookup_int32(nvl, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) && (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) { fmd_hdl_debug(hdl, "ignoring '%s' for " "scrub/resilver I/O", class); return; } if (zcp->zc_data.zc_serd_checksum[0] == '\0') { if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N, &checksum_n) != 0) { checksum_n = DEFAULT_CHECKSUM_N; } if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T, &checksum_t) != 0) { checksum_t = DEFAULT_CHECKSUM_T; } zfs_serd_name(zcp->zc_data.zc_serd_checksum, pool_guid, vdev_guid, "checksum"); fmd_serd_create(hdl, zcp->zc_data.zc_serd_checksum, checksum_n, SEC2NSEC(checksum_t)); zfs_case_serialize(zcp); } if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_checksum, ep)) { zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.checksum"); } } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && failmode != NULL) { if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { zfs_case_solve(hdl, zcp, "fault.fs.zfs.io_failure_continue"); } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { zfs_case_solve(hdl, zcp, "fault.fs.zfs.io_failure_wait"); } } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { #ifndef __linux__ /* This causes an unexpected fault diagnosis on linux */ checkremove = B_TRUE; #endif } /* * Because I/O errors may be due to device removal, we postpone * any diagnosis until we're sure that we aren't about to * receive a 'resource.fs.zfs.removed' event. */ if (checkremove) { if (zcp->zc_data.zc_has_remove_timer) fmd_timer_remove(hdl, zcp->zc_remove_timer); zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, zfs_remove_timeout); if (!zcp->zc_data.zc_has_remove_timer) { zcp->zc_data.zc_has_remove_timer = 1; zfs_case_serialize(zcp); } } } } /* * The timeout is fired when we diagnosed an I/O error, and it was not due to * device removal (which would cause the timeout to be cancelled). */ static void zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) { zfs_case_t *zcp = data; if (id == zcp->zc_remove_timer) zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io"); } /* * The specified case has been closed and any case-specific * data structures should be deallocated. */ static void zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) { zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); if (zcp->zc_data.zc_serd_checksum[0] != '\0') fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); if (zcp->zc_data.zc_serd_io[0] != '\0') fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_slow_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io); if (zcp->zc_data.zc_has_remove_timer) fmd_timer_remove(hdl, zcp->zc_remove_timer); uu_list_remove(zfs_cases, zcp); uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); } -/* - * We use the fmd gc entry point to look for old cases that no longer apply. - * This allows us to keep our set of case data small in a long running system. - */ -static void -zfs_fm_gc(fmd_hdl_t *hdl) -{ - zfs_purge_cases(hdl); -} - static const fmd_hdl_ops_t fmd_ops = { zfs_fm_recv, /* fmdo_recv */ zfs_fm_timeout, /* fmdo_timeout */ zfs_fm_close, /* fmdo_close */ NULL, /* fmdo_stats */ - zfs_fm_gc, /* fmdo_gc */ + NULL, /* fmdo_gc */ }; static const fmd_prop_t fmd_props[] = { - { "checksum_N", FMD_TYPE_UINT32, "10" }, - { "checksum_T", FMD_TYPE_TIME, "10min" }, - { "io_N", FMD_TYPE_UINT32, "10" }, - { "io_T", FMD_TYPE_TIME, "10min" }, - { "remove_timeout", FMD_TYPE_TIME, "15sec" }, { NULL, 0, NULL } }; static const fmd_hdl_info_t fmd_info = { "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props }; void _zfs_diagnosis_init(fmd_hdl_t *hdl) { libzfs_handle_t *zhdl; if ((zhdl = libzfs_init()) == NULL) return; if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), NULL, UU_LIST_POOL_DEBUG)) == NULL) { libzfs_fini(zhdl); return; } if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, UU_LIST_DEBUG)) == NULL) { uu_list_pool_destroy(zfs_case_pool); libzfs_fini(zhdl); return; } if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { uu_list_destroy(zfs_cases); uu_list_pool_destroy(zfs_case_pool); libzfs_fini(zhdl); return; } fmd_hdl_setspecific(hdl, zhdl); (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); - - zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); } void _zfs_diagnosis_fini(fmd_hdl_t *hdl) { zfs_case_t *zcp; uu_list_walk_t *walk; libzfs_handle_t *zhdl; /* * Remove all active cases. */ walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); while ((zcp = uu_list_walk_next(walk)) != NULL) { fmd_hdl_debug(hdl, "removing case ena %llu", (long long unsigned)zcp->zc_data.zc_ena); uu_list_remove(zfs_cases, zcp); uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); } uu_list_walk_end(walk); uu_list_destroy(zfs_cases); uu_list_pool_destroy(zfs_case_pool); zhdl = fmd_hdl_getspecific(hdl); libzfs_fini(zhdl); } diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index a0e377a4a0c8..1ef5c631a438 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -1,668 +1,671 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2016, Intel Corporation. * Copyright (c) 2018, loli10K */ /* * The ZFS retire agent is responsible for managing hot spares across all pools. * When we see a device fault or a device removal, we try to open the associated * pool and look for any hot spares. We iterate over any available hot spares * and attempt a 'zpool replace' for each one. * * For vdevs diagnosed as faulty, the agent is also responsible for proactively * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). */ #include #include #include #include #include #include #include #include "zfs_agents.h" #include "fmd_api.h" typedef struct zfs_retire_repaired { struct zfs_retire_repaired *zrr_next; uint64_t zrr_pool; uint64_t zrr_vdev; } zfs_retire_repaired_t; typedef struct zfs_retire_data { libzfs_handle_t *zrd_hdl; zfs_retire_repaired_t *zrd_repaired; } zfs_retire_data_t; static void zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) { zfs_retire_repaired_t *zrp; while ((zrp = zdp->zrd_repaired) != NULL) { zdp->zrd_repaired = zrp->zrr_next; fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); } } /* * Find a pool with a matching GUID. */ typedef struct find_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; nvlist_t *cb_vdev; uint64_t cb_vdev_guid; uint64_t cb_num_spares; } find_cbdata_t; static int find_pool(zpool_handle_t *zhp, void *data) { find_cbdata_t *cbp = data; if (cbp->cb_guid == zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } /* * Find a vdev within a tree with a matching GUID. */ static nvlist_t * find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) { uint64_t guid; nvlist_t **child; uint_t c, children; nvlist_t *ret; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && guid == search_guid) { fmd_hdl_debug(fmd_module_hdl("zfs-retire"), "matched vdev %llu", guid); return (nv); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (NULL); for (c = 0; c < children; c++) { if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) return (ret); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) != 0) return (NULL); for (c = 0; c < children; c++) { if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) return (ret); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) != 0) return (NULL); for (c = 0; c < children; c++) { if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) return (ret); } return (NULL); } static int remove_spares(zpool_handle_t *zhp, void *data) { nvlist_t *config, *nvroot; nvlist_t **spares; uint_t nspares; char *devname; find_cbdata_t *cbp = data; uint64_t spareguid = 0; vdev_stat_t *vs; unsigned int c; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) { zpool_close(zhp); return (0); } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) { zpool_close(zhp); return (0); } for (int i = 0; i < nspares; i++) { if (nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &spareguid) == 0 && spareguid == cbp->cb_vdev_guid) { devname = zpool_vdev_name(NULL, zhp, spares[i], B_FALSE); nvlist_lookup_uint64_array(spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c); if (vs->vs_state != VDEV_STATE_REMOVED && zpool_vdev_remove_wanted(zhp, devname) == 0) cbp->cb_num_spares++; break; } } zpool_close(zhp); return (0); } /* * Given a vdev guid, find and remove all spares associated with it. */ static int find_and_remove_spares(libzfs_handle_t *zhdl, uint64_t vdev_guid) { find_cbdata_t cb; cb.cb_num_spares = 0; cb.cb_vdev_guid = vdev_guid; zpool_iter(zhdl, remove_spares, &cb); return (cb.cb_num_spares); } /* * Given a (pool, vdev) GUID pair, find the matching pool and vdev. */ static zpool_handle_t * find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, nvlist_t **vdevp) { find_cbdata_t cb; zpool_handle_t *zhp; nvlist_t *config, *nvroot; /* * Find the corresponding pool and make sure the vdev still exists. */ cb.cb_guid = pool_guid; if (zpool_iter(zhdl, find_pool, &cb) != 1) return (NULL); zhp = cb.cb_zhp; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) { zpool_close(zhp); return (NULL); } if (vdev_guid != 0) { if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { zpool_close(zhp); return (NULL); } } return (zhp); } /* * Given a vdev, attempt to replace it with every known spare until one * succeeds or we run out of devices to try. * Return whether we were successful or not in replacing the device. */ static boolean_t replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) { nvlist_t *config, *nvroot, *replacement; nvlist_t **spares; uint_t s, nspares; char *dev_name; zprop_source_t source; int ashift; config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) return (B_FALSE); /* * Find out if there are any hot spares available in the pool. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) return (B_FALSE); /* * lookup "ashift" pool property, we may need it for the replacement */ ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); /* * Try to replace each spare, ending when we successfully * replace it. */ for (s = 0; s < nspares; s++) { boolean_t rebuild = B_FALSE; const char *spare_name, *type; if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, &spare_name) != 0) continue; /* prefer sequential resilvering for distributed spares */ if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) rebuild = B_TRUE; /* if set, add the "ashift" pool property to the spare nvlist */ if (source != ZPROP_SRC_DEFAULT) (void) nvlist_add_uint64(spares[s], ZPOOL_CONFIG_ASHIFT, ashift); (void) nvlist_add_nvlist_array(replacement, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)&spares[s], 1); fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", dev_name, zfs_basename(spare_name)); if (zpool_vdev_attach(zhp, dev_name, spare_name, replacement, B_TRUE, rebuild) == 0) { free(dev_name); nvlist_free(replacement); return (B_TRUE); } } free(dev_name); nvlist_free(replacement); return (B_FALSE); } /* * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and * ASRU is now usable. ZFS has found the device to be present and * functioning. */ static void zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) { zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); zfs_retire_repaired_t *zrp; uint64_t pool_guid, vdev_guid; if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid) != 0 || nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) return; /* * Before checking the state of the ASRU, go through and see if we've * already made an attempt to repair this ASRU. This list is cleared * whenever we receive any kind of list event, and is designed to * prevent us from generating a feedback loop when we attempt repairs * against a faulted pool. The problem is that checking the unusable * state of the ASRU can involve opening the pool, which can post * statechange events but otherwise leave the pool in the faulted * state. This list allows us to detect when a statechange event is * due to our own request. */ for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { if (zrp->zrr_pool == pool_guid && zrp->zrr_vdev == vdev_guid) return; } zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); zrp->zrr_next = zdp->zrd_repaired; zrp->zrr_pool = pool_guid; zrp->zrr_vdev = vdev_guid; zdp->zrd_repaired = zrp; fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", vdev_guid, pool_guid); } static void zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) { (void) ep; uint64_t pool_guid, vdev_guid; zpool_handle_t *zhp; nvlist_t *resource, *fault; nvlist_t **faults; uint_t f, nfaults; zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); libzfs_handle_t *zhdl = zdp->zrd_hdl; boolean_t fault_device, degrade_device; boolean_t is_repair; boolean_t l2arc = B_FALSE; boolean_t spare = B_FALSE; const char *scheme; nvlist_t *vdev = NULL; const char *uuid; int repair_done = 0; boolean_t retire; boolean_t is_disk; vdev_aux_t aux; uint64_t state = 0; vdev_stat_t *vs; unsigned int c; fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); (void) nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state); /* * If this is a resource notifying us of device removal then simply * check for an available spare and continue unless the device is a * l2arc vdev, in which case we just offline it. */ if (strcmp(class, "resource.fs.zfs.removed") == 0 || (strcmp(class, "resource.fs.zfs.statechange") == 0 && (state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED))) { const char *devtype; char *devname; if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, &devtype) == 0) { if (strcmp(devtype, VDEV_TYPE_SPARE) == 0) spare = B_TRUE; else if (strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) l2arc = B_TRUE; } if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) return; if (vdev_guid == 0) { fmd_hdl_debug(hdl, "Got a zero GUID"); return; } if (spare) { int nspares = find_and_remove_spares(zhdl, vdev_guid); fmd_hdl_debug(hdl, "%d spares removed", nspares); return; } if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid) != 0) return; if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, &vdev)) == NULL) return; devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c); /* * If state removed is requested for already removed vdev, * its a loopback event from spa_async_remove(). Just * ignore it. */ if (vs->vs_state == VDEV_STATE_REMOVED && state == VDEV_STATE_REMOVED) return; /* Remove the vdev since device is unplugged */ int remove_status = 0; if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) { remove_status = zpool_vdev_remove_wanted(zhp, devname); fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'" ", err:%d", devname, libzfs_errno(zhdl)); } /* Replace the vdev with a spare if its not a l2arc */ if (!l2arc && !remove_status && (!fmd_prop_get_int32(hdl, "spare_on_remove") || replace_with_spare(hdl, zhp, vdev) == B_FALSE)) { /* Could not handle with spare */ fmd_hdl_debug(hdl, "no spare for '%s'", devname); } free(devname); zpool_close(zhp); return; } if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) return; /* * Note: on Linux statechange events are more than just * healthy ones so we need to confirm the actual state value. */ if (strcmp(class, "resource.fs.zfs.statechange") == 0 && state == VDEV_STATE_HEALTHY) { zfs_vdev_repair(hdl, nvl); return; } if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { zfs_vdev_repair(hdl, nvl); return; } zfs_retire_clear_data(hdl, zdp); if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) is_repair = B_TRUE; else is_repair = B_FALSE; /* * We subscribe to zfs faults as well as all repair events. */ if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &faults, &nfaults) != 0) return; for (f = 0; f < nfaults; f++) { fault = faults[f]; fault_device = B_FALSE; degrade_device = B_FALSE; is_disk = B_FALSE; if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, &retire) == 0 && retire == 0) continue; /* * While we subscribe to fault.fs.zfs.*, we only take action * for faults targeting a specific vdev (open failure or SERD * failure). We also subscribe to fault.io.* events, so that * faulty disks will be faulted in the ZFS configuration. */ if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { fault_device = B_TRUE; } else if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.checksum")) { degrade_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.vdev.slow_io")) { + degrade_device = B_TRUE; } else if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.device")) { fault_device = B_FALSE; } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { is_disk = B_TRUE; fault_device = B_TRUE; } else { continue; } if (is_disk) { continue; } else { /* * This is a ZFS fault. Lookup the resource, and * attempt to find the matching vdev. */ if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &resource) != 0 || nvlist_lookup_string(resource, FM_FMRI_SCHEME, &scheme) != 0) continue; if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) continue; if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, &pool_guid) != 0) continue; if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, &vdev_guid) != 0) { if (is_repair) vdev_guid = 0; else continue; } if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, &vdev)) == NULL) continue; aux = VDEV_AUX_ERR_EXCEEDED; } if (vdev_guid == 0) { /* * For pool-level repair events, clear the entire pool. */ fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", zpool_get_name(zhp)); (void) zpool_clear(zhp, NULL, NULL); zpool_close(zhp); continue; } /* * If this is a repair event, then mark the vdev as repaired and * continue. */ if (is_repair) { repair_done = 1; fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", zpool_get_name(zhp), vdev_guid); (void) zpool_vdev_clear(zhp, vdev_guid); zpool_close(zhp); continue; } /* * Actively fault the device if needed. */ if (fault_device) (void) zpool_vdev_fault(zhp, vdev_guid, aux); if (degrade_device) (void) zpool_vdev_degrade(zhp, vdev_guid, aux); if (fault_device || degrade_device) fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", fault_device ? "fault" : "degrade", vdev_guid, zpool_get_name(zhp)); /* * Attempt to substitute a hot spare. */ (void) replace_with_spare(hdl, zhp, vdev); zpool_close(zhp); } if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) fmd_case_uuresolved(hdl, uuid); } static const fmd_hdl_ops_t fmd_ops = { zfs_retire_recv, /* fmdo_recv */ NULL, /* fmdo_timeout */ NULL, /* fmdo_close */ NULL, /* fmdo_stats */ NULL, /* fmdo_gc */ }; static const fmd_prop_t fmd_props[] = { { "spare_on_remove", FMD_TYPE_BOOL, "true" }, { NULL, 0, NULL } }; static const fmd_hdl_info_t fmd_info = { "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props }; void _zfs_retire_init(fmd_hdl_t *hdl) { zfs_retire_data_t *zdp; libzfs_handle_t *zhdl; if ((zhdl = libzfs_init()) == NULL) return; if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { libzfs_fini(zhdl); return; } zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); zdp->zrd_hdl = zhdl; fmd_hdl_setspecific(hdl, zdp); } void _zfs_retire_fini(fmd_hdl_t *hdl) { zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); if (zdp != NULL) { zfs_retire_clear_data(hdl, zdp); libzfs_fini(zdp->zrd_hdl); fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); } } diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index f1262ed772de..a11b6d0b7fac 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -1,1288 +1,1304 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ /* * ZFS Fault Injector * * This userland component takes a set of options and uses libzpool to translate * from a user-visible object type and name to an internal representation. * There are two basic types of faults: device faults and data faults. * * * DEVICE FAULTS * * Errors can be injected into a particular vdev using the '-d' option. This * option takes a path or vdev GUID to uniquely identify the device within a * pool. There are four types of errors that can be injected, IO, ENXIO, * ECHILD, and EILSEQ. These can be controlled through the '-e' option and the * default is ENXIO. For EIO failures, any attempt to read data from the device * will return EIO, but a subsequent attempt to reopen the device will succeed. * For ENXIO failures, any attempt to read from the device will return EIO, but * any attempt to reopen the device will also return ENXIO. The EILSEQ failures * only apply to read operations (-T read) and will flip a bit after the device * has read the original data. * * For label faults, the -L option must be specified. This allows faults * to be injected into either the nvlist, uberblock, pad1, or pad2 region * of all the labels for the specified device. * * This form of the command looks like: * * zinject -d device [-e errno] [-L ] pool * * * DATA FAULTS * * We begin with a tuple of the form: * * * * type A string describing the type of data to target. Each type * implicitly describes how to interpret 'object'. Currently, * the following values are supported: * * data User data for a file * dnode Dnode for a file or directory * * The following MOS objects are special. Instead of injecting * errors on a particular object or blkid, we inject errors across * all objects of the given type. * * mos Any data in the MOS * mosdir object directory * config pool configuration * bpobj blkptr list * spacemap spacemap * metaslab metaslab * errlog persistent error log * * level Object level. Defaults to '0', not applicable to all types. If * a range is given, this corresponds to the indirect block * corresponding to the specific range. * * range A numerical range [start,end) within the object. Defaults to * the full size of the file. * * object A string describing the logical location of the object. For * files and directories (currently the only supported types), * this is the path of the object on disk. * * This is translated, via libzpool, into the following internal representation: * * * * These types should be self-explanatory. This tuple is then passed to the * kernel via a special ioctl() to initiate fault injection for the given * object. Note that 'type' is not strictly necessary for fault injection, but * is used when translating existing faults into a human-readable string. * * * The command itself takes one of the forms: * * zinject * zinject <-a | -u pool> * zinject -c * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] * [-r range] * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool * * With no arguments, the command prints all currently registered injection * handlers, with their numeric identifiers. * * The '-c' option will clear the given handler, or all handlers if 'all' is * specified. * * The '-e' option takes a string describing the errno to simulate. This must * be one of 'io', 'checksum', 'decompress', or 'decrypt'. In most cases this * will result in the same behavior, but RAID-Z will produce a different set of * ereports for this situation. * * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is * specified, then the ARC cache is flushed appropriately. If '-u' is * specified, then the underlying SPA is unloaded. Either of these flags can be * specified independently of any other handlers. The '-m' flag automatically * does an unmount and remount of the underlying dataset to aid in flushing the * cache. * * The '-f' flag controls the frequency of errors injected, expressed as a * real number percentage between 0.0001 and 100. The default is 100. * * The this form is responsible for actually injecting the handler into the * framework. It takes the arguments described above, translates them to the * internal tuple using libzpool, and then issues an ioctl() to register the * handler. * * The final form can target a specific bookmark, regardless of whether a * human-readable interface has been designed. It allows developers to specify * a particular block by number. */ #include #include #include #include #include #include #include #include #include #include #undef verify /* both libzfs.h and zfs_context.h want to define this */ #include "zinject.h" libzfs_handle_t *g_zfs; int zfs_fd; static const char *const errtable[TYPE_INVAL] = { "data", "dnode", "mos", "mosdir", "metaslab", "config", "bpobj", "spacemap", "errlog", "uber", "nvlist", "pad1", "pad2" }; static err_type_t name_to_type(const char *arg) { int i; for (i = 0; i < TYPE_INVAL; i++) if (strcmp(errtable[i], arg) == 0) return (i); return (TYPE_INVAL); } static const char * type_to_name(uint64_t type) { switch (type) { case DMU_OT_OBJECT_DIRECTORY: return ("mosdir"); case DMU_OT_OBJECT_ARRAY: return ("metaslab"); case DMU_OT_PACKED_NVLIST: return ("config"); case DMU_OT_BPOBJ: return ("bpobj"); case DMU_OT_SPACE_MAP: return ("spacemap"); case DMU_OT_ERROR_LOG: return ("errlog"); default: return ("-"); } } /* * Print usage message. */ void usage(void) { (void) printf( "usage:\n" "\n" "\tzinject\n" "\n" "\t\tList all active injection records.\n" "\n" "\tzinject -c \n" "\n" "\t\tClear the particular record (if given a numeric ID), or\n" "\t\tall records if 'all' is specified.\n" "\n" "\tzinject -p pool\n" "\t\tInject a panic fault at the specified function. Only \n" "\t\tfunctions which call spa_vdev_config_exit(), or \n" "\t\tspa_vdev_exit() will trigger a panic.\n" "\n" "\tzinject -d device [-e errno] [-L ] [-F]\n" "\t\t[-T ] [-f frequency] pool\n\n" "\t\tInject a fault into a particular device or the device's\n" "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " "\t\t'pad1', or 'pad2'.\n" "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n" "\t\t'corrupt' (bit flip).\n" "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n" "\t\tdevice error injection to a percentage of the IOs.\n" "\n" "\tzinject -d device -A -D pool\n" "\t\tPerform a specific action on a particular device.\n" "\n" "\tzinject -d device -D latency:lanes pool\n" "\n" "\t\tAdd an artificial delay to IO requests on a particular\n" "\t\tdevice, such that the requests take a minimum of 'latency'\n" "\t\tmilliseconds to complete. Each delay has an associated\n" "\t\tnumber of 'lanes' which defines the number of concurrent\n" "\t\tIO requests that can be processed.\n" "\n" "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n" "\t\tthe device will only be able to service a single IO request\n" "\t\tat a time with each request taking 10 ms to complete. So,\n" "\t\tif only a single request is submitted every 10 ms, the\n" "\t\taverage latency will be 10 ms; but if more than one request\n" "\t\tis submitted every 10 ms, the average latency will be more\n" "\t\tthan 10 ms.\n" "\n" "\t\tSimilarly, if a delay of 10 ms is specified to have two\n" "\t\tlanes (-D 10:2), then the device will be able to service\n" "\t\ttwo requests at a time, each with a minimum latency of\n" "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n" "\t\tthe average latency will be 10 ms; but if more than two\n" "\t\trequests are submitted every 10 ms, the average latency\n" "\t\twill be more than 10 ms.\n" "\n" "\t\tAlso note, these delays are additive. So two invocations\n" "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n" "\t\tof '-D 10:2'. This also means, one can specify multiple\n" "\t\tlanes with differing target latencies. For example, an\n" "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n" "\t\tcreate 3 lanes on the device; one lane with a latency\n" "\t\tof 10 ms and two lanes with a 25 ms latency.\n" "\n" "\tzinject -I [-s | -g ] pool\n" "\t\tCause the pool to stop writing blocks yet not\n" "\t\treport errors for a duration. Simulates buggy hardware\n" "\t\tthat fails to honor cache flush requests.\n" "\t\tDefault duration is 30 seconds. The machine is panicked\n" "\t\tat the end of the duration.\n" "\n" "\tzinject -b objset:object:level:blkid pool\n" "\n" "\t\tInject an error into pool 'pool' with the numeric bookmark\n" "\t\tspecified by the remaining tuple. Each number is in\n" "\t\thexadecimal, and only one block can be specified.\n" "\n" "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n" "\t\t[-r range] [-a] [-m] [-u] [-f freq] \n" "\n" "\t\tInject an error into the object specified by the '-t' option\n" "\t\tand the object descriptor. The 'object' parameter is\n" "\t\tinterpreted depending on the '-t' option.\n" "\n" "\t\t-q\tQuiet mode. Only print out the handler number added.\n" "\t\t-e\tInject a specific error. Must be one of 'io',\n" "\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n" "\t\t-C\tInject the given error only into specific DVAs. The\n" "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n" "\t\t\tseparated by commas (ex. '0,2').\n" "\t\t-l\tInject error at a particular block level. Default is " "0.\n" "\t\t-m\tAutomatically remount underlying filesystem.\n" "\t\t-r\tInject error over a particular logical range of an\n" "\t\t\tobject. Will be translated to the appropriate blkid\n" "\t\t\trange according to the object's properties.\n" "\t\t-a\tFlush the ARC cache. Can be specified without any\n" "\t\t\tassociated object.\n" "\t\t-u\tUnload the associated pool. Can be specified with only\n" "\t\t\ta pool object.\n" "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n" "\t\t\ta percentage between 0.0001 and 100.\n" "\n" "\t-t data\t\tInject an error into the plain file contents of a\n" "\t\t\tfile. The object must be specified as a complete path\n" "\t\t\tto a file on a ZFS filesystem.\n" "\n" "\t-t dnode\tInject an error into the metadnode in the block\n" "\t\t\tcorresponding to the dnode for a file or directory. The\n" "\t\t\t'-r' option is incompatible with this mode. The object\n" "\t\t\tis specified as a complete path to a file or directory\n" "\t\t\ton a ZFS filesystem.\n" "\n" "\t-t \tInject errors into the MOS for objects of the given\n" "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n" "\t\t\tspacemap, metaslab, errlog. The only valid is\n" "\t\t\tthe poolname.\n"); } static int iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), void *data) { zfs_cmd_t zc = {"\0"}; int ret; while (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) if ((ret = func((int)zc.zc_guid, zc.zc_name, &zc.zc_inject_record, data)) != 0) return (ret); if (errno != ENOENT) { (void) fprintf(stderr, "Unable to list handlers: %s\n", strerror(errno)); return (-1); } return (0); } static int print_data_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_guid != 0 || record->zi_func[0] != '\0') return (0); if (*count == 0) { (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "DVAs", "RANGE"); (void) printf("--- --------------- ------ " "------ -------- --- ---- ---------------\n"); } *count += 1; (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ", id, pool, (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object, type_to_name(record->zi_type), record->zi_level, record->zi_dvas); if (record->zi_start == 0 && record->zi_end == -1ULL) (void) printf("all\n"); else (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); return (0); } static int print_device_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_guid == 0 || record->zi_func[0] != '\0') return (0); if (record->zi_cmd == ZINJECT_DELAY_IO) return (0); if (*count == 0) { (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); (void) printf("--- --------------- ----------------\n"); } *count += 1; (void) printf("%3d %-15s %llx\n", id, pool, (u_longlong_t)record->zi_guid); return (0); } static int print_delay_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_guid == 0 || record->zi_func[0] != '\0') return (0); if (record->zi_cmd != ZINJECT_DELAY_IO) return (0); if (*count == 0) { (void) printf("%3s %-15s %-15s %-15s %s\n", "ID", "POOL", "DELAY (ms)", "LANES", "GUID"); (void) printf("--- --------------- --------------- " "--------------- ----------------\n"); } *count += 1; (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool, (u_longlong_t)NSEC2MSEC(record->zi_timer), (u_longlong_t)record->zi_nlanes, (u_longlong_t)record->zi_guid); return (0); } static int print_panic_handler(int id, const char *pool, zinject_record_t *record, void *data) { int *count = data; if (record->zi_func[0] == '\0') return (0); if (*count == 0) { (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION"); (void) printf("--- --------------- ----------------\n"); } *count += 1; (void) printf("%3d %-15s %s\n", id, pool, record->zi_func); return (0); } /* * Print all registered error handlers. Returns the number of handlers * registered. */ static int print_all_handlers(void) { int count = 0, total = 0; (void) iter_handlers(print_device_handler, &count); if (count > 0) { total += count; (void) printf("\n"); count = 0; } (void) iter_handlers(print_delay_handler, &count); if (count > 0) { total += count; (void) printf("\n"); count = 0; } (void) iter_handlers(print_data_handler, &count); if (count > 0) { total += count; (void) printf("\n"); count = 0; } (void) iter_handlers(print_panic_handler, &count); return (count + total); } static int cancel_one_handler(int id, const char *pool, zinject_record_t *record, void *data) { (void) pool, (void) record, (void) data; zfs_cmd_t zc = {"\0"}; zc.zc_guid = (uint64_t)id; if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to remove handler %d: %s\n", id, strerror(errno)); return (1); } return (0); } /* * Remove all fault injection handlers. */ static int cancel_all_handlers(void) { int ret = iter_handlers(cancel_one_handler, NULL); if (ret == 0) (void) printf("removed all registered handlers\n"); return (ret); } /* * Remove a specific fault injection handler. */ static int cancel_handler(int id) { zfs_cmd_t zc = {"\0"}; zc.zc_guid = (uint64_t)id; if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to remove handler %d: %s\n", id, strerror(errno)); return (1); } (void) printf("removed handler %d\n", id); return (0); } /* * Register a new fault injection handler. */ static int register_handler(const char *pool, int flags, zinject_record_t *record, int quiet) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); zc.zc_inject_record = *record; zc.zc_guid = flags; if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) { (void) fprintf(stderr, "failed to add handler: %s\n", errno == EDOM ? "block level exceeds max level of object" : strerror(errno)); return (1); } if (flags & ZINJECT_NULL) return (0); if (quiet) { (void) printf("%llu\n", (u_longlong_t)zc.zc_guid); } else { (void) printf("Added handler %llu with the following " "properties:\n", (u_longlong_t)zc.zc_guid); (void) printf(" pool: %s\n", pool); if (record->zi_guid) { (void) printf(" vdev: %llx\n", (u_longlong_t)record->zi_guid); } else if (record->zi_func[0] != '\0') { (void) printf(" panic function: %s\n", record->zi_func); } else if (record->zi_duration > 0) { (void) printf(" time: %lld seconds\n", (u_longlong_t)record->zi_duration); } else if (record->zi_duration < 0) { (void) printf(" txgs: %lld \n", (u_longlong_t)-record->zi_duration); } else { (void) printf("objset: %llu\n", (u_longlong_t)record->zi_objset); (void) printf("object: %llu\n", (u_longlong_t)record->zi_object); (void) printf(" type: %llu\n", (u_longlong_t)record->zi_type); (void) printf(" level: %d\n", record->zi_level); if (record->zi_start == 0 && record->zi_end == -1ULL) (void) printf(" range: all\n"); else (void) printf(" range: [%llu, %llu)\n", (u_longlong_t)record->zi_start, (u_longlong_t)record->zi_end); (void) printf(" dvas: 0x%x\n", record->zi_dvas); } } return (0); } static int perform_action(const char *pool, zinject_record_t *record, int cmd) { zfs_cmd_t zc = {"\0"}; ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED); (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); zc.zc_guid = record->zi_guid; zc.zc_cookie = cmd; if (zfs_ioctl(g_zfs, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (1); } static int parse_delay(char *str, uint64_t *delay, uint64_t *nlanes) { unsigned long scan_delay; unsigned long scan_nlanes; if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2) return (1); /* * We explicitly disallow a delay of zero here, because we key * off this value being non-zero in translate_device(), to * determine if the fault is a ZINJECT_DELAY_IO fault or not. */ if (scan_delay == 0) return (1); /* * The units for the CLI delay parameter is milliseconds, but * the data passed to the kernel is interpreted as nanoseconds. * Thus we scale the milliseconds to nanoseconds here, and this * nanosecond value is used to pass the delay to the kernel. */ *delay = MSEC2NSEC(scan_delay); *nlanes = scan_nlanes; return (0); } static int parse_frequency(const char *str, uint32_t *percent) { double val; char *post; val = strtod(str, &post); if (post == NULL || *post != '\0') return (EINVAL); /* valid range is [0.0001, 100.0] */ val /= 100.0f; if (val < 0.000001f || val > 1.0f) return (ERANGE); /* convert to an integer for use by kernel */ *percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX)); return (0); } /* * This function converts a string specifier for DVAs into a bit mask. * The dva's provided by the user should be 0 indexed and separated by * a comma. For example: * "1" -> 0b0010 (0x2) * "0,1" -> 0b0011 (0x3) * "0,1,2" -> 0b0111 (0x7) */ static int parse_dvas(const char *str, uint32_t *dvas_out) { const char *c = str; uint32_t mask = 0; boolean_t need_delim = B_FALSE; /* max string length is 5 ("0,1,2") */ if (strlen(str) > 5 || strlen(str) == 0) return (EINVAL); while (*c != '\0') { switch (*c) { case '0': case '1': case '2': /* check for pipe between DVAs */ if (need_delim) return (EINVAL); /* check if this DVA has been set already */ if (mask & (1 << ((*c) - '0'))) return (EINVAL); mask |= (1 << ((*c) - '0')); need_delim = B_TRUE; break; case ',': need_delim = B_FALSE; break; default: /* check for invalid character */ return (EINVAL); } c++; } /* check for dangling delimiter */ if (!need_delim) return (EINVAL); *dvas_out = mask; return (0); } int main(int argc, char **argv) { int c; char *range = NULL; char *cancel = NULL; char *end; char *raw = NULL; char *device = NULL; int level = 0; int quiet = 0; int error = 0; int domount = 0; int io_type = ZIO_TYPES; int action = VDEV_STATE_UNKNOWN; err_type_t type = TYPE_INVAL; err_type_t label = TYPE_INVAL; zinject_record_t record = { 0 }; char pool[MAXNAMELEN] = ""; char dataset[MAXNAMELEN] = ""; zfs_handle_t *zhp = NULL; int nowrites = 0; int dur_txg = 0; int dur_secs = 0; int ret; int flags = 0; uint32_t dvas = 0; if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { (void) fprintf(stderr, "failed to open ZFS device\n"); libzfs_fini(g_zfs); return (1); } if (argc == 1) { /* * No arguments. Print the available handlers. If there are no * available handlers, direct the user to '-h' for help * information. */ if (print_all_handlers() == 0) { (void) printf("No handlers registered.\n"); (void) printf("Run 'zinject -h' for usage " "information.\n"); } libzfs_fini(g_zfs); return (0); } while ((c = getopt(argc, argv, ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { switch (c) { case 'a': flags |= ZINJECT_FLUSH_ARC; break; case 'A': if (strcasecmp(optarg, "degrade") == 0) { action = VDEV_STATE_DEGRADED; } else if (strcasecmp(optarg, "fault") == 0) { action = VDEV_STATE_FAULTED; } else { (void) fprintf(stderr, "invalid action '%s': " "must be 'degrade' or 'fault'\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } break; case 'b': raw = optarg; break; case 'c': cancel = optarg; break; case 'C': ret = parse_dvas(optarg, &dvas); if (ret != 0) { (void) fprintf(stderr, "invalid DVA list '%s': " "DVAs should be 0 indexed and separated by " "commas.\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } break; case 'd': device = optarg; break; case 'D': errno = 0; ret = parse_delay(optarg, &record.zi_timer, &record.zi_nlanes); if (ret != 0) { (void) fprintf(stderr, "invalid i/o delay " "value: '%s'\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } break; case 'e': if (strcasecmp(optarg, "io") == 0) { error = EIO; } else if (strcasecmp(optarg, "checksum") == 0) { error = ECKSUM; } else if (strcasecmp(optarg, "decompress") == 0) { error = EINVAL; } else if (strcasecmp(optarg, "decrypt") == 0) { error = EACCES; } else if (strcasecmp(optarg, "nxio") == 0) { error = ENXIO; } else if (strcasecmp(optarg, "dtl") == 0) { error = ECHILD; } else if (strcasecmp(optarg, "corrupt") == 0) { error = EILSEQ; } else { (void) fprintf(stderr, "invalid error type " "'%s': must be 'io', 'checksum' or " "'nxio'\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } break; case 'f': ret = parse_frequency(optarg, &record.zi_freq); if (ret != 0) { (void) fprintf(stderr, "%sfrequency value must " "be in the range [0.0001, 100.0]\n", ret == EINVAL ? "invalid value: " : ret == ERANGE ? "out of range: " : ""); libzfs_fini(g_zfs); return (1); } break; case 'F': record.zi_failfast = B_TRUE; break; case 'g': dur_txg = 1; record.zi_duration = (int)strtol(optarg, &end, 10); if (record.zi_duration <= 0 || *end != '\0') { (void) fprintf(stderr, "invalid duration '%s': " "must be a positive integer\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } /* store duration of txgs as its negative */ record.zi_duration *= -1; break; case 'h': usage(); libzfs_fini(g_zfs); return (0); case 'I': /* default duration, if one hasn't yet been defined */ nowrites = 1; if (dur_secs == 0 && dur_txg == 0) record.zi_duration = 30; break; case 'l': level = (int)strtol(optarg, &end, 10); if (*end != '\0') { (void) fprintf(stderr, "invalid level '%s': " "must be an integer\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } break; case 'm': domount = 1; break; case 'p': (void) strlcpy(record.zi_func, optarg, sizeof (record.zi_func)); record.zi_cmd = ZINJECT_PANIC; break; case 'q': quiet = 1; break; case 'r': range = optarg; flags |= ZINJECT_CALC_RANGE; break; case 's': dur_secs = 1; record.zi_duration = (int)strtol(optarg, &end, 10); if (record.zi_duration <= 0 || *end != '\0') { (void) fprintf(stderr, "invalid duration '%s': " "must be a positive integer\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } break; case 'T': if (strcasecmp(optarg, "read") == 0) { io_type = ZIO_TYPE_READ; } else if (strcasecmp(optarg, "write") == 0) { io_type = ZIO_TYPE_WRITE; } else if (strcasecmp(optarg, "free") == 0) { io_type = ZIO_TYPE_FREE; } else if (strcasecmp(optarg, "claim") == 0) { io_type = ZIO_TYPE_CLAIM; } else if (strcasecmp(optarg, "all") == 0) { io_type = ZIO_TYPES; } else { (void) fprintf(stderr, "invalid I/O type " "'%s': must be 'read', 'write', 'free', " "'claim' or 'all'\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } break; case 't': if ((type = name_to_type(optarg)) == TYPE_INVAL && !MOS_TYPE(type)) { (void) fprintf(stderr, "invalid type '%s'\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } break; case 'u': flags |= ZINJECT_UNLOAD_SPA; break; case 'L': if ((label = name_to_type(optarg)) == TYPE_INVAL && !LABEL_TYPE(type)) { (void) fprintf(stderr, "invalid label type " "'%s'\n", optarg); usage(); libzfs_fini(g_zfs); return (1); } break; case ':': (void) fprintf(stderr, "option -%c requires an " "operand\n", optopt); usage(); libzfs_fini(g_zfs); return (1); case '?': (void) fprintf(stderr, "invalid option '%c'\n", optopt); usage(); libzfs_fini(g_zfs); return (2); } } argc -= optind; argv += optind; if (record.zi_duration != 0) record.zi_cmd = ZINJECT_IGNORED_WRITES; if (cancel != NULL) { /* * '-c' is invalid with any other options. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "cancel (-c) incompatible with " "any other options\n"); usage(); libzfs_fini(g_zfs); return (2); } if (argc != 0) { (void) fprintf(stderr, "extraneous argument to '-c'\n"); usage(); libzfs_fini(g_zfs); return (2); } if (strcmp(cancel, "all") == 0) { return (cancel_all_handlers()); } else { int id = (int)strtol(cancel, &end, 10); if (*end != '\0') { (void) fprintf(stderr, "invalid handle id '%s':" " must be an integer or 'all'\n", cancel); usage(); libzfs_fini(g_zfs); return (1); } return (cancel_handler(id)); } } if (device != NULL) { /* * Device (-d) injection uses a completely different mechanism * for doing injection, so handle it separately here. */ if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || dvas != 0) { (void) fprintf(stderr, "device (-d) incompatible with " "data error injection\n"); usage(); libzfs_fini(g_zfs); return (2); } if (argc != 1) { (void) fprintf(stderr, "device (-d) injection requires " "a single pool name\n"); usage(); libzfs_fini(g_zfs); return (2); } (void) strlcpy(pool, argv[0], sizeof (pool)); dataset[0] = '\0'; if (error == ECKSUM) { (void) fprintf(stderr, "device error type must be " "'io', 'nxio' or 'corrupt'\n"); libzfs_fini(g_zfs); return (1); } if (error == EILSEQ && (record.zi_freq == 0 || io_type != ZIO_TYPE_READ)) { (void) fprintf(stderr, "device corrupt errors require " "io type read and a frequency value\n"); libzfs_fini(g_zfs); return (1); } record.zi_iotype = io_type; if (translate_device(pool, device, label, &record) != 0) { libzfs_fini(g_zfs); return (1); } + + if (record.zi_nlanes) { + switch (io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + case ZIO_TYPES: + break; + default: + (void) fprintf(stderr, "I/O type for a delay " + "must be 'read' or 'write'\n"); + usage(); + libzfs_fini(g_zfs); + return (1); + } + } + if (!error) error = ENXIO; if (action != VDEV_STATE_UNKNOWN) return (perform_action(pool, &record, action)); } else if (raw != NULL) { if (range != NULL || type != TYPE_INVAL || level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "raw (-b) format with " "any other options\n"); usage(); libzfs_fini(g_zfs); return (2); } if (argc != 1) { (void) fprintf(stderr, "raw (-b) format expects a " "single pool name\n"); usage(); libzfs_fini(g_zfs); return (2); } (void) strlcpy(pool, argv[0], sizeof (pool)); dataset[0] = '\0'; if (error == ENXIO) { (void) fprintf(stderr, "data error type must be " "'checksum' or 'io'\n"); libzfs_fini(g_zfs); return (1); } record.zi_cmd = ZINJECT_DATA_FAULT; if (translate_raw(raw, &record) != 0) { libzfs_fini(g_zfs); return (1); } if (!error) error = EIO; } else if (record.zi_cmd == ZINJECT_PANIC) { if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || device != NULL || record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "panic (-p) incompatible with " "other options\n"); usage(); libzfs_fini(g_zfs); return (2); } if (argc < 1 || argc > 2) { (void) fprintf(stderr, "panic (-p) injection requires " "a single pool name and an optional id\n"); usage(); libzfs_fini(g_zfs); return (2); } (void) strlcpy(pool, argv[0], sizeof (pool)); if (argv[1] != NULL) record.zi_type = atoi(argv[1]); dataset[0] = '\0'; } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { if (raw != NULL || range != NULL || type != TYPE_INVAL || level != 0 || record.zi_freq > 0 || dvas != 0) { (void) fprintf(stderr, "hardware failure (-I) " "incompatible with other options\n"); usage(); libzfs_fini(g_zfs); return (2); } if (nowrites == 0) { (void) fprintf(stderr, "-s or -g meaningless " "without -I (ignore writes)\n"); usage(); libzfs_fini(g_zfs); return (2); } else if (dur_secs && dur_txg) { (void) fprintf(stderr, "choose a duration either " "in seconds (-s) or a number of txgs (-g) " "but not both\n"); usage(); libzfs_fini(g_zfs); return (2); } else if (argc != 1) { (void) fprintf(stderr, "ignore writes (-I) " "injection requires a single pool name\n"); usage(); libzfs_fini(g_zfs); return (2); } (void) strlcpy(pool, argv[0], sizeof (pool)); dataset[0] = '\0'; } else if (type == TYPE_INVAL) { if (flags == 0) { (void) fprintf(stderr, "at least one of '-b', '-d', " "'-t', '-a', '-p', '-I' or '-u' " "must be specified\n"); usage(); libzfs_fini(g_zfs); return (2); } if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) { (void) strlcpy(pool, argv[0], sizeof (pool)); dataset[0] = '\0'; } else if (argc != 0) { (void) fprintf(stderr, "extraneous argument for " "'-f'\n"); usage(); libzfs_fini(g_zfs); return (2); } flags |= ZINJECT_NULL; } else { if (argc != 1) { (void) fprintf(stderr, "missing object\n"); usage(); libzfs_fini(g_zfs); return (2); } if (error == ENXIO || error == EILSEQ) { (void) fprintf(stderr, "data error type must be " "'checksum' or 'io'\n"); libzfs_fini(g_zfs); return (1); } if (dvas != 0) { if (error == EACCES || error == EINVAL) { (void) fprintf(stderr, "the '-C' option may " "not be used with logical data errors " "'decrypt' and 'decompress'\n"); libzfs_fini(g_zfs); return (1); } record.zi_dvas = dvas; } if (error == EACCES) { if (type != TYPE_DATA) { (void) fprintf(stderr, "decryption errors " "may only be injected for 'data' types\n"); libzfs_fini(g_zfs); return (1); } record.zi_cmd = ZINJECT_DECRYPT_FAULT; /* * Internally, ZFS actually uses ECKSUM for decryption * errors since EACCES is used to indicate the key was * not found. */ error = ECKSUM; } else { record.zi_cmd = ZINJECT_DATA_FAULT; } if (translate_record(type, argv[0], range, level, &record, pool, dataset) != 0) { libzfs_fini(g_zfs); return (1); } if (!error) error = EIO; } /* * If this is pool-wide metadata, unmount everything. The ioctl() will * unload the pool, so that we trigger spa-wide reopen of metadata next * time we access the pool. */ if (dataset[0] != '\0' && domount) { if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) { libzfs_fini(g_zfs); return (1); } if (zfs_unmount(zhp, NULL, 0) != 0) { libzfs_fini(g_zfs); return (1); } } record.zi_error = error; ret = register_handler(pool, flags, &record, quiet); if (dataset[0] != '\0' && domount) ret = (zfs_mount(zhp, NULL, 0) != 0); libzfs_fini(g_zfs); return (ret); } diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 8753d7263914..0783271f4734 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1,11586 +1,11592 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley * Copyright (c) 2021, Klara Inc. * Copyright [2021] Hewlett Packard Enterprise Development LP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" #include "statcommon.h" libzfs_handle_t *g_zfs; static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); static int zpool_do_status(int, char **); static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); static int zpool_do_reopen(int, char **); static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_resilver(int, char **); static int zpool_do_trim(int, char **); static int zpool_do_import(int, char **); static int zpool_do_export(int, char **); static int zpool_do_upgrade(int, char **); static int zpool_do_history(int, char **); static int zpool_do_events(int, char **); static int zpool_do_get(int, char **); static int zpool_do_set(int, char **); static int zpool_do_sync(int, char **); static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( const char *, boolean_t *); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_ADD, HELP_ATTACH, HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, HELP_INITIALIZE, HELP_SCRUB, HELP_RESILVER, HELP_TRIM, HELP_STATUS, HELP_UPGRADE, HELP_EVENTS, HELP_GET, HELP_SET, HELP_SPLIT, HELP_SYNC, HELP_REGUID, HELP_REOPEN, HELP_VERSION, HELP_WAIT } zpool_help_t; /* * Flags for stats to display with "zpool iostats" */ enum iostat_type { IOS_DEFAULT = 0, IOS_LATENCY = 1, IOS_QUEUES = 2, IOS_L_HISTO = 3, IOS_RQ_HISTO = 4, IOS_COUNT, /* always last element */ }; /* iostat_type entries as bitmasks */ #define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) #define IOS_LATENCY_M (1ULL << IOS_LATENCY) #define IOS_QUEUES_M (1ULL << IOS_QUEUES) #define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) #define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) /* Mask of all the histo bits */ #define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) /* * Lookup table for iostat flags to nvlist names. Basically a list * of all the nvlists a flag requires. Also specifies the order in * which data gets printed in zpool iostat. */ static const char *vsx_type_to_nvlist[IOS_COUNT][15] = { [IOS_L_HISTO] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_LATENCY] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, NULL}, [IOS_QUEUES] = { ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, NULL}, [IOS_RQ_HISTO] = { ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO, ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO, NULL}, }; /* * Given a cb->cb_flags with a histogram bit set, return the iostat_type. * Right now, only one histo bit is ever set at one time, so we can * just do a highbit64(a) */ #define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) typedef struct zpool_command { const char *name; int (*func)(int, char **); zpool_help_t usage; } zpool_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zpool_command_t command_table[] = { { "version", zpool_do_version, HELP_VERSION }, { NULL }, { "create", zpool_do_create, HELP_CREATE }, { "destroy", zpool_do_destroy, HELP_DESTROY }, { NULL }, { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, { NULL }, { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "initialize", zpool_do_initialize, HELP_INITIALIZE }, { "resilver", zpool_do_resilver, HELP_RESILVER }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { "trim", zpool_do_trim, HELP_TRIM }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "events", zpool_do_events, HELP_EVENTS }, { NULL }, { "get", zpool_do_get, HELP_GET }, { "set", zpool_do_set, HELP_SET }, { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) #define VDEV_ALLOC_CLASS_LOGS "logs" static zpool_command_t *current_command; static zfs_type_t current_prop_type = (ZFS_TYPE_POOL | ZFS_TYPE_VDEV); static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: return (gettext("\tadd [-fgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: return (gettext("\tclear [[--power]|[-nF]] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_CHECKPOINT: return (gettext("\tcheckpoint [-d [-w]] ...\n")); case HELP_DESTROY: return (gettext("\tdestroy [-f] \n")); case HELP_DETACH: return (gettext("\tdetach \n")); case HELP_EXPORT: return (gettext("\texport [-af] ...\n")); case HELP_HISTORY: return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " "[-R root] [-F [-n]]\n" "\t [--rewind-to-checkpoint] [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [[[-c [script1,script2,...]" "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n" "\t [[pool ...]|[pool vdev ...]|[vdev ...]]" " [[-n] interval [count]]\n")); case HELP_LABELCLEAR: return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [--power]|[[-f][-t]] " " ...\n")); case HELP_ONLINE: return (gettext("\tonline [--power][-e] " "...\n")); case HELP_REPLACE: return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); case HELP_REMOVE: return (gettext("\tremove [-npsw] ...\n")); case HELP_REOPEN: return (gettext("\treopen [-n] \n")); case HELP_INITIALIZE: return (gettext("\tinitialize [-c | -s | -u] [-w] " "[ ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] [-w] [-e] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [--power] [-c [script1,script2,...]] " "[-igLpPstvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" "\tupgrade -v\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_EVENTS: return (gettext("\tevents [-vHf [pool] | -c]\n")); case HELP_GET: return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n" "\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); case HELP_REGUID: return (gettext("\treguid \n")); case HELP_SYNC: return (gettext("\tsync [pool] ...\n")); case HELP_VERSION: return (gettext("\tversion\n")); case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); default: __builtin_unreachable(); } } static void zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) { uint_t children = 0; nvlist_t **child; uint_t i; (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children); if (children == 0) { char *path = zpool_vdev_name(g_zfs, zhp, nvroot, VDEV_NAME_PATH); if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && strcmp(path, VDEV_TYPE_HOLE) != 0) fnvlist_add_boolean(res, path); free(path); return; } for (i = 0; i < children; i++) { zpool_collect_leaves(zhp, child[i], res); } } /* * Callback routine that will print out a pool property value. */ static int print_pool_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); return (ZPROP_CONT); } /* * Callback routine that will print out a vdev property value. */ static int print_vdev_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-19s ", vdev_prop_to_name(prop)); if (vdev_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, " YES "); if (vdev_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", vdev_prop_values(prop)); return (ZPROP_CONT); } /* * Given a leaf vdev name like 'L5' return its VDEV_CONFIG_PATH like * '/dev/disk/by-vdev/L5'. */ static const char * vdev_name_to_path(zpool_handle_t *zhp, char *vdev) { nvlist_t *vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL); if (vdev_nv == NULL) { return (NULL); } return (fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH)); } static int zpool_power_on(zpool_handle_t *zhp, char *vdev) { return (zpool_power(zhp, vdev, B_TRUE)); } static int zpool_power_on_and_disk_wait(zpool_handle_t *zhp, char *vdev) { int rc; rc = zpool_power_on(zhp, vdev); if (rc != 0) return (rc); zpool_disk_wait(vdev_name_to_path(zhp, vdev)); return (0); } static int zpool_power_on_pool_and_wait_for_devices(zpool_handle_t *zhp) { nvlist_t *nv; const char *path = NULL; int rc; /* Power up all the devices first */ FOR_EACH_REAL_LEAF_VDEV(zhp, nv) { path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); if (path != NULL) { rc = zpool_power_on(zhp, (char *)path); if (rc != 0) { return (rc); } } } /* * Wait for their devices to show up. Since we powered them on * at roughly the same time, they should all come online around * the same time. */ FOR_EACH_REAL_LEAF_VDEV(zhp, nv) { path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); zpool_disk_wait(path); } return (0); } static int zpool_power_off(zpool_handle_t *zhp, char *vdev) { return (zpool_power(zhp, vdev, B_FALSE)); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static __attribute__((noreturn)) void usage(boolean_t requested) { FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { int i; (void) fprintf(fp, gettext("usage: zpool command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } (void) fprintf(fp, gettext("\nFor further help on a command or topic, " "run: %s\n"), "zpool help []"); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && current_prop_type != (ZFS_TYPE_POOL | ZFS_TYPE_VDEV) && ((strcmp(current_command->name, "set") == 0) || (strcmp(current_command->name, "get") == 0) || (strcmp(current_command->name, "list") == 0))) { (void) fprintf(fp, "%s", gettext("\nthe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-19s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ if (current_prop_type == ZFS_TYPE_POOL) { (void) zprop_iter(print_pool_prop_cb, fp, B_FALSE, B_TRUE, current_prop_type); (void) fprintf(fp, "\t%-19s ", "feature@..."); (void) fprintf(fp, "YES " "disabled | enabled | active\n"); (void) fprintf(fp, gettext("\nThe feature@ properties " "must be appended with a feature name.\n" "See zpool-features(7).\n")); } else if (current_prop_type == ZFS_TYPE_VDEV) { (void) zprop_iter(print_vdev_prop_cb, fp, B_FALSE, B_TRUE, current_prop_type); } } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * zpool initialize [-c | -s | -u] [-w] [ ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. * -u Uninitialize. Clears initialization state. * -w Wait. Blocks until initializing has completed. */ int zpool_do_initialize(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; nvlist_t *vdevs; int err = 0; boolean_t wait = B_FALSE; struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, {"uninit", no_argument, NULL, 'u'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; while ((c = getopt_long(argc, argv, "csuw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_CANCEL; break; case 's': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_SUSPEND; break; case 'u': if (cmd_type != POOL_INITIALIZE_START && cmd_type != POOL_INITIALIZE_UNINIT) { (void) fprintf(stderr, gettext("-u cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_INITIALIZE_UNINIT; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_INITIALIZE_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c, -s" "or -u\n")); usage(B_FALSE); } poolname = argv[0]; zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); } else { for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } if (wait) err = zpool_initialize_wait(zhp, cmd_type, vdevs); else err = zpool_initialize(zhp, cmd_type, vdevs); fnvlist_free(vdevs); zpool_close(zhp); return (err); } /* * print a pool vdev config for dry runs */ static void print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, const char *match, int name_flags) { nvlist_t **child; uint_t c, children; char *vname; boolean_t printed = B_FALSE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { if (name != NULL) (void) printf("\t%*s%s\n", indent, "", name); return; } for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; const char *class = ""; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole == B_TRUE) { continue; } (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) class = VDEV_ALLOC_BIAS_LOG; (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &class); if (strcmp(match, class) != 0) continue; if (!printed && name != NULL) { (void) printf("\t%*s%s\n", indent, "", name); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); print_vdev_tree(zhp, vname, child[c], indent + 2, "", name_flags); free(vname); } } /* * Print the list of l2cache devices for dry runs. */ static void print_cache_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "cache"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } /* * Print the list of spares for dry runs. */ static void print_spare_list(nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { (void) printf("\t%*s%s\n", indent, "", "spares"); } else { return; } for (c = 0; c < children; c++) { char *vname; vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); (void) printf("\t%*s%s\n", indent + 2, "", vname); free(vname); } } static boolean_t prop_list_contains_feature(nvlist_t *proplist) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; nvp = nvlist_next_nvpair(proplist, nvp)) { if (zpool_prop_feature(nvpair_name(nvp))) return (B_TRUE); } return (B_FALSE); } /* * Add a property pair (name, string-value) into a property nvlist. */ static int add_prop_list(const char *propname, const char *propval, nvlist_t **props, boolean_t poolprop) { zpool_prop_t prop = ZPOOL_PROP_INVAL; nvlist_t *proplist; const char *normnm; const char *strval; if (*props == NULL && nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); return (1); } proplist = *props; if (poolprop) { const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); const char *cname = zpool_prop_to_name(ZPOOL_PROP_COMPATIBILITY); if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && (!zpool_prop_feature(propname) && !zpool_prop_vdev(propname))) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool or vdev property\n"), propname); return (2); } /* * feature@ properties and version should not be specified * at the same time. */ if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && prop_list_contains_feature(proplist))) { (void) fprintf(stderr, gettext("'feature@' and " "'version' properties cannot be specified " "together\n")); return (2); } /* * if version is specified, only "legacy" compatibility * may be requested */ if ((prop == ZPOOL_PROP_COMPATIBILITY && strcmp(propval, ZPOOL_COMPAT_LEGACY) != 0 && nvlist_exists(proplist, vname)) || (prop == ZPOOL_PROP_VERSION && nvlist_exists(proplist, cname) && strcmp(fnvlist_lookup_string(proplist, cname), ZPOOL_COMPAT_LEGACY) != 0)) { (void) fprintf(stderr, gettext("when 'version' is " "specified, the 'compatibility' feature may only " "be set to '" ZPOOL_COMPAT_LEGACY "'\n")); return (2); } if (zpool_prop_feature(propname) || zpool_prop_vdev(propname)) normnm = propname; else normnm = zpool_prop_to_name(prop); } else { zfs_prop_t fsprop = zfs_name_to_prop(propname); if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM, B_FALSE)) { normnm = zfs_prop_to_name(fsprop); } else if (zfs_prop_user(propname) || zfs_prop_userquota(propname)) { normnm = propname; } else { (void) fprintf(stderr, gettext("property '%s' is " "not a valid filesystem property\n"), propname); return (2); } } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && prop != ZPOOL_PROP_CACHEFILE) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (2); } if (nvlist_add_string(proplist, normnm, propval) != 0) { (void) fprintf(stderr, gettext("internal " "error: out of memory\n")); return (1); } return (0); } /* * Set a default property pair (name, string-value) in a property nvlist */ static int add_prop_list_default(const char *propname, const char *propval, nvlist_t **props) { const char *pval; if (nvlist_lookup_string(*props, propname, &pval) == 0) return (0); return (add_prop_list(propname, propval, props, B_TRUE)); } /* * zpool add [-fgLnP] [-o property=value] ... * * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not add the devices, but display the resulting layout if * they were to be added. * -o Set property=value. * -P Display full path for vdev name. * * Adds the given vdevs to 'pool'. As with create, the bulk of this work is * handled by make_root_vdev(), which constructs the nvlist needed to pass to * libzfs. */ int zpool_do_add(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; char *poolname; int ret; zpool_handle_t *zhp; nvlist_t *config; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, "fgLno:P")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'g': name_flags |= VDEV_NAME_GUID; break; case 'L': name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'n': dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 'P': name_flags |= VDEV_NAME_PATH; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); usage(B_FALSE); } poolname = argv[0]; argc--; argv++; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } /* pass off to make_root_vdev for processing */ nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); } if (dryrun) { nvlist_t *poolnvroot; nvlist_t **l2child, **sparechild; uint_t l2children, sparechildren, c; char *vname; boolean_t hadcache = B_FALSE, hadspare = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " "configuration:\n\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", name_flags | VDEV_NAME_TYPE_ID); print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); /* print other classes: 'dedup', 'special', and 'log' */ if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", poolnvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) { print_vdev_tree(zhp, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, name_flags); } if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", poolnvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) { print_vdev_tree(zhp, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, name_flags); } if (num_logs(poolnvroot) > 0) { print_vdev_tree(zhp, "logs", poolnvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); print_vdev_tree(zhp, NULL, nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } else if (num_logs(nvroot) > 0) { print_vdev_tree(zhp, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, name_flags); } /* Do the same for the caches */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { hadcache = B_TRUE; (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2child, &l2children) == 0 && l2children) { if (!hadcache) (void) printf(gettext("\tcache\n")); for (c = 0; c < l2children; c++) { vname = zpool_vdev_name(g_zfs, NULL, l2child[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } /* And finally the spares */ if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { hadspare = B_TRUE; (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &sparechild, &sparechildren) == 0 && sparechildren > 0) { if (!hadspare) (void) printf(gettext("\tspares\n")); for (c = 0; c < sparechildren; c++) { vname = zpool_vdev_name(g_zfs, NULL, sparechild[c], name_flags); (void) printf("\t %s\n", vname); free(vname); } } ret = 0; } else { ret = (zpool_add(zhp, nvroot) != 0); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool remove [-npsw] ... * * Removes the given vdev from the pool. */ int zpool_do_remove(int argc, char **argv) { char *poolname; int i, ret = 0; zpool_handle_t *zhp = NULL; boolean_t stop = B_FALSE; int c; boolean_t noop = B_FALSE; boolean_t parsable = B_FALSE; boolean_t wait = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "npsw")) != -1) { switch (c) { case 'n': noop = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 's': stop = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); if (stop && noop) { zpool_close(zhp); (void) fprintf(stderr, gettext("stop request ignored\n")); return (0); } if (stop) { if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (zpool_vdev_remove_cancel(zhp) != 0) ret = 1; if (wait) { (void) fprintf(stderr, gettext("invalid option " "combination: -w cannot be used with -s\n")); usage(B_FALSE); } } else { if (argc < 2) { (void) fprintf(stderr, gettext("missing device\n")); usage(B_FALSE); } for (i = 1; i < argc; i++) { if (noop) { uint64_t size; if (zpool_vdev_indirect_size(zhp, argv[i], &size) != 0) { ret = 1; break; } if (parsable) { (void) printf("%s %llu\n", argv[i], (unsigned long long)size); } else { char valstr[32]; zfs_nicenum(size, valstr, sizeof (valstr)); (void) printf("Memory that will be " "used after removing %s: %s\n", argv[i], valstr); } } else { if (zpool_vdev_remove(zhp, argv[i]) != 0) ret = 1; } } if (ret == 0 && wait) ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE); } zpool_close(zhp); return (ret); } /* * Return 1 if a vdev is active (being used in a pool) * Return 0 if a vdev is inactive (offlined or faulted, or not in active pool) * * This is useful for checking if a disk in an active pool is offlined or * faulted. */ static int vdev_is_active(char *vdev_path) { int fd; fd = open(vdev_path, O_EXCL); if (fd < 0) { return (1); /* cant open O_EXCL - disk is active */ } close(fd); return (0); /* disk is inactive in the pool */ } /* * zpool labelclear [-f] * * -f Force clearing the label for the vdevs which are members of * the exported or foreign pools. * * Verifies that the vdev is not active and zeros out the label information * on the device. */ int zpool_do_labelclear(int argc, char **argv) { char vdev[MAXPATHLEN]; char *name = NULL; int c, fd = -1, ret = 0; nvlist_t *config; pool_state_t state; boolean_t inuse = B_FALSE; boolean_t force = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get vdev name */ if (argc < 1) { (void) fprintf(stderr, gettext("missing vdev name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } (void) strlcpy(vdev, argv[0], sizeof (vdev)); /* * If we cannot open an absolute path, we quit. * Otherwise if the provided vdev name doesn't point to a file, * try prepending expected disk paths and partition numbers. */ if ((fd = open(vdev, O_RDWR)) < 0) { int error; if (vdev[0] == '/') { (void) fprintf(stderr, gettext("failed to open " "%s: %s\n"), vdev, strerror(errno)); return (1); } error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN); if (error == 0 && zfs_dev_is_whole_disk(vdev)) { if (zfs_append_partition(vdev, MAXPATHLEN) == -1) error = ENOENT; } if (error || ((fd = open(vdev, O_RDWR)) < 0)) { if (errno == ENOENT) { (void) fprintf(stderr, gettext( "failed to find device %s, try " "specifying absolute path instead\n"), argv[0]); return (1); } (void) fprintf(stderr, gettext("failed to open %s:" " %s\n"), vdev, strerror(errno)); return (1); } } /* * Flush all dirty pages for the block device. This should not be * fatal when the device does not support BLKFLSBUF as would be the * case for a file vdev. */ if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY)) (void) fprintf(stderr, gettext("failed to invalidate " "cache for %s: %s\n"), vdev, strerror(errno)); if (zpool_read_label(fd, &config, NULL) != 0) { (void) fprintf(stderr, gettext("failed to read label from %s\n"), vdev); ret = 1; goto errout; } nvlist_free(config); ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); if (ret != 0) { (void) fprintf(stderr, gettext("failed to check state for %s\n"), vdev); ret = 1; goto errout; } if (!inuse) goto wipe_label; switch (state) { default: case POOL_STATE_ACTIVE: case POOL_STATE_SPARE: case POOL_STATE_L2CACHE: /* * We allow the user to call 'zpool offline -f' * on an offlined disk in an active pool. We can check if * the disk is online by calling vdev_is_active(). */ if (force && !vdev_is_active(vdev)) break; (void) fprintf(stderr, gettext( "%s is a member (%s) of pool \"%s\""), vdev, zpool_pool_state_to_name(state), name); if (force) { (void) fprintf(stderr, gettext( ". Offline the disk first to clear its label.")); } printf("\n"); ret = 1; goto errout; case POOL_STATE_EXPORTED: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of exported pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_POTENTIALLY_ACTIVE: if (force) break; (void) fprintf(stderr, gettext( "use '-f' to override the following error:\n" "%s is a member of potentially active pool \"%s\"\n"), vdev, name); ret = 1; goto errout; case POOL_STATE_DESTROYED: /* inuse should never be set for a destroyed pool */ assert(0); break; } wipe_label: ret = zpool_clear_label(fd); if (ret != 0) { (void) fprintf(stderr, gettext("failed to clear label for %s\n"), vdev); } errout: free(name); (void) close(fd); return (ret); } /* * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * * -f Force creation, even if devices appear in use * -n Do not create the pool, but display the resulting layout if it * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's * '/' * -o Set property=value. * -o Set feature@feature=enabled|disabled. * -d Don't automatically enable all supported pool features * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c. * Once we get the nvlist back from make_root_vdev(), we either print out the * contents (if '-n' was specified), or pass it to libzfs to do the creation. */ int zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t enable_pool_features = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; char *tname = NULL; int ret = 1; char *altroot = NULL; char *compat = NULL; char *mountpoint = NULL; nvlist_t *fsprops = NULL; nvlist_t *props = NULL; char *propval; /* check options */ while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'd': enable_pool_features = B_FALSE; break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto errout; break; case 'm': /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); goto errout; } *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; /* * If the user is creating a pool that doesn't support * feature flags, don't enable any features. */ if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { char *end; u_longlong_t ver; ver = strtoull(propval, &end, 10); if (*end == '\0' && ver < SPA_VERSION_FEATURES) { enable_pool_features = B_FALSE; } } if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) altroot = propval; if (zpool_name_to_prop(optarg) == ZPOOL_PROP_COMPATIBILITY) compat = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -O option\n")); goto errout; } *propval = '\0'; propval++; /* * Mountpoints are checked and then added later. * Uniquely among properties, they can be specified * more than once, to avoid conflict with -m. */ if (0 == strcmp(optarg, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { mountpoint = propval; } else if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) { goto errout; } break; case 't': /* * Sanity check temporary pool name. */ if (strchr(optarg, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create " "'%s': invalid character '/' in temporary " "name\n"), optarg); (void) fprintf(stderr, gettext("use 'zfs " "create' to create a dataset\n")); goto errout; } if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) goto errout; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto errout; tname = optarg; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); goto badusage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing vdev specification\n")); goto badusage; } poolname = argv[0]; /* * As a special case, check for use of '/' in the name, and direct the * user to use 'zfs create' instead. */ if (strchr(poolname, '/') != NULL) { (void) fprintf(stderr, gettext("cannot create '%s': invalid " "character '/' in pool name\n"), poolname); (void) fprintf(stderr, gettext("use 'zfs create' to " "create a dataset\n")); goto errout; } /* pass off to make_root_vdev for bulk processing */ nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, argc - 1, argv + 1); if (nvroot == NULL) goto errout; /* make_root_vdev() allows 0 toplevel children if there are spares */ if (!zfs_allocatable_devs(nvroot)) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); goto errout; } if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); goto errout; } /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. */ if (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { char buf[MAXPATHLEN]; DIR *dirp; if (mountpoint && mountpoint[0] != '/') { (void) fprintf(stderr, gettext("invalid mountpoint " "'%s': must be an absolute path, 'legacy', or " "'none'\n"), mountpoint); goto errout; } if (mountpoint == NULL) { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s/%s", altroot, poolname); else (void) snprintf(buf, sizeof (buf), "/%s", poolname); } else { if (altroot != NULL) (void) snprintf(buf, sizeof (buf), "%s%s", altroot, mountpoint); else (void) snprintf(buf, sizeof (buf), "%s", mountpoint); } if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { (void) fprintf(stderr, gettext("mountpoint '%s' : " "%s\n"), buf, strerror(errno)); (void) fprintf(stderr, gettext("use '-m' " "option to provide a different default\n")); goto errout; } else if (dirp) { int count = 0; while (count < 3 && readdir(dirp) != NULL) count++; (void) closedir(dirp); if (count > 2) { (void) fprintf(stderr, gettext("mountpoint " "'%s' exists and is not empty\n"), buf); (void) fprintf(stderr, gettext("use '-m' " "option to provide a " "different default\n")); goto errout; } } } /* * Now that the mountpoint's validity has been checked, ensure that * the property is set appropriately prior to creating the pool. */ if (mountpoint != NULL) { ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), mountpoint, &fsprops, B_FALSE); if (ret != 0) goto errout; } ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run * through all the vdevs in the list and print out in an * appropriate hierarchy. */ (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), poolname); print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); print_vdev_tree(NULL, "dedup", nvroot, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", nvroot, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); print_cache_list(nvroot, 0); print_spare_list(nvroot, 0); ret = 0; } else { /* * Load in feature set. * Note: if compatibility property not given, we'll have * NULL, which means 'all features'. */ boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) goto errout; /* * props contains list of features to enable. * For each feature: * - remove it if feature@name=disabled * - leave it there if feature@name=enabled * - add it if: * - enable_pool_features (ie: no '-d' or '-o version') * - it's supported by the kernel module * - it's in the requested feature set * - warn if it's enabled but not in compat */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { char propname[MAXPATHLEN]; const char *propval; zfeature_info_t *feat = &spa_feature_table[i]; (void) snprintf(propname, sizeof (propname), "feature@%s", feat->fi_uname); if (!nvlist_lookup_string(props, propname, &propval)) { if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) { (void) nvlist_remove_all(props, propname); } else if (strcmp(propval, ZFS_FEATURE_ENABLED) == 0 && !requested_features[i]) { (void) fprintf(stderr, gettext( "Warning: feature \"%s\" enabled " "but is not in specified " "'compatibility' feature set.\n"), feat->fi_uname); } } else if ( enable_pool_features && feat->fi_zfs_mod_supported && requested_features[i]) { ret = add_prop_list(propname, ZFS_FEATURE_ENABLED, &props, B_TRUE); if (ret != 0) goto errout; } } ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { if (zfs_mount(pool, NULL, 0) == 0) { ret = zfs_share(pool, NULL); zfs_commit_shares(NULL); } zfs_close(pool); } } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { (void) fprintf(stderr, gettext("pool name may have " "been omitted\n")); } } errout: nvlist_free(nvroot); nvlist_free(fsprops); nvlist_free(props); return (ret); badusage: nvlist_free(fsprops); nvlist_free(props); usage(B_FALSE); return (2); } /* * zpool destroy * * -f Forcefully unmount any datasets * * Destroy the given pool. Automatically unmounts any datasets in the pool. */ int zpool_do_destroy(int argc, char **argv) { boolean_t force = B_FALSE; int c; char *pool; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "f")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { /* * As a special case, check for use of '/' in the name, and * direct the user to use 'zfs destroy' instead. */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("use 'zfs destroy' to " "destroy a dataset\n")); return (1); } if (zpool_disable_datasets(zhp, force) != 0) { (void) fprintf(stderr, gettext("could not destroy '%s': " "could not unmount datasets\n"), zpool_get_name(zhp)); zpool_close(zhp); return (1); } /* The history must be logged as part of the export */ log_history = B_FALSE; ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); return (ret); } typedef struct export_cbdata { boolean_t force; boolean_t hardforce; } export_cbdata_t; /* * Export one pool */ static int zpool_export_one(zpool_handle_t *zhp, void *data) { export_cbdata_t *cb = data; if (zpool_disable_datasets(zhp, cb->force) != 0) return (1); /* The history must be logged as part of the export */ log_history = B_FALSE; if (cb->hardforce) { if (zpool_export_force(zhp, history_str) != 0) return (1); } else if (zpool_export(zhp, cb->force, history_str) != 0) { return (1); } return (0); } /* * zpool export [-f] ... * * -a Export all pools * -f Forcefully unmount datasets * * Export the given pools. By default, the command will attempt to cleanly * unmount any active datasets within the pool. If the '-f' flag is specified, * then the datasets will be forcefully unmounted. */ int zpool_do_export(int argc, char **argv) { export_cbdata_t cb; boolean_t do_all = B_FALSE; boolean_t force = B_FALSE; boolean_t hardforce = B_FALSE; int c, ret; /* check options */ while ((c = getopt(argc, argv, "afF")) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'f': force = B_TRUE; break; case 'F': hardforce = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.force = force; cb.hardforce = hardforce; argc -= optind; argv += optind; if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_export_one, &cb)); } /* check arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_export_one, &cb); return (ret); } /* * Given a vdev configuration, determine the maximum width needed for the device * name column. */ static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, int name_flags) { static const char *const subtypes[] = {ZPOOL_CONFIG_SPARES, ZPOOL_CONFIG_L2CACHE, ZPOOL_CONFIG_CHILDREN}; char *name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); max = MAX(strlen(name) + depth, max); free(name); nvlist_t **child; uint_t children; for (size_t i = 0; i < ARRAY_SIZE(subtypes); ++i) if (nvlist_lookup_nvlist_array(nv, subtypes[i], &child, &children) == 0) for (uint_t c = 0; c < children; ++c) max = MAX(max_width(zhp, child[c], depth + 2, max, name_flags), max); return (max); } typedef struct spare_cbdata { uint64_t cb_guid; zpool_handle_t *cb_zhp; } spare_cbdata_t; static boolean_t find_vdev(nvlist_t *nv, uint64_t search) { uint64_t guid; nvlist_t **child; uint_t c, children; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && search == guid) return (B_TRUE); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) if (find_vdev(child[c], search)) return (B_TRUE); } return (B_FALSE); } static int find_spare(zpool_handle_t *zhp, void *data) { spare_cbdata_t *cbp = data; nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (find_vdev(nvroot, cbp->cb_guid)) { cbp->cb_zhp = zhp; return (1); } zpool_close(zhp); return (0); } typedef struct status_cbdata { int cb_count; int cb_name_flags; int cb_namewidth; boolean_t cb_allpools; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; boolean_t cb_print_unhealthy; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; boolean_t cb_print_power; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ static boolean_t is_blank_str(const char *str) { for (; str != NULL && *str != '\0'; ++str) if (!isblank(*str)) return (B_FALSE); return (B_TRUE); } /* Print command output lines for specific vdev in a specific pool */ static void zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, const char *path) { vdev_cmd_data_t *data; int i, j; const char *val; for (i = 0; i < vcdl->count; i++) { if ((strcmp(vcdl->data[i].path, path) != 0) || (strcmp(vcdl->data[i].pool, pool) != 0)) { /* Not the vdev we're looking for */ continue; } data = &vcdl->data[i]; /* Print out all the output values for this vdev */ for (j = 0; j < vcdl->uniq_cols_cnt; j++) { val = NULL; /* Does this vdev have values for this column? */ for (int k = 0; k < data->cols_cnt; k++) { if (strcmp(data->cols[k], vcdl->uniq_cols[j]) == 0) { /* yes it does, record the value */ val = data->lines[k]; break; } } /* * Mark empty values with dashes to make output * awk-able. */ if (val == NULL || is_blank_str(val)) val = "-"; printf("%*s", vcdl->uniq_cols_width[j], val); if (j < vcdl->uniq_cols_cnt - 1) fputs(" ", stdout); } /* Print out any values that aren't in a column at the end */ for (j = data->cols_cnt; j < data->lines_cnt; j++) { /* Did we have any columns? If so print a spacer. */ if (vcdl->uniq_cols_cnt > 0) fputs(" ", stdout); val = data->lines[j]; fputs(val ?: "", stdout); } break; } } /* * Print vdev initialization status for leaves */ static void print_status_initialize(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { initialize_pct = (vs->vs_initialize_bytes_done * 100 / (vs->vs_initialize_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_INITIALIZE_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_INITIALIZE_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% initialized%s)"), initialize_pct, zbuf); } else { (void) printf(gettext(" (uninitialized)")); } } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) printf(gettext(" (initializing)")); } } /* * Print vdev TRIM status for leaves */ static void print_status_trim(vdev_stat_t *vs, boolean_t verbose) { if (verbose) { if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE || vs->vs_trim_state == VDEV_TRIM_SUSPENDED || vs->vs_trim_state == VDEV_TRIM_COMPLETE) && !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; struct tm zaction_ts; time_t t = vs->vs_trim_action_time; int trim_pct = 100; if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) { trim_pct = (vs->vs_trim_bytes_done * 100 / (vs->vs_trim_bytes_est + 1)); } (void) localtime_r(&t, &zaction_ts); (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); switch (vs->vs_trim_state) { case VDEV_TRIM_SUSPENDED: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("suspended, started at"), tbuf); break; case VDEV_TRIM_ACTIVE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("started at"), tbuf); break; case VDEV_TRIM_COMPLETE: (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", gettext("completed at"), tbuf); break; } (void) printf(gettext(" (%d%% trimmed%s)"), trim_pct, zbuf); } else if (vs->vs_trim_notsup) { (void) printf(gettext(" (trim unsupported)")); } else { (void) printf(gettext(" (untrimmed)")); } } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) { (void) printf(gettext(" (trimming)")); } } /* * Return the color associated with a health string. This includes returning * NULL for no color change. */ static const char * health_str_to_color(const char *health) { if (strcmp(health, gettext("FAULTED")) == 0 || strcmp(health, gettext("SUSPENDED")) == 0 || strcmp(health, gettext("UNAVAIL")) == 0) { return (ANSI_RED); } if (strcmp(health, gettext("OFFLINE")) == 0 || strcmp(health, gettext("DEGRADED")) == 0 || strcmp(health, gettext("REMOVED")) == 0) { return (ANSI_YELLOW); } return (NULL); } /* * Called for each leaf vdev. Returns 0 if the vdev is healthy. * A vdev is unhealthy if any of the following are true: * 1) there are read, write, or checksum errors, * 2) its state is not ONLINE, or * 3) slow IO reporting was requested (-s) and there are slow IOs. */ static int vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data) { status_cbdata_t *cb = data; vdev_stat_t *vs; uint_t vsc; (void) hdl_data; if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) != 0) return (1); if (vs->vs_checksum_errors || vs->vs_read_errors || vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY) return (1); if (cb->cb_print_slow_ios && vs->vs_slow_ios) return (1); return (0); } /* * Print out configuration state as requested by status_callback. */ static void print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) { nvlist_t **child, *root; uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; const char *state; const char *type; const char *path = NULL; const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL, *scolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) return; state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for * online drives. */ if (vs->vs_aux == VDEV_AUX_SPARED) state = gettext("INUSE"); else if (vs->vs_state == VDEV_STATE_HEALTHY) state = gettext("AVAIL"); } /* * If '-e' is specified then top-level vdevs and their children * can be pruned if all of their leaves are healthy. */ if (cb->cb_print_unhealthy && depth > 0 && for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) { return; } printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); if (!isspare) { if (vs->vs_read_errors) rcolor = ANSI_RED; if (vs->vs_write_errors) wcolor = ANSI_RED; if (vs->vs_checksum_errors) ccolor = ANSI_RED; if (vs->vs_slow_ios) scolor = ANSI_BLUE; if (cb->cb_literal) { fputc(' ', stdout); printf_color(rcolor, "%5llu", (u_longlong_t)vs->vs_read_errors); fputc(' ', stdout); printf_color(wcolor, "%5llu", (u_longlong_t)vs->vs_write_errors); fputc(' ', stdout); printf_color(ccolor, "%5llu", (u_longlong_t)vs->vs_checksum_errors); } else { zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); fputc(' ', stdout); printf_color(rcolor, "%5s", rbuf); fputc(' ', stdout); printf_color(wcolor, "%5s", wbuf); fputc(' ', stdout); printf_color(ccolor, "%5s", cbuf); } if (cb->cb_print_slow_ios) { if (children == 0) { /* Only leafs vdevs have slow IOs */ zfs_nicenum(vs->vs_slow_ios, rbuf, sizeof (rbuf)); } else { snprintf(rbuf, sizeof (rbuf), "-"); } if (cb->cb_literal) printf_color(scolor, " %5llu", (u_longlong_t)vs->vs_slow_ios); else printf_color(scolor, " %5s", rbuf); } if (cb->cb_print_power) { if (children == 0) { /* Only leaf vdevs have physical slots */ switch (zpool_power_current_state(zhp, (char *) fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH))) { case 0: printf_color(ANSI_RED, " %5s", gettext("off")); break; case 1: printf(" %5s", gettext("on")); break; default: printf(" %5s", "-"); } } else { printf(" %5s", "-"); } } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, ¬present) == 0) { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); (void) printf(" %s %s", gettext("was"), path); } else if (vs->vs_aux != 0) { (void) printf(" "); color_start(ANSI_RED); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ASHIFT_TOO_BIG: (void) printf(gettext("unsupported minimum blocksize")); break; case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &spare_cb.cb_guid) == 0); if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { if (strcmp(zpool_get_name(spare_cb.cb_zhp), zpool_get_name(zhp)) == 0) (void) printf(gettext("currently in " "use")); else (void) printf(gettext("in use by " "pool '%s'"), zpool_get_name(spare_cb.cb_zhp)); zpool_close(spare_cb.cb_zhp); } else { (void) printf(gettext("currently in use")); } break; case VDEV_AUX_ERR_EXCEEDED: - (void) printf(gettext("too many errors")); + if (vs->vs_read_errors + vs->vs_write_errors + + vs->vs_checksum_errors == 0 && children == 0 && + vs->vs_slow_ios > 0) { + (void) printf(gettext("too many slow I/Os")); + } else { + (void) printf(gettext("too many errors")); + } break; case VDEV_AUX_IO_FAILURE: (void) printf(gettext("experienced I/O failures")); break; case VDEV_AUX_BAD_LOG: (void) printf(gettext("bad intent log")); break; case VDEV_AUX_EXTERNAL: (void) printf(gettext("external device fault")); break; case VDEV_AUX_SPLIT_POOL: (void) printf(gettext("split into new pool")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; case VDEV_AUX_BAD_LABEL: (void) printf(gettext("invalid label")); break; default: (void) printf(gettext("corrupted data")); break; } color_end(); } else if (children == 0 && !isspare && getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") == NULL && VDEV_STAT_VALID(vs_physical_ashift, vsc) && vs->vs_configured_ashift < vs->vs_physical_ashift) { (void) printf( gettext(" block size: %dB configured, %dB native"), 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } if (vs->vs_scan_removing != 0) { (void) printf(gettext(" (removing)")); } else if (VDEV_STAT_VALID(vs_noalloc, vsc) && vs->vs_noalloc != 0) { (void) printf(gettext(" (non-allocating)")); } /* The root vdev has the scrub/resilver stats */ root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); /* * If you force fault a drive that's resilvering, its scan stats can * get frozen in time, giving the false impression that it's * being resilvered. That's why we check the state to see if the vdev * is healthy before reporting "resilvering" or "repairing". */ if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0 && vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_scan_processed != 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? "resilvering" : "repairing"); } else if (vs->vs_resilver_deferred) { (void) printf(gettext(" (awaiting resilver)")); } } /* The top-level vdevs have the rebuild stats */ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && children == 0 && vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_rebuild_processed != 0) { (void) printf(gettext(" (resilvering)")); } } if (cb->vcdl != NULL) { if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); } (void) printf("\n"); for (c = 0; c < children; c++) { uint64_t islog = B_FALSE, ishole = B_FALSE; /* Don't print logs or holes here */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole); if (islog || ishole) continue; /* Only print normal classes here */ if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; /* Provide vdev_rebuild_stats to children if available */ if (vrs == NULL) { (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_status_config(zhp, cb, vname, child[c], depth + 2, isspare, vrs); free(vname); } } /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ static void print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, int depth) { nvlist_t **child; uint_t c, children; vdev_stat_t *vs; const char *type; char *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); if (strcmp(type, VDEV_TYPE_MISSING) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); if (vs->vs_aux != 0) { (void) printf(" "); switch (vs->vs_aux) { case VDEV_AUX_OPEN_FAILED: (void) printf(gettext("cannot open")); break; case VDEV_AUX_BAD_GUID_SUM: (void) printf(gettext("missing device")); break; case VDEV_AUX_NO_REPLICAS: (void) printf(gettext("insufficient replicas")); break; case VDEV_AUX_VERSION_NEWER: (void) printf(gettext("newer version")); break; case VDEV_AUX_UNSUP_FEAT: (void) printf(gettext("unsupported feature(s)")); break; case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; case VDEV_AUX_ACTIVE: (void) printf(gettext("currently in use")); break; case VDEV_AUX_CHILDREN_OFFLINE: (void) printf(gettext("all children offline")); break; case VDEV_AUX_BAD_LABEL: (void) printf(gettext("invalid label")); break; default: (void) printf(gettext("corrupted data")); break; } } (void) printf("\n"); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_import_config(cb, vname, child[c], depth + 2); free(vname); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, NULL, child[c], cb->cb_name_flags); (void) printf("\t %s\n", vname); free(vname); } } } /* * Print specialized class vdevs. * * These are recorded as top level vdevs in the main pool child array * but with "is_log" set to 1 or an "alloc_bias" string. We use either * print_status_config() or print_import_config() to print the top level * class vdevs then any of their children (eg mirrored slogs) are printed * recursively - which works because only the top level vdev is marked. */ static void print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, const char *class) { uint_t c, children; nvlist_t **child; boolean_t printed = B_FALSE; assert(zhp != NULL || !cb->cb_verbose); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE; const char *bias = NULL; const char *type = NULL; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) { bias = (char *)VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class) != 0) continue; if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { (void) printf("\t%s\t\n", gettext(class)); printed = B_TRUE; } char *name = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); if (cb->cb_print_status) print_status_config(zhp, cb, name, child[c], 2, B_FALSE, NULL); else print_import_config(cb, name, child[c], 2); free(name); } } /* * Display the status for the given pool. */ static int show_import(nvlist_t *config, boolean_t report_error) { uint64_t pool_state; vdev_stat_t *vs; const char *name; uint64_t guid; uint64_t hostid = 0; const char *msgid; const char *hostname = "unknown"; nvlist_t *nvroot, *nvinfo; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t vsc; const char *comment; status_cbdata_t cb = { 0 }; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid, &errata); /* * If we're importing using a cachefile, then we won't report any * errors unless we are in the scan phase of the import. */ if (reason != ZPOOL_STATUS_OK && !report_error) return (reason); (void) printf(gettext(" pool: %s\n"), name); (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices contains" " corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: (void) printf( gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "an incompatible version.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported " "features are not enabled on the pool.\n\t" "(Note that they may be intentionally disabled " "if the\n\t'compatibility' property is set.)\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Error reading or parsing " "the file(s) indicated by the 'compatibility'\n" "property.\n")); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more features " "are enabled on the pool despite not being\n" "requested by the 'compatibility' property.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool uses the following " "feature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); color_start(ANSI_YELLOW); zpool_print_unsup_feat(config); color_end(); break; case ZPOOL_STATUS_HOSTID_ACTIVE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is currently " "imported by another system.\n")); break; case ZPOOL_STATUS_HOSTID_REQUIRED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has the " "multihost property on. It cannot\n\tbe safely imported " "when the system hostid is not set.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool was last accessed " "by another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record cannot " "be read.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices were " "being resilvered.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); break; default: /* * No other status can be seen when importing pools. */ assert(reason == ZPOOL_STATUS_OK); } /* * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { if (reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); } else if (reason == ZPOOL_STATUS_COMPATIBILITY_ERR) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric\n\tidentifier, " "though the file(s) indicated by its " "'compatibility'\n\tproperty cannot be parsed at " "this time.\n")); } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); } else if (reason == ZPOOL_STATUS_ERRATA) { switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: (void) printf(gettext(" action: The pool can " "be imported using its name or numeric " "identifier,\n\thowever there is a compat" "ibility issue which should be corrected" "\n\tby running 'zpool scrub'\n")); break; case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: (void) printf(gettext(" action: The pool can" "not be imported with this version of ZFS " "due to\n\tan active asynchronous destroy. " "Revert to an earlier version\n\tand " "allow the destroy to complete before " "updating.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted datasets contain an on-disk " "incompatibility, which\n\tneeds to be " "corrected. Backup these datasets to new " "encrypted datasets\n\tand destroy the " "old ones.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext(" action: Existing " "encrypted snapshots and bookmarks contain " "an on-disk\n\tincompatibility. This may " "cause on-disk corruption if they are used" "\n\twith 'zfs recv'. To correct the " "issue, enable the bookmark_v2 feature.\n\t" "No additional action is needed if there " "are no encrypted snapshots or\n\t" "bookmarks. If preserving the encrypted " "snapshots and bookmarks is\n\trequired, " "use a non-raw send to backup and restore " "them. Alternately,\n\tthey may be removed" " to resolve the incompatibility.\n")); break; default: /* * All errata must contain an action message. */ assert(0); } } else { (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported. Access the pool on a system that " "supports\n\tthe required feature(s), or recreate " "the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be " "imported in read-write mode. Import the pool " "with\n" "\t\"-o readonly=on\", access the pool on a system " "that supports the\n\trequired feature(s), or " "recreate the pool from backup.\n")); break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; case ZPOOL_STATUS_HOSTID_ACTIVE: VERIFY0(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) printf(gettext(" action: The pool must be " "exported from %s (hostid=%"PRIx64")\n\tbefore it " "can be safely imported.\n"), hostname, hostid); break; case ZPOOL_STATUS_HOSTID_REQUIRED: (void) printf(gettext(" action: Set a unique system " "hostid with the zgenhostid(8) command.\n")); break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } /* Print the comment attached to the pool. */ if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) (void) printf(gettext("comment: %s\n"), comment); /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": */ if (((vs->vs_state == VDEV_STATE_CLOSED) || (vs->vs_state == VDEV_STATE_CANT_OPEN)) && (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext("\tThe pool was destroyed, " "but can be imported using the '-Df' flags.\n")); else if (pool_state != POOL_STATE_EXPORTED) (void) printf(gettext("\tThe pool may be active on " "another system, but can be imported using\n\t" "the '-f' flag.\n")); } if (msgid != NULL) { (void) printf(gettext( " see: https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } (void) printf(gettext(" config:\n\n")); cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), VDEV_NAME_TYPE_ID); if (cb.cb_namewidth < 10) cb.cb_namewidth = 10; print_import_config(&cb, name, nvroot, 0); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " "be part of this pool, though their\n\texact " "configuration cannot be determined.\n")); } return (0); } static boolean_t zfs_force_import_required(nvlist_t *config) { uint64_t state; uint64_t hostid = 0; nvlist_t *nvinfo; state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); /* * The hostid on LOAD_INFO comes from the MOS label via * spa_tryimport(). If its not there then we're likely talking to an * older kernel, so use the top one, which will be from the label * discovered in zpool_find_import(), or if a cachefile is in use, the * local hostid. */ if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0) (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) return (B_TRUE); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state != MMP_STATE_INACTIVE) return (B_TRUE); } return (B_FALSE); } /* * Perform the import for the given configuration. This passes the heavy * lifting off to zpool_import_props(), and then mounts the datasets contained * within the pool. */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, nvlist_t *props, int flags) { int ret = 0; int ms_status = 0; zpool_handle_t *zhp; const char *name; uint64_t version; name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " "is formatted using an unsupported ZFS version\n"), name); return (1); } else if (zfs_force_import_required(config) && !(flags & ZFS_IMPORT_ANY_HOST)) { mmp_state_t mmp_state = MMP_STATE_INACTIVE; nvlist_t *nvinfo; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (mmp_state == MMP_STATE_ACTIVE) { const char *hostname = ""; uint64_t hostid = 0; if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool is imported on %s (hostid: " "0x%"PRIx64")\nExport the pool on the other " "system, then run 'zpool import'.\n"), name, hostname, hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) fprintf(stderr, gettext("Cannot import '%s': " "pool has the multihost property on and the\n" "system's hostid is not set. Set a unique hostid " "with the zgenhostid(8) command.\n"), name); } else { const char *hostname = ""; time_t timestamp = 0; uint64_t hostid = 0; if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_HOSTNAME); else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) hostname = fnvlist_lookup_string(config, ZPOOL_CONFIG_HOSTNAME); if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) timestamp = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID); else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID); (void) fprintf(stderr, gettext("cannot import '%s': " "pool was previously in use from another system.\n" "Last accessed by %s (hostid=%"PRIx64") at %s" "The pool can be imported, use 'zpool import -f' " "to import the pool.\n"), name, hostname, hostid, ctime(×tamp)); } return (1); } if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) name = newname; if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) return (1); /* * Loading keys is best effort. We don't want to return immediately * if it fails but we do want to give the error to the caller. */ if (flags & ZFS_IMPORT_LOAD_KEYS && zfs_crypto_attempt_load_keys(g_zfs, name) != 0) ret = 1; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && !(flags & ZFS_IMPORT_ONLY)) { ms_status = zpool_enable_datasets(zhp, mntopts, 0); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Import was " "successful, but unable to share some datasets")); } else if (ms_status == EZFS_MOUNTFAILED) { (void) fprintf(stderr, gettext("Import was " "successful, but unable to mount some datasets")); } } zpool_close(zhp); return (ret); } static int import_pools(nvlist_t *pools, nvlist_t *props, char *mntopts, int flags, char *orig_name, char *new_name, boolean_t do_destroyed, boolean_t pool_specified, boolean_t do_all, importargs_t *import) { nvlist_t *config = NULL; nvlist_t *found_config = NULL; uint64_t pool_state; /* * At this point we have a list of import candidate configs. Even if * we were searching by pool name or guid, we still need to * post-process the list to deal with pool state and possible * duplicate names. */ int err = 0; nvpair_t *elem = NULL; boolean_t first = B_TRUE; while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, &config) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &pool_state) == 0); if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) continue; if (do_destroyed && pool_state != POOL_STATE_DESTROYED) continue; verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, import->policy) == 0); if (!pool_specified) { if (first) first = B_FALSE; else if (!do_all) (void) fputc('\n', stdout); if (do_all) { err |= do_import(config, NULL, mntopts, props, flags); } else { /* * If we're importing from cachefile, then * we don't want to report errors until we * are in the scan phase of the import. If * we get an error, then we return that error * to invoke the scan phase. */ if (import->cachefile && !import->scan) err = show_import(config, B_FALSE); else (void) show_import(config, B_TRUE); } } else if (import->poolname != NULL) { const char *name; /* * We are searching for a pool based on name. */ verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); if (strcmp(name, import->poolname) == 0) { if (found_config != NULL) { (void) fprintf(stderr, gettext( "cannot import '%s': more than " "one matching pool\n"), import->poolname); (void) fprintf(stderr, gettext( "import by numeric ID instead\n")); err = B_TRUE; } found_config = config; } } else { uint64_t guid; /* * Search for a pool by guid. */ verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); if (guid == import->guid) found_config = config; } } /* * If we were searching for a specific pool, verify that we found a * pool, and then do the import. */ if (pool_specified && err == 0) { if (found_config == NULL) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), orig_name); err = B_TRUE; } else { err |= do_import(found_config, new_name, mntopts, props, flags); } } /* * If we were just looking for pools, report an error if none were * found. */ if (!pool_specified && first) (void) fprintf(stderr, gettext("no pools available to import\n")); return (err); } typedef struct target_exists_args { const char *poolname; uint64_t poolguid; } target_exists_args_t; static int name_or_guid_exists(zpool_handle_t *zhp, void *data) { target_exists_args_t *args = data; nvlist_t *config = zpool_get_config(zhp, NULL); int found = 0; if (config == NULL) return (0); if (args->poolname != NULL) { const char *pool_name; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); if (strcmp(pool_name, args->poolname) == 0) found = 1; } else { uint64_t pool_guid; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); if (pool_guid == args->poolguid) found = 1; } zpool_close(zhp); return (found); } /* * zpool checkpoint * checkpoint --discard * * -d Discard the checkpoint from a checkpointed * --discard pool. * * -w Wait for discarding a checkpoint to complete. * --wait * * Checkpoints the specified pool, by taking a "snapshot" of its * current state. A pool can only have one checkpoint at a time. */ int zpool_do_checkpoint(int argc, char **argv) { boolean_t discard, wait; char *pool; zpool_handle_t *zhp; int c, err; struct option long_options[] = { {"discard", no_argument, NULL, 'd'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; discard = B_FALSE; wait = B_FALSE; while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) { switch (c) { case 'd': discard = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (wait && !discard) { (void) fprintf(stderr, gettext("--wait only valid when " "--discard also specified\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } pool = argv[0]; if ((zhp = zpool_open(g_zfs, pool)) == NULL) { /* As a special case, check for use of '/' in the name */ if (strchr(pool, '/') != NULL) (void) fprintf(stderr, gettext("'zpool checkpoint' " "doesn't work on datasets. To save the state " "of a dataset from a specific point in time " "please use 'zfs snapshot'\n")); return (1); } if (discard) { err = (zpool_discard_checkpoint(zhp) != 0); if (err == 0 && wait) err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD); } else { err = (zpool_checkpoint(zhp) != 0); } zpool_close(zhp); return (err); } #define CHECKPOINT_OPT 1024 /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile | -s] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] * [-d dir | -c cachefile | -s] [-f] [-n] [-F] * [newpool] * * -c Read pool information from a cachefile instead of searching * devices. If importing from a cachefile config fails, then * fallback to searching for devices only in the directories that * exist in the cachefile. * * -d Scan in a specific directory, other than /dev/. More than * one directory can be specified using multiple '-d' options. * * -D Scan for previously destroyed pools or import all or only * specified destroyed pools. * * -R Temporarily import the pool, with all mountpoints relative to * the given root. The pool will remain exported when the machine * is rebooted. * * -V Import even in the presence of faulted vdevs. This is an * intentionally undocumented option for testing purposes, and * treats the pool configuration as complete, leaving any bad * vdevs in the FAULTED state. In other words, it does verbatim * import. * * -f Force import, even if it appears that the pool is active. * * -F Attempt rewind if necessary. * * -n See if rewind would work, but don't actually rewind. * * -N Import the pool but don't mount datasets. * * -T Specify a starting txg to use for import. This option is * intentionally undocumented option for testing purposes. * * -a Import all pools found. * * -l Load encryption keys while importing. * * -o Set property=value and/or temporary mount options (without '='). * * -s Scan using the default search path, the libblkid cache will * not be consulted. * * --rewind-to-checkpoint * Import the pool and revert back to the checkpoint. * * The import command scans for pools to import, and import pools based on pool * name and GUID. The pool can also be renamed as part of the import process. */ int zpool_do_import(int argc, char **argv) { char **searchdirs = NULL; char *env, *envdup = NULL; int nsearch = 0; int c; int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; uint64_t searchguid = 0; char *searchname = NULL; char *propval; nvlist_t *policy = NULL; nvlist_t *props = NULL; int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t do_scan = B_FALSE; boolean_t pool_exists = B_FALSE; boolean_t pool_specified = B_FALSE; uint64_t txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; char *endptr; struct option long_options[] = { {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX", long_options, NULL)) != -1) { switch (c) { case 'a': do_all = B_TRUE; break; case 'c': cachefile = optarg; break; case 'd': searchdirs = safe_realloc(searchdirs, (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = optarg; break; case 'D': do_destroyed = B_TRUE; break; case 'f': flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; case 'l': flags |= ZFS_IMPORT_LOAD_KEYS; break; case 'm': flags |= ZFS_IMPORT_MISSING_LOG; break; case 'n': dryrun = B_TRUE; break; case 'N': flags |= ZFS_IMPORT_ONLY; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE)) goto error; } else { mntopts = optarg; } break; case 'R': if (add_prop_list(zpool_prop_to_name( ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) goto error; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto error; break; case 's': do_scan = B_TRUE; break; case 't': flags |= ZFS_IMPORT_TEMP_NAME; if (add_prop_list_default(zpool_prop_to_name( ZPOOL_PROP_CACHEFILE), "none", &props)) goto error; break; case 'T': errno = 0; txg = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("invalid txg value\n")); usage(B_FALSE); } rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; break; case 'V': flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; break; case CHECKPOINT_OPT: flags |= ZFS_IMPORT_CHECKPOINT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (cachefile && nsearch != 0) { (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); usage(B_FALSE); } if (cachefile && do_scan) { (void) fprintf(stderr, gettext("-c is incompatible with -s\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); usage(B_FALSE); } if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) { (void) fprintf(stderr, gettext("-l is only meaningful during " "an import\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) goto error; /* check argument count */ if (do_all) { if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } /* * Check for the effective uid. We do this explicitly here because * otherwise any attempt to discover pools will silently fail. */ if (argc == 0 && geteuid() != 0) { (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); free(searchdirs); nvlist_free(props); nvlist_free(policy); return (1); } /* * Depending on the arguments given, we do one of the following: * * Iterate through all pools and display information about * each one. * * -a Iterate through all pools and try to import each one. * * Find the pool that corresponds to the given GUID/pool * name and import that one. * * -D Above options applies only to destroyed pools. */ if (argc != 0) { char *endptr; errno = 0; searchguid = strtoull(argv[0], &endptr, 10); if (errno != 0 || *endptr != '\0') { searchname = argv[0]; searchguid = 0; } pool_specified = B_TRUE; /* * User specified a name or guid. Ensure it's unique. */ target_exists_args_t search = {searchname, searchguid}; pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); } /* * Check the environment for the preferred search path. */ if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { char *dir, *tmp = NULL; envdup = strdup(env); for (dir = strtok_r(envdup, ":", &tmp); dir != NULL; dir = strtok_r(NULL, ":", &tmp)) { searchdirs = safe_realloc(searchdirs, (nsearch + 1) * sizeof (char *)); searchdirs[nsearch++] = dir; } } idata.path = searchdirs; idata.paths = nsearch; idata.poolname = searchname; idata.guid = searchguid; idata.cachefile = cachefile; idata.scan = do_scan; idata.policy = policy; libpc_handle_t lpch = { .lpc_lib_handle = g_zfs, .lpc_ops = &libzfs_config_ops, .lpc_printerr = B_TRUE }; pools = zpool_search_import(&lpch, &idata); if (pools != NULL && pool_exists && (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name already exists\n"), argv[0]); (void) fprintf(stderr, gettext("use the form '%s " " ' to give it a new name\n"), "zpool import"); err = 1; } else if (pools == NULL && pool_exists) { (void) fprintf(stderr, gettext("cannot import '%s': " "a pool with that name is already created/imported,\n"), argv[0]); (void) fprintf(stderr, gettext("and no additional pools " "with that name were found\n")); err = 1; } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } err = 1; } if (err == 1) { free(searchdirs); free(envdup); nvlist_free(policy); nvlist_free(pools); nvlist_free(props); return (1); } err = import_pools(pools, props, mntopts, flags, argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, do_destroyed, pool_specified, do_all, &idata); /* * If we're using the cachefile and we failed to import, then * fallback to scanning the directory for pools that match * those in the cachefile. */ if (err != 0 && cachefile != NULL) { (void) printf(gettext("cachefile import failed, retrying\n")); /* * We use the scan flag to gather the directories that exist * in the cachefile. If we need to fallback to searching for * the pool config, we will only search devices in these * directories. */ idata.scan = B_TRUE; nvlist_free(pools); pools = zpool_search_import(&lpch, &idata); err = import_pools(pools, props, mntopts, flags, argc >= 1 ? argv[0] : NULL, argc >= 2 ? argv[1] : NULL, do_destroyed, pool_specified, do_all, &idata); } error: nvlist_free(props); nvlist_free(pools); nvlist_free(policy); free(searchdirs); free(envdup); return (err ? 1 : 0); } /* * zpool sync [-f] [pool] ... * * -f (undocumented) force uberblock (and config including zpool cache file) * update. * * Sync the specified pool(s). * Without arguments "zpool sync" will sync all pools. * This command initiates TXG sync(s) and will return after the TXG(s) commit. * */ static int zpool_do_sync(int argc, char **argv) { int ret; boolean_t force = B_FALSE; /* check options */ while ((ret = getopt(argc, argv, "f")) != -1) { switch (ret) { case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ ret = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_sync_one, &force); return (ret); } typedef struct iostat_cbdata { uint64_t cb_flags; int cb_namewidth; int cb_iteration; boolean_t cb_verbose; boolean_t cb_literal; boolean_t cb_scripted; zpool_list_t *cb_list; vdev_cmd_data_list_t *vcdl; vdev_cbdata_t cb_vdevs; } iostat_cbdata_t; /* iostat labels */ typedef struct name_and_columns { const char *name; /* Column name */ unsigned int columns; /* Center name to this number of columns */ } name_and_columns_t; #define IOSTAT_MAX_LABELS 15 /* Max number of labels on one line */ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, {NULL}}, [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {"rebuild", 1}, {NULL}}, [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, {"trimq_write", 2}, {"rebuildq_write", 2}, {NULL}}, [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, {"asyncq_wait", 2}, {NULL}}, [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, {"trim", 2}, {"rebuild", 2}, {NULL}}, }; /* Shorthand - if "columns" field not set, default to 1 column */ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = { [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, {"write"}, {NULL}}, [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {"wait"}, {NULL}}, [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}}, [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {"rebuild"}, {NULL}}, [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, }; static const char *histo_to_title[] = { [IOS_L_HISTO] = "latency", [IOS_RQ_HISTO] = "req_size", }; /* * Return the number of labels in a null-terminated name_and_columns_t * array. * */ static unsigned int label_array_len(const name_and_columns_t *labels) { int i = 0; while (labels[i].name) i++; return (i); } /* * Return the number of strings in a null-terminated string array. * For example: * * const char foo[] = {"bar", "baz", NULL} * * returns 2 */ static uint64_t str_array_len(const char *array[]) { uint64_t i = 0; while (array[i]) i++; return (i); } /* * Return a default column width for default/latency/queue columns. This does * not include histograms, which have their columns autosized. */ static unsigned int default_column_width(iostat_cbdata_t *cb, enum iostat_type type) { unsigned long column_width = 5; /* Normal niceprint */ static unsigned long widths[] = { /* * Choose some sane default column sizes for printing the * raw numbers. */ [IOS_DEFAULT] = 15, /* 1PB capacity */ [IOS_LATENCY] = 10, /* 1B ns = 10sec */ [IOS_QUEUES] = 6, /* 1M queue entries */ [IOS_L_HISTO] = 10, /* 1B ns = 10sec */ [IOS_RQ_HISTO] = 6, /* 1M queue entries */ }; if (cb->cb_literal) column_width = widths[type]; return (column_width); } /* * Print the column labels, i.e: * * capacity operations bandwidth * alloc free read write read write ... * * If force_column_width is set, use it for the column width. If not set, use * the default column width. */ static void print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) { int i, idx, s; int text_start, rw_column_width, spaces_to_end; uint64_t flags = cb->cb_flags; uint64_t f; unsigned int column_width = force_column_width; /* For each bit set in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; if (!force_column_width) column_width = default_column_width(cb, idx); /* Print our top labels centered over "read write" label. */ for (i = 0; i < label_array_len(labels[idx]); i++) { const char *name = labels[idx][i].name; /* * We treat labels[][].columns == 0 as shorthand * for one column. It makes writing out the label * tables more concise. */ unsigned int columns = MAX(1, labels[idx][i].columns); unsigned int slen = strlen(name); rw_column_width = (column_width * columns) + (2 * (columns - 1)); text_start = (int)((rw_column_width) / columns - slen / columns); if (text_start < 0) text_start = 0; printf(" "); /* Two spaces between columns */ /* Space from beginning of column to label */ for (s = 0; s < text_start; s++) printf(" "); printf("%s", name); /* Print space after label to end of column */ spaces_to_end = rw_column_width - text_start - slen; if (spaces_to_end < 0) spaces_to_end = 0; for (s = 0; s < spaces_to_end; s++) printf(" "); } } } /* * print_cmd_columns - Print custom column titles from -c * * If the user specified the "zpool status|iostat -c" then print their custom * column titles in the header. For example, print_cmd_columns() would print * the " col1 col2" part of this: * * $ zpool iostat -vc 'echo col1=val1; echo col2=val2' * ... * capacity operations bandwidth * pool alloc free read write read write col1 col2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- * mypool 269K 1008M 0 0 107 946 * mirror 269K 1008M 0 0 107 946 * sdb - - 0 0 102 473 val1 val2 * sdc - - 0 0 5 473 val1 val2 * ---------- ----- ----- ----- ----- ----- ----- ---- ---- */ static void print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) { int i, j; vdev_cmd_data_t *data = &vcdl->data[0]; if (vcdl->count == 0 || data == NULL) return; /* * Each vdev cmd should have the same column names unless the user did * something weird with their cmd. Just take the column names from the * first vdev and assume it works for all of them. */ for (i = 0; i < vcdl->uniq_cols_cnt; i++) { printf(" "); if (use_dashes) { for (j = 0; j < vcdl->uniq_cols_width[i]; j++) printf("-"); } else { printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i], vcdl->uniq_cols[i]); } } } /* * Utility function to print out a line of dashes like: * * -------------------------------- ----- ----- ----- ----- ----- * * ...or a dashed named-row line like: * * logs - - - - - * * @cb: iostat data * * @force_column_width If non-zero, use the value as the column width. * Otherwise use the default column widths. * * @name: Print a dashed named-row line starting * with @name. Otherwise, print a regular * dashed line. */ static void print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, const char *name) { int i; unsigned int namewidth; uint64_t flags = cb->cb_flags; uint64_t f; int idx; const name_and_columns_t *labels; const char *title; if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdevs.cb_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), name ? strlen(name) : 0); if (name) { printf("%-*s", namewidth, name); } else { for (i = 0; i < namewidth; i++) (void) printf("-"); } /* For each bit in flags */ for (f = flags; f; f &= ~(1ULL << idx)) { unsigned int column_width; idx = lowbit64(f) - 1; if (force_column_width) column_width = force_column_width; else column_width = default_column_width(cb, idx); labels = iostat_bottom_labels[idx]; for (i = 0; i < label_array_len(labels); i++) { if (name) printf(" %*s-", column_width - 1, " "); else printf(" %.*s", column_width, "--------------------"); } } } static void print_iostat_separator_impl(iostat_cbdata_t *cb, unsigned int force_column_width) { print_iostat_dashes(cb, force_column_width, NULL); } static void print_iostat_separator(iostat_cbdata_t *cb) { print_iostat_separator_impl(cb, 0); } static void print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, const char *histo_vdev_name) { unsigned int namewidth; const char *title; color_start(ANSI_BOLD); if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdevs.cb_names_count) { title = "vdev"; } else { title = "pool"; } namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), histo_vdev_name ? strlen(histo_vdev_name) : 0); if (histo_vdev_name) printf("%-*s", namewidth, histo_vdev_name); else printf("%*s", namewidth, ""); print_iostat_labels(cb, force_column_width, iostat_top_labels); printf("\n"); printf("%-*s", namewidth, title); print_iostat_labels(cb, force_column_width, iostat_bottom_labels); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 0); printf("\n"); print_iostat_separator_impl(cb, force_column_width); if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 1); color_end(); printf("\n"); } static void print_iostat_header(iostat_cbdata_t *cb) { print_iostat_header_impl(cb, 0, NULL); } /* * Prints a size string (i.e. 120M) with the suffix ("M") colored * by order of magnitude. Uses column_size to add padding. */ static void print_stat_color(const char *statbuf, unsigned int column_size) { fputs(" ", stdout); size_t len = strlen(statbuf); while (len < column_size) { fputc(' ', stdout); column_size--; } if (*statbuf == '0') { color_start(ANSI_GRAY); fputc('0', stdout); } else { for (; *statbuf; statbuf++) { if (*statbuf == 'K') color_start(ANSI_GREEN); else if (*statbuf == 'M') color_start(ANSI_YELLOW); else if (*statbuf == 'G') color_start(ANSI_RED); else if (*statbuf == 'T') color_start(ANSI_BOLD_BLUE); else if (*statbuf == 'P') color_start(ANSI_MAGENTA); else if (*statbuf == 'E') color_start(ANSI_CYAN); fputc(*statbuf, stdout); if (--column_size <= 0) break; } } color_end(); } /* * Display a single statistic. */ static void print_one_stat(uint64_t value, enum zfs_nicenum_format format, unsigned int column_size, boolean_t scripted) { char buf[64]; zfs_nicenum_format(value, buf, sizeof (buf), format); if (scripted) printf("\t%s", buf); else print_stat_color(buf, column_size); } /* * Calculate the default vdev stats * * Subtract oldvs from newvs, apply a scaling factor, and save the resulting * stats into calcvs. */ static void calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, vdev_stat_t *calcvs) { int i; memcpy(calcvs, newvs, sizeof (*calcvs)); for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); } /* * Internal representation of the extended iostats data. * * The extended iostat stats are exported in nvlists as either uint64_t arrays * or single uint64_t's. We make both look like arrays to make them easier * to process. In order to make single uint64_t's look like arrays, we set * __data to the stat data, and then set *data = &__data with count = 1. Then, * we can just use *data and count. */ struct stat_array { uint64_t *data; uint_t count; /* Number of entries in data[] */ uint64_t __data; /* Only used when data is a single uint64_t */ }; static uint64_t stat_histo_max(struct stat_array *nva, unsigned int len) { uint64_t max = 0; int i; for (i = 0; i < len; i++) max = MAX(max, array64_max(nva[i].data, nva[i].count)); return (max); } /* * Helper function to lookup a uint64_t array or uint64_t value and store its * data as a stat_array. If the nvpair is a single uint64_t value, then we make * it look like a one element array to make it easier to process. */ static int nvpair64_to_stat_array(nvlist_t *nvl, const char *name, struct stat_array *nva) { nvpair_t *tmp; int ret; verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); switch (nvpair_type(tmp)) { case DATA_TYPE_UINT64_ARRAY: ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); break; case DATA_TYPE_UINT64: ret = nvpair_value_uint64(tmp, &nva->__data); nva->data = &nva->__data; nva->count = 1; break; default: /* Not a uint64_t */ ret = EINVAL; break; } return (ret); } /* * Given a list of nvlist names, look up the extended stats in newnv and oldnv, * subtract them, and return the results in a newly allocated stat_array. * You must free the returned array after you are done with it with * free_calc_stats(). * * Additionally, you can set "oldnv" to NULL if you simply want the newnv * values. */ static struct stat_array * calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, nvlist_t *newnv) { nvlist_t *oldnvx = NULL, *newnvx; struct stat_array *oldnva, *newnva, *calcnva; int i, j; unsigned int alloc_size = (sizeof (struct stat_array)) * len; /* Extract our extended stats nvlist from the main list */ verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, &newnvx) == 0); if (oldnv) { verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, &oldnvx) == 0); } newnva = safe_malloc(alloc_size); oldnva = safe_malloc(alloc_size); calcnva = safe_malloc(alloc_size); for (j = 0; j < len; j++) { verify(nvpair64_to_stat_array(newnvx, names[j], &newnva[j]) == 0); calcnva[j].count = newnva[j].count; alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); calcnva[j].data = safe_malloc(alloc_size); memcpy(calcnva[j].data, newnva[j].data, alloc_size); if (oldnvx) { verify(nvpair64_to_stat_array(oldnvx, names[j], &oldnva[j]) == 0); for (i = 0; i < oldnva[j].count; i++) calcnva[j].data[i] -= oldnva[j].data[i]; } } free(newnva); free(oldnva); return (calcnva); } static void free_calc_stats(struct stat_array *nva, unsigned int len) { int i; for (i = 0; i < len; i++) free(nva[i].data); free(nva); } static void print_iostat_histo(struct stat_array *nva, unsigned int len, iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, double scale) { int i, j; char buf[6]; uint64_t val; enum zfs_nicenum_format format; unsigned int buckets; unsigned int start_bucket; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; /* All these histos are the same size, so just use nva[0].count */ buckets = nva[0].count; if (cb->cb_flags & IOS_RQ_HISTO_M) { /* Start at 512 - req size should never be lower than this */ start_bucket = 9; } else { start_bucket = 0; } for (j = start_bucket; j < buckets; j++) { /* Print histogram bucket label */ if (cb->cb_flags & IOS_L_HISTO_M) { /* Ending range of this bucket */ val = (1UL << (j + 1)) - 1; zfs_nicetime(val, buf, sizeof (buf)); } else { /* Request size (starting range of bucket) */ val = (1UL << j); zfs_nicenum(val, buf, sizeof (buf)); } if (cb->cb_scripted) printf("%llu", (u_longlong_t)val); else printf("%-*s", namewidth, buf); /* Print the values on the line */ for (i = 0; i < len; i++) { print_one_stat(nva[i].data[j] * scale, format, column_width, cb->cb_scripted); } printf("\n"); } } static void print_solid_separator(unsigned int length) { while (length--) printf("-"); printf("\n"); } static void print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv, double scale, const char *name) { unsigned int column_width; unsigned int namewidth; unsigned int entire_width; enum iostat_type type; struct stat_array *nva; const char **names; unsigned int names_len; /* What type of histo are we? */ type = IOS_HISTO_IDX(cb->cb_flags); /* Get NULL-terminated array of nvlist names for our histo */ names = vsx_type_to_nvlist[type]; names_len = str_array_len(names); /* num of names */ nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); if (cb->cb_literal) { column_width = MAX(5, (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); } else { column_width = 5; } namewidth = MAX(cb->cb_namewidth, strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); /* * Calculate the entire line width of what we're printing. The * +2 is for the two spaces between columns: */ /* read write */ /* ----- ----- */ /* |___| <---------- column_width */ /* */ /* |__________| <--- entire_width */ /* */ entire_width = namewidth + (column_width + 2) * label_array_len(iostat_bottom_labels[type]); if (cb->cb_scripted) printf("%s\n", name); else print_iostat_header_impl(cb, column_width, name); print_iostat_histo(nva, names_len, cb, column_width, namewidth, scale); free_calc_stats(nva, names_len); if (!cb->cb_scripted) print_solid_separator(entire_width); } /* * Calculate the average latency of a power-of-two latency histogram */ static uint64_t single_histo_average(uint64_t *histo, unsigned int buckets) { int i; uint64_t count = 0, total = 0; for (i = 0; i < buckets; i++) { /* * Our buckets are power-of-two latency ranges. Use the * midpoint latency of each bucket to calculate the average. * For example: * * Bucket Midpoint * 8ns-15ns: 12ns * 16ns-31ns: 24ns * ... */ if (histo[i] != 0) { total += histo[i] * (((1UL << i) + ((1UL << i)/2))); count += histo[i]; } } /* Prevent divide by zero */ return (count == 0 ? 0 : total / count); } static void print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *newnv) { const char *names[] = { ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_QUEUES); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; for (int i = 0; i < ARRAY_SIZE(names); i++) { uint64_t val = nva[i].data[0]; print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } static void print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, nvlist_t *newnv) { int i; uint64_t val; const char *names[] = { ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO, }; struct stat_array *nva; unsigned int column_width = default_column_width(cb, IOS_LATENCY); enum zfs_nicenum_format format; nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); if (cb->cb_literal) format = ZFS_NICENUM_RAWTIME; else format = ZFS_NICENUM_TIME; /* Print our avg latencies on the line */ for (i = 0; i < ARRAY_SIZE(names); i++) { /* Compute average latency for a latency histo */ val = single_histo_average(nva[i].data, nva[i].count); print_one_stat(val, format, column_width, cb->cb_scripted); } free_calc_stats(nva, ARRAY_SIZE(names)); } /* * Print default statistics (capacity/operations/bandwidth) */ static void print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) { unsigned int column_width = default_column_width(cb, IOS_DEFAULT); enum zfs_nicenum_format format; char na; /* char to print for "not applicable" values */ if (cb->cb_literal) { format = ZFS_NICENUM_RAW; na = '0'; } else { format = ZFS_NICENUM_1024; na = '-'; } /* only toplevel vdevs have capacity stats */ if (vs->vs_space == 0) { if (cb->cb_scripted) printf("\t%c\t%c", na, na); else printf(" %*c %*c", column_width, na, column_width, na); } else { print_one_stat(vs->vs_alloc, format, column_width, cb->cb_scripted); print_one_stat(vs->vs_space - vs->vs_alloc, format, column_width, cb->cb_scripted); } print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), format, column_width, cb->cb_scripted); print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), format, column_width, cb->cb_scripted); } static const char *const class_name[] = { VDEV_ALLOC_BIAS_DEDUP, VDEV_ALLOC_BIAS_SPECIAL, VDEV_ALLOC_CLASS_LOGS }; /* * Print out all the statistics for the given vdev. This can either be the * toplevel configuration, or called recursively. If 'name' is NULL, then this * is a verbose output, and we don't want to display the toplevel pool stats. * * Returns the number of stat lines printed. */ static unsigned int print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, nvlist_t *newnv, iostat_cbdata_t *cb, int depth) { nvlist_t **oldchild, **newchild; uint_t c, children, oldchildren; vdev_stat_t *oldvs, *newvs, *calcvs; vdev_stat_t zerovs = { 0 }; char *vname; int i; int ret = 0; uint64_t tdelta; double scale; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return (ret); calcvs = safe_malloc(sizeof (*calcvs)); if (oldnv != NULL) { verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } /* Do we only want to see a specific vdev? */ for (i = 0; i < cb->cb_vdevs.cb_names_count; i++) { /* Yes we do. Is this the vdev? */ if (strcmp(name, cb->cb_vdevs.cb_names[i]) == 0) { /* * This is our vdev. Since it is the only vdev we * will be displaying, make depth = 0 so that it * doesn't get indented. */ depth = 0; break; } } if (cb->cb_vdevs.cb_names_count && (i == cb->cb_vdevs.cb_names_count)) { /* Couldn't match the name */ goto children; } verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); /* * Print the vdev name unless it's is a histogram. Histograms * display the vdev name in the header itself. */ if (!(cb->cb_flags & IOS_ANYHISTO_M)) { if (cb->cb_scripted) { printf("%s", name); } else { if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); } } /* Calculate our scaling factor */ tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { /* * If we specify printing histograms with no time interval, then * print the histogram numbers over the entire lifetime of the * vdev. */ scale = 1; } else { if (tdelta == 0) scale = 1.0; else scale = (double)NANOSEC / tdelta; } if (cb->cb_flags & IOS_DEFAULT_M) { calc_default_iostats(oldvs, newvs, calcvs); print_iostat_default(calcvs, cb, scale); } if (cb->cb_flags & IOS_LATENCY_M) print_iostat_latency(cb, oldnv, newnv); if (cb->cb_flags & IOS_QUEUES_M) print_iostat_queues(cb, newnv); if (cb->cb_flags & IOS_ANYHISTO_M) { printf("\n"); print_iostat_histos(cb, oldnv, newnv, scale, name); } if (cb->vcdl != NULL) { const char *path; if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, &path) == 0) { printf(" "); zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); } } if (!(cb->cb_flags & IOS_ANYHISTO_M)) printf("\n"); ret++; children: free(calcvs); if (!cb->cb_verbose) return (ret); if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } /* * print normal top-level devices */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE, islog = B_FALSE; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, &ishole); (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (ishole || islog) continue; if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } /* * print all other top-level devices */ for (uint_t n = 0; n < ARRAY_SIZE(class_name); n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { uint64_t islog = B_FALSE; const char *bias = NULL; const char *type = NULL; (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, &islog); if (islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(newchild[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdevs.cb_names) { print_iostat_dashes(cb, 0, class_name[n]); } printf("\n"); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, &newchild, &children) != 0) return (ret); if (oldnv) { if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, &oldchild, &oldchildren) != 0) return (ret); children = MIN(oldchildren, children); } if (children > 0) { if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && !cb->cb_vdevs.cb_names) { print_iostat_dashes(cb, 0, "cache"); } printf("\n"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, newchild[c], cb->cb_vdevs.cb_name_flags); ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); } } return (ret); } static int refresh_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; boolean_t missing; /* * If the pool has disappeared, remove it from the list and continue. */ if (zpool_refresh_stats(zhp, &missing) != 0) return (-1); if (missing) pool_list_remove(cb->cb_list, zhp); return (0); } /* * Callback to print out the iostats for the given pool. */ static int print_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; nvlist_t *oldconfig, *newconfig; nvlist_t *oldnvroot, *newnvroot; int ret; newconfig = zpool_get_config(zhp, &oldconfig); if (cb->cb_iteration == 1) oldconfig = NULL; verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, &newnvroot) == 0); if (oldconfig == NULL) oldnvroot = NULL; else verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, &oldnvroot) == 0); ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, cb, 0); if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdevs.cb_names_count) { print_iostat_separator(cb); if (cb->vcdl != NULL) { print_cmd_columns(cb->vcdl, 1); } printf("\n"); } return (ret); } static int get_columns(void) { struct winsize ws; int columns = 80; int error; if (isatty(STDOUT_FILENO)) { error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); if (error == 0) columns = ws.ws_col; } else { columns = 999; } return (columns); } /* * Return the required length of the pool/vdev name column. The minimum * allowed width and output formatting flags must be provided. */ static int get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose) { nvlist_t *config, *nvroot; int width = min_width; if ((config = zpool_get_config(zhp, NULL)) != NULL) { verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); size_t poolname_len = strlen(zpool_get_name(zhp)); if (verbose == B_FALSE) { width = MAX(poolname_len, min_width); } else { width = MAX(poolname_len, max_width(zhp, nvroot, 0, min_width, flags)); } } return (width); } /* * Parse the input string, get the 'interval' and 'count' value if there is one. */ static void get_interval_count(int *argcp, char **argv, float *iv, unsigned long *cnt) { float interval = 0; unsigned long count = 0; int argc = *argcp; /* * Determine if the last argument is an integer or a pool name */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext( "interval cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { /* * If this is not a valid number, just plow on. The * user will get a more informative error message later * on. */ interval = 0; } } /* * If the last argument is also an integer, then we have both a count * and an interval. */ if (argc > 0 && zfs_isnumber(argv[argc - 1])) { char *end; errno = 0; count = interval; interval = strtof(argv[argc - 1], &end); if (*end == '\0' && errno == 0) { if (interval == 0) { (void) fprintf(stderr, gettext( "interval cannot be zero\n")); usage(B_FALSE); } /* * Ignore the last parameter */ argc--; } else { interval = 0; } } *iv = interval; *cnt = count; *argcp = argc; } static void get_timestamp_arg(char c) { if (c == 'u') timestamp_fmt = UDATE; else if (c == 'd') timestamp_fmt = DDATE; else usage(B_FALSE); } /* * Return stat flags that are supported by all pools by both the module and * zpool iostat. "*data" should be initialized to all 0xFFs before running. * It will get ANDed down until only the flags that are supported on all pools * remain. */ static int get_stat_flags_cb(zpool_handle_t *zhp, void *data) { uint64_t *mask = data; nvlist_t *config, *nvroot, *nvx; uint64_t flags = 0; int i, j; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); /* Default stats are always supported, but for completeness.. */ if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) flags |= IOS_DEFAULT_M; /* Get our extended stats nvlist from the main list */ if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, &nvx) != 0) { /* * No extended stats; they're probably running an older * module. No big deal, we support that too. */ goto end; } /* For each extended stat, make sure all its nvpairs are supported */ for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { if (!vsx_type_to_nvlist[j][0]) continue; /* Start off by assuming the flag is supported, then check */ flags |= (1ULL << j); for (i = 0; vsx_type_to_nvlist[j][i]; i++) { if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { /* flag isn't supported */ flags = flags & ~(1ULL << j); break; } } } end: *mask = *mask & flags; return (0); } /* * Return a bitmask of stats that are supported on all pools by both the module * and zpool iostat. */ static uint64_t get_stat_flags(zpool_list_t *list) { uint64_t mask = -1; /* * get_stat_flags_cb() will lop off bits from "mask" until only the * flags that are supported on all pools remain. */ pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); return (mask); } /* * Return 1 if cb_data->cb_names[0] is this vdev's name, 0 otherwise. */ static int is_vdev_cb(void *zhp_data, nvlist_t *nv, void *cb_data) { uint64_t guid; vdev_cbdata_t *cb = cb_data; zpool_handle_t *zhp = zhp_data; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (0); return (guid == zpool_vdev_path_to_guid(zhp, cb->cb_names[0])); } /* * Returns 1 if cb_data->cb_names[0] is a vdev name, 0 otherwise. */ static int is_vdev(zpool_handle_t *zhp, void *cb_data) { return (for_each_vdev(zhp, is_vdev_cb, cb_data)); } /* * Check if vdevs are in a pool * * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise * return 0. If pool_name is NULL, then search all pools. */ static int are_vdevs_in_pool(int argc, char **argv, char *pool_name, vdev_cbdata_t *cb) { char **tmp_name; int ret = 0; int i; int pool_count = 0; if ((argc == 0) || !*argv) return (0); if (pool_name) pool_count = 1; /* Temporarily hijack cb_names for a second... */ tmp_name = cb->cb_names; /* Go though our list of prospective vdev names */ for (i = 0; i < argc; i++) { cb->cb_names = argv + i; /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, is_vdev, cb); if (!ret) { /* No match */ break; } } cb->cb_names = tmp_name; return (ret); } static int is_pool_cb(zpool_handle_t *zhp, void *data) { char *name = data; if (strcmp(name, zpool_get_name(zhp)) == 0) return (1); return (0); } /* * Do we have a pool named *name? If so, return 1, otherwise 0. */ static int is_pool(char *name) { return (for_each_pool(0, NULL, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, is_pool_cb, name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ static int are_all_pools(int argc, char **argv) { if ((argc == 0) || !*argv) return (0); while (--argc >= 0) if (!is_pool(argv[argc])) return (0); return (1); } /* * Helper function to print out vdev/pool names we can't resolve. Used for an * error message. */ static void error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, vdev_cbdata_t *cb) { int i; char *name; char *str; for (i = 0; i < argc; i++) { name = argv[i]; if (is_pool(name)) str = gettext("pool"); else if (are_vdevs_in_pool(1, &name, pool_name, cb)) str = gettext("vdev in this pool"); else if (are_vdevs_in_pool(1, &name, NULL, cb)) str = gettext("vdev in another pool"); else str = gettext("unknown"); fprintf(stderr, "\t%s (%s)\n", name, str); } } /* * Same as get_interval_count(), but with additional checks to not misinterpret * guids as interval/count values. Assumes VDEV_NAME_GUID is set in * cb.cb_vdevs.cb_name_flags. */ static void get_interval_count_filter_guids(int *argc, char **argv, float *interval, unsigned long *count, iostat_cbdata_t *cb) { char **tmpargv = argv; int argc_for_interval = 0; /* Is the last arg an interval value? Or a guid? */ if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, &cb->cb_vdevs)) { /* * The last arg is not a guid, so it's probably an * interval value. */ argc_for_interval++; if (*argc >= 2 && !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, &cb->cb_vdevs)) { /* * The 2nd to last arg is not a guid, so it's probably * an interval value. */ argc_for_interval++; } } /* Point to our list of possible intervals */ tmpargv = &argv[*argc - argc_for_interval]; *argc = *argc - argc_for_interval; get_interval_count(&argc_for_interval, tmpargv, interval, count); } /* * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or * if we were unable to determine its size. */ static int terminal_height(void) { struct winsize win; if (isatty(STDOUT_FILENO) == 0) return (-1); if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0) return (win.ws_row); return (-1); } /* * Run one of the zpool status/iostat -c scripts with the help (-h) option and * print the result. * * name: Short name of the script ('iostat'). * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat'); */ static void print_zpool_script_help(char *name, char *path) { char *argv[] = {path, (char *)"-h", NULL}; char **lines = NULL; int lines_cnt = 0; int rc; rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines, &lines_cnt); if (rc != 0 || lines == NULL || lines_cnt <= 0) { if (lines != NULL) libzfs_free_str_array(lines, lines_cnt); return; } for (int i = 0; i < lines_cnt; i++) if (!is_blank_str(lines[i])) printf(" %-14s %s\n", name, lines[i]); libzfs_free_str_array(lines, lines_cnt); } /* * Go though the zpool status/iostat -c scripts in the user's path, run their * help option (-h), and print out the results. */ static void print_zpool_dir_scripts(char *dirpath) { DIR *dir; struct dirent *ent; char fullpath[MAXPATHLEN]; struct stat dir_stat; if ((dir = opendir(dirpath)) != NULL) { /* print all the files and directories within directory */ while ((ent = readdir(dir)) != NULL) { if (snprintf(fullpath, sizeof (fullpath), "%s/%s", dirpath, ent->d_name) >= sizeof (fullpath)) { (void) fprintf(stderr, gettext("internal error: " "ZPOOL_SCRIPTS_PATH too large.\n")); exit(1); } /* Print the scripts */ if (stat(fullpath, &dir_stat) == 0) if (dir_stat.st_mode & S_IXUSR && S_ISREG(dir_stat.st_mode)) print_zpool_script_help(ent->d_name, fullpath); } closedir(dir); } } /* * Print out help text for all zpool status/iostat -c scripts. */ static void print_zpool_script_list(const char *subcommand) { char *dir, *sp, *tmp; printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); sp = zpool_get_cmd_search_path(); if (sp == NULL) return; for (dir = strtok_r(sp, ":", &tmp); dir != NULL; dir = strtok_r(NULL, ":", &tmp)) print_zpool_dir_scripts(dir); free(sp); } /* * Set the minimum pool/vdev name column width. The width must be at least 10, * but may be as large as the column width - 42 so it still fits on one line. * NOTE: 42 is the width of the default capacity/operations/bandwidth output */ static int get_namewidth_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; int width, available_width; /* * get_namewidth() returns the maximum width of any name in that column * for any pool/vdev/device line that will be output. */ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_vdevs.cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); /* * The width we are calculating is the width of the header and also the * padding width for names that are less than maximum width. The stats * take up 42 characters, so the width available for names is: */ available_width = get_columns() - 42; /* * If the maximum width fits on a screen, then great! Make everything * line up by justifying all lines to the same width. If that max * width is larger than what's available, the name plus stats won't fit * on one line, and justifying to that width would cause every line to * wrap on the screen. We only want lines with long names to wrap. * Limit the padding to what won't wrap. */ if (width > available_width) width = available_width; /* * And regardless of whatever the screen width is (get_columns can * return 0 if the width is not known or less than 42 for a narrow * terminal) have the width be a minimum of 10. */ if (width < 10) width = 10; /* Save the calculated width */ cb->cb_namewidth = width; return (0); } /* * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name] * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]] * [interval [count]] * * -c CMD For each vdev, run command CMD * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -P Display full path for vdev name. * -v Display statistics for individual vdevs * -h Display help * -p Display values in parsable (exact) format. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -l Display average latency * -q Display queue depths * -w Display latency histograms * -r Display request size histogram * -T Display a timestamp in date(1) or Unix format * -n Only print headers once * * This command can be tricky because we want to be able to deal with pool * creation/destruction as well as vdev configuration changes. The bulk of this * processing is handled by the pool_list_* routines in zpool_iter.c. We rely * on pool_list_update() to detect the addition of new pools. Configuration * changes are all handled within libzfs. */ int zpool_do_iostat(int argc, char **argv) { int c; int ret; int npools; float interval = 0; unsigned long count = 0; int winheight = 24; zpool_list_t *list; boolean_t verbose = B_FALSE; boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; boolean_t omit_since_boot = B_FALSE; boolean_t guid = B_FALSE; boolean_t follow_links = B_FALSE; boolean_t full_name = B_FALSE; boolean_t headers_once = B_FALSE; iostat_cbdata_t cb = { 0 }; char *cmd = NULL; /* Used for printing error message */ const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; uint64_t unsupported_flags; /* check options */ while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; verbose = B_TRUE; break; case 'g': guid = B_TRUE; break; case 'L': follow_links = B_TRUE; break; case 'P': full_name = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': verbose = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'l': latency = B_TRUE; break; case 'q': queues = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'w': l_histo = B_TRUE; break; case 'r': rq_histo = B_TRUE; break; case 'y': omit_since_boot = B_TRUE; break; case 'n': headers_once = B_TRUE; break; case 'h': usage(B_FALSE); break; case '?': if (optopt == 'c') { print_zpool_script_list("iostat"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; cb.cb_literal = parsable; cb.cb_scripted = scripted; if (guid) cb.cb_vdevs.cb_name_flags |= VDEV_NAME_GUID; if (follow_links) cb.cb_vdevs.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; if (full_name) cb.cb_vdevs.cb_name_flags |= VDEV_NAME_PATH; cb.cb_iteration = 0; cb.cb_namewidth = 0; cb.cb_verbose = verbose; /* Get our interval and count values (if any) */ if (guid) { get_interval_count_filter_guids(&argc, argv, &interval, &count, &cb); } else { get_interval_count(&argc, argv, &interval, &count); } if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_vdevs_in_pool(argc, argv, NULL, &cb.cb_vdevs)) { /* All the args are vdevs */ cb.cb_vdevs.cb_names = argv; cb.cb_vdevs.cb_names_count = argc; argc = 0; /* No pools to process */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb.cb_vdevs)) { /* ...and the rest are vdev names */ cb.cb_vdevs.cb_names = argv + 1; cb.cb_vdevs.cb_names_count = argc - 1; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected either a list of ")); fprintf(stderr, gettext("pools, or list of vdevs in")); fprintf(stderr, " \"%s\", ", argv[0]); fprintf(stderr, gettext("but got:\n")); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb.cb_vdevs); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The args don't make sense. The first arg isn't a pool name, * nor are all the args vdevs. */ fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); fprintf(stderr, "\n"); return (1); } if (cb.cb_vdevs.cb_names_count != 0) { /* * If user specified vdevs, it implies verbose. */ cb.cb_verbose = B_TRUE; } /* * Construct the list of all interesting pools. */ ret = 0; if ((list = pool_list_get(argc, argv, NULL, ZFS_TYPE_POOL, parsable, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { pool_list_free(list); return (1); } if (pool_list_count(list) == 0 && interval == 0) { pool_list_free(list); (void) fprintf(stderr, gettext("no pools available\n")); return (1); } if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) { pool_list_free(list); (void) fprintf(stderr, gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n")); usage(B_FALSE); return (1); } if (l_histo && rq_histo) { pool_list_free(list); (void) fprintf(stderr, gettext("Only one of [-r|-w] can be passed at a time\n")); usage(B_FALSE); return (1); } /* * Enter the main iostat loop. */ cb.cb_list = list; if (l_histo) { /* * Histograms tables look out of place when you try to display * them with the other stats, so make a rule that you can only * print histograms by themselves. */ cb.cb_flags = IOS_L_HISTO_M; } else if (rq_histo) { cb.cb_flags = IOS_RQ_HISTO_M; } else { cb.cb_flags = IOS_DEFAULT_M; if (latency) cb.cb_flags |= IOS_LATENCY_M; if (queues) cb.cb_flags |= IOS_QUEUES_M; } /* * See if the module supports all the stats we want to display. */ unsupported_flags = cb.cb_flags & ~get_stat_flags(list); if (unsupported_flags) { uint64_t f; int idx; fprintf(stderr, gettext("The loaded zfs module doesn't support:")); /* for each bit set in unsupported_flags */ for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { idx = lowbit64(f) - 1; fprintf(stderr, " -%c", flag_to_arg[idx]); } fprintf(stderr, ". Try running a newer module.\n"); pool_list_free(list); return (1); } for (;;) { if ((npools = pool_list_count(list)) == 0) (void) fprintf(stderr, gettext("no pools available\n")); else { /* * If this is the first iteration and -y was supplied * we skip any printing. */ boolean_t skip = (omit_since_boot && cb.cb_iteration == 0); /* * Refresh all statistics. This is done as an * explicit step before calculating the maximum name * width, so that any * configuration changes are * properly accounted for. */ (void) pool_list_iter(list, B_FALSE, refresh_iostat, &cb); /* * Iterate over all pools to determine the maximum width * for the pool / device name column across all pools. */ cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_iostat, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL && cb.cb_verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) { cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, g_zfs, cb.cb_vdevs.cb_names, cb.cb_vdevs.cb_names_count, cb.cb_vdevs.cb_name_flags); } else { cb.vcdl = NULL; } /* * Check terminal size so we can print headers * even when terminal window has its height * changed. */ winheight = terminal_height(); /* * Are we connected to TTY? If not, headers_once * should be true, to avoid breaking scripts. */ if (winheight < 0) headers_once = B_TRUE; /* * If it's the first time and we're not skipping it, * or either skip or verbose mode, print the header. * * The histogram code explicitly prints its header on * every vdev, so skip this for histograms. */ if (((++cb.cb_iteration == 1 && !skip) || (skip != verbose) || (!headers_once && (cb.cb_iteration % winheight) == 0)) && (!(cb.cb_flags & IOS_ANYHISTO_M)) && !cb.cb_scripted) print_iostat_header(&cb); if (skip) { (void) fflush(stdout); (void) fsleep(interval); continue; } pool_list_iter(list, B_FALSE, print_iostat, &cb); /* * If there's more than one pool, and we're not in * verbose mode (which prints a separator for us), * then print a separator. * * In addition, if we're printing specific vdevs then * we also want an ending separator. */ if (((npools > 1 && !verbose && !(cb.cb_flags & IOS_ANYHISTO_M)) || (!(cb.cb_flags & IOS_ANYHISTO_M) && cb.cb_vdevs.cb_names_count)) && !cb.cb_scripted) { print_iostat_separator(&cb); if (cb.vcdl != NULL) print_cmd_columns(cb.vcdl, 1); printf("\n"); } if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); } if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fflush(stdout); (void) fsleep(interval); } pool_list_free(list); return (ret); } typedef struct list_cbdata { boolean_t cb_verbose; int cb_name_flags; int cb_namewidth; boolean_t cb_scripted; zprop_list_t *cb_proplist; boolean_t cb_literal; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) (void) fputs(" ", stdout); else first = B_FALSE; right_justify = B_FALSE; if (pl->pl_prop != ZPROP_USERPROP) { header = zpool_prop_column_name(pl->pl_prop); right_justify = zpool_prop_align_right(pl->pl_prop); } else { int i; for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) fputs(header, stdout); else if (right_justify) (void) printf("%*s", (int)width, header); else (void) printf("%-*s", (int)width, header); } (void) fputc('\n', stdout); } /* * Given a pool and a list of properties, print out all the properties according * to the described layout. Used by zpool_do_list(). */ static void print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; const char *propstr; boolean_t right_justify; size_t width; for (; pl != NULL; pl = pl->pl_next) { width = pl->pl_width; if (first && cb->cb_verbose) { /* * Reset the width to accommodate the verbose listing * of devices. */ width = cb->cb_namewidth; } if (!first) { if (cb->cb_scripted) (void) fputc('\t', stdout); else (void) fputs(" ", stdout); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_USERPROP) { if (zpool_get_prop(zhp, pl->pl_prop, property, sizeof (property), NULL, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); } else if ((zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop)) && zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; } else if (zfs_prop_user(pl->pl_user_prop) && zpool_get_userprop(zhp, pl->pl_user_prop, property, sizeof (property), NULL) == 0) { propstr = property; } else { propstr = "-"; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) fputs(propstr, stdout); else if (right_justify) (void) printf("%*s", (int)width, propstr); else (void) printf("%-*s", (int)width, propstr); } (void) fputc('\n', stdout); } static void print_one_column(zpool_prop_t prop, uint64_t value, const char *str, boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format) { char propval[64]; boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { case ZPOOL_PROP_SIZE: case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else zfs_nicenum_format(value, propval, sizeof (propval), format); break; case ZPOOL_PROP_FRAGMENTATION: if (value == ZFS_FRAG_INVALID) { (void) strlcpy(propval, "-", sizeof (propval)); } else if (format == ZFS_NICENUM_RAW) { (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value); } else { (void) snprintf(propval, sizeof (propval), "%llu%%", (unsigned long long)value); } break; case ZPOOL_PROP_CAPACITY: /* capacity value is in parts-per-10,000 (aka permyriad) */ if (format == ZFS_NICENUM_RAW) (void) snprintf(propval, sizeof (propval), "%llu", (unsigned long long)value / 100); else (void) snprintf(propval, sizeof (propval), value < 1000 ? "%1.2f%%" : value < 10000 ? "%2.1f%%" : "%3.0f%%", value / 100.0); break; case ZPOOL_PROP_HEALTH: width = 8; (void) strlcpy(propval, str, sizeof (propval)); break; default: zfs_nicenum_format(value, propval, sizeof (propval), format); } if (!valid) (void) strlcpy(propval, "-", sizeof (propval)); if (scripted) (void) printf("\t%s", propval); else (void) printf(" %*s", (int)width, propval); } /* * print static default line per vdev * not compatible with '-o' option */ static void print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, list_cbdata_t *cb, int depth, boolean_t isspare) { nvlist_t **child; vdev_stat_t *vs; uint_t c, children; char *vname; boolean_t scripted = cb->cb_scripted; uint64_t islog = B_FALSE; const char *dashes = "%-*s - - - - " "- - - - -\n"; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (name != NULL) { boolean_t toplevel = (vs->vs_space != 0); uint64_t cap; enum zfs_nicenum_format format; const char *state; if (cb->cb_literal) format = ZFS_NICENUM_RAW; else format = ZFS_NICENUM_1024; if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) return; if (scripted) (void) printf("\t%s", name); else if (strlen(name) + depth > cb->cb_namewidth) (void) printf("%*s%s", depth, "", name); else (void) printf("%*s%s%*s", depth, "", name, (int)(cb->cb_namewidth - strlen(name) - depth), ""); /* * Print the properties for the individual vdevs. Some * properties are only applicable to toplevel vdevs. The * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ if (VDEV_STAT_VALID(vs_pspace, c) && vs->vs_pspace) print_one_column(ZPOOL_PROP_SIZE, vs->vs_pspace, NULL, scripted, B_TRUE, format); else print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_CHECKPOINT, vs->vs_checkpoint_space, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL, scripted, B_TRUE, format); print_one_column(ZPOOL_PROP_FRAGMENTATION, vs->vs_fragmentation, NULL, scripted, (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), format); cap = (vs->vs_space == 0) ? 0 : (vs->vs_alloc * 10000 / vs->vs_space); print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL, scripted, toplevel, format); state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { if (vs->vs_aux == VDEV_AUX_SPARED) state = "INUSE"; else if (vs->vs_state == VDEV_STATE_HEALTHY) state = "AVAIL"; } print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted, B_TRUE, format); (void) fputc('\n', stdout); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; /* list the normal vdevs first */ for (c = 0; c < children; c++) { uint64_t ishole = B_FALSE; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) continue; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) continue; if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) continue; vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } /* list the classes: 'logs', 'dedup', and 'special' */ for (uint_t n = 0; n < ARRAY_SIZE(class_name); n++) { boolean_t printed = B_FALSE; for (c = 0; c < children; c++) { const char *bias = NULL; const char *type = NULL; if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { bias = VDEV_ALLOC_CLASS_LOGS; } else { (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); (void) nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type); } if (bias == NULL || strcmp(bias, class_name[n]) != 0) continue; if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; if (!printed) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, class_name[n]); printed = B_TRUE; } vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags | VDEV_NAME_TYPE_ID); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); free(vname); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0 && children > 0) { /* LINTED E_SEC_PRINTF_VAR_FMT */ (void) printf(dashes, cb->cb_namewidth, "spare"); for (c = 0; c < children; c++) { vname = zpool_vdev_name(g_zfs, zhp, child[c], cb->cb_name_flags); print_list_stats(zhp, vname, child[c], cb, depth + 2, B_TRUE); free(vname); } } } /* * Generic callback function to list a pool. */ static int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; print_pool(zhp, cbp); if (cbp->cb_verbose) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE); } return (0); } /* * Set the minimum pool/vdev name column width. The width must be at least 9, * but may be as large as needed. */ static int get_namewidth_list(zpool_handle_t *zhp, void *data) { list_cbdata_t *cb = data; int width; width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); if (width < 9) width = 9; cb->cb_namewidth = width; return (0); } /* * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -g Display guid for individual vdev name. * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -L Follow links when resolving vdev path name. * -o List of properties to display. Defaults to * "name,size,allocated,free,expandsize,fragmentation,capacity," * "dedupratio,health,altroot" * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. */ int zpool_do_list(int argc, char **argv) { int c; int ret = 0; list_cbdata_t cb = { 0 }; static char default_props[] = "name,size,allocated,free,checkpoint,expandsize,fragmentation," "capacity,dedupratio,health,altroot"; char *props = default_props; float interval = 0; unsigned long count = 0; zpool_list_t *list; boolean_t first = B_TRUE; current_prop_type = ZFS_TYPE_POOL; /* check options */ while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { switch (c) { case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'o': props = optarg; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 'p': cb.cb_literal = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 'v': cb.cb_verbose = B_TRUE; cb.cb_namewidth = 8; /* 8 until precalc is avail */ break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, ZFS_TYPE_POOL, cb.cb_literal, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) break; cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb); if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (!cb.cb_scripted && (first || cb.cb_verbose)) { print_header(&cb); first = B_FALSE; } ret = pool_list_iter(list, B_TRUE, list_callback, &cb); if (interval == 0) break; if (count != 0 && --count == 0) break; pool_list_free(list); (void) fflush(stdout); (void) fsleep(interval); } if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { (void) printf(gettext("no pools available\n")); ret = 0; } pool_list_free(list); zprop_free_list(cb.cb_proplist); return (ret); } static int zpool_do_attach_or_replace(int argc, char **argv, int replacing) { boolean_t force = B_FALSE; boolean_t rebuild = B_FALSE; boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; nvlist_t *props = NULL; char *propval; int ret; /* check options */ while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { case 'f': force = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for -o option\n")); usage(B_FALSE); } *propval = '\0'; propval++; if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || (add_prop_list(optarg, propval, &props, B_TRUE))) usage(B_FALSE); break; case 's': rebuild = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } poolname = argv[0]; if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } old_disk = argv[1]; if (argc < 3) { if (!replacing) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } new_disk = old_disk; argc -= 1; argv += 1; } else { new_disk = argv[2]; argc -= 2; argv += 2; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { nvlist_free(props); return (1); } if (zpool_get_config(zhp, NULL) == NULL) { (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), poolname); zpool_close(zhp); nvlist_free(props); return (1); } /* unless manually specified use "ashift" pool property (if set) */ if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { int intval; zprop_source_t src; char strval[ZPOOL_MAXPROPLEN]; intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); if (src != ZPROP_SRC_DEFAULT) { (void) sprintf(strval, "%" PRId32, intval); verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, &props, B_TRUE) == 0); } } nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { zpool_close(zhp); nvlist_free(props); return (1); } ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); if (ret == 0 && wait) { zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER; char raidz_prefix[] = "raidz"; if (replacing) { activity = ZPOOL_WAIT_REPLACE; } else if (strncmp(old_disk, raidz_prefix, strlen(raidz_prefix)) == 0) { activity = ZPOOL_WAIT_RAIDZ_EXPAND; } ret = zpool_wait(zhp, activity); } nvlist_free(props); nvlist_free(nvroot); zpool_close(zhp); return (ret); } /* * zpool replace [-fsw] [-o property=value] * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for replacing to complete before returning * * Replace with . */ int zpool_do_replace(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); } /* * zpool attach [-fsw] [-o property=value] | * * -f Force attach, even if appears to be in use. * -s Use sequential instead of healing reconstruction for resilver. * -o Set property=value. * -w Wait for resilvering (mirror) or expansion (raidz) to complete * before returning. * * Attach to a or , where the vdev can be of type * mirror or raidz. If is not part of a mirror, then will * be transformed into a mirror of and . When a mirror * is involved, will begin life with a DTL of [0, now], and will * immediately begin to resilver itself. For the raidz case, a expansion will * commence and reflow the raidz data across all the disks including the * . */ int zpool_do_attach(int argc, char **argv) { return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); } /* * zpool detach [-f] * * -f Force detach of , even if DTLs argue against it * (not supported yet) * * Detach a device from a mirror. The operation will be refused if * is the last device in the mirror, or if the DTLs indicate that this device * has the only valid copy of some data. */ int zpool_do_detach(int argc, char **argv) { int c; char *poolname, *path; zpool_handle_t *zhp; int ret; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing specification\n")); usage(B_FALSE); } poolname = argv[0]; path = argv[1]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_vdev_detach(zhp, path); zpool_close(zhp); return (ret); } /* * zpool split [-gLnP] [-o prop=val] ... * [-o mntopt] ... * [-R altroot] [ ...] * * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -n Do not split the pool, but display the resulting layout if * it were to be split. * -o Set property=value, or set mount options. * -P Display full path for vdev name. * -R Mount the split-off pool under an alternate root. * -l Load encryption keys while importing. * * Splits the named pool and gives it the new pool name. Devices to be split * off may be listed, provided that no more than one device is specified * per top-level vdev mirror. The newly split pool is left in an exported * state unless -R is specified. * * Restrictions: the top-level of the pool pool must only be made up of * mirrors; all devices in the pool must be healthy; no device may be * undergoing a resilvering operation. */ int zpool_do_split(int argc, char **argv) { char *srcpool, *newpool, *propval; char *mntopts = NULL; splitflags_t flags; int c, ret = 0; int ms_status = 0; boolean_t loadkeys = B_FALSE; zpool_handle_t *zhp; nvlist_t *config, *props = NULL; flags.dryrun = B_FALSE; flags.import = B_FALSE; flags.name_flags = 0; /* check options */ while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) { switch (c) { case 'g': flags.name_flags |= VDEV_NAME_GUID; break; case 'L': flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'R': flags.import = B_TRUE; if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } break; case 'l': loadkeys = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { nvlist_free(props); usage(B_FALSE); } } else { mntopts = optarg; } break; case 'P': flags.name_flags |= VDEV_NAME_PATH; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); break; } } if (!flags.import && mntopts != NULL) { (void) fprintf(stderr, gettext("setting mntopts is only " "valid when importing the pool\n")); usage(B_FALSE); } if (!flags.import && loadkeys) { (void) fprintf(stderr, gettext("loading keys is only " "valid when importing the pool\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("Missing new pool name\n")); usage(B_FALSE); } srcpool = argv[0]; newpool = argv[1]; argc -= 2; argv += 2; if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { nvlist_free(props); return (1); } config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); if (config == NULL) { ret = 1; } else { if (flags.dryrun) { (void) printf(gettext("would create '%s' with the " "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); print_vdev_tree(NULL, "dedup", config, 0, VDEV_ALLOC_BIAS_DEDUP, 0); print_vdev_tree(NULL, "special", config, 0, VDEV_ALLOC_BIAS_SPECIAL, 0); } } zpool_close(zhp); if (ret != 0 || flags.dryrun || !flags.import) { nvlist_free(config); nvlist_free(props); return (ret); } /* * The split was successful. Now we need to open the new * pool and import it. */ if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { nvlist_free(config); nvlist_free(props); return (1); } if (loadkeys) { ret = zfs_crypto_attempt_load_keys(g_zfs, newpool); if (ret != 0) ret = 1; } if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { ms_status = zpool_enable_datasets(zhp, mntopts, 0); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Split was successful, " "datasets are mounted but sharing of some datasets " "has failed\n")); } else if (ms_status == EZFS_MOUNTFAILED) { (void) fprintf(stderr, gettext("Split was successful" ", but some datasets could not be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); } } zpool_close(zhp); nvlist_free(config); nvlist_free(props); return (ret); } #define POWER_OPT 1024 /* * zpool online [--power] ... * * --power: Power on the enclosure slot to the drive (if possible) */ int zpool_do_online(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; vdev_state_t newstate; int flags = 0; boolean_t is_power_on = B_FALSE; struct option long_options[] = { {"power", no_argument, NULL, POWER_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "e", long_options, NULL)) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; case POWER_OPT: is_power_on = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) is_power_on = B_TRUE; argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { vdev_state_t oldstate; boolean_t avail_spare, l2cache; int rc; if (is_power_on) { rc = zpool_power_on_and_disk_wait(zhp, argv[i]); if (rc == ENOTSUP) { (void) fprintf(stderr, gettext("Power control not supported\n")); } if (rc != 0) return (rc); } nvlist_t *tgt = zpool_find_vdev(zhp, argv[i], &avail_spare, &l2cache, NULL); if (tgt == NULL) { ret = 1; continue; } uint_t vsc; oldstate = ((vdev_stat_t *)fnvlist_lookup_uint64_array(tgt, ZPOOL_CONFIG_VDEV_STATS, &vsc))->vs_state; if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), argv[i]); if (newstate == VDEV_STATE_FAULTED) (void) printf(gettext("use 'zpool " "clear' to restore a faulted " "device\n")); else (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); if ((flags & ZFS_ONLINE_EXPAND)) { (void) printf(gettext("%s: failed " "to expand usable space on " "unhealthy device '%s'\n"), (oldstate >= VDEV_STATE_DEGRADED ? "error" : "warning"), argv[i]); if (oldstate >= VDEV_STATE_DEGRADED) { ret = 1; break; } } } } else { ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool offline [-ft]|[--power] ... * * * -f Force the device into a faulted state. * * -t Only take the device off-line temporarily. The offline/faulted * state will not be persistent across reboots. * * --power Power off the enclosure slot to the drive (if possible) */ int zpool_do_offline(int argc, char **argv) { int c, i; char *poolname; zpool_handle_t *zhp; int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; boolean_t is_power_off = B_FALSE; struct option long_options[] = { {"power", no_argument, NULL, POWER_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "ft", long_options, NULL)) != -1) { switch (c) { case 'f': fault = B_TRUE; break; case 't': istmp = B_TRUE; break; case POWER_OPT: is_power_off = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (is_power_off && fault) { (void) fprintf(stderr, gettext("-0 and -f cannot be used together\n")); usage(B_FALSE); return (1); } if (is_power_off && istmp) { (void) fprintf(stderr, gettext("-0 and -t cannot be used together\n")); usage(B_FALSE); return (1); } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing device name\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); for (i = 1; i < argc; i++) { uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); if (is_power_off) { /* * Note: we have to power off first, then set REMOVED, * or else zpool_vdev_set_removed_state() returns * EAGAIN. */ ret = zpool_power_off(zhp, argv[i]); if (ret != 0) { (void) fprintf(stderr, "%s %s %d\n", gettext("unable to power off slot for"), argv[i], ret); } zpool_vdev_set_removed_state(zhp, guid, VDEV_AUX_NONE); } else if (fault) { vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ aux = VDEV_AUX_EXTERNAL_PERSIST; } else { aux = VDEV_AUX_EXTERNAL; } if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0) ret = 1; } else { if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) ret = 1; } } zpool_close(zhp); return (ret); } /* * zpool clear [-nF]|[--power] [device] * * Clear all errors associated with a pool or a particular device. */ int zpool_do_clear(int argc, char **argv) { int c; int ret = 0; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; boolean_t is_power_on = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; struct option long_options[] = { {"power", no_argument, NULL, POWER_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "FnX", long_options, NULL)) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; break; case 'n': dryrun = B_TRUE; break; case 'X': xtreme_rewind = B_TRUE; break; case POWER_OPT: is_power_on = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) is_power_on = B_TRUE; argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((dryrun || xtreme_rewind) && !do_rewind) { (void) fprintf(stderr, gettext("-n or -X only meaningful with -F\n")); usage(B_FALSE); } if (dryrun) rewind_policy = ZPOOL_TRY_REWIND; else if (do_rewind) rewind_policy = ZPOOL_DO_REWIND; if (xtreme_rewind) rewind_policy |= ZPOOL_EXTREME_REWIND; /* In future, further rewind policy choices can be passed along here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind_policy) != 0) { return (1); } pool = argv[0]; device = argc == 2 ? argv[1] : NULL; if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { nvlist_free(policy); return (1); } if (is_power_on) { if (device == NULL) { zpool_power_on_pool_and_wait_for_devices(zhp); } else { zpool_power_on_and_disk_wait(zhp, device); } } if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); nvlist_free(policy); return (ret); } /* * zpool reguid */ int zpool_do_reguid(int argc, char **argv) { int c; char *poolname; zpool_handle_t *zhp; int ret = 0; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* get pool name and check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } poolname = argv[0]; if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); ret = zpool_reguid(zhp); zpool_close(zhp); return (ret); } /* * zpool reopen * * Reopen the pool so that the kernel can update the sizes of all vdevs. */ int zpool_do_reopen(int argc, char **argv) { int c; int ret = 0; boolean_t scrub_restart = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "n")) != -1) { switch (c) { case 'n': scrub_restart = B_FALSE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, zpool_reopen_one, &scrub_restart); return (ret); } typedef struct scrub_cbdata { int cb_type; pool_scrub_cmd_t cb_scrub_cmd; } scrub_cbdata_t; static boolean_t zpool_has_checkpoint(zpool_handle_t *zhp) { nvlist_t *config, *nvroot; config = zpool_get_config(zhp, NULL); if (config != NULL) { pool_checkpoint_stat_t *pcs = NULL; uint_t c; nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs == NULL || pcs->pcs_state == CS_NONE) return (B_FALSE); assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || pcs->pcs_state == CS_CHECKPOINT_DISCARDING); return (B_TRUE); } return (B_FALSE); } static int scrub_callback(zpool_handle_t *zhp, void *data) { scrub_cbdata_t *cb = data; int err; /* * Ignore faulted pools. */ if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { (void) fprintf(stderr, gettext("cannot scan '%s': pool is " "currently unavailable\n"), zpool_get_name(zhp)); return (1); } err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); if (err == 0 && zpool_has_checkpoint(zhp) && cb->cb_type == POOL_SCAN_SCRUB) { (void) printf(gettext("warning: will not scrub state that " "belongs to the checkpoint of pool '%s'\n"), zpool_get_name(zhp)); } return (err != 0); } static int wait_callback(zpool_handle_t *zhp, void *data) { zpool_wait_activity_t *act = data; return (zpool_wait(zhp, *act)); } /* * zpool scrub [-s | -p] [-w] [-e] ... * * -e Only scrub blocks in the error log. * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. */ int zpool_do_scrub(int argc, char **argv) { int c; scrub_cbdata_t cb; boolean_t wait = B_FALSE; int error; cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; boolean_t is_error_scrub = B_FALSE; boolean_t is_pause = B_FALSE; boolean_t is_stop = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "spwe")) != -1) { switch (c) { case 'e': is_error_scrub = B_TRUE; break; case 's': is_stop = B_TRUE; break; case 'p': is_pause = B_TRUE; break; case 'w': wait = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (is_pause && is_stop) { (void) fprintf(stderr, gettext("invalid option " "combination :-s and -p are mutually exclusive\n")); usage(B_FALSE); } else { if (is_error_scrub) cb.cb_type = POOL_SCAN_ERRORSCRUB; if (is_pause) { cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; } else if (is_stop) { cb.cb_type = POOL_SCAN_NONE; } else { cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; } } if (wait && (cb.cb_type == POOL_SCAN_NONE || cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { (void) fprintf(stderr, gettext("invalid option combination: " "-w cannot be used with -p or -s\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } error = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, scrub_callback, &cb); if (wait && !error) { zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; error = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, wait_callback, &act); } return (error); } /* * zpool resilver ... * * Restarts any in-progress resilver */ int zpool_do_resilver(int argc, char **argv) { int c; scrub_cbdata_t cb; cb.cb_type = POOL_SCAN_RESILVER; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); } return (for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, scrub_callback, &cb)); } /* * zpool trim [-d] [-r ] [-c | -s] [ ...] * * -c Cancel. Ends any in-progress trim. * -d Secure trim. Requires kernel and device support. * -r Sets the TRIM rate in bytes (per second). Supports * adding a multiplier suffix such as 'k' or 'm'. * -s Suspend. TRIM can then be restarted with no flags. * -w Wait. Blocks until trimming has completed. */ int zpool_do_trim(int argc, char **argv) { struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"secure", no_argument, NULL, 'd'}, {"rate", required_argument, NULL, 'r'}, {"suspend", no_argument, NULL, 's'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_trim_func_t cmd_type = POOL_TRIM_START; uint64_t rate = 0; boolean_t secure = B_FALSE; boolean_t wait = B_FALSE; int c; while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_CANCEL) { (void) fprintf(stderr, gettext("-c cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_CANCEL; break; case 'd': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-d cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } secure = B_TRUE; break; case 'r': if (cmd_type != POOL_TRIM_START) { (void) fprintf(stderr, gettext("-r cannot be " "combined with the -c or -s options\n")); usage(B_FALSE); } if (zfs_nicestrtonum(g_zfs, optarg, &rate) == -1) { (void) fprintf(stderr, "%s: %s\n", gettext("invalid value for rate"), libzfs_error_description(g_zfs)); usage(B_FALSE); } break; case 's': if (cmd_type != POOL_TRIM_START && cmd_type != POOL_TRIM_SUSPEND) { (void) fprintf(stderr, gettext("-s cannot be " "combined with other options\n")); usage(B_FALSE); } cmd_type = POOL_TRIM_SUSPEND; break; case 'w': wait = B_TRUE; break; case '?': if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name argument\n")); usage(B_FALSE); return (-1); } if (wait && (cmd_type != POOL_TRIM_START)) { (void) fprintf(stderr, gettext("-w cannot be used with -c or " "-s\n")); usage(B_FALSE); } char *poolname = argv[0]; zpool_handle_t *zhp = zpool_open(g_zfs, poolname); if (zhp == NULL) return (-1); trimflags_t trim_flags = { .secure = secure, .rate = rate, .wait = wait, }; nvlist_t *vdevs = fnvlist_alloc(); if (argc == 1) { /* no individual leaf vdevs specified, so add them all */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); zpool_collect_leaves(zhp, nvroot, vdevs); trim_flags.fullpool = B_TRUE; } else { trim_flags.fullpool = B_FALSE; for (int i = 1; i < argc; i++) { fnvlist_add_boolean(vdevs, argv[i]); } } int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); fnvlist_free(vdevs); zpool_close(zhp); return (error); } /* * Converts a total number of seconds to a human readable string broken * down in to days/hours/minutes/seconds. */ static void secs_to_dhms(uint64_t total, char *buf) { uint64_t days = total / 60 / 60 / 24; uint64_t hours = (total / 60 / 60) % 24; uint64_t mins = (total / 60) % 60; uint64_t secs = (total % 60); if (days > 0) { (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", (u_longlong_t)days, (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } else { (void) sprintf(buf, "%02llu:%02llu:%02llu", (u_longlong_t)hours, (u_longlong_t)mins, (u_longlong_t)secs); } } /* * Print out detailed error scrub status. */ static void print_err_scrub_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t total_secs_left; uint64_t secs_left, mins_left, hours_left, days_left; uint64_t examined, to_be_examined; if (ps == NULL || ps->pss_error_scrub_func != POOL_SCAN_ERRORSCRUB) { return; } (void) printf(gettext(" scrub: ")); start = ps->pss_error_scrub_start; end = ps->pss_error_scrub_end; pause = ps->pss_pass_error_scrub_pause; examined = ps->pss_error_scrub_examined; to_be_examined = ps->pss_error_scrub_to_be_examined; assert(ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB); if (ps->pss_error_scrub_state == DSS_FINISHED) { total_secs_left = end - start; days_left = total_secs_left / 60 / 60 / 24; hours_left = (total_secs_left / 60 / 60) % 24; mins_left = (total_secs_left / 60) % 60; secs_left = (total_secs_left % 60); (void) printf(gettext("scrubbed %llu error blocks in %llu days " "%02llu:%02llu:%02llu on %s"), (u_longlong_t)examined, (u_longlong_t)days_left, (u_longlong_t)hours_left, (u_longlong_t)mins_left, (u_longlong_t)secs_left, ctime(&end)); return; } else if (ps->pss_error_scrub_state == DSS_CANCELED) { (void) printf(gettext("error scrub canceled on %s"), ctime(&end)); return; } assert(ps->pss_error_scrub_state == DSS_ERRORSCRUBBING); /* Error scrub is in progress. */ if (pause == 0) { (void) printf(gettext("error scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("error scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\terror scrub started on %s"), ctime(&start)); } double fraction_done = (double)examined / (to_be_examined + examined); (void) printf(gettext("\t%.2f%% done, issued I/O for %llu error" " blocks"), 100 * fraction_done, (u_longlong_t)examined); (void) printf("\n"); } /* * Print out detailed scrub status. */ static void print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i; uint64_t elapsed, scan_rate, issue_rate; double fraction_done; char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7]; char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); /* If there's never been a scan, there's not much to say. */ if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); int is_resilver = ps->pss_func == POOL_SCAN_RESILVER; int is_scrub = ps->pss_func == POOL_SCAN_SCRUB; assert(is_resilver || is_scrub); /* Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { secs_to_dhms(end - start, time_buf); if (is_scrub) { (void) printf(gettext("scrub repaired %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } else if (is_resilver) { (void) printf(gettext("resilvered %s " "in %s with %llu errors on %s"), processed_buf, time_buf, (u_longlong_t)ps->pss_errors, ctime(&end)); } return; } else if (ps->pss_state == DSS_CANCELED) { if (is_scrub) { (void) printf(gettext("scrub canceled on %s"), ctime(&end)); } else if (is_resilver) { (void) printf(gettext("resilver canceled on %s"), ctime(&end)); } return; } assert(ps->pss_state == DSS_SCANNING); /* Scan is in progress. Resilvers can't be paused. */ if (is_scrub) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { (void) printf(gettext("scrub paused since %s"), ctime(&pause)); (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (is_resilver) { (void) printf(gettext("resilver in progress since %s"), ctime(&start)); } scanned = ps->pss_examined; pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; total_s = ps->pss_to_examine; total_i = ps->pss_to_examine - ps->pss_skipped; /* we are only done with a block once we have issued the IO for it */ fraction_done = (double)issued / total_i; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; elapsed = (elapsed != 0) ? elapsed : 1; scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf)); zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf)); /* do not print estimated time if we have a paused scrub */ (void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf); if (pause == 0 && scan_rate > 0) { zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); (void) printf(gettext(" at %s/s"), srate_buf); } (void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf); if (pause == 0 && issue_rate > 0) { zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); (void) printf(gettext(" at %s/s"), irate_buf); } (void) printf(gettext("\n")); if (is_resilver) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (is_scrub) { (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } if (pause == 0) { /* * Only provide an estimate iff: * 1) we haven't yet issued all we expected, and * 2) the issue rate exceeds 10 MB/s, and * 3) it's either: * a) a resilver which has started repairs, or * b) a scrub which has entered the issue phase. */ if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 && ((is_resilver && ps->pss_processed > 0) || (is_scrub && issued > 0))) { secs_to_dhms((total_i - issued) / issue_rate, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } static void print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name) { if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) return; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); printf(" "); uint64_t bytes_scanned = vrs->vrs_bytes_scanned; uint64_t bytes_issued = vrs->vrs_bytes_issued; uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; uint64_t bytes_est_s = vrs->vrs_bytes_est; uint64_t bytes_est_i = vrs->vrs_bytes_est; if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8) bytes_est_i -= vrs->vrs_pass_bytes_skipped; uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / (vrs->vrs_pass_time_ms + 1)) * 1000; uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / (vrs->vrs_pass_time_ms + 1)) * 1000; double scan_pct = MIN((double)bytes_scanned * 100 / (bytes_est_s + 1), 100); /* Format all of the numbers we will be reporting */ char bytes_scanned_buf[7], bytes_issued_buf[7]; char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7]; char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; zfs_nicebytes(bytes_scanned, bytes_scanned_buf, sizeof (bytes_scanned_buf)); zfs_nicebytes(bytes_issued, bytes_issued_buf, sizeof (bytes_issued_buf)); zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, sizeof (bytes_rebuilt_buf)); zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf)); zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf)); time_t start = vrs->vrs_start_time; time_t end = vrs->vrs_end_time; /* Rebuild is finished or canceled. */ if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); (void) printf(gettext("resilvered (%s) %s in %s " "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { (void) printf(gettext("resilver (%s) canceled on %s"), vdev_name, ctime(&end)); return; } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { (void) printf(gettext("resilver (%s) in progress since %s"), vdev_name, ctime(&start)); } assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); (void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf, bytes_est_s_buf); if (scan_rate > 0) { zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); (void) printf(gettext(" at %s/s"), scan_rate_buf); } (void) printf(gettext(", %s / %s issued"), bytes_issued_buf, bytes_est_i_buf); if (issue_rate > 0) { zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); (void) printf(gettext(" at %s/s"), issue_rate_buf); } (void) printf(gettext("\n")); (void) printf(gettext("\t%s resilvered, %.2f%% done"), bytes_rebuilt_buf, scan_pct); if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { if (bytes_est_s >= bytes_scanned && scan_rate >= 10 * 1024 * 1024) { secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " "completion time\n")); } } else { (void) printf(gettext("\n")); } } /* * Print rebuild status for top-level vdevs. */ static void print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { char *name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); print_rebuild_status_impl(vrs, i, name); free(name); } } } /* * As we don't scrub checkpointed blocks, we want to warn the user that we * skipped scanning some blocks if a checkpoint exists or existed at any * time during the scan. If a sequential instead of healing reconstruction * was performed then the blocks were reconstructed. However, their checksums * have not been verified so we still print the warning. */ static void print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) { if (ps == NULL || pcs == NULL) return; if (pcs->pcs_state == CS_NONE || pcs->pcs_state == CS_CHECKPOINT_DISCARDING) return; assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); if (ps->pss_state == DSS_NONE) return; if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && ps->pss_end_time < pcs->pcs_start_time) return; if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { (void) printf(gettext(" scan warning: skipped blocks " "that are only referenced by the checkpoint.\n")); } else { assert(ps->pss_state == DSS_SCANNING); (void) printf(gettext(" scan warning: skipping blocks " "that are only referenced by the checkpoint.\n")); } } /* * Returns B_TRUE if there is an active rebuild in progress. Otherwise, * B_FALSE is returned and 'rebuild_end_time' is set to the end time for * the last completed (or cancelled) rebuild. */ static boolean_t check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) { nvlist_t **child; uint_t children; boolean_t rebuilding = B_FALSE; uint64_t end_time = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; if (nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { if (vrs->vrs_end_time > end_time) end_time = vrs->vrs_end_time; if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { rebuilding = B_TRUE; end_time = 0; break; } } } if (rebuild_end_time != NULL) *rebuild_end_time = end_time; return (rebuilding); } /* * Print the scan status. */ static void print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) { uint64_t rebuild_end_time = 0, resilver_end_time = 0; boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; boolean_t have_errorscrub = B_FALSE; boolean_t active_resilver = B_FALSE; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; uint_t c; time_t scrub_start = 0, errorscrub_start = 0; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c) == 0) { if (ps->pss_func == POOL_SCAN_RESILVER) { resilver_end_time = ps->pss_end_time; active_resilver = (ps->pss_state == DSS_SCANNING); } have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); scrub_start = ps->pss_start_time; if (c > offsetof(pool_scan_stat_t, pss_pass_error_scrub_pause) / 8) { have_errorscrub = (ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB); errorscrub_start = ps->pss_error_scrub_start; } } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); /* Always print the scrub status when available. */ if (have_scrub && scrub_start > errorscrub_start) print_scan_scrub_resilver_status(ps); else if (have_errorscrub && errorscrub_start >= scrub_start) print_err_scrub_status(ps); /* * When there is an active resilver or rebuild print its status. * Otherwise print the status of the last resilver or rebuild. */ if (active_resilver || (!active_rebuild && have_resilver && resilver_end_time && resilver_end_time > rebuild_end_time)) { print_scan_scrub_resilver_status(ps); } else if (active_rebuild || (!active_resilver && have_rebuild && rebuild_end_time && rebuild_end_time > resilver_end_time)) { print_rebuild_status(zhp, nvroot); } (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_scan_warning(ps, pcs); } /* * Print out detailed removal status. */ static void print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) { char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; time_t start, end; nvlist_t *config, *nvroot; nvlist_t **child; uint_t children; char *vdev_name; if (prs == NULL || prs->prs_state == DSS_NONE) return; /* * Determine name of vdev. */ config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(prs->prs_removing_vdev < children); vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); printf_color(ANSI_BOLD, gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); /* * Removal is finished or canceled. */ if (prs->prs_state == DSS_FINISHED) { uint64_t minutes_taken = (end - start) / 60; (void) printf(gettext("Removal of vdev %llu copied %s " "in %lluh%um, completed on %s"), (longlong_t)prs->prs_removing_vdev, copied_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), ctime((time_t *)&end)); } else if (prs->prs_state == DSS_CANCELED) { (void) printf(gettext("Removal of %s canceled on %s"), vdev_name, ctime(&end)); } else { uint64_t copied, total, elapsed, mins_left, hours_left; double fraction_done; uint_t rate; assert(prs->prs_state == DSS_SCANNING); /* * Removal is in progress. */ (void) printf(gettext( "Evacuation of %s in progress since %s"), vdev_name, ctime(&start)); copied = prs->prs_copied > 0 ? prs->prs_copied : 1; total = prs->prs_to_copy; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - prs->prs_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; mins_left = ((total - copied) / rate) / 60; hours_left = mins_left / 60; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ (void) printf(gettext( "\t%s copied out of %s at %s/s, %.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (hours_left < (30 * 24)) { (void) printf(gettext(", %lluh%um to go\n"), (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } free(vdev_name); if (prs->prs_mapping_memory > 0) { char mem_buf[7]; zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); (void) printf(gettext( "\t%s memory used for removed device mappings\n"), mem_buf); } } /* * Print out detailed raidz expansion status. */ static void print_raidz_expand_status(zpool_handle_t *zhp, pool_raidz_expand_stat_t *pres) { char copied_buf[7]; if (pres == NULL || pres->pres_state == DSS_NONE) return; /* * Determine name of vdev. */ nvlist_t *config = zpool_get_config(zhp, NULL); nvlist_t *nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); nvlist_t **child; uint_t children; verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); assert(pres->pres_expanding_vdev < children); printf_color(ANSI_BOLD, gettext("expand: ")); time_t start = pres->pres_start_time; time_t end = pres->pres_end_time; char *vname = zpool_vdev_name(g_zfs, zhp, child[pres->pres_expanding_vdev], 0); zfs_nicenum(pres->pres_reflowed, copied_buf, sizeof (copied_buf)); /* * Expansion is finished or canceled. */ if (pres->pres_state == DSS_FINISHED) { char time_buf[32]; secs_to_dhms(end - start, time_buf); (void) printf(gettext("expanded %s-%u copied %s in %s, " "on %s"), vname, (int)pres->pres_expanding_vdev, copied_buf, time_buf, ctime((time_t *)&end)); } else { char examined_buf[7], total_buf[7], rate_buf[7]; uint64_t copied, total, elapsed, secs_left; double fraction_done; uint_t rate; assert(pres->pres_state == DSS_SCANNING); /* * Expansion is in progress. */ (void) printf(gettext( "expansion of %s-%u in progress since %s"), vname, (int)pres->pres_expanding_vdev, ctime(&start)); copied = pres->pres_reflowed > 0 ? pres->pres_reflowed : 1; total = pres->pres_to_reflow; fraction_done = (double)copied / total; /* elapsed time for this pass */ elapsed = time(NULL) - pres->pres_start_time; elapsed = elapsed > 0 ? elapsed : 1; rate = copied / elapsed; rate = rate > 0 ? rate : 1; secs_left = (total - copied) / rate; zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); /* * do not print estimated time if hours_left is more than * 30 days */ (void) printf(gettext("\t%s / %s copied at %s/s, %.2f%% done"), examined_buf, total_buf, rate_buf, 100 * fraction_done); if (pres->pres_waiting_for_resilver) { (void) printf(gettext(", paused for resilver or " "clear\n")); } else if (secs_left < (30 * 24 * 3600)) { char time_buf[32]; secs_to_dhms(secs_left, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext( ", (copy is slow, no estimated time)\n")); } } free(vname); } static void print_checkpoint_status(pool_checkpoint_stat_t *pcs) { time_t start; char space_buf[7]; if (pcs == NULL || pcs->pcs_state == CS_NONE) return; (void) printf(gettext("checkpoint: ")); start = pcs->pcs_start_time; zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { char *date = ctime(&start); /* * ctime() adds a newline at the end of the generated * string, thus the weird format specifier and the * strlen() call used to chop it off from the output. */ (void) printf(gettext("created %.*s, consumes %s\n"), (int)(strlen(date) - 1), date, space_buf); return; } assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); (void) printf(gettext("discarding, %s remaining.\n"), space_buf); } static void print_error_log(zpool_handle_t *zhp) { nvlist_t *nverrlist = NULL; nvpair_t *elem; char *pathname; size_t len = MAXPATHLEN * 2; if (zpool_get_errlog(zhp, &nverrlist) != 0) return; (void) printf("errors: Permanent errors have been " "detected in the following files:\n\n"); pathname = safe_malloc(len); elem = NULL; while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { nvlist_t *nv; uint64_t dsobj, obj; verify(nvpair_value_nvlist(elem, &nv) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, &dsobj) == 0); verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, &obj) == 0); zpool_obj_to_path(zhp, dsobj, obj, pathname, len); (void) printf("%7s %s\n", "", pathname); } free(pathname); nvlist_free(nverrlist); } static void print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, uint_t nspares) { uint_t i; char *name; if (nspares == 0) return; (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { name = zpool_vdev_name(g_zfs, zhp, spares[i], cb->cb_name_flags); print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); free(name); } } static void print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, uint_t nl2cache) { uint_t i; char *name; if (nl2cache == 0) return; (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { name = zpool_vdev_name(g_zfs, zhp, l2cache[i], cb->cb_name_flags); print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE, NULL); free(name); } } static void print_dedup_stats(nvlist_t *config) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; char dspace[6], mspace[6]; /* * If the pool was faulted then we may not have been able to * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); (void) printf(gettext(" dedup: ")); if (ddo->ddo_count == 0) { (void) printf(gettext("no DDT entries\n")); return; } zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); (void) printf("DDT entries %llu, size %s on disk, %s in core\n", (u_longlong_t)ddo->ddo_count, dspace, mspace); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t **)&ddh, &c) == 0); zpool_dump_ddt(dds, ddh); } /* * Display a summary of pool status. Displays a summary such as: * * pool: tank * status: DEGRADED * reason: One or more devices ... * see: https://openzfs.github.io/openzfs-docs/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK * c2t0d0 UNAVAIL * * When given the '-v' option, we print out the complete config. If the '-e' * option is specified, then we print out error rate information as well. */ static int status_callback(zpool_handle_t *zhp, void *data) { status_cbdata_t *cbp = data; nvlist_t *config, *nvroot; const char *msgid; zpool_status_t reason; zpool_errata_t errata; const char *health; uint_t c; vdev_stat_t *vs; config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); cbp->cb_count++; /* * If we were given 'zpool status -x', only report those pools with * problems. */ if (cbp->cb_explain && (reason == ZPOOL_STATUS_OK || reason == ZPOOL_STATUS_VERSION_OLDER || reason == ZPOOL_STATUS_FEAT_DISABLED || reason == ZPOOL_STATUS_COMPATIBILITY_ERR || reason == ZPOOL_STATUS_INCOMPATIBLE_FEAT)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); if (cbp->cb_first) cbp->cb_first = B_FALSE; } return (0); } if (cbp->cb_first) cbp->cb_first = B_FALSE; else (void) printf("\n"); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_get_state_str(zhp); printf(" "); printf_color(ANSI_BOLD, gettext("pool:")); printf(" %s\n", zpool_get_name(zhp)); fputc(' ', stdout); printf_color(ANSI_BOLD, gettext("state: ")); printf_color(health_str_to_color(health), "%s", health); fputc('\n', stdout); switch (reason) { case ZPOOL_STATUS_MISSING_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. Sufficient replicas exist for\n\tthe pool " "to continue functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_MISSING_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be opened. There are insufficient\n\treplicas for the" " pool to continue functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Attach the missing device " "and online it using 'zpool online'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing or\n\tinvalid. " "Sufficient replicas exist for the pool to continue\n\t" "functioning in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the device using " "'zpool replace'.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices could " "not be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_FAILING_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an unrecoverable error. An\n\tattempt was " "made to correct the error. Applications are " "unaffected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Determine if the " "device needs to be replaced, and clear the errors\n\tusing" " 'zpool clear' or replace the device with 'zpool " "replace'.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been taken offline by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using 'zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_REMOVED_DEV: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "been removed by the administrator.\n\tSufficient " "replicas exist for the pool to continue functioning in " "a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Online the device " "using zpool online' or replace the device with\n\t'zpool " "replace'.\n")); break; case ZPOOL_STATUS_RESILVERING: case ZPOOL_STATUS_REBUILDING: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices is " "currently being resilvered. The pool will\n\tcontinue " "to function, possibly in a degraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Wait for the resilver to " "complete.\n")); break; case ZPOOL_STATUS_REBUILD_SCRUB: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices have " "been sequentially resilvered, scrubbing\n\tthe pool " "is recommended.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " "verify all data checksums.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices has " "experienced an error resulting in data\n\tcorruption. " "Applications may be affected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Restore the file in question" " if possible. Otherwise restore the\n\tentire pool from " "backup.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool metadata is " "corrupted and the pool cannot be opened.\n")); zpool_explain_recover(zpool_get_handle(zhp), zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is formatted using " "a legacy on-disk format. The pool can\n\tstill be used, " "but some features are unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Upgrade the pool using " "'zpool upgrade'. Once this is done, the\n\tpool will no " "longer be accessible on software that does not support\n\t" "feature flags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool has been upgraded " "to a newer, incompatible on-disk version.\n\tThe pool " "cannot be accessed on this system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system running more recent software, or\n\trestore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FEAT_DISABLED: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Some supported and " "requested features are not enabled on the pool.\n\t" "The pool can still be used, but some features are " "unavailable.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Enable all features using " "'zpool upgrade'. Once this is done,\n\tthe pool may no " "longer be accessible by software that does not support\n\t" "the features. See zpool-features(7) for details.\n")); break; case ZPOOL_STATUS_COMPATIBILITY_ERR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("This pool has a " "compatibility list specified, but it could not be\n\t" "read/parsed at this time. The pool can still be used, " "but this\n\tshould be investigated.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Check the value of the " "'compatibility' property against the\n\t" "appropriate file in " ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_INCOMPATIBLE_FEAT: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more features " "are enabled on the pool despite not being\n\t" "requested by the 'compatibility' property.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Consider setting " "'compatibility' to an appropriate value, or\n\t" "adding needed features to the relevant file in\n\t" ZPOOL_SYSCONF_COMPAT_D " or " ZPOOL_DATA_COMPAT_D ".\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_READ: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "on this system because it uses the\n\tfollowing feature(s)" " not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Access the pool from a " "system that supports the required feature(s),\n\tor " "restore the pool from backup.\n")); break; case ZPOOL_STATUS_UNSUP_FEAT_WRITE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool can only be " "accessed in read-only mode on this system. It\n\tcannot be" " accessed in read-write mode because it uses the " "following\n\tfeature(s) not supported on this system:\n")); zpool_print_unsup_feat(config); (void) printf("\n"); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " "in read-write mode. Import the pool with\n" "\t\"-o readonly=on\", access the pool from a system that " "supports the\n\trequired feature(s), or restore the " "pool from backup.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors.\n\tSufficient " "replicas exist for the pool to continue functioning " "in a\n\tdegraded state.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Replace the faulted device, " "or use 'zpool clear' to mark the device\n\trepaired.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_NR: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to persistent errors. There are " "insufficient replicas for the pool to\n\tcontinue " "functioning.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Destroy and re-create the " "pool from a backup source. Manually marking the device\n" "\trepaired using 'zpool clear' may allow some data " "to be recovered.\n")); break; case ZPOOL_STATUS_IO_FAILURE_MMP: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("The pool is suspended " "because multihost writes failed or were delayed;\n\t" "another system could import the pool undetected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " "pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: case ZPOOL_STATUS_IO_FAILURE_CONTINUE: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("One or more devices are " "faulted in response to IO failures.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the affected " "devices are connected, then run 'zpool clear'.\n")); break; case ZPOOL_STATUS_BAD_LOG: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("An intent log record " "could not be read.\n" "\tWaiting for administrator intervention to fix the " "faulted pool.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Either restore the affected " "device(s) and run 'zpool online',\n" "\tor ignore the intent log records by running " "'zpool clear'.\n")); break; case ZPOOL_STATUS_NON_NATIVE_ASHIFT: (void) printf(gettext("status: One or more devices are " "configured to use a non-native block size.\n" "\tExpect reduced performance.\n")); (void) printf(gettext("action: Replace affected devices with " "devices that support the\n\tconfigured block size, or " "migrate data to a properly configured\n\tpool.\n")); break; case ZPOOL_STATUS_HOSTID_MISMATCH: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid" " and system hostid on imported pool.\n\tThis pool was " "previously imported into a system with a different " "hostid,\n\tand then was verbatim imported into this " "system.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Export this pool on all " "systems on which it is imported.\n" "\tThen import it to correct the mismatch.\n")); break; case ZPOOL_STATUS_ERRATA: printf_color(ANSI_BOLD, gettext("status: ")); printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), errata); switch (errata) { case ZPOOL_ERRATA_NONE: break; case ZPOOL_ERRATA_ZOL_2094_SCRUB: printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " run 'zpool scrub'.\n")); break; case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: (void) printf(gettext("\tExisting encrypted datasets " "contain an on-disk incompatibility\n\twhich " "needs to be corrected.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the issue" " backup existing encrypted datasets to new\n\t" "encrypted datasets and destroy the old ones. " "'zfs mount -o ro' can\n\tbe used to temporarily " "mount existing encrypted datasets readonly.\n")); break; case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: (void) printf(gettext("\tExisting encrypted snapshots " "and bookmarks contain an on-disk\n\tincompat" "ibility. This may cause on-disk corruption if " "they are used\n\twith 'zfs recv'.\n")); printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("To correct the" "issue, enable the bookmark_v2 feature. No " "additional\n\taction is needed if there are no " "encrypted snapshots or bookmarks.\n\tIf preserving" "the encrypted snapshots and bookmarks is required," " use\n\ta non-raw send to backup and restore them." " Alternately, they may be\n\tremoved to resolve " "the incompatibility.\n")); break; default: /* * All errata which allow the pool to be imported * must contain an action message. */ assert(0); } break; default: /* * The remaining errors can't actually be generated, yet. */ assert(reason == ZPOOL_STATUS_OK); } if (msgid != NULL) { printf(" "); printf_color(ANSI_BOLD, gettext("see:")); printf(gettext( " https://openzfs.github.io/openzfs-docs/msg/%s\n"), msgid); } if (config != NULL) { uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; print_scan_status(zhp, nvroot); pool_removal_stat_t *prs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); print_removal_status(zhp, prs); pool_checkpoint_stat_t *pcs = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); print_checkpoint_status(pcs); pool_raidz_expand_stat_t *pres = NULL; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); print_raidz_expand_status(zhp, pres); cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, cbp->cb_name_flags | VDEV_NAME_TYPE_ID); if (cbp->cb_namewidth < 10) cbp->cb_namewidth = 10; color_start(ANSI_BOLD); (void) printf(gettext("config:\n\n")); (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); color_end(); if (cbp->cb_print_slow_ios) { printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); } if (cbp->cb_print_power) { printf_color(ANSI_BOLD, " %5s", gettext("POWER")); } if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); printf("\n"); print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, B_FALSE, NULL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, cbp, l2cache, nl2cache); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, cbp, spares, nspares); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, &nerr) == 0) { (void) printf("\n"); if (nerr == 0) { (void) printf(gettext( "errors: No known data errors\n")); } else if (!cbp->cb_verbose) { color_start(ANSI_RED); (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); color_end(); } else { print_error_log(zhp); } } if (cbp->cb_dedup_stats) print_dedup_stats(config); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); } return (0); } /* * zpool status [-c [script1,script2,...]] [-igLpPstvx] [--power] [-T d|u] ... * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD * -e Display only unhealthy vdevs * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -s Display slow IOs column. * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format * --power Display vdev enclosure slot power status * * Describes the health status of all pools or some subset. */ int zpool_do_status(int argc, char **argv) { int c; int ret; float interval = 0; unsigned long count = 0; status_cbdata_t cb = { 0 }; char *cmd = NULL; struct option long_options[] = { {"power", no_argument, NULL, POWER_OPT}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options, NULL)) != -1) { switch (c) { case 'c': if (cmd != NULL) { fprintf(stderr, gettext("Can't set -c flag twice\n")); exit(1); } if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { fprintf(stderr, gettext( "Can't run -c, disabled by " "ZPOOL_SCRIPTS_ENABLED.\n")); exit(1); } if ((getuid() <= 0 || geteuid() <= 0) && !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { fprintf(stderr, gettext( "Can't run -c with root privileges " "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); exit(1); } cmd = optarg; break; case 'e': cb.cb_print_unhealthy = B_TRUE; break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; case 'p': cb.cb_literal = B_TRUE; break; case 'P': cb.cb_name_flags |= VDEV_NAME_PATH; break; case 's': cb.cb_print_slow_ios = B_TRUE; break; case 'v': cb.cb_verbose = B_TRUE; break; case 'x': cb.cb_explain = B_TRUE; break; case 'D': cb.cb_dedup_stats = B_TRUE; break; case 't': cb.cb_print_vdev_trim = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case POWER_OPT: cb.cb_print_power = B_TRUE; break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); exit(0); } else { fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; cb.cb_first = B_TRUE; cb.cb_print_status = B_TRUE; for (;;) { if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); if (cmd != NULL) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); ret = for_each_pool(argc, argv, B_TRUE, NULL, ZFS_TYPE_POOL, cb.cb_literal, status_callback, &cb); if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) (void) printf(gettext("all pools are healthy\n")); if (ret != 0) return (ret); if (interval == 0) break; if (count != 0 && --count == 0) break; (void) fflush(stdout); (void) fsleep(interval); } return (0); } typedef struct upgrade_cbdata { int cb_first; int cb_argc; uint64_t cb_version; char **cb_argv; } upgrade_cbdata_t; static int check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) { int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int *count = (int *)unsupp_fs; if (zfs_version > ZPL_VERSION) { (void) printf(gettext("%s (v%d) is not supported by this " "implementation of ZFS.\n"), zfs_get_name(zhp), zfs_version); (*count)++; } zfs_iter_filesystems_v2(zhp, 0, check_unsupp_fs, unsupp_fs); zfs_close(zhp); return (0); } static int upgrade_version(zpool_handle_t *zhp, uint64_t version) { int ret; nvlist_t *config; uint64_t oldversion; int unsupp_fs = 0; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &oldversion) == 0); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; assert(SPA_VERSION_IS_SUPPORTED(oldversion)); assert(oldversion < version); ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); if (ret != 0) return (ret); if (unsupp_fs) { (void) fprintf(stderr, gettext("Upgrade not performed due " "to %d unsupported filesystems (max v%d).\n"), unsupp_fs, (int)ZPL_VERSION); return (1); } if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) { (void) fprintf(stderr, gettext("Upgrade not performed because " "'compatibility' property set to '" ZPOOL_COMPAT_LEGACY "'.\n")); return (1); } ret = zpool_upgrade(zhp, version); if (ret != 0) return (ret); if (version >= SPA_VERSION_FEATURES) { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to feature flags.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion); } else { (void) printf(gettext("Successfully upgraded " "'%s' from version %llu to version %llu.\n"), zpool_get_name(zhp), (u_longlong_t)oldversion, (u_longlong_t)version); } return (0); } static int upgrade_enable_all(zpool_handle_t *zhp, int *countp) { int i, ret, count; boolean_t firstff = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t requested_features[SPA_FEATURES]; if (zpool_do_load_compat(compat, requested_features) != ZPOOL_COMPATIBILITY_OK) return (-1); count = 0; for (i = 0; i < SPA_FEATURES; i++) { const char *fname = spa_feature_table[i].fi_uname; const char *fguid = spa_feature_table[i].fi_guid; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid) && requested_features[i]) { char *propname; verify(-1 != asprintf(&propname, "feature@%s", fname)); ret = zpool_set_prop(zhp, propname, ZFS_FEATURE_ENABLED); if (ret != 0) { free(propname); return (ret); } count++; if (firstff) { (void) printf(gettext("Enabled the " "following features on '%s':\n"), zpool_get_name(zhp)); firstff = B_FALSE; } (void) printf(gettext(" %s\n"), fname); free(propname); } } if (countp != NULL) *countp = count; return (0); } static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; boolean_t modified_pool = B_FALSE; int ret; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < cbp->cb_version) { cbp->cb_first = B_FALSE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); modified_pool = B_TRUE; /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count > 0) { cbp->cb_first = B_FALSE; modified_pool = B_TRUE; } } if (modified_pool) { (void) printf("\n"); (void) after_zpool_upgrade(zhp); } return (0); } static int upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); assert(SPA_VERSION_IS_SUPPORTED(version)); if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " "formatted with legacy version numbers and can\n" "be upgraded to use feature flags. After " "being upgraded, these pools\nwill no " "longer be accessible by software that does not " "support feature\nflags.\n\n" "Note that setting a pool's 'compatibility' " "feature to '" ZPOOL_COMPAT_LEGACY "' will\n" "inhibit upgrades.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; } (void) printf("%2llu %s\n", (u_longlong_t)version, zpool_get_name(zhp)); } return (0); } static int upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if (version >= SPA_VERSION_FEATURES) { int i; boolean_t poolfirst = B_TRUE; nvlist_t *enabled = zpool_get_features(zhp); for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; const char *fname = spa_feature_table[i].fi_uname; if (!spa_feature_table[i].fi_zfs_mod_supported) continue; if (!nvlist_exists(enabled, fguid)) { if (cbp->cb_first) { (void) printf(gettext("\nSome " "supported features are not " "enabled on the following pools. " "Once a\nfeature is enabled the " "pool may become incompatible with " "software\nthat does not support " "the feature. See " "zpool-features(7) for " "details.\n\n" "Note that the pool " "'compatibility' feature can be " "used to inhibit\nfeature " "upgrades.\n\n")); (void) printf(gettext("POOL " "FEATURE\n")); (void) printf(gettext("------" "---------\n")); cbp->cb_first = B_FALSE; } if (poolfirst) { (void) printf(gettext("%s\n"), zpool_get_name(zhp)); poolfirst = B_FALSE; } (void) printf(gettext(" %s\n"), fname); } /* * If they did "zpool upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } } return (0); } static int upgrade_one(zpool_handle_t *zhp, void *data) { boolean_t modified_pool = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) fprintf(stderr, gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" " to upgrade.\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using more current version '%llu'.\n\n"), zpool_get_name(zhp), (u_longlong_t)cur_version); return (0); } if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " "using version %llu.\n\n"), zpool_get_name(zhp), (u_longlong_t)cbp->cb_version); return (0); } if (cur_version != cbp->cb_version) { modified_pool = B_TRUE; ret = upgrade_version(zhp, cbp->cb_version); if (ret != 0) return (ret); } if (cbp->cb_version >= SPA_VERSION_FEATURES) { int count = 0; ret = upgrade_enable_all(zhp, &count); if (ret != 0) return (ret); if (count != 0) { modified_pool = B_TRUE; } else if (cur_version == SPA_VERSION) { (void) printf(gettext("Pool '%s' already has all " "supported and requested features enabled.\n"), zpool_get_name(zhp)); } } if (modified_pool) { (void) printf("\n"); (void) after_zpool_upgrade(zhp); } return (0); } /* * zpool upgrade * zpool upgrade -v * zpool upgrade [-V version] <-a | pool ...> * * With no arguments, display downrev'd ZFS pool available for upgrade. * Individual pools can be upgraded by specifying the pool, and '-a' will * upgrade all pools. */ int zpool_do_upgrade(int argc, char **argv) { int c; upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; boolean_t upgradeall = B_FALSE; char *end; /* check options */ while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); if (*end != '\0' || !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } cb.cb_argc = argc; cb.cb_argv = argv; argc -= optind; argv += optind; if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); usage(B_FALSE); } } (void) printf("%s", gettext("This system supports ZFS pool feature " "flags.\n\n")); if (showversions) { int i; (void) printf(gettext("The following features are " "supported:\n\n")); (void) printf(gettext("FEAT DESCRIPTION\n")); (void) printf("----------------------------------------------" "---------------\n"); for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t *fi = &spa_feature_table[i]; if (!fi->fi_zfs_mod_supported) continue; const char *ro = (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? " (read-only compatible)" : ""; (void) printf("%-37s%s\n", fi->fi_uname, ro); (void) printf(" %s\n", fi->fi_desc); } (void) printf("\n"); (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS version\n")); (void) printf(gettext(" 2 Ditto blocks " "(replicated metadata)\n")); (void) printf(gettext(" 3 Hot spares and double parity " "RAID-Z\n")); (void) printf(gettext(" 4 zpool history\n")); (void) printf(gettext(" 5 Compression using the gzip " "algorithm\n")); (void) printf(gettext(" 6 bootfs pool property\n")); (void) printf(gettext(" 7 Separate intent log devices\n")); (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext(" 14 passthrough-x aclinherit\n")); (void) printf(gettext(" 15 user/group space accounting\n")); (void) printf(gettext(" 16 stmf property support\n")); (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); (void) printf(gettext(" 18 Snapshot user holds\n")); (void) printf(gettext(" 19 Log device removal\n")); (void) printf(gettext(" 20 Compression using zle " "(zero-length encoding)\n")); (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); (void) printf(gettext(" 24 System attributes\n")); (void) printf(gettext(" 25 Improved scrub stats\n")); (void) printf(gettext(" 26 Improved snapshot deletion " "performance\n")); (void) printf(gettext(" 27 Improved snapshot creation " "performance\n")); (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf(gettext("see the ZFS Administration Guide.\n\n")); } else if (argc == 0 && upgradeall) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); if (ret == 0 && cb.cb_first) { if (cb.cb_version == SPA_VERSION) { (void) printf(gettext("All pools are already " "formatted using feature flags.\n\n")); (void) printf(gettext("Every feature flags " "pool already has all supported and " "requested features enabled.\n")); } else { (void) printf(gettext("All pools are already " "formatted with version %llu or higher.\n"), (u_longlong_t)cb.cb_version); } } } else if (argc == 0) { cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("All pools are formatted " "using feature flags.\n\n")); } else { (void) printf(gettext("\nUse 'zpool upgrade -v' " "for a list of available legacy versions.\n")); } cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); assert(ret == 0); if (cb.cb_first) { (void) printf(gettext("Every feature flags pool has " "all supported and requested features enabled.\n")); } else { (void) printf(gettext("\n")); } } else { ret = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, upgrade_one, &cb); } return (ret); } typedef struct hist_cbdata { boolean_t first; boolean_t longfmt; boolean_t internal; } hist_cbdata_t; static void print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) { nvlist_t **records; uint_t numrecords; int i; verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { nvlist_t *rec = records[i]; char tbuf[64] = ""; if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { time_t tsec; struct tm t; tsec = fnvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME); (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); } if (nvlist_exists(rec, ZPOOL_HIST_ELAPSED_NS)) { uint64_t elapsed_ns = fnvlist_lookup_int64(records[i], ZPOOL_HIST_ELAPSED_NS); (void) snprintf(tbuf + strlen(tbuf), sizeof (tbuf) - strlen(tbuf), " (%lldms)", (long long)elapsed_ns / 1000 / 1000); } if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { (void) printf("%s %s", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { int ievent = fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); if (!cb->internal) continue; if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); continue; } (void) printf("%s [internal %s txg:%lld] %s", tbuf, zfs_history_event_names[ievent], (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { if (!cb->internal) continue; (void) printf("%s [txg:%lld] %s", tbuf, (longlong_t)fnvlist_lookup_uint64( rec, ZPOOL_HIST_TXG), fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { (void) printf(" %s (%llu)", fnvlist_lookup_string(rec, ZPOOL_HIST_DSNAME), (u_longlong_t)fnvlist_lookup_uint64(rec, ZPOOL_HIST_DSID)); } (void) printf(" %s", fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { if (!cb->internal) continue; (void) printf("%s ioctl %s\n", tbuf, fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { (void) printf(" input:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_INPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { (void) printf(" output:\n"); dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_SIZE)) { (void) printf(" output nvlist omitted; " "original size: %lldKB\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_OUTPUT_SIZE) / 1024); } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, ZPOOL_HIST_ERRNO)); } } else { if (!cb->internal) continue; (void) printf("%s unrecognized record:\n", tbuf); dump_nvlist(rec, 4); } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); struct passwd *pwd = getpwuid(who); (void) printf("user %d ", (int)who); if (pwd != NULL) (void) printf("(%s) ", pwd->pw_name); } if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { (void) printf("on %s", fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); } if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { (void) printf(":%s", fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } (void) printf("]"); (void) printf("\n"); } } /* * Print out the command history for a specific pool. */ static int get_history_one(zpool_handle_t *zhp, void *data) { nvlist_t *nvhis; int ret; hist_cbdata_t *cb = (hist_cbdata_t *)data; uint64_t off = 0; boolean_t eof = B_FALSE; cb->first = B_FALSE; (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); while (!eof) { if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) return (ret); print_history_records(nvhis, cb); nvlist_free(nvhis); } (void) printf("\n"); return (ret); } /* * zpool history * * Displays the history of commands that modified pools. */ int zpool_do_history(int argc, char **argv) { hist_cbdata_t cbdata = { 0 }; int ret; int c; cbdata.first = B_TRUE; /* check options */ while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': cbdata.longfmt = B_TRUE; break; case 'i': cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; ret = for_each_pool(argc, argv, B_FALSE, NULL, ZFS_TYPE_POOL, B_FALSE, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { (void) fprintf(stderr, gettext("no pools available\n")); return (0); } return (ret); } typedef struct ev_opts { int verbose; int scripted; int follow; int clear; char poolname[ZFS_MAX_DATASET_NAME_LEN]; } ev_opts_t; static void zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) { char ctime_str[26], str[32]; const char *ptr; int64_t *tv; uint_t n; verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); memset(str, ' ', 32); (void) ctime_r((const time_t *)&tv[0], ctime_str); (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */ (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */ (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */ (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ if (opts->scripted) (void) printf(gettext("%s\t"), str); else (void) printf(gettext("%s "), str); verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); (void) printf(gettext("%s\n"), ptr); } static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { data_type_t type = nvpair_type(nvp); const char *name = nvpair_name(nvp); boolean_t b; uint8_t i8; uint16_t i16; uint32_t i32; uint64_t i64; const char *str; nvlist_t *cnv; printf(gettext("%*s%s = "), depth, "", name); switch (type) { case DATA_TYPE_BOOLEAN: printf(gettext("%s"), "1"); break; case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(nvp, &b); printf(gettext("%s"), b ? "1" : "0"); break; case DATA_TYPE_BYTE: (void) nvpair_value_byte(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT8: (void) nvpair_value_int8(nvp, (void *)&i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_UINT8: (void) nvpair_value_uint8(nvp, &i8); printf(gettext("0x%x"), i8); break; case DATA_TYPE_INT16: (void) nvpair_value_int16(nvp, (void *)&i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_UINT16: (void) nvpair_value_uint16(nvp, &i16); printf(gettext("0x%x"), i16); break; case DATA_TYPE_INT32: (void) nvpair_value_int32(nvp, (void *)&i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); printf(gettext("0x%x"), i32); break; case DATA_TYPE_INT64: (void) nvpair_value_int64(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &i64); /* * translate vdev state values to readable * strings to aide zpool events consumers */ if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } break; case DATA_TYPE_HRTIME: (void) nvpair_value_hrtime(nvp, (void *)&i64); printf(gettext("0x%llx"), (u_longlong_t)i64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &str); printf(gettext("\"%s\""), str ? str : ""); break; case DATA_TYPE_NVLIST: printf(gettext("(embedded nvlist)\n")); (void) nvpair_value_nvlist(nvp, &cnv); zpool_do_events_nvprint(cnv, depth + 8); printf(gettext("%*s(end %s)"), depth, "", name); break; case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; uint_t i, nelem; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); printf(gettext("(%d embedded nvlists)\n"), nelem); for (i = 0; i < nelem; i++) { printf(gettext("%*s%s[%d] = %s\n"), depth, "", name, i, "(embedded nvlist)"); zpool_do_events_nvprint(val[i], depth + 8); printf(gettext("%*s(end %s[%i])\n"), depth, "", name, i); } printf(gettext("%*s(end %s)\n"), depth, "", name); } break; case DATA_TYPE_INT8_ARRAY: { int8_t *val; uint_t i, nelem; (void) nvpair_value_int8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; uint_t i, nelem; (void) nvpair_value_uint8_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; uint_t i, nelem; (void) nvpair_value_int16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; uint_t i, nelem; (void) nvpair_value_uint16_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; uint_t i, nelem; (void) nvpair_value_int32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; uint_t i, nelem; (void) nvpair_value_uint32_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%x "), val[i]); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; uint_t i, nelem; (void) nvpair_value_int64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; uint_t i, nelem; (void) nvpair_value_uint64_array(nvp, &val, &nelem); for (i = 0; i < nelem; i++) printf(gettext("0x%llx "), (u_longlong_t)val[i]); break; } case DATA_TYPE_STRING_ARRAY: { const char **str; uint_t i, nelem; (void) nvpair_value_string_array(nvp, &str, &nelem); for (i = 0; i < nelem; i++) printf(gettext("\"%s\" "), str[i] ? str[i] : ""); break; } case DATA_TYPE_BOOLEAN_ARRAY: case DATA_TYPE_BYTE_ARRAY: case DATA_TYPE_DOUBLE: case DATA_TYPE_DONTCARE: case DATA_TYPE_UNKNOWN: printf(gettext("")); break; } printf(gettext("\n")); } } static int zpool_do_events_next(ev_opts_t *opts) { nvlist_t *nvl; int zevent_fd, ret, dropped; const char *pool; zevent_fd = open(ZFS_DEV, O_RDWR); VERIFY(zevent_fd >= 0); if (!opts->scripted) (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); while (1) { ret = zpool_events_next(g_zfs, &nvl, &dropped, (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); if (ret || nvl == NULL) break; if (dropped > 0) (void) printf(gettext("dropped %d events\n"), dropped); if (strlen(opts->poolname) > 0 && nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 && strcmp(opts->poolname, pool) != 0) continue; zpool_do_events_short(nvl, opts); if (opts->verbose) { zpool_do_events_nvprint(nvl, 8); printf(gettext("\n")); } (void) fflush(stdout); nvlist_free(nvl); } VERIFY(0 == close(zevent_fd)); return (ret); } static int zpool_do_events_clear(void) { int count, ret; ret = zpool_events_clear(g_zfs, &count); if (!ret) (void) printf(gettext("cleared %d events\n"), count); return (ret); } /* * zpool events [-vHf [pool] | -c] * * Displays events logs by ZFS. */ int zpool_do_events(int argc, char **argv) { ev_opts_t opts = { 0 }; int ret; int c; /* check options */ while ((c = getopt(argc, argv, "vHfc")) != -1) { switch (c) { case 'v': opts.verbose = 1; break; case 'H': opts.scripted = 1; break; case 'f': opts.follow = 1; break; case 'c': opts.clear = 1; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } else if (argc == 1) { (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname)); if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) { (void) fprintf(stderr, gettext("invalid pool name '%s'\n"), opts.poolname); usage(B_FALSE); } } if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) && opts.clear) { (void) fprintf(stderr, gettext("invalid options combined with -c\n")); usage(B_FALSE); } if (opts.clear) ret = zpool_do_events_clear(); else ret = zpool_do_events_next(&opts); return (ret); } static int get_callback_vdev(zpool_handle_t *zhp, char *vdevname, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[ZFS_MAXPROPLEN]; zprop_source_t srctype; for (zprop_list_t *pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { char *prop_name; /* * If the first property is pool name, it is a special * placeholder that we can skip. This will also skip * over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL) { prop_name = pl->pl_user_prop; } else { prop_name = (char *)vdev_prop_to_name(pl->pl_prop); } if (zpool_get_vdev_prop(zhp, vdevname, pl->pl_prop, prop_name, value, sizeof (value), &srctype, cbp->cb_literal) == 0) { zprop_print_one_property(vdevname, cbp, prop_name, value, srctype, NULL, NULL); } } return (0); } static int get_callback_vdev_cb(void *zhp_data, nvlist_t *nv, void *data) { zpool_handle_t *zhp = zhp_data; zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char *vdevname; const char *type; int ret; /* * zpool_vdev_name() transforms the root vdev name (i.e., root-0) to the * pool name for display purposes, which is not desired. Fallback to * zpool_vdev_name() when not dealing with the root vdev. */ type = fnvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE); if (zhp != NULL && strcmp(type, "root") == 0) vdevname = strdup("root-0"); else vdevname = zpool_vdev_name(g_zfs, zhp, nv, cbp->cb_vdevs.cb_name_flags); (void) vdev_expand_proplist(zhp, vdevname, &cbp->cb_proplist); ret = get_callback_vdev(zhp, vdevname, data); free(vdevname); return (ret); } static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; char value[ZFS_MAXPROPLEN]; zprop_source_t srctype; zprop_list_t *pl; int vid; if (cbp->cb_type == ZFS_TYPE_VDEV) { if (strcmp(cbp->cb_vdevs.cb_names[0], "all-vdevs") == 0) { for_each_vdev(zhp, get_callback_vdev_cb, data); } else { /* Adjust column widths for vdev properties */ for (vid = 0; vid < cbp->cb_vdevs.cb_names_count; vid++) { vdev_expand_proplist(zhp, cbp->cb_vdevs.cb_names[vid], &cbp->cb_proplist); } /* Display the properties */ for (vid = 0; vid < cbp->cb_vdevs.cb_names_count; vid++) { get_callback_vdev(zhp, cbp->cb_vdevs.cb_names[vid], data); } } } else { assert(cbp->cb_type == ZFS_TYPE_POOL); for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * Skip the special fake placeholder. This will also * skip over the name property when 'all' is specified. */ if (pl->pl_prop == ZPOOL_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop == ZPROP_INVAL && zfs_prop_user(pl->pl_user_prop)) { srctype = ZPROP_SRC_LOCAL; if (zpool_get_userprop(zhp, pl->pl_user_prop, value, sizeof (value), &srctype) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } else if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; if (zpool_prop_get_feature(zhp, pl->pl_user_prop, value, sizeof (value)) == 0) { zprop_print_one_property( zpool_get_name(zhp), cbp, pl->pl_user_prop, value, srctype, NULL, NULL); } } else { if (zpool_get_prop(zhp, pl->pl_prop, value, sizeof (value), &srctype, cbp->cb_literal) != 0) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, NULL); } } } return (0); } /* * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of columns to display. Defaults to * "name,property,value,source". * -p Display values in parsable (exact) format. * * Get properties of pools in the system. Output space statistics * for each one as well as other attributes. */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; int c, i; char *propstr = NULL; char *vdev = NULL; cb.cb_first = B_TRUE; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; cb.cb_vdevs.cb_name_flags |= VDEV_NAME_TYPE_ID; current_prop_type = cb.cb_type; /* check options */ while ((c = getopt(argc, argv, ":Hpo:")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 'o': memset(&cb.cb_columns, 0, sizeof (cb.cb_columns)); i = 0; for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_opts[] = { "name", "property", "value", "source", "all" }; static const zfs_get_column_t col_cols[] = { GET_COL_NAME, GET_COL_PROPERTY, GET_COL_VALUE, GET_COL_SOURCE }; if (i == ZFS_GET_NCOLS - 1) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } for (c = 0; c < ARRAY_SIZE(col_opts); ++c) if (strcmp(tok, col_opts[c]) == 0) goto found; (void) fprintf(stderr, gettext("invalid column name '%s'\n"), tok); usage(B_FALSE); found: if (c >= 4) { if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } memcpy(cb.cb_columns, col_cols, sizeof (col_cols)); i = ZFS_GET_NCOLS - 1; } else cb.cb_columns[i++] = col_cols[c]; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } /* Properties list is needed later by zprop_get_list() */ propstr = argv[0]; argc--; argv++; if (argc == 0) { /* No args, so just print the defaults. */ } else if (are_all_pools(argc, argv)) { /* All the args are pool names */ } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if ((argc == 2 && strcmp(argv[1], "all-vdevs") == 0) || (argc == 2 && strcmp(argv[1], "root") == 0) || are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb.cb_vdevs)) { if (strcmp(argv[1], "root") == 0) vdev = strdup("root-0"); else vdev = strdup(argv[1]); /* ... and the rest are vdev names */ cb.cb_vdevs.cb_names = &vdev; cb.cb_vdevs.cb_names_count = argc - 1; cb.cb_type = ZFS_TYPE_VDEV; argc = 1; /* One pool to process */ } else { fprintf(stderr, gettext("Expected a list of vdevs in" " \"%s\", but got:\n"), argv[0]); error_list_unresolved_vdevs(argc - 1, argv + 1, argv[0], &cb.cb_vdevs); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } } else { /* * The first arg isn't a pool name, */ fprintf(stderr, gettext("missing pool name.\n")); fprintf(stderr, "\n"); usage(B_FALSE); return (1); } if (zprop_get_list(g_zfs, propstr, &cb.cb_proplist, cb.cb_type) != 0) { /* Use correct list of valid properties (pool or vdev) */ current_prop_type = cb.cb_type; usage(B_FALSE); } if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, cb.cb_type, cb.cb_literal, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); if (vdev != NULL) free(vdev); return (ret); } typedef struct set_cbdata { char *cb_propname; char *cb_value; zfs_type_t cb_type; vdev_cbdata_t cb_vdevs; boolean_t cb_any_successful; } set_cbdata_t; static int set_pool_callback(zpool_handle_t *zhp, set_cbdata_t *cb) { int error; /* Check if we have out-of-bounds features */ if (strcmp(cb->cb_propname, ZPOOL_CONFIG_COMPATIBILITY) == 0) { boolean_t features[SPA_FEATURES]; if (zpool_do_load_compat(cb->cb_value, features) != ZPOOL_COMPATIBILITY_OK) return (-1); nvlist_t *enabled = zpool_get_features(zhp); spa_feature_t i; for (i = 0; i < SPA_FEATURES; i++) { const char *fguid = spa_feature_table[i].fi_guid; if (nvlist_exists(enabled, fguid) && !features[i]) break; } if (i < SPA_FEATURES) (void) fprintf(stderr, gettext("Warning: one or " "more features already enabled on pool '%s'\n" "are not present in this compatibility set.\n"), zpool_get_name(zhp)); } /* if we're setting a feature, check it's in compatibility set */ if (zpool_prop_feature(cb->cb_propname) && strcmp(cb->cb_value, ZFS_FEATURE_ENABLED) == 0) { char *fname = strchr(cb->cb_propname, '@') + 1; spa_feature_t f; if (zfeature_lookup_name(fname, &f) == 0) { char compat[ZFS_MAXPROPLEN]; if (zpool_get_prop(zhp, ZPOOL_PROP_COMPATIBILITY, compat, ZFS_MAXPROPLEN, NULL, B_FALSE) != 0) compat[0] = '\0'; boolean_t features[SPA_FEATURES]; if (zpool_do_load_compat(compat, features) != ZPOOL_COMPATIBILITY_OK) { (void) fprintf(stderr, gettext("Error: " "cannot enable feature '%s' on pool '%s'\n" "because the pool's 'compatibility' " "property cannot be parsed.\n"), fname, zpool_get_name(zhp)); return (-1); } if (!features[f]) { (void) fprintf(stderr, gettext("Error: " "cannot enable feature '%s' on pool '%s'\n" "as it is not specified in this pool's " "current compatibility set.\n" "Consider setting 'compatibility' to a " "less restrictive set, or to 'off'.\n"), fname, zpool_get_name(zhp)); return (-1); } } } error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); return (error); } static int set_callback(zpool_handle_t *zhp, void *data) { int error; set_cbdata_t *cb = (set_cbdata_t *)data; if (cb->cb_type == ZFS_TYPE_VDEV) { error = zpool_set_vdev_prop(zhp, *cb->cb_vdevs.cb_names, cb->cb_propname, cb->cb_value); } else { assert(cb->cb_type == ZFS_TYPE_POOL); error = set_pool_callback(zhp, cb); } cb->cb_any_successful = !error; return (error); } int zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; char *vdev = NULL; current_prop_type = ZFS_TYPE_POOL; if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing property=value " "argument\n")); usage(B_FALSE); } if (argc < 3) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } if (argc > 4) { (void) fprintf(stderr, gettext("too many pool names\n")); usage(B_FALSE); } cb.cb_propname = argv[1]; cb.cb_type = ZFS_TYPE_POOL; cb.cb_vdevs.cb_name_flags |= VDEV_NAME_TYPE_ID; cb.cb_value = strchr(cb.cb_propname, '='); if (cb.cb_value == NULL) { (void) fprintf(stderr, gettext("missing value in " "property=value argument\n")); usage(B_FALSE); } *(cb.cb_value) = '\0'; cb.cb_value++; argc -= 2; argv += 2; /* argv[0] is pool name */ if (!is_pool(argv[0])) { (void) fprintf(stderr, gettext("cannot open '%s': is not a pool\n"), argv[0]); return (EINVAL); } /* argv[1], when supplied, is vdev name */ if (argc == 2) { if (strcmp(argv[1], "root") == 0) vdev = strdup("root-0"); else vdev = strdup(argv[1]); if (!are_vdevs_in_pool(1, &vdev, argv[0], &cb.cb_vdevs)) { (void) fprintf(stderr, gettext( "cannot find '%s' in '%s': device not in pool\n"), vdev, argv[0]); free(vdev); return (EINVAL); } cb.cb_vdevs.cb_names = &vdev; cb.cb_vdevs.cb_names_count = 1; cb.cb_type = ZFS_TYPE_VDEV; } error = for_each_pool(1, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, set_callback, &cb); if (vdev != NULL) free(vdev); return (error); } /* Add up the total number of bytes left to initialize/trim across all vdevs */ static uint64_t vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) { uint64_t bytes_remaining; nvlist_t **child; uint_t c, children; vdev_stat_t *vs; assert(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); if (activity == ZPOOL_WAIT_INITIALIZE && vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) bytes_remaining = vs->vs_initialize_bytes_est - vs->vs_initialize_bytes_done; else if (activity == ZPOOL_WAIT_TRIM && vs->vs_trim_state == VDEV_TRIM_ACTIVE) bytes_remaining = vs->vs_trim_bytes_est - vs->vs_trim_bytes_done; else bytes_remaining = 0; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) bytes_remaining += vdev_activity_remaining(child[c], activity); return (bytes_remaining); } /* Add up the total number of bytes left to rebuild across top-level vdevs */ static uint64_t vdev_activity_top_remaining(nvlist_t *nv) { uint64_t bytes_remaining = 0; nvlist_t **child; uint_t children; int error; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (uint_t c = 0; c < children; c++) { vdev_rebuild_stat_t *vrs; uint_t i; error = nvlist_lookup_uint64_array(child[c], ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); if (error == 0) { if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { bytes_remaining += (vrs->vrs_bytes_est - vrs->vrs_bytes_rebuilt); } } } return (bytes_remaining); } /* Whether any vdevs are 'spare' or 'replacing' vdevs */ static boolean_t vdev_any_spare_replacing(nvlist_t *nv) { nvlist_t **child; uint_t c, children; const char *vdev_type; (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; for (c = 0; c < children; c++) { if (vdev_any_spare_replacing(child[c])) return (B_TRUE); } return (B_FALSE); } typedef struct wait_data { char *wd_poolname; boolean_t wd_scripted; boolean_t wd_exact; boolean_t wd_headers_once; boolean_t wd_should_exit; /* Which activities to wait for */ boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES]; float wd_interval; pthread_cond_t wd_cv; pthread_mutex_t wd_mutex; } wait_data_t; /* * Print to stdout a single line, containing one column for each activity that * we are waiting for specifying how many bytes of work are left for that * activity. */ static void print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) { nvlist_t *config, *nvroot; uint_t c; int i; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *pss = NULL; pool_removal_stat_t *prs = NULL; pool_raidz_expand_stat_t *pres = NULL; const char *const headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", "REMOVE", "RESILVER", "SCRUB", "TRIM", "RAIDZ_EXPAND"}; int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; /* Calculate the width of each column */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { /* * Make sure we have enough space in the col for pretty-printed * numbers and for the column header, and then leave a couple * spaces between cols for readability. */ col_widths[i] = MAX(strlen(headers[i]), 6) + 2; } if (timestamp_fmt != NODATE) print_timestamp(timestamp_fmt); /* Print header if appropriate */ int term_height = terminal_height(); boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && row % (term_height-1) == 0); if (!wd->wd_scripted && (row == 0 || reprint_header)) { for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { if (wd->wd_enabled[i]) (void) printf("%*s", col_widths[i], headers[i]); } (void) fputc('\n', stdout); } /* Bytes of work remaining in each activity */ int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0}; bytes_rem[ZPOOL_WAIT_FREE] = zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL); config = zpool_get_config(zhp, NULL); nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING) bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); if (prs != NULL && prs->prs_state == DSS_SCANNING) bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy - prs->prs_copied; (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c); if (pss != NULL && pss->pss_state == DSS_SCANNING && pss->pss_pass_scrub_pause == 0) { int64_t rem = pss->pss_to_examine - pss->pss_issued; if (pss->pss_func == POOL_SCAN_SCRUB) bytes_rem[ZPOOL_WAIT_SCRUB] = rem; else bytes_rem[ZPOOL_WAIT_RESILVER] = rem; } else if (check_rebuilding(nvroot, NULL)) { bytes_rem[ZPOOL_WAIT_RESILVER] = vdev_activity_top_remaining(nvroot); } (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_RAIDZ_EXPAND_STATS, (uint64_t **)&pres, &c); if (pres != NULL && pres->pres_state == DSS_SCANNING) { int64_t rem = pres->pres_to_reflow - pres->pres_reflowed; bytes_rem[ZPOOL_WAIT_RAIDZ_EXPAND] = rem; } bytes_rem[ZPOOL_WAIT_INITIALIZE] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); bytes_rem[ZPOOL_WAIT_TRIM] = vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM); /* * A replace finishes after resilvering finishes, so the amount of work * left for a replace is the same as for resilvering. * * It isn't quite correct to say that if we have any 'spare' or * 'replacing' vdevs and a resilver is happening, then a replace is in * progress, like we do here. When a hot spare is used, the faulted vdev * is not removed after the hot spare is resilvered, so parent 'spare' * vdev is not removed either. So we could have a 'spare' vdev, but be * resilvering for a different reason. However, we use it as a heuristic * because we don't have access to the DTLs, which could tell us whether * or not we have really finished resilvering a hot spare. */ if (vdev_any_spare_replacing(nvroot)) bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { char buf[64]; if (!wd->wd_enabled[i]) continue; if (wd->wd_exact) { (void) snprintf(buf, sizeof (buf), "%" PRIi64, bytes_rem[i]); } else { zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); } if (wd->wd_scripted) (void) printf(i == 0 ? "%s" : "\t%s", buf); else (void) printf(" %*s", col_widths[i] - 1, buf); } (void) printf("\n"); (void) fflush(stdout); } static void * wait_status_thread(void *arg) { wait_data_t *wd = (wait_data_t *)arg; zpool_handle_t *zhp; if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL) return (void *)(1); for (int row = 0; ; row++) { boolean_t missing; struct timespec timeout; int ret = 0; (void) clock_gettime(CLOCK_REALTIME, &timeout); if (zpool_refresh_stats(zhp, &missing) != 0 || missing || zpool_props_refresh(zhp) != 0) { zpool_close(zhp); return (void *)(uintptr_t)(missing ? 0 : 1); } print_wait_status_row(wd, zhp, row); timeout.tv_sec += floor(wd->wd_interval); long nanos = timeout.tv_nsec + (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC; if (nanos >= NANOSEC) { timeout.tv_sec++; timeout.tv_nsec = nanos - NANOSEC; } else { timeout.tv_nsec = nanos; } pthread_mutex_lock(&wd->wd_mutex); if (!wd->wd_should_exit) ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex, &timeout); pthread_mutex_unlock(&wd->wd_mutex); if (ret == 0) { break; /* signaled by main thread */ } else if (ret != ETIMEDOUT) { (void) fprintf(stderr, gettext("pthread_cond_timedwait " "failed: %s\n"), strerror(ret)); zpool_close(zhp); return (void *)(uintptr_t)(1); } } zpool_close(zhp); return (void *)(0); } int zpool_do_wait(int argc, char **argv) { boolean_t verbose = B_FALSE; int c, i; unsigned long count; pthread_t status_thr; int error = 0; zpool_handle_t *zhp; wait_data_t wd; wd.wd_scripted = B_FALSE; wd.wd_exact = B_FALSE; wd.wd_headers_once = B_FALSE; wd.wd_should_exit = B_FALSE; pthread_mutex_init(&wd.wd_mutex, NULL); pthread_cond_init(&wd.wd_cv, NULL); /* By default, wait for all types of activity. */ for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) wd.wd_enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "HpT:t:")) != -1) { switch (c) { case 'H': wd.wd_scripted = B_TRUE; break; case 'n': wd.wd_headers_once = B_TRUE; break; case 'p': wd.wd_exact = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; case 't': /* Reset activities array */ memset(&wd.wd_enabled, 0, sizeof (wd.wd_enabled)); for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const col_opts[] = { "discard", "free", "initialize", "replace", "remove", "resilver", "scrub", "trim", "raidz_expand" }; for (i = 0; i < ARRAY_SIZE(col_opts); ++i) if (strcmp(tok, col_opts[i]) == 0) { wd.wd_enabled[i] = B_TRUE; goto found; } (void) fprintf(stderr, gettext("invalid activity '%s'\n"), tok); usage(B_FALSE); found:; } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; get_interval_count(&argc, argv, &wd.wd_interval, &count); if (count != 0) { /* This subcmd only accepts an interval, not a count */ (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (wd.wd_interval != 0) verbose = B_TRUE; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'pool' argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } wd.wd_poolname = argv[0]; if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL) return (1); if (verbose) { /* * We use a separate thread for printing status updates because * the main thread will call lzc_wait(), which blocks as long * as an activity is in progress, which can be a long time. */ if (pthread_create(&status_thr, NULL, wait_status_thread, &wd) != 0) { (void) fprintf(stderr, gettext("failed to create status" "thread: %s\n"), strerror(errno)); zpool_close(zhp); return (1); } } /* * Loop over all activities that we are supposed to wait for until none * of them are in progress. Note that this means we can end up waiting * for more activities to complete than just those that were in progress * when we began waiting; if an activity we are interested in begins * while we are waiting for another activity, we will wait for both to * complete before exiting. */ for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!wd.wd_enabled[i]) continue; error = zpool_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zpool_close(zhp); if (verbose) { uintptr_t status; pthread_mutex_lock(&wd.wd_mutex); wd.wd_should_exit = B_TRUE; pthread_cond_signal(&wd.wd_cv); pthread_mutex_unlock(&wd.wd_mutex); (void) pthread_join(status_thr, (void *)&status); if (status != 0) error = status; } pthread_mutex_destroy(&wd.wd_mutex); pthread_cond_destroy(&wd.wd_cv); return (error); } static int find_command_idx(const char *command, int *idx) { for (int i = 0; i < NCOMMAND; ++i) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } /* * Display version message */ static int zpool_do_version(int argc, char **argv) { (void) argc, (void) argv; return (zfs_version_print() != 0); } /* Display documentation */ static int zpool_do_help(int argc, char **argv) { char page[MAXNAMELEN]; if (argc < 3 || strcmp(argv[2], "zpool") == 0) strcpy(page, "zpool"); else if (strcmp(argv[2], "concepts") == 0 || strcmp(argv[2], "props") == 0) snprintf(page, sizeof (page), "zpool%s", argv[2]); else snprintf(page, sizeof (page), "zpool-%s", argv[2]); execlp("man", "man", page, NULL); fprintf(stderr, "couldn't run man program: %s", strerror(errno)); return (-1); } /* * Do zpool_load_compat() and print error message on failure */ static zpool_compat_status_t zpool_do_load_compat(const char *compat, boolean_t *list) { char report[1024]; zpool_compat_status_t ret; ret = zpool_load_compat(compat, list, report, 1024); switch (ret) { case ZPOOL_COMPATIBILITY_OK: break; case ZPOOL_COMPATIBILITY_NOFILES: case ZPOOL_COMPATIBILITY_BADFILE: case ZPOOL_COMPATIBILITY_BADTOKEN: (void) fprintf(stderr, "Error: %s\n", report); break; case ZPOOL_COMPATIBILITY_WARNTOKEN: (void) fprintf(stderr, "Warning: %s\n", report); ret = ZPOOL_COMPATIBILITY_OK; break; } return (ret); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); srand(time(NULL)); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zpool_do_version(argc, argv)); /* * Special case 'help' */ if (strcmp(cmdname, "help") == 0) return (zpool_do_help(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } libzfs_print_on_error(g_zfs, B_TRUE); zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = safe_malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=')) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { /* * 'freeze' is a vile debugging abomination, so we treat * it as such. */ zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc); if (ret != 0) { (void) fprintf(stderr, gettext("failed to freeze pool: %d\n"), errno); ret = 1; } log_history = 0; } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index fb9e8649221e..c746600cd2d5 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -1,135 +1,137 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2020 by Delphix. All rights reserved. */ #ifndef _SYS_FM_FS_ZFS_H #define _SYS_FM_FS_ZFS_H #ifdef __cplusplus extern "C" { #endif #define ZFS_ERROR_CLASS "fs.zfs" #define FM_EREPORT_ZFS_CHECKSUM "checksum" #define FM_EREPORT_ZFS_AUTHENTICATION "authentication" #define FM_EREPORT_ZFS_IO "io" #define FM_EREPORT_ZFS_DATA "data" #define FM_EREPORT_ZFS_DELAY "delay" #define FM_EREPORT_ZFS_DEADMAN "deadman" #define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" #define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data" #define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas" #define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" #define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" #define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" #define FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT "vdev.bad_ashift" #define FM_EREPORT_ZFS_IO_FAILURE "io_failure" #define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" #define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" #define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write" #define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" #define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" #define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid" #define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context" #define FM_EREPORT_PAYLOAD_ZFS_POOL_STATE "pool_state" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH "vdev_physpath" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE "vdev_laststate" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS "vdev_spare_paths" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS "vdev_spare_guids" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N "vdev_cksum_n" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T "vdev_cksum_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N "vdev_io_n" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY "zio_priority" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" #define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" #define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" #define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap" #define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets" #define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" #define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" #define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" #define FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME "snapshot_name" #define FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME "device_name" #define FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME "raw_name" #define FM_EREPORT_PAYLOAD_ZFS_VOLUME "volume" #define FM_EREPORT_FAILMODE_WAIT "wait" #define FM_EREPORT_FAILMODE_CONTINUE "continue" #define FM_EREPORT_FAILMODE_PANIC "panic" #define FM_RESOURCE_REMOVED "removed" #define FM_RESOURCE_AUTOREPLACE "autoreplace" #define FM_RESOURCE_STATECHANGE "statechange" #define FM_RESOURCE_ZFS_SNAPSHOT_MOUNT "snapshot_mount" #define FM_RESOURCE_ZFS_SNAPSHOT_UNMOUNT "snapshot_unmount" #define FM_RESOURCE_ZVOL_CREATE_SYMLINK "zvol_create" #define FM_RESOURCE_ZVOL_REMOVE_SYMLINK "zvol_remove" #ifdef __cplusplus } #endif #endif /* _SYS_FM_FS_ZFS_H */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index c6f7dcca78b3..025567e2183f 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1,1893 +1,1895 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. * Portions Copyright 2010 Robert Milkowski * Copyright (c) 2021, Colm Buckley * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H extern __attribute__((visibility("default"))) #include #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_INVALID = 0, ZFS_TYPE_FILESYSTEM = (1 << 0), ZFS_TYPE_SNAPSHOT = (1 << 1), ZFS_TYPE_VOLUME = (1 << 2), ZFS_TYPE_POOL = (1 << 3), ZFS_TYPE_BOOKMARK = (1 << 4), ZFS_TYPE_VDEV = (1 << 5), } zfs_type_t; /* * NB: lzc_dataset_type should be updated whenever a new objset type is added, * if it represents a real type of a dataset that can be created from userland. */ typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) /* * All of these include the terminating NUL byte. */ #define ZAP_MAXNAMELEN 256 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 #define ZFS_MAX_DATASET_NAME_LEN 256 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in module/zcommon/zfs_prop.c. */ typedef enum { ZPROP_CONT = -2, ZPROP_INVAL = -1, ZPROP_USERPROP = ZPROP_INVAL, ZFS_PROP_TYPE = 0, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ ZFS_PROP_OBJSETID, ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_DNODESIZE, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ ZFS_PROP_VOLMODE, ZFS_PROP_FILESYSTEM_LIMIT, ZFS_PROP_SNAPSHOT_LIMIT, ZFS_PROP_FILESYSTEM_COUNT, ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_SNAPDEV, ZFS_PROP_ACLTYPE, ZFS_PROP_SELINUX_CONTEXT, ZFS_PROP_SELINUX_FSCONTEXT, ZFS_PROP_SELINUX_DEFCONTEXT, ZFS_PROP_SELINUX_ROOTCONTEXT, ZFS_PROP_RELATIME, ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_OVERLAY, ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_PROP_ENCRYPTION, ZFS_PROP_KEYLOCATION, ZFS_PROP_KEYFORMAT, ZFS_PROP_PBKDF2_SALT, ZFS_PROP_PBKDF2_ITERS, ZFS_PROP_ENCRYPTION_ROOT, ZFS_PROP_KEY_GUID, ZFS_PROP_KEYSTATUS, ZFS_PROP_REMAPTXG, /* obsolete - no longer used */ ZFS_PROP_SPECIAL_SMALL_BLOCKS, ZFS_PROP_IVSET_GUID, /* not exposed to the user */ ZFS_PROP_REDACTED, ZFS_PROP_REDACT_SNAPS, ZFS_PROP_SNAPSHOTS_CHANGED, ZFS_PROP_PREFETCH, ZFS_PROP_VOLTHREADING, ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_PROP_USEROBJUSED, ZFS_PROP_USEROBJQUOTA, ZFS_PROP_GROUPOBJUSED, ZFS_PROP_GROUPOBJQUOTA, ZFS_PROP_PROJECTUSED, ZFS_PROP_PROJECTQUOTA, ZFS_PROP_PROJECTOBJUSED, ZFS_PROP_PROJECTOBJQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; _SYS_FS_ZFS_H const char *const zfs_userquota_prop_prefixes[ ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. Properties must be registered in zfs_prop_init(). */ typedef enum { ZPOOL_PROP_INVAL = -1, ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_ASHIFT, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_PROP_MAXBLOCKSIZE, ZPOOL_PROP_TNAME, ZPOOL_PROP_MAXDNODESIZE, ZPOOL_PROP_MULTIHOST, ZPOOL_PROP_CHECKPOINT, ZPOOL_PROP_LOAD_GUID, ZPOOL_PROP_AUTOTRIM, ZPOOL_PROP_COMPATIBILITY, ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONERATIO, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(8). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_BOOLEAN_NA 2 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Length of 'written@' and 'written#' */ #define ZFS_WRITTEN_PROP_PREFIX_LEN 8 /* * VDEV properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zpool_prop.c. */ typedef enum { VDEV_PROP_INVAL = -1, VDEV_PROP_USERPROP = VDEV_PROP_INVAL, VDEV_PROP_NAME, VDEV_PROP_CAPACITY, VDEV_PROP_STATE, VDEV_PROP_GUID, VDEV_PROP_ASIZE, VDEV_PROP_PSIZE, VDEV_PROP_ASHIFT, VDEV_PROP_SIZE, VDEV_PROP_FREE, VDEV_PROP_ALLOCATED, VDEV_PROP_COMMENT, VDEV_PROP_EXPANDSZ, VDEV_PROP_FRAGMENTATION, VDEV_PROP_BOOTSIZE, VDEV_PROP_PARITY, VDEV_PROP_PATH, VDEV_PROP_DEVID, VDEV_PROP_PHYS_PATH, VDEV_PROP_ENC_PATH, VDEV_PROP_FRU, VDEV_PROP_PARENT, VDEV_PROP_CHILDREN, VDEV_PROP_NUMCHILDREN, VDEV_PROP_READ_ERRORS, VDEV_PROP_WRITE_ERRORS, VDEV_PROP_CHECKSUM_ERRORS, VDEV_PROP_INITIALIZE_ERRORS, VDEV_PROP_OPS_NULL, VDEV_PROP_OPS_READ, VDEV_PROP_OPS_WRITE, VDEV_PROP_OPS_FREE, VDEV_PROP_OPS_CLAIM, VDEV_PROP_OPS_TRIM, VDEV_PROP_BYTES_NULL, VDEV_PROP_BYTES_READ, VDEV_PROP_BYTES_WRITE, VDEV_PROP_BYTES_FREE, VDEV_PROP_BYTES_CLAIM, VDEV_PROP_BYTES_TRIM, VDEV_PROP_REMOVING, VDEV_PROP_ALLOCATING, VDEV_PROP_FAILFAST, VDEV_PROP_CHECKSUM_N, VDEV_PROP_CHECKSUM_T, VDEV_PROP_IO_N, VDEV_PROP_IO_T, VDEV_PROP_RAIDZ_EXPANDING, + VDEV_PROP_SLOW_IO_N, + VDEV_PROP_SLOW_IO_T, VDEV_NUM_PROPS } vdev_prop_t; /* * Dataset property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H const char *zfs_prop_default_string(zfs_prop_t); _SYS_FS_ZFS_H uint64_t zfs_prop_default_numeric(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_readonly(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_visible(zfs_prop_t prop); _SYS_FS_ZFS_H boolean_t zfs_prop_inheritable(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_setonce(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_encryption_key_param(zfs_prop_t); _SYS_FS_ZFS_H boolean_t zfs_prop_valid_keylocation(const char *, boolean_t); _SYS_FS_ZFS_H const char *zfs_prop_to_name(zfs_prop_t); _SYS_FS_ZFS_H zfs_prop_t zfs_name_to_prop(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_user(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_userquota(const char *); _SYS_FS_ZFS_H boolean_t zfs_prop_written(const char *); _SYS_FS_ZFS_H int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); _SYS_FS_ZFS_H boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t); /* * Pool property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H zpool_prop_t zpool_name_to_prop(const char *); _SYS_FS_ZFS_H const char *zpool_prop_to_name(zpool_prop_t); _SYS_FS_ZFS_H const char *zpool_prop_default_string(zpool_prop_t); _SYS_FS_ZFS_H uint64_t zpool_prop_default_numeric(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_readonly(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_setonce(zpool_prop_t); _SYS_FS_ZFS_H boolean_t zpool_prop_feature(const char *); _SYS_FS_ZFS_H boolean_t zpool_prop_unsupported(const char *); _SYS_FS_ZFS_H int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * VDEV property functions shared between libzfs and kernel. */ _SYS_FS_ZFS_H vdev_prop_t vdev_name_to_prop(const char *); _SYS_FS_ZFS_H boolean_t vdev_prop_user(const char *name); _SYS_FS_ZFS_H const char *vdev_prop_to_name(vdev_prop_t); _SYS_FS_ZFS_H const char *vdev_prop_default_string(vdev_prop_t); _SYS_FS_ZFS_H uint64_t vdev_prop_default_numeric(vdev_prop_t); _SYS_FS_ZFS_H boolean_t vdev_prop_readonly(vdev_prop_t prop); _SYS_FS_ZFS_H int vdev_prop_index_to_string(vdev_prop_t, uint64_t, const char **); _SYS_FS_ZFS_H int vdev_prop_string_to_index(vdev_prop_t, const char *, uint64_t *); _SYS_FS_ZFS_H boolean_t zpool_prop_vdev(const char *name); _SYS_FS_ZFS_H uint64_t vdev_prop_random_value(vdev_prop_t prop, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; typedef enum { ZFS_XATTR_OFF = 0, ZFS_XATTR_DIR = 1, ZFS_XATTR_SA = 2 } zfs_xattr_type_t; typedef enum { ZFS_DNSIZE_LEGACY = 0, ZFS_DNSIZE_AUTO = 1, ZFS_DNSIZE_1K = 1024, ZFS_DNSIZE_2K = 2048, ZFS_DNSIZE_4K = 4096, ZFS_DNSIZE_8K = 8192, ZFS_DNSIZE_16K = 16384 } zfs_dnsize_type_t; typedef enum { ZFS_REDUNDANT_METADATA_ALL, ZFS_REDUNDANT_METADATA_MOST, ZFS_REDUNDANT_METADATA_SOME, ZFS_REDUNDANT_METADATA_NONE } zfs_redundant_metadata_type_t; typedef enum { ZFS_VOLMODE_DEFAULT = 0, ZFS_VOLMODE_GEOM = 1, ZFS_VOLMODE_DEV = 2, ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; typedef enum zfs_keystatus { ZFS_KEYSTATUS_NONE = 0, ZFS_KEYSTATUS_UNAVAILABLE, ZFS_KEYSTATUS_AVAILABLE, } zfs_keystatus_t; typedef enum zfs_keyformat { ZFS_KEYFORMAT_NONE = 0, ZFS_KEYFORMAT_RAW, ZFS_KEYFORMAT_HEX, ZFS_KEYFORMAT_PASSPHRASE, ZFS_KEYFORMAT_FORMATS } zfs_keyformat_t; typedef enum zfs_key_location { ZFS_KEYLOCATION_NONE = 0, ZFS_KEYLOCATION_PROMPT, ZFS_KEYLOCATION_URI, ZFS_KEYLOCATION_LOCATIONS } zfs_keylocation_t; typedef enum { ZFS_PREFETCH_NONE = 0, ZFS_PREFETCH_METADATA = 1, ZFS_PREFETCH_ALL = 2 } zfs_prefetch_type_t; #define DEFAULT_PBKDF2_ITERATIONS 350000 #define MIN_PBKDF2_ITERATIONS 100000 /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * The incrementing pool version number has been replaced by pool feature * flags. For more details, see zfeature.c. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Persistent L2ARC version */ #define L2ARC_PERSISTENT_VERSION_1 1ULL #define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1 #define L2ARC_PERSISTENT_VERSION_STRING "1" /* Rewind policy information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_load_policy { uint32_t zlp_rewind; /* rewind policy requested */ uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ uint64_t zlp_maxdata; /* max acceptable data errors */ uint64_t zlp_txg; /* specific txg to load */ } zpool_load_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. New on-disk names should be prefixed with ":" * (e.g. "org.openzfs:") to avoid conflicting names being developed * independently. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" #define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_SPARE_ID "spareid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ #define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ #define ZPOOL_CONFIG_RAIDZ_EXPAND_STATS "raidz_expand_stats" /* not on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ /* container nvlist of extended stats */ #define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" /* Active queue read/write stats */ #define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" #define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue" #define ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE "vdev_rebuild_active_queue" /* Queue sizes */ #define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" #define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" #define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" #define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" #define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue" #define ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE "vdev_rebuild_pend_queue" /* Latency read/write histogram stats */ #define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" #define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo" #define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo" #define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" #define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" #define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo" #define ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO "vdev_rebuild_histo" /* Request size histograms */ #define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo" #define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo" #define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo" #define ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO "vdev_ind_rebuild_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo" #define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo" #define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" #define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" #define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo" #define ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO "vdev_agg_rebuild_histo" /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" /* vdev enclosure sysfs path */ #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding" #define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_NONALLOCATING "non_allocating" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_ROOT_ZAP "com.klarasystems:vdev_zap_root" #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" #define ZPOOL_CONFIG_RESILVER_DEFER "com.datto:resilver_defer" #define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ #define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ #define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ #define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ #define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" #define ZPOOL_CONFIG_COMPATIBILITY "compatibility" /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Pool load policy parameters */ #define ZPOOL_LOAD_POLICY "load-policy" #define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" #define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" #define ZPOOL_LOAD_META_THRESH "load-meta-thresh" #define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_META_ERRORS "verify_meta_errors" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" /* dRAID configuration */ #define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" #define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" #define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DRAID "draid" #define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_DRAID_MAXPARITY 3 #define VDEV_DRAID_MIN_CHILDREN 2 #define VDEV_DRAID_MAX_CHILDREN UINT8_MAX /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" #define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ "com.delphix:obsolete_counts_are_precise" #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" #define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ "com.delphix:ms_unflushed_phys_txgs" #define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \ "org.openzfs:vdev_rebuild" #define VDEV_TOP_ZAP_ALLOCATION_BIAS \ "org.zfsonlinux:allocation_bias" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE \ "org.openzfs:raidz_expand_state" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME \ "org.openzfs:raidz_expand_start_time" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME \ "org.openzfs:raidz_expand_end_time" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \ "org.openzfs:raidz_expand_bytes_copied" /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" #define VDEV_ALLOC_BIAS_SPECIAL "special" #define VDEV_ALLOC_BIAS_DEDUP "dedup" /* vdev initialize state */ #define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ "com.delphix:next_offset_to_initialize" #define VDEV_LEAF_ZAP_INITIALIZE_STATE \ "com.delphix:vdev_initialize_state" #define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ "com.delphix:vdev_initialize_action_time" /* vdev TRIM state */ #define VDEV_LEAF_ZAP_TRIM_LAST_OFFSET \ "org.zfsonlinux:next_offset_to_trim" #define VDEV_LEAF_ZAP_TRIM_STATE \ "org.zfsonlinux:vdev_trim_state" #define VDEV_LEAF_ZAP_TRIM_ACTION_TIME \ "org.zfsonlinux:vdev_trim_action_time" #define VDEV_LEAF_ZAP_TRIM_RATE \ "org.zfsonlinux:vdev_trim_rate" #define VDEV_LEAF_ZAP_TRIM_PARTIAL \ "org.zfsonlinux:vdev_trim_partial" #define VDEV_LEAF_ZAP_TRIM_SECURE \ "org.zfsonlinux:vdev_trim_secure" /* * This is needed in userland to report the minimum necessary device size. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * Set if the fragmentation has not yet been calculated. This can happen * because the space maps have not been upgraded or the histogram feature * is not enabled. */ #define ZFS_FRAG_INVALID UINT64_MAX /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE_BOOT "/boot/zfs/zpool.cache" #define ZPOOL_CACHE "/etc/zfs/zpool.cache" /* * Settings for zpool compatibility features files */ #define ZPOOL_SYSCONF_COMPAT_D SYSCONFDIR "/zfs/compatibility.d" #define ZPOOL_DATA_COMPAT_D PKGDATADIR "/compatibility.d" #define ZPOOL_COMPAT_MAXSIZE 16384 /* * Hard-wired compatibility settings */ #define ZPOOL_COMPAT_LEGACY "legacy" #define ZPOOL_COMPAT_OFF "off" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis or forced fault */ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * mmp state. The following states provide additional detail describing * why a pool couldn't be safely imported. */ typedef enum mmp_state { MMP_STATE_ACTIVE = 0, /* In active use */ MMP_STATE_INACTIVE, /* Inactive and safe to import */ MMP_STATE_NO_HOSTID /* System hostid is not set */ } mmp_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_ERRORSCRUB, POOL_SCAN_FUNCS } pool_scan_func_t; /* * Used to control scrub pause and resume. */ typedef enum pool_scrub_cmd { POOL_SCRUB_NORMAL = 0, POOL_SCRUB_PAUSE, POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; typedef enum { CS_NONE, CS_CHECKPOINT_EXISTS, CS_CHECKPOINT_DISCARDING, CS_NUM_STATES } checkpoint_state_t; typedef struct pool_checkpoint_stat { uint64_t pcs_state; /* checkpoint_state_t */ uint64_t pcs_start_time; /* time checkpoint/discard started */ uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_IOCTL, ZIO_TYPE_TRIM, ZIO_TYPES } zio_type_t; /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ uint64_t pss_skipped; /* total bytes skipped by scanner */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ uint64_t pss_pass_scrub_pause; /* pause time of a scrub pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; uint64_t pss_pass_issued; /* issued bytes per scan pass */ uint64_t pss_issued; /* total bytes checked by scanner */ /* error scrub values stored on disk */ uint64_t pss_error_scrub_func; /* pool_scan_func_t */ uint64_t pss_error_scrub_state; /* dsl_scan_state_t */ uint64_t pss_error_scrub_start; /* error scrub start time */ uint64_t pss_error_scrub_end; /* error scrub end time */ uint64_t pss_error_scrub_examined; /* error blocks issued I/O */ /* error blocks to be issued I/O */ uint64_t pss_error_scrub_to_be_examined; /* error scrub values not stored on disk */ /* error scrub pause time in milliseconds */ uint64_t pss_pass_error_scrub_pause; } pool_scan_stat_t; typedef struct pool_removal_stat { uint64_t prs_state; /* dsl_scan_state_t */ uint64_t prs_removing_vdev; uint64_t prs_start_time; uint64_t prs_end_time; uint64_t prs_to_copy; /* bytes that need to be copied */ uint64_t prs_copied; /* bytes copied so far */ /* * bytes of memory used for indirect mappings. * This includes all removed vdevs. */ uint64_t prs_mapping_memory; } pool_removal_stat_t; typedef struct pool_raidz_expand_stat { uint64_t pres_state; /* dsl_scan_state_t */ uint64_t pres_expanding_vdev; uint64_t pres_start_time; uint64_t pres_end_time; uint64_t pres_to_reflow; /* bytes that need to be moved */ uint64_t pres_reflowed; /* bytes moved so far */ uint64_t pres_waiting_for_resilver; } pool_raidz_expand_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_ERRORSCRUBBING, DSS_NUM_STATES } dsl_scan_state_t; typedef struct vdev_rebuild_stat { uint64_t vrs_state; /* vdev_rebuild_state_t */ uint64_t vrs_start_time; /* time_t */ uint64_t vrs_end_time; /* time_t */ uint64_t vrs_scan_time_ms; /* total run time (millisecs) */ uint64_t vrs_bytes_scanned; /* allocated bytes scanned */ uint64_t vrs_bytes_issued; /* read bytes issued */ uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */ uint64_t vrs_bytes_est; /* total bytes to scan */ uint64_t vrs_errors; /* scanning errors */ uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; /* * Errata described by https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-ER. * The ordering of this enum must be maintained to ensure the errata identifiers * map to the correct documentation. New errata may only be appended to the * list and must contain corresponding documentation at the above link. */ typedef enum zpool_errata { ZPOOL_ERRATA_NONE, ZPOOL_ERRATA_ZOL_2094_SCRUB, ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY, ZPOOL_ERRATA_ZOL_6845_ENCRYPTION, ZPOOL_ERRATA_ZOL_8308_ENCRYPTION, } zpool_errata_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and user land as an nvlist uint64 array. * * The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in * order to keep subsequent members at their known fixed offsets. When * adding a new field it must be added to the end the structure. */ #define VS_ZIO_TYPES 6 typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[VS_ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[VS_ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_initialize_errors; /* initializing errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ uint64_t vs_initialize_bytes_done; /* bytes initialized */ uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ uint64_t vs_initialize_state; /* vdev_initializing_state_t */ uint64_t vs_initialize_action_time; /* time_t */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ uint64_t vs_resilver_deferred; /* resilver deferred */ uint64_t vs_slow_ios; /* slow IOs */ uint64_t vs_trim_errors; /* trimming errors */ uint64_t vs_trim_notsup; /* supported by device */ uint64_t vs_trim_bytes_done; /* bytes trimmed */ uint64_t vs_trim_bytes_est; /* total bytes to trim */ uint64_t vs_trim_state; /* vdev_trim_state_t */ uint64_t vs_trim_action_time; /* time_t */ uint64_t vs_rebuild_processed; /* bytes rebuilt */ uint64_t vs_configured_ashift; /* TLV vdev_ashift */ uint64_t vs_logical_ashift; /* vdev_logical_ashift */ uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_noalloc; /* allocations halted? */ uint64_t vs_pspace; /* physical capacity */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ ((uint64_t_field_count * sizeof (uint64_t)) >= \ (offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field))) /* * Extended stats * * These are stats which aren't included in the original iostat output. For * convenience, they are grouped together in vdev_stat_ex, although each stat * is individually exported as an nvlist. */ typedef struct vdev_stat_ex { /* Number of ZIOs issued to disk and waiting to finish */ uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* Number of ZIOs pending to be issued to disk */ uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; /* * Below are the histograms for various latencies. Buckets are in * units of nanoseconds. */ /* * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in * before this. */ #define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */ #define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */ /* Amount of time in ZIO queue (ns) */ uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_L_HISTO_BUCKETS]; /* Total ZIO latency (ns). Includes queuing and disk access time */ uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* Amount of time to read/write the disk (ns) */ uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; /* "lookup the bucket for a value" histogram macros */ #define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \ buckets - 1) : 0) #define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS) #define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS) /* Physical IO histogram */ uint64_t vsx_ind_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; /* Delegated (aggregated) physical IO histogram */ uint64_t vsx_agg_histo[ZIO_PRIORITY_NUM_QUEUEABLE] [VDEV_RQ_HISTO_BUCKETS]; } vdev_stat_ex_t; /* * Initialize functions. */ typedef enum pool_initialize_func { POOL_INITIALIZE_START, POOL_INITIALIZE_CANCEL, POOL_INITIALIZE_SUSPEND, POOL_INITIALIZE_UNINIT, POOL_INITIALIZE_FUNCS } pool_initialize_func_t; /* * TRIM functions. */ typedef enum pool_trim_func { POOL_TRIM_START, POOL_TRIM_CANCEL, POOL_TRIM_SUSPEND, POOL_TRIM_FUNCS } pool_trim_func_t; /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { uint64_t ddo_count; /* number of elements in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" #define ZFS_DEVDIR "/dev" #define ZFS_SUPER_MAGIC 0x2fc12fc1 /* general zvol path */ #define ZVOL_DIR "/dev/zvol/" #define ZVOL_MAJOR 230 #define ZVOL_MINOR_BITS 4 #define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) #define ZVOL_MINORS (1 << 4) #define ZVOL_DEV_NAME "zd" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 16384 typedef enum { VDEV_INITIALIZE_NONE, VDEV_INITIALIZE_ACTIVE, VDEV_INITIALIZE_CANCELED, VDEV_INITIALIZE_SUSPENDED, VDEV_INITIALIZE_COMPLETE } vdev_initializing_state_t; typedef enum { VDEV_TRIM_NONE, VDEV_TRIM_ACTIVE, VDEV_TRIM_CANCELED, VDEV_TRIM_SUSPENDED, VDEV_TRIM_COMPLETE, } vdev_trim_state_t; typedef enum { VDEV_REBUILD_NONE, VDEV_REBUILD_ACTIVE, VDEV_REBUILD_CANCELED, VDEV_REBUILD_COMPLETE, } vdev_rebuild_state_t; /* * nvlist name constants. Facilitate restricting snapshot iteration range for * the "list next snapshot" ioctl */ #define SNAP_ITER_MIN_TXG "snap_iter_min_txg" #define SNAP_ITER_MAX_TXG "snap_iter_max_txg" /* * /dev/zfs ioctl numbers. * * These numbers cannot change over time. New ioctl numbers must be appended. */ typedef enum zfs_ioc { /* * Core features - 88/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, #else ZFS_IOC_FIRST = ('Z' << 8), #endif ZFS_IOC = ZFS_IOC_FIRST, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, /* 0x5a00 */ ZFS_IOC_POOL_DESTROY, /* 0x5a01 */ ZFS_IOC_POOL_IMPORT, /* 0x5a02 */ ZFS_IOC_POOL_EXPORT, /* 0x5a03 */ ZFS_IOC_POOL_CONFIGS, /* 0x5a04 */ ZFS_IOC_POOL_STATS, /* 0x5a05 */ ZFS_IOC_POOL_TRYIMPORT, /* 0x5a06 */ ZFS_IOC_POOL_SCAN, /* 0x5a07 */ ZFS_IOC_POOL_FREEZE, /* 0x5a08 */ ZFS_IOC_POOL_UPGRADE, /* 0x5a09 */ ZFS_IOC_POOL_GET_HISTORY, /* 0x5a0a */ ZFS_IOC_VDEV_ADD, /* 0x5a0b */ ZFS_IOC_VDEV_REMOVE, /* 0x5a0c */ ZFS_IOC_VDEV_SET_STATE, /* 0x5a0d */ ZFS_IOC_VDEV_ATTACH, /* 0x5a0e */ ZFS_IOC_VDEV_DETACH, /* 0x5a0f */ ZFS_IOC_VDEV_SETPATH, /* 0x5a10 */ ZFS_IOC_VDEV_SETFRU, /* 0x5a11 */ ZFS_IOC_OBJSET_STATS, /* 0x5a12 */ ZFS_IOC_OBJSET_ZPLPROPS, /* 0x5a13 */ ZFS_IOC_DATASET_LIST_NEXT, /* 0x5a14 */ ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x5a15 */ ZFS_IOC_SET_PROP, /* 0x5a16 */ ZFS_IOC_CREATE, /* 0x5a17 */ ZFS_IOC_DESTROY, /* 0x5a18 */ ZFS_IOC_ROLLBACK, /* 0x5a19 */ ZFS_IOC_RENAME, /* 0x5a1a */ ZFS_IOC_RECV, /* 0x5a1b */ ZFS_IOC_SEND, /* 0x5a1c */ ZFS_IOC_INJECT_FAULT, /* 0x5a1d */ ZFS_IOC_CLEAR_FAULT, /* 0x5a1e */ ZFS_IOC_INJECT_LIST_NEXT, /* 0x5a1f */ ZFS_IOC_ERROR_LOG, /* 0x5a20 */ ZFS_IOC_CLEAR, /* 0x5a21 */ ZFS_IOC_PROMOTE, /* 0x5a22 */ ZFS_IOC_SNAPSHOT, /* 0x5a23 */ ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x5a24 */ ZFS_IOC_OBJ_TO_PATH, /* 0x5a25 */ ZFS_IOC_POOL_SET_PROPS, /* 0x5a26 */ ZFS_IOC_POOL_GET_PROPS, /* 0x5a27 */ ZFS_IOC_SET_FSACL, /* 0x5a28 */ ZFS_IOC_GET_FSACL, /* 0x5a29 */ ZFS_IOC_SHARE, /* 0x5a2a */ ZFS_IOC_INHERIT_PROP, /* 0x5a2b */ ZFS_IOC_SMB_ACL, /* 0x5a2c */ ZFS_IOC_USERSPACE_ONE, /* 0x5a2d */ ZFS_IOC_USERSPACE_MANY, /* 0x5a2e */ ZFS_IOC_USERSPACE_UPGRADE, /* 0x5a2f */ ZFS_IOC_HOLD, /* 0x5a30 */ ZFS_IOC_RELEASE, /* 0x5a31 */ ZFS_IOC_GET_HOLDS, /* 0x5a32 */ ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x5a33 */ ZFS_IOC_VDEV_SPLIT, /* 0x5a34 */ ZFS_IOC_NEXT_OBJ, /* 0x5a35 */ ZFS_IOC_DIFF, /* 0x5a36 */ ZFS_IOC_TMP_SNAPSHOT, /* 0x5a37 */ ZFS_IOC_OBJ_TO_STATS, /* 0x5a38 */ ZFS_IOC_SPACE_WRITTEN, /* 0x5a39 */ ZFS_IOC_SPACE_SNAPS, /* 0x5a3a */ ZFS_IOC_DESTROY_SNAPS, /* 0x5a3b */ ZFS_IOC_POOL_REGUID, /* 0x5a3c */ ZFS_IOC_POOL_REOPEN, /* 0x5a3d */ ZFS_IOC_SEND_PROGRESS, /* 0x5a3e */ ZFS_IOC_LOG_HISTORY, /* 0x5a3f */ ZFS_IOC_SEND_NEW, /* 0x5a40 */ ZFS_IOC_SEND_SPACE, /* 0x5a41 */ ZFS_IOC_CLONE, /* 0x5a42 */ ZFS_IOC_BOOKMARK, /* 0x5a43 */ ZFS_IOC_GET_BOOKMARKS, /* 0x5a44 */ ZFS_IOC_DESTROY_BOOKMARKS, /* 0x5a45 */ ZFS_IOC_RECV_NEW, /* 0x5a46 */ ZFS_IOC_POOL_SYNC, /* 0x5a47 */ ZFS_IOC_CHANNEL_PROGRAM, /* 0x5a48 */ ZFS_IOC_LOAD_KEY, /* 0x5a49 */ ZFS_IOC_UNLOAD_KEY, /* 0x5a4a */ ZFS_IOC_CHANGE_KEY, /* 0x5a4b */ ZFS_IOC_REMAP, /* 0x5a4c */ ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */ ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */ ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */ ZFS_IOC_POOL_TRIM, /* 0x5a50 */ ZFS_IOC_REDACT, /* 0x5a51 */ ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a52 */ ZFS_IOC_WAIT, /* 0x5a53 */ ZFS_IOC_WAIT_FS, /* 0x5a54 */ ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */ ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ /* * Per-platform (Optional) - 8/128 numbers reserved. */ ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80, ZFS_IOC_EVENTS_NEXT, /* 0x81 (Linux) */ ZFS_IOC_EVENTS_CLEAR, /* 0x82 (Linux) */ ZFS_IOC_EVENTS_SEEK, /* 0x83 (Linux) */ ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */ ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */ ZFS_IOC_USERNS_ATTACH = ZFS_IOC_JAIL, /* 0x85 (Linux) */ ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */ ZFS_IOC_USERNS_DETACH = ZFS_IOC_UNJAIL, /* 0x86 (Linux) */ ZFS_IOC_SET_BOOTENV, /* 0x87 */ ZFS_IOC_GET_BOOTENV, /* 0x88 */ ZFS_IOC_LAST } zfs_ioc_t; /* * zvol ioctl to get dataset name */ #define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN]) #ifdef __linux__ /* * IOCTLs to update and retrieve additional file level attributes on * Linux. */ #define ZFS_IOC_GETDOSFLAGS _IOR(0x83, 1, uint64_t) #define ZFS_IOC_SETDOSFLAGS _IOW(0x83, 2, uint64_t) /* * Additional file level attributes, that are stored * in the upper half of z_pflags */ #define ZFS_READONLY 0x0000000100000000ull #define ZFS_HIDDEN 0x0000000200000000ull #define ZFS_SYSTEM 0x0000000400000000ull #define ZFS_ARCHIVE 0x0000000800000000ull #define ZFS_IMMUTABLE 0x0000001000000000ull #define ZFS_NOUNLINK 0x0000002000000000ull #define ZFS_APPENDONLY 0x0000004000000000ull #define ZFS_NODUMP 0x0000008000000000ull #define ZFS_OPAQUE 0x0000010000000000ull #define ZFS_AV_QUARANTINED 0x0000020000000000ull #define ZFS_AV_MODIFIED 0x0000040000000000ull #define ZFS_REPARSE 0x0000080000000000ull #define ZFS_OFFLINE 0x0000100000000000ull #define ZFS_SPARSE 0x0000200000000000ull #define ZFS_DOS_FL_USER_VISIBLE (ZFS_IMMUTABLE | ZFS_APPENDONLY | \ ZFS_NOUNLINK | ZFS_ARCHIVE | ZFS_NODUMP | ZFS_SYSTEM | \ ZFS_HIDDEN | ZFS_READONLY | ZFS_REPARSE | ZFS_OFFLINE | \ ZFS_SPARSE) #endif /* * ZFS-specific error codes used for returning descriptive errors * to the userland through zfs ioctls. * * The enum implicitly includes all the error codes from errno.h. * New code should use and extend this enum for errors that are * not described precisely by generic errno codes. * * These numbers should not change over time. New entries should be appended. * * (Keep in sync with contrib/pyzfs/libzfs_core/_constants.py) */ typedef enum { ZFS_ERR_CHECKPOINT_EXISTS = 1024, ZFS_ERR_DISCARDING_CHECKPOINT, ZFS_ERR_NO_CHECKPOINT, ZFS_ERR_DEVRM_IN_PROGRESS, ZFS_ERR_VDEV_TOO_BIG, ZFS_ERR_IOC_CMD_UNAVAIL, ZFS_ERR_IOC_ARG_UNAVAIL, ZFS_ERR_IOC_ARG_REQUIRED, ZFS_ERR_IOC_ARG_BADTYPE, ZFS_ERR_WRONG_PARENT, ZFS_ERR_FROM_IVSET_GUID_MISSING, ZFS_ERR_FROM_IVSET_GUID_MISMATCH, ZFS_ERR_SPILL_BLOCK_FLAG_MISSING, ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE, ZFS_ERR_EXPORT_IN_PROGRESS, ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, ZFS_ERR_STREAM_TRUNCATED, ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, ZFS_ERR_RESILVER_IN_PROGRESS, ZFS_ERR_REBUILD_IN_PROGRESS, ZFS_ERR_BADPROP, ZFS_ERR_VDEV_NOTSUP, ZFS_ERR_NOT_USER_NAMESPACE, ZFS_ERR_RESUME_EXISTS, ZFS_ERR_CRYPTO_NOTSUP, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, } zfs_errno_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR, /* load failed */ SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; typedef enum { ZPOOL_WAIT_CKPT_DISCARD, ZPOOL_WAIT_FREE, ZPOOL_WAIT_INITIALIZE, ZPOOL_WAIT_REPLACE, ZPOOL_WAIT_REMOVE, ZPOOL_WAIT_RESILVER, ZPOOL_WAIT_SCRUB, ZPOOL_WAIT_TRIM, ZPOOL_WAIT_RAIDZ_EXPAND, ZPOOL_WAIT_NUM_ACTIVITIES } zpool_wait_activity_t; typedef enum { ZFS_WAIT_DELETEQ, ZFS_WAIT_NUM_ACTIVITIES } zfs_wait_activity_t; /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_OUTPUT_SIZE "out_size" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" #define ZPOOL_HIST_ELAPSED_NS "elapsed_ns" /* * Special nvlist name that will not have its args recorded in the pool's * history log. */ #define ZPOOL_HIDDEN_ARGS "hidden_args" /* * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. */ #define ZPOOL_INITIALIZE_COMMAND "initialize_command" #define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" /* * The following are names used when invoking ZFS_IOC_POOL_TRIM. */ #define ZPOOL_TRIM_COMMAND "trim_command" #define ZPOOL_TRIM_VDEVS "trim_vdevs" #define ZPOOL_TRIM_RATE "trim_rate" #define ZPOOL_TRIM_SECURE "trim_secure" /* * The following are names used when invoking ZFS_IOC_POOL_WAIT. */ #define ZPOOL_WAIT_ACTIVITY "wait_activity" #define ZPOOL_WAIT_TAG "wait_tag" #define ZPOOL_WAIT_WAITED "wait_waited" /* * The following are names used when invoking ZFS_IOC_VDEV_GET_PROP. */ #define ZPOOL_VDEV_PROPS_GET_VDEV "vdevprops_get_vdev" #define ZPOOL_VDEV_PROPS_GET_PROPS "vdevprops_get_props" /* * The following are names used when invoking ZFS_IOC_VDEV_SET_PROP. */ #define ZPOOL_VDEV_PROPS_SET_VDEV "vdevprops_set_vdev" #define ZPOOL_VDEV_PROPS_SET_PROPS "vdevprops_set_props" /* * The following are names used when invoking ZFS_IOC_WAIT_FS. */ #define ZFS_WAIT_ACTIVITY "wait_activity" #define ZFS_WAIT_WAITED "wait_waited" /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_ONLINE_SPARE 0x10 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 #define ZFS_IMPORT_TEMP_NAME 0x10 #define ZFS_IMPORT_SKIP_MMP 0x20 #define ZFS_IMPORT_LOAD_KEYS 0x40 #define ZFS_IMPORT_CHECKPOINT 0x80 /* * Channel program argument/return nvlist keys and defaults. */ #define ZCP_ARG_PROGRAM "program" #define ZCP_ARG_ARGLIST "arg" #define ZCP_ARG_SYNC "sync" #define ZCP_ARG_INSTRLIMIT "instrlimit" #define ZCP_ARG_MEMLIMIT "memlimit" #define ZCP_ARG_CLIARGV "argv" #define ZCP_RET_ERROR "error" #define ZCP_RET_RETURN "return" #define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) #define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) #define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) #define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_FINISH * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING * * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 * * ESC_ZFS_HISTORY_EVENT * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) * * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the * history log nvlist. The keynames will be free of any spaces or other * characters that could be potentially unexpected to consumers of the * sysevents. */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" #define ZFS_EV_HIST_TIME "history_time" #define ZFS_EV_HIST_CMD "history_command" #define ZFS_EV_HIST_WHO "history_who" #define ZFS_EV_HIST_ZONE "history_zone" #define ZFS_EV_HIST_HOST "history_hostname" #define ZFS_EV_HIST_TXG "history_txg" #define ZFS_EV_HIST_INT_EVENT "history_internal_event" #define ZFS_EV_HIST_INT_STR "history_internal_str" #define ZFS_EV_HIST_INT_NAME "history_internal_name" #define ZFS_EV_HIST_IOCTL "history_ioctl" #define ZFS_EV_HIST_DSNAME "history_dsname" #define ZFS_EV_HIST_DSID "history_dsid" #define ZFS_EV_RESILVER_TYPE "resilver_type" /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * * The recordsize property can not be set larger than zfs_max_recordsize * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near * zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. */ #define SPA_MINBLOCKSHIFT 9 #define SPA_OLD_MAXBLOCKSHIFT 17 #define SPA_MAXBLOCKSHIFT 24 #define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) #define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) #define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) /* supported encryption algorithms */ enum zio_encrypt { ZIO_CRYPT_INHERIT = 0, ZIO_CRYPT_ON, ZIO_CRYPT_OFF, ZIO_CRYPT_AES_128_CCM, ZIO_CRYPT_AES_192_CCM, ZIO_CRYPT_AES_256_CCM, ZIO_CRYPT_AES_128_GCM, ZIO_CRYPT_AES_192_GCM, ZIO_CRYPT_AES_256_GCM, ZIO_CRYPT_FUNCTIONS }; #define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM #define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF /* * xattr namespace prefixes. These are forbidden in xattr names. * * For cross-platform compatibility, xattrs in the user namespace should not be * prefixed with the namespace name, but for backwards compatibility with older * ZFS on Linux versions we do prefix the namespace. */ #define ZFS_XA_NS_FREEBSD_PREFIX "freebsd:" #define ZFS_XA_NS_FREEBSD_PREFIX_LEN strlen("freebsd:") #define ZFS_XA_NS_LINUX_SECURITY_PREFIX "security." #define ZFS_XA_NS_LINUX_SECURITY_PREFIX_LEN strlen("security.") #define ZFS_XA_NS_LINUX_SYSTEM_PREFIX "system." #define ZFS_XA_NS_LINUX_SYSTEM_PREFIX_LEN strlen("system.") #define ZFS_XA_NS_LINUX_TRUSTED_PREFIX "trusted." #define ZFS_XA_NS_LINUX_TRUSTED_PREFIX_LEN strlen("trusted.") #define ZFS_XA_NS_LINUX_USER_PREFIX "user." #define ZFS_XA_NS_LINUX_USER_PREFIX_LEN strlen("user.") #define ZFS_XA_NS_PREFIX_MATCH(ns, name) \ (strncmp(name, ZFS_XA_NS_##ns##_PREFIX, \ ZFS_XA_NS_##ns##_PREFIX_LEN) == 0) #define ZFS_XA_NS_PREFIX_FORBIDDEN(name) \ (ZFS_XA_NS_PREFIX_MATCH(FREEBSD, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_SECURITY, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_SYSTEM, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_TRUSTED, name) || \ ZFS_XA_NS_PREFIX_MATCH(LINUX_USER, name)) #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index dafab66c70f9..f39ebf031cea 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -1,654 +1,657 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_VDEV_IMPL_H #define _SYS_VDEV_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Virtual device descriptors. * * All storage pool operations go through the virtual device framework, * which provides data replication and I/O scheduling. */ /* * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; struct abd; extern uint_t zfs_vdev_queue_depth_pct; extern uint_t zfs_vdev_def_queue_depth; extern uint_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); typedef void vdev_kobj_post_evt_func_t(vdev_t *vd); typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg); typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, vdev_remap_cb_t callback, void *arg); /* * Given a target vdev, translates the logical range "in" to the physical * range "res" */ typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical, range_seg64_t *physical, range_seg64_t *remain); typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, uint64_t size, uint64_t max_segment); typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, uint64_t *sizep); typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); typedef uint64_t vdev_nparity_func_t(vdev_t *vd); typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); typedef const struct vdev_ops { vdev_init_func_t *vdev_op_init; vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; vdev_asize_func_t *vdev_op_asize; vdev_min_asize_func_t *vdev_op_min_asize; vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; vdev_need_resilver_func_t *vdev_op_need_resilver; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; vdev_xlation_func_t *vdev_op_xlate; vdev_rebuild_asize_func_t *vdev_op_rebuild_asize; vdev_metaslab_init_func_t *vdev_op_metaslab_init; vdev_config_generate_func_t *vdev_op_config_generate; vdev_nparity_func_t *vdev_op_nparity; vdev_ndisks_func_t *vdev_op_ndisks; vdev_kobj_post_evt_func_t *vdev_op_kobj_evt_post; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; /* * Virtual device properties */ typedef union vdev_queue_class { struct { ulong_t vqc_list_numnodes; list_t vqc_list; }; avl_tree_t vqc_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; zio_priority_t vq_last_prio; /* Last sent I/O priority. */ uint32_t vq_cqueued; /* Classes with queued I/Os. */ uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE]; uint32_t vq_active; /* Number of active I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ list_t vq_active_list; /* List of active I/Os. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ kmutex_t vq_lock; }; typedef enum vdev_alloc_bias { VDEV_BIAS_NONE, VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ } vdev_alloc_bias_t; /* * On-disk indirect vdev state. * * An indirect vdev is described exclusively in the MOS config of a pool. * The config for an indirect vdev includes several fields, which are * accessed in memory by a vdev_indirect_config_t. */ typedef struct vdev_indirect_config { /* * Object (in MOS) which contains the indirect mapping. This object * contains an array of vdev_indirect_mapping_entry_phys_t ordered by * vimep_src. The bonus buffer for this object is a * vdev_indirect_mapping_phys_t. This object is allocated when a vdev * removal is initiated. * * Note that this object can be empty if none of the data on the vdev * has been copied yet. */ uint64_t vic_mapping_object; /* * Object (in MOS) which contains the birth times for the mapping * entries. This object contains an array of * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus * buffer for this object is a vdev_indirect_birth_phys_t. This object * is allocated when a vdev removal is initiated. * * Note that this object can be empty if none of the vdev has yet been * copied. */ uint64_t vic_births_object; /* * This is the vdev ID which was removed previous to this vdev, or * UINT64_MAX if there are no previously removed vdevs. */ uint64_t vic_prev_indirect_vdev; } vdev_indirect_config_t; /* * Virtual device descriptor */ struct vdev { /* * Common to all vdev types. */ uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ /* * Logical block alignment shift * * The smallest sized/aligned I/O supported by the device. */ uint64_t vdev_logical_ashift; /* * Physical block alignment shift * * The device supports logical I/Os with vdev_logical_ashift * size/alignment, but optimum performance will be achieved by * aligning/sizing requests to vdev_physical_ashift. Smaller * requests may be inflated or incur device level read-modify-write * operations. * * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). */ uint64_t vdev_physical_ashift; uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ spa_t *vdev_spa; /* spa for this vdev */ void *vdev_tsd; /* type-specific data */ vdev_t *vdev_top; /* top-level vdev */ vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ vdev_stat_t vdev_stat; /* virtual device statistics */ vdev_stat_ex_t vdev_stat_ex; /* extended statistics */ boolean_t vdev_expanding; /* expand the vdev? */ boolean_t vdev_reopening; /* reopen in progress? */ boolean_t vdev_nonrot; /* true if solid state */ int vdev_load_error; /* error on last load */ int vdev_open_error; /* error on last open */ int vdev_validate_error; /* error on last validate */ kthread_t *vdev_open_thread; /* thread opening children */ kthread_t *vdev_validate_thread; /* thread validating children */ uint64_t vdev_crtxg; /* txg when top-level was added */ uint64_t vdev_root_zap; /* * Top-level vdev state. */ uint64_t vdev_ms_array; /* metaslab array object */ uint64_t vdev_ms_shift; /* metaslab size shift */ uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ metaslab_t **vdev_ms; /* metaslab array */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_probe_wanted; /* async probe wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ uint64_t vdev_noalloc; /* device is passivated? */ uint64_t vdev_removing; /* device is being removed? */ uint64_t vdev_failfast; /* device failfast setting */ boolean_t vdev_rz_expanding; /* raidz is being expanded? */ boolean_t vdev_ishole; /* is a hole in the namespace */ uint64_t vdev_top_zap; vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ /* Initialize related */ boolean_t vdev_initialize_exit_wanted; vdev_initializing_state_t vdev_initialize_state; list_node_t vdev_initialize_node; kthread_t *vdev_initialize_thread; /* Protects vdev_initialize_thread and vdev_initialize_state. */ kmutex_t vdev_initialize_lock; kcondvar_t vdev_initialize_cv; uint64_t vdev_initialize_offset[TXG_SIZE]; uint64_t vdev_initialize_last_offset; range_tree_t *vdev_initialize_tree; /* valid while initializing */ uint64_t vdev_initialize_bytes_est; uint64_t vdev_initialize_bytes_done; uint64_t vdev_initialize_action_time; /* start and end time */ /* TRIM related */ boolean_t vdev_trim_exit_wanted; boolean_t vdev_autotrim_exit_wanted; vdev_trim_state_t vdev_trim_state; list_node_t vdev_trim_node; kmutex_t vdev_autotrim_lock; kcondvar_t vdev_autotrim_cv; kcondvar_t vdev_autotrim_kick_cv; kthread_t *vdev_autotrim_thread; /* Protects vdev_trim_thread and vdev_trim_state. */ kmutex_t vdev_trim_lock; kcondvar_t vdev_trim_cv; kthread_t *vdev_trim_thread; uint64_t vdev_trim_offset[TXG_SIZE]; uint64_t vdev_trim_last_offset; uint64_t vdev_trim_bytes_est; uint64_t vdev_trim_bytes_done; uint64_t vdev_trim_rate; /* requested rate (bytes/sec) */ uint64_t vdev_trim_partial; /* requested partial TRIM */ uint64_t vdev_trim_secure; /* requested secure TRIM */ uint64_t vdev_trim_action_time; /* start and end time */ /* Rebuild related */ boolean_t vdev_rebuilding; boolean_t vdev_rebuild_exit_wanted; boolean_t vdev_rebuild_cancel_wanted; boolean_t vdev_rebuild_reset_wanted; kmutex_t vdev_rebuild_lock; kcondvar_t vdev_rebuild_cv; kthread_t *vdev_rebuild_thread; vdev_rebuild_t vdev_rebuild_config; /* For limiting outstanding I/Os (initialize, TRIM) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; /* * Values stored in the config for an indirect or removing vdev. */ vdev_indirect_config_t vdev_indirect_config; /* * The vdev_indirect_rwlock protects the vdev_indirect_mapping * pointer from changing on indirect vdevs (when it is condensed). * Note that removing (not yet indirect) vdevs have different * access patterns (the mapping is not accessed from open context, * e.g. from zio_read) and locking strategy (e.g. svr_lock). */ krwlock_t vdev_indirect_rwlock; vdev_indirect_mapping_t *vdev_indirect_mapping; vdev_indirect_births_t *vdev_indirect_births; /* * In memory data structures used to manage the obsolete sm, for * indirect or removing vdevs. * * The vdev_obsolete_segments is the in-core record of the segments * that are no longer referenced anywhere in the pool (due to * being freed or remapped and not referenced by any snapshots). * During a sync, segments are added to vdev_obsolete_segments * via vdev_indirect_mark_obsolete(); at the end of each sync * pass, this is appended to vdev_obsolete_sm via * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock * protects against concurrent modifications of vdev_obsolete_segments * from multiple zio threads. */ kmutex_t vdev_obsolete_lock; range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; /* * Protects the vdev_scan_io_queue field itself as well as the * structure's contents (when present). */ kmutex_t vdev_scan_io_queue_lock; struct dsl_scan_io_queue *vdev_scan_io_queue; /* * Leaf vdev state. */ range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ space_map_t *vdev_dtl_sm; /* dirty time log space map */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_dtl_object; /* DTL object */ uint64_t vdev_psize; /* physical device capacity */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_offline; /* persistent offline state */ uint64_t vdev_faulted; /* persistent faulted state */ uint64_t vdev_degraded; /* persistent degraded state */ uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ char *vdev_enc_sysfs_path; /* enclosure sysfs path */ char *vdev_fru; /* physical FRU location */ uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_has_trim; /* TRIM is supported */ boolean_t vdev_has_securetrim; /* secure TRIM is supported */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ boolean_t vdev_splitting; /* split or repair in progress */ boolean_t vdev_delayed_close; /* delayed device close? */ boolean_t vdev_tmpoffline; /* device taken offline temporarily? */ boolean_t vdev_detached; /* device detached? */ boolean_t vdev_cant_read; /* vdev is failing all reads */ boolean_t vdev_cant_write; /* vdev is failing all writes */ boolean_t vdev_isspare; /* was a hot spare */ boolean_t vdev_isl2cache; /* was a l2cache device */ boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */ boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ boolean_t vdev_attaching; /* vdev attach ashift handling */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ uint64_t vdev_leaf_zap; hrtime_t vdev_mmp_pending; /* 0 if write finished */ uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ uint64_t vdev_expansion_time; /* vdev's last expansion time */ list_node_t vdev_leaf_node; /* leaf vdev list */ /* * For DTrace to work in userland (libzpool) context, these fields must * remain at the end of the structure. DTrace will use the kernel's * CTF definition for 'struct vdev', and since the size of a kmutex_t is * larger in userland, the offsets for the rest of the fields would be * incorrect. */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ kmutex_t vdev_stat_lock; /* vdev_stat */ kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ /* * We rate limit ZIO delay, deadman, and checksum events, since they * can flood ZED with tons of events when a drive is acting up. */ zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; zfs_ratelimit_t vdev_checksum_rl; /* - * Checksum and IO thresholds for tuning ZED + * Vdev properties for tuning ZED */ uint64_t vdev_checksum_n; uint64_t vdev_checksum_t; uint64_t vdev_io_n; uint64_t vdev_io_t; + uint64_t vdev_slow_io_n; + uint64_t vdev_slow_io_t; }; #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) /* * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock * ring when MMP is enabled. */ #define MMP_BLOCKS_PER_LABEL 1 /* The largest uberblock we support is 8k. */ #define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) typedef struct vdev_phys { char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; zio_eck_t vp_zbt; } vdev_phys_t; typedef enum vbe_vers { /* * The bootenv file is stored as ascii text in the envblock. * It is used by the GRUB bootloader used on Linux to store the * contents of the grubenv file. The file is stored as raw ASCII, * and is protected by an embedded checksum. By default, GRUB will * check if the boot filesystem supports storing the environment data * in a special location, and if so, will invoke filesystem specific * logic to retrieve it. This can be overridden by a variable, should * the user so desire. */ VB_RAW = 0, /* * The bootenv file is converted to an nvlist and then packed into the * envblock. */ VB_NVLIST = 1 } vbe_vers_t; typedef struct vdev_boot_envblock { uint64_t vbe_version; char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - sizeof (zio_eck_t)]; zio_eck_t vbe_zbt; } vdev_boot_envblock_t; _Static_assert(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE, "vdev_boot_envblock_t wrong size"); typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ vdev_boot_envblock_t vl_be; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ /* * vdev_dirty() flags */ #define VDD_METASLAB 0x01 #define VDD_DTL 0x02 /* Offset of embedded boot loader region on each label */ #define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) /* * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. * On RAIDZ, this space is overwritten during RAIDZ expansion. */ #define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ /* * Size of label regions at the start and end of each leaf device. */ #define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS #define VDEV_OFFSET_IS_LABEL(vd, off) \ (((off) < VDEV_LABEL_START_SIZE) || \ ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE))) #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 #define VDEV_ALLOC_ROOTPOOL 4 #define VDEV_ALLOC_SPLIT 5 #define VDEV_ALLOC_ATTACH 6 /* * Allocate or free a vdev */ extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops); extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, vdev_t *parent, uint_t id, int alloctype); extern void vdev_free(vdev_t *vd); /* * Add or remove children and parents */ extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); extern void vdev_compact_children(vdev_t *pvd); extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ extern boolean_t vdev_log_state_valid(vdev_t *vd); extern int vdev_load(vdev_t *vd); extern int vdev_dtl_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); /* * Available vdev types. */ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; extern vdev_ops_t vdev_draid_ops; extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg); extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_alloc(vdev_t *vd); extern uint64_t vdev_get_nparity(vdev_t *vd); extern uint64_t vdev_get_ndisks(vdev_t *vd); /* * Global variables */ extern int zfs_vdev_standard_sm_blksz; /* * Functions from vdev_indirect.c */ extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); extern boolean_t vdev_indirect_should_condense(vdev_t *vd); extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); extern int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj); extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); /* * Other miscellaneous functions */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); /* * Vdev ashift optimization tunables */ extern uint_t zfs_vdev_min_auto_ashift; extern uint_t zfs_vdev_max_auto_ashift; int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); #ifdef __cplusplus } #endif #endif /* _SYS_VDEV_IMPL_H */ diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 7c39b134d1ef..cdd2f04c2629 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -1,9647 +1,9649 @@ - + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index c7b8617ef35e..402c14a6baee 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -1,5480 +1,5482 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2018, loli10K * Copyright (c) 2021, Colm Buckley * Copyright (c) 2021, 2023, Klara Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_comutil.h" #include "zfeature_common.h" static boolean_t zpool_vdev_is_interior(const char *name); typedef struct prop_flags { unsigned int create:1; /* Validate property on creation */ unsigned int import:1; /* Validate property on import */ unsigned int vdevprop:1; /* Validate property as a VDEV property */ } prop_flags_t; /* * ==================================================================== * zpool property functions * ==================================================================== */ static int zpool_get_all_props(zpool_handle_t *zhp) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zcmd_alloc_dst_nvlist(hdl, &zc, 0); while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { if (errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); else { zcmd_free_nvlists(&zc); return (-1); } } if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) { zcmd_free_nvlists(&zc); return (-1); } zcmd_free_nvlists(&zc); return (0); } int zpool_props_refresh(zpool_handle_t *zhp) { nvlist_t *old_props; old_props = zhp->zpool_props; if (zpool_get_all_props(zhp) != 0) return (-1); nvlist_free(old_props); return (0); } static const char * zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; const char *value; zprop_source_t source; nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { source = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); value = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { source = ZPROP_SRC_DEFAULT; if ((value = zpool_prop_default_string(prop)) == NULL) value = "-"; } if (src) *src = source; return (value); } uint64_t zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) { nvlist_t *nv, *nvl; uint64_t value; zprop_source_t source; if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) { /* * zpool_get_all_props() has most likely failed because * the pool is faulted, but if all we need is the top level * vdev's guid then get it from the zhp config nvlist. */ if ((prop == ZPOOL_PROP_GUID) && (nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) && (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) == 0)) { return (value); } return (zpool_prop_default_numeric(prop)); } nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { source = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); value = fnvlist_lookup_uint64(nv, ZPROP_VALUE); } else { source = ZPROP_SRC_DEFAULT; value = zpool_prop_default_numeric(prop); } if (src) *src = source; return (value); } /* * Map VDEV STATE to printed strings. */ const char * zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) { switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return (gettext("OFFLINE")); case VDEV_STATE_REMOVED: return (gettext("REMOVED")); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return (gettext("FAULTED")); else if (aux == VDEV_AUX_SPLIT_POOL) return (gettext("SPLIT")); else return (gettext("UNAVAIL")); case VDEV_STATE_FAULTED: return (gettext("FAULTED")); case VDEV_STATE_DEGRADED: return (gettext("DEGRADED")); case VDEV_STATE_HEALTHY: return (gettext("ONLINE")); default: break; } return (gettext("UNKNOWN")); } /* * Map POOL STATE to printed strings. */ const char * zpool_pool_state_to_name(pool_state_t state) { switch (state) { default: break; case POOL_STATE_ACTIVE: return (gettext("ACTIVE")); case POOL_STATE_EXPORTED: return (gettext("EXPORTED")); case POOL_STATE_DESTROYED: return (gettext("DESTROYED")); case POOL_STATE_SPARE: return (gettext("SPARE")); case POOL_STATE_L2CACHE: return (gettext("L2CACHE")); case POOL_STATE_UNINITIALIZED: return (gettext("UNINITIALIZED")); case POOL_STATE_UNAVAIL: return (gettext("UNAVAIL")); case POOL_STATE_POTENTIALLY_ACTIVE: return (gettext("POTENTIALLY_ACTIVE")); } return (gettext("UNKNOWN")); } /* * Given a pool handle, return the pool health string ("ONLINE", "DEGRADED", * "SUSPENDED", etc). */ const char * zpool_get_state_str(zpool_handle_t *zhp) { zpool_errata_t errata; zpool_status_t status; const char *str; status = zpool_get_status(zhp, NULL, &errata); if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { str = gettext("FAULTED"); } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT || status == ZPOOL_STATUS_IO_FAILURE_CONTINUE || status == ZPOOL_STATUS_IO_FAILURE_MMP) { str = gettext("SUSPENDED"); } else { nvlist_t *nvroot = fnvlist_lookup_nvlist( zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); uint_t vsc; vdev_stat_t *vs = (vdev_stat_t *)fnvlist_lookup_uint64_array( nvroot, ZPOOL_CONFIG_VDEV_STATS, &vsc); str = zpool_state_to_name(vs->vs_state, vs->vs_aux); } return (str); } /* * Get a zpool property value for 'prop' and return the value in * a pre-allocated buffer. */ int zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { uint64_t intval; const char *strval; zprop_source_t src = ZPROP_SRC_NONE; if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { switch (prop) { case ZPOOL_PROP_NAME: (void) strlcpy(buf, zpool_get_name(zhp), len); break; case ZPOOL_PROP_HEALTH: (void) strlcpy(buf, zpool_get_state_str(zhp), len); break; case ZPOOL_PROP_GUID: intval = zpool_get_prop_int(zhp, prop, &src); (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); break; case ZPOOL_PROP_ALTROOT: case ZPOOL_PROP_CACHEFILE: case ZPOOL_PROP_COMMENT: case ZPOOL_PROP_COMPATIBILITY: if (zhp->zpool_props != NULL || zpool_get_all_props(zhp) == 0) { (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; } zfs_fallthrough; default: (void) strlcpy(buf, "-", len); break; } if (srctype != NULL) *srctype = src; return (0); } if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && prop != ZPOOL_PROP_NAME) return (-1); switch (zpool_prop_get_type(prop)) { case PROP_TYPE_STRING: (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); break; case PROP_TYPE_NUMBER: intval = zpool_get_prop_int(zhp, prop, &src); switch (prop) { case ZPOOL_PROP_SIZE: case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: case ZPOOL_PROP_FREEING: case ZPOOL_PROP_LEAKED: case ZPOOL_PROP_ASHIFT: case ZPOOL_PROP_MAXBLOCKSIZE: case ZPOOL_PROP_MAXDNODESIZE: case ZPOOL_PROP_BCLONESAVED: case ZPOOL_PROP_BCLONEUSED: if (literal) (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); else (void) zfs_nicenum(intval, buf, len); break; case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: if (intval == 0) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicebytes(intval, buf, len); } break; case ZPOOL_PROP_CAPACITY: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_FRAGMENTATION: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case ZPOOL_PROP_BCLONERATIO: case ZPOOL_PROP_DEDUPRATIO: if (literal) (void) snprintf(buf, len, "%llu.%02llu", (u_longlong_t)(intval / 100), (u_longlong_t)(intval % 100)); else (void) snprintf(buf, len, "%llu.%02llux", (u_longlong_t)(intval / 100), (u_longlong_t)(intval % 100)); break; case ZPOOL_PROP_HEALTH: (void) strlcpy(buf, zpool_get_state_str(zhp), len); break; case ZPOOL_PROP_VERSION: if (intval >= SPA_VERSION_FEATURES) { (void) snprintf(buf, len, "-"); break; } zfs_fallthrough; default: (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } break; case PROP_TYPE_INDEX: intval = zpool_get_prop_int(zhp, prop, &src); if (zpool_prop_index_to_string(prop, intval, &strval) != 0) return (-1); (void) strlcpy(buf, strval, len); break; default: abort(); } if (srctype) *srctype = src; return (0); } /* * Get a zpool property value for 'propname' and return the value in * a pre-allocated buffer. */ int zpool_get_userprop(zpool_handle_t *zhp, const char *propname, char *buf, size_t len, zprop_source_t *srctype) { nvlist_t *nv, *nvl; uint64_t ival; const char *value; zprop_source_t source = ZPROP_SRC_LOCAL; nvl = zhp->zpool_props; if (nvlist_lookup_nvlist(nvl, propname, &nv) == 0) { if (nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0) source = ival; verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); } else { source = ZPROP_SRC_DEFAULT; value = "-"; } if (srctype) *srctype = source; (void) strlcpy(buf, value, len); return (0); } /* * Check if the bootfs name has the same pool name as it is set to. * Assuming bootfs is a valid dataset name. */ static boolean_t bootfs_name_valid(const char *pool, const char *bootfs) { int len = strlen(pool); if (bootfs[0] == '\0') return (B_TRUE); if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT)) return (B_FALSE); if (strncmp(pool, bootfs, len) == 0 && (bootfs[len] == '/' || bootfs[len] == '\0')) return (B_TRUE); return (B_FALSE); } /* * Given an nvlist of zpool properties to be set, validate that they are * correct, and parse any numeric properties (index, boolean, etc) if they are * specified as strings. */ static nvlist_t * zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf) { nvpair_t *elem; nvlist_t *retprops; zpool_prop_t prop; const char *strval; uint64_t intval; const char *slash, *check; struct stat64 statbuf; zpool_handle_t *zhp; char report[1024]; if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); return (NULL); } elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); if (flags.vdevprop && zpool_prop_vdev(propname)) { vdev_prop_t vprop = vdev_name_to_prop(propname); if (vdev_prop_readonly(vprop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (zprop_parse_value(hdl, elem, vprop, ZFS_TYPE_VDEV, retprops, &strval, &intval, errbuf) != 0) goto error; continue; } else if (flags.vdevprop && vdev_prop_user(propname)) { if (nvlist_add_nvpair(retprops, elem) != 0) { (void) no_memory(hdl); goto error; } continue; } else if (flags.vdevprop) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property: '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } prop = zpool_name_to_prop(propname); if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) { int err; char *fname = strchr(propname, '@') + 1; err = zfeature_lookup_name(fname, NULL); if (err != 0) { ASSERT3U(err, ==, ENOENT); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "feature '%s' unsupported by kernel"), fname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0 && strcmp(strval, ZFS_FEATURE_DISABLED) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set to " "'enabled' or 'disabled'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (!flags.create && strcmp(strval, ZFS_FEATURE_DISABLED) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set to " "'disabled' at creation time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvlist_add_uint64(retprops, propname, 0) != 0) { (void) no_memory(hdl); goto error; } continue; } else if (prop == ZPOOL_PROP_INVAL && zfs_prop_user(propname)) { /* * This is a user property: make sure it's a * string, and that it's less than ZAP_MAXNAMELEN. */ if (nvpair_type(elem) != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property name '%s' is too long"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } (void) nvpair_value_string(elem, &strval); if (strlen(strval) >= ZFS_MAXPROPLEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property value '%s' is too long"), strval); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (nvlist_add_string(retprops, propname, strval) != 0) { (void) no_memory(hdl); goto error; } continue; } /* * Make sure this property is valid and applies to this type. */ if (prop == ZPOOL_PROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zpool_prop_readonly(prop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is readonly"), propname); (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); goto error; } if (!flags.create && zpool_prop_setonce(prop)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set at " "creation time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops, &strval, &intval, errbuf) != 0) goto error; /* * Perform additional checking for specific properties. */ switch (prop) { case ZPOOL_PROP_VERSION: if (intval < version || !SPA_VERSION_IS_SUPPORTED(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %llu is invalid."), propname, (unsigned long long)intval); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } break; case ZPOOL_PROP_ASHIFT: if (intval != 0 && (intval < ASHIFT_MIN || intval > ASHIFT_MAX)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %llu is invalid, " "only values between %" PRId32 " and %" PRId32 " are allowed."), propname, (unsigned long long)intval, ASHIFT_MIN, ASHIFT_MAX); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_BOOTFS: if (flags.create || flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' cannot be set at creation " "or import time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (version < SPA_VERSION_BOOTFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to support " "'%s' property"), propname); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); goto error; } /* * bootfs property value has to be a dataset name and * the dataset has to be in the same pool as it sets to. */ if (!bootfs_name_valid(poolname, strval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " "is an invalid name"), strval); (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); goto error; } if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "could not open pool '%s'"), poolname); (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); goto error; } zpool_close(zhp); break; case ZPOOL_PROP_ALTROOT: if (!flags.create && !flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set during pool " "creation or import"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bad alternate root '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } break; case ZPOOL_PROP_CACHEFILE: if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' must be empty, an " "absolute path, or 'none'"), propname); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } slash = strrchr(strval, '/'); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid file"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } *(char *)slash = '\0'; if (strval[0] != '\0' && (stat64(strval, &statbuf) != 0 || !S_ISDIR(statbuf.st_mode))) { *(char *)slash = '/'; zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is not a valid directory"), strval); (void) zfs_error(hdl, EZFS_BADPATH, errbuf); goto error; } *(char *)slash = '/'; break; case ZPOOL_PROP_COMPATIBILITY: switch (zpool_load_compat(strval, NULL, report, 1024)) { case ZPOOL_COMPATIBILITY_OK: case ZPOOL_COMPATIBILITY_WARNTOKEN: break; case ZPOOL_COMPATIBILITY_BADFILE: case ZPOOL_COMPATIBILITY_BADTOKEN: case ZPOOL_COMPATIBILITY_NOFILES: zfs_error_aux(hdl, "%s", report); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_COMMENT: for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment may only have printable " "characters")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } } if (strlen(strval) > ZPROP_MAX_COMMENT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "comment must not exceed %d characters"), ZPROP_MAX_COMMENT); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_READONLY: if (!flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set at " "import time"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_MULTIHOST: if (get_system_hostid() == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "requires a non-zero system hostid")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } break; case ZPOOL_PROP_DEDUPDITTO: printf("Note: property '%s' no longer has " "any effect\n", propname); break; default: break; } } return (retprops); error: nvlist_free(retprops); return (NULL); } /* * Set zpool property : propname=propval. */ int zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) { zfs_cmd_t zc = {"\0"}; int ret = -1; char errbuf[ERRBUFLEN]; nvlist_t *nvl = NULL; nvlist_t *realprops; uint64_t version; prop_flags_t flags = { 0 }; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zpool_name); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_add_string(nvl, propname, propval) != 0) { nvlist_free(nvl); return (no_memory(zhp->zpool_hdl)); } version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) { nvlist_free(nvl); return (-1); } nvlist_free(nvl); nvl = realprops; /* * Execute the corresponding ioctl() to set this property. */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl); ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc); zcmd_free_nvlists(&zc); nvlist_free(nvl); if (ret) (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); else (void) zpool_props_refresh(zhp); return (ret); } int zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp, zfs_type_t type, boolean_t literal) { libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; nvlist_t *features = NULL; nvpair_t *nvp; zprop_list_t **last; boolean_t firstexpand = (NULL == *plp); int i; if (zprop_expand_list(hdl, plp, type) != 0) return (-1); if (type == ZFS_TYPE_VDEV) return (0); last = plp; while (*last != NULL) last = &(*last)->pl_next; if ((*plp)->pl_all) features = zpool_get_features(zhp); if ((*plp)->pl_all && firstexpand) { /* Handle userprops in the all properties case */ if (zhp->zpool_props == NULL && zpool_props_refresh(zhp)) return (-1); nvp = NULL; while ((nvp = nvlist_next_nvpair(zhp->zpool_props, nvp)) != NULL) { const char *propname = nvpair_name(nvp); if (!zfs_prop_user(propname)) continue; entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_USERPROP; entry->pl_user_prop = zfs_strdup(hdl, propname); entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } for (i = 0; i < SPA_FEATURES; i++) { entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_USERPROP; entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", spa_feature_table[i].fi_uname); entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } } /* add any unsupported features */ for (nvp = nvlist_next_nvpair(features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { char *propname; boolean_t found; if (zfeature_is_supported(nvpair_name(nvp))) continue; propname = zfs_asprintf(hdl, "unsupported@%s", nvpair_name(nvp)); /* * Before adding the property to the list make sure that no * other pool already added the same property. */ found = B_FALSE; entry = *plp; while (entry != NULL) { if (entry->pl_user_prop != NULL && strcmp(propname, entry->pl_user_prop) == 0) { found = B_TRUE; break; } entry = entry->pl_next; } if (found) { free(propname); continue; } entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_USERPROP; entry->pl_user_prop = propname; entry->pl_width = strlen(entry->pl_user_prop); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_USERPROP && zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), NULL, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } else if (entry->pl_prop == ZPROP_INVAL && zfs_prop_user(entry->pl_user_prop) && zpool_get_userprop(zhp, entry->pl_user_prop, buf, sizeof (buf), NULL) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } } return (0); } int vdev_expand_proplist(zpool_handle_t *zhp, const char *vdevname, zprop_list_t **plp) { zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; const char *strval = NULL; int err = 0; nvpair_t *elem = NULL; nvlist_t *vprops = NULL; nvlist_t *propval = NULL; const char *propname; vdev_prop_t prop; zprop_list_t **last; for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed) continue; if (zpool_get_vdev_prop(zhp, vdevname, entry->pl_prop, entry->pl_user_prop, buf, sizeof (buf), NULL, B_FALSE) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } if (entry->pl_prop == VDEV_PROP_NAME && strlen(vdevname) > entry->pl_width) entry->pl_width = strlen(vdevname); } /* Handle the all properties case */ last = plp; if (*last != NULL && (*last)->pl_all == B_TRUE) { while (*last != NULL) last = &(*last)->pl_next; err = zpool_get_all_vdev_props(zhp, vdevname, &vprops); if (err != 0) return (err); while ((elem = nvlist_next_nvpair(vprops, elem)) != NULL) { propname = nvpair_name(elem); /* Skip properties that are not user defined */ if ((prop = vdev_name_to_prop(propname)) != VDEV_PROP_USERPROP) continue; if (nvpair_value_nvlist(elem, &propval) != 0) continue; strval = fnvlist_lookup_string(propval, ZPROP_VALUE); entry = zfs_alloc(zhp->zpool_hdl, sizeof (zprop_list_t)); entry->pl_prop = prop; entry->pl_user_prop = zfs_strdup(zhp->zpool_hdl, propname); entry->pl_width = strlen(strval); entry->pl_all = B_TRUE; *last = entry; last = &entry->pl_next; } } return (0); } /* * Get the state for the given feature on the given ZFS pool. */ int zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, size_t len) { uint64_t refcount; boolean_t found = B_FALSE; nvlist_t *features = zpool_get_features(zhp); boolean_t supported; const char *feature = strchr(propname, '@') + 1; supported = zpool_prop_feature(propname); ASSERT(supported || zpool_prop_unsupported(propname)); /* * Convert from feature name to feature guid. This conversion is * unnecessary for unsupported@... properties because they already * use guids. */ if (supported) { int ret; spa_feature_t fid; ret = zfeature_lookup_name(feature, &fid); if (ret != 0) { (void) strlcpy(buf, "-", len); return (ENOTSUP); } feature = spa_feature_table[fid].fi_guid; } if (nvlist_lookup_uint64(features, feature, &refcount) == 0) found = B_TRUE; if (supported) { if (!found) { (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); } else { if (refcount == 0) (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); else (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); } } else { if (found) { if (refcount == 0) { (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); } else { (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); } } else { (void) strlcpy(buf, "-", len); return (ENOTSUP); } } return (0); } /* * Validate the given pool name, optionally putting an extended error message in * 'buf'. */ boolean_t zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) { namecheck_err_t why; char what; int ret; ret = pool_namecheck(pool, &why, &what); /* * The rules for reserved pool names were extended at a later point. * But we need to support users with existing pools that may now be * invalid. So we only check for this expanded set of names during a * create (or import), and only in userland. */ if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); return (B_FALSE); } if (ret != 0) { if (hdl != NULL) { switch (why) { case NAME_ERR_TOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is too long")); break; case NAME_ERR_INVALCHAR: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid character " "'%c' in pool name"), what); break; case NAME_ERR_NOLETTER: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name must begin with a letter")); break; case NAME_ERR_RESERVED: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "name is reserved")); break; case NAME_ERR_DISKLIKE: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool name is reserved")); break; case NAME_ERR_LEADING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "leading slash in name")); break; case NAME_ERR_EMPTY_COMPONENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "empty component in name")); break; case NAME_ERR_TRAILING_SLASH: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "trailing slash in name")); break; case NAME_ERR_MULTIPLE_DELIMITERS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' and/or '#' delimiters in " "name")); break; case NAME_ERR_NO_AT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "permission set is missing '@'")); break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "(%d) not defined"), why); break; } } return (B_FALSE); } return (B_TRUE); } /* * Open a handle to the given pool, even if the pool is currently in the FAULTED * state. */ zpool_handle_t * zpool_open_canfail(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; boolean_t missing; /* * Make sure the pool name is valid. */ if (!zpool_name_valid(hdl, B_TRUE, pool)) { (void) zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); return (NULL); } zhp = zfs_alloc(hdl, sizeof (zpool_handle_t)); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (NULL); } if (missing) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool")); (void) zfs_error_fmt(hdl, EZFS_NOENT, dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); zpool_close(zhp); return (NULL); } return (zhp); } /* * Like the above, but silent on error. Used when iterating over pools (because * the configuration cache may be out of date). */ int zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret) { zpool_handle_t *zhp; boolean_t missing; zhp = zfs_alloc(hdl, sizeof (zpool_handle_t)); zhp->zpool_hdl = hdl; (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); if (zpool_refresh_stats(zhp, &missing) != 0) { zpool_close(zhp); return (-1); } if (missing) { zpool_close(zhp); *ret = NULL; return (0); } *ret = zhp; return (0); } /* * Similar to zpool_open_canfail(), but refuses to open pools in the faulted * state. */ zpool_handle_t * zpool_open(libzfs_handle_t *hdl, const char *pool) { zpool_handle_t *zhp; if ((zhp = zpool_open_canfail(hdl, pool)) == NULL) return (NULL); if (zhp->zpool_state == POOL_STATE_UNAVAIL) { (void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name); zpool_close(zhp); return (NULL); } return (zhp); } /* * Close the handle. Simply frees the memory associated with the handle. */ void zpool_close(zpool_handle_t *zhp) { nvlist_free(zhp->zpool_config); nvlist_free(zhp->zpool_old_config); nvlist_free(zhp->zpool_props); free(zhp); } /* * Return the name of the pool. */ const char * zpool_get_name(zpool_handle_t *zhp) { return (zhp->zpool_name); } /* * Return the state of the pool (ACTIVE or UNAVAILABLE) */ int zpool_get_state(zpool_handle_t *zhp) { return (zhp->zpool_state); } /* * Check if vdev list contains a special vdev */ static boolean_t zpool_has_special_vdev(nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (uint_t c = 0; c < children; c++) { const char *bias; if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 && strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) { return (B_TRUE); } } } return (B_FALSE); } /* * Check if vdev list contains a dRAID vdev */ static boolean_t zpool_has_draid_vdev(nvlist_t *nvroot) { nvlist_t **child; uint_t children; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (uint_t c = 0; c < children; c++) { const char *type; if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) == 0 && strcmp(type, VDEV_TYPE_DRAID) == 0) { return (B_TRUE); } } } return (B_FALSE); } /* * Output a dRAID top-level vdev name in to the provided buffer. */ static char * zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity, uint64_t spares, uint64_t children) { snprintf(name, len, "%s%llu:%llud:%lluc:%llus", VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, (u_longlong_t)children, (u_longlong_t)spares); return (name); } /* * Return B_TRUE if the provided name is a dRAID spare name. */ boolean_t zpool_is_draid_spare(const char *name) { uint64_t spare_id, parity, vdev_id; if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, (u_longlong_t *)&spare_id) == 3) { return (B_TRUE); } return (B_FALSE); } /* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we * don't have to worry about error semantics. */ int zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *fsprops) { zfs_cmd_t zc = {"\0"}; nvlist_t *zc_fsprops = NULL; nvlist_t *zc_props = NULL; nvlist_t *hidden_args = NULL; uint8_t *wkeydata = NULL; uint_t wkeylen = 0; char errbuf[ERRBUFLEN]; int ret = -1; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), pool); if (!zpool_name_valid(hdl, B_FALSE, pool)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); zcmd_write_conf_nvlist(hdl, &zc, nvroot); if (props) { prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE }; if ((zc_props = zpool_valid_proplist(hdl, pool, props, SPA_VERSION_1, flags, errbuf)) == NULL) { goto create_failed; } } if (fsprops) { uint64_t zoned; const char *zonestr; zoned = ((nvlist_lookup_string(fsprops, zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) && strcmp(zonestr, "on") == 0); if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, NULL, B_TRUE, errbuf)) == NULL) { goto create_failed; } if (nvlist_exists(zc_fsprops, zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) && !zpool_has_special_vdev(nvroot)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s property requires a special vdev"), zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto create_failed; } if (!zc_props && (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { goto create_failed; } if (zfs_crypto_create(hdl, NULL, zc_fsprops, props, B_TRUE, &wkeydata, &wkeylen) != 0) { zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); goto create_failed; } if (nvlist_add_nvlist(zc_props, ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) { goto create_failed; } if (wkeydata != NULL) { if (nvlist_alloc(&hidden_args, NV_UNIQUE_NAME, 0) != 0) goto create_failed; if (nvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen) != 0) goto create_failed; if (nvlist_add_nvlist(zc_props, ZPOOL_HIDDEN_ARGS, hidden_args) != 0) goto create_failed; } } if (zc_props) zcmd_write_src_nvlist(hdl, &zc, zc_props); (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) { zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); nvlist_free(hidden_args); if (wkeydata != NULL) free(wkeydata); switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. This can also happen under if the device is * part of an active md or lvm device. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device, or " "one of\nthe devices is part of an active md or " "lvm device")); return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ERANGE: /* * This happens if the record size is smaller or larger * than the allowed size range, or not a power of 2. * * NOTE: although zfs_valid_proplist is called earlier, * this case may have slipped through since the * pool does not exist yet and it is therefore * impossible to read properties e.g. max blocksize * from the pool. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "record size invalid")); return (zfs_error(hdl, EZFS_BADPROP, errbuf)); case EOVERFLOW: /* * This occurs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicebytes(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is less than the " "minimum size (%s)"), buf); } return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case ENOSPC: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is out of space")); return (zfs_error(hdl, EZFS_BADDEV, errbuf)); case EINVAL: if (zpool_has_draid_vdev(nvroot) && zfeature_lookup_name("draid", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID vdevs are unsupported by the " "kernel")); return (zfs_error(hdl, EZFS_BADDEV, errbuf)); } else { return (zpool_standard_error(hdl, errno, errbuf)); } default: return (zpool_standard_error(hdl, errno, errbuf)); } } create_failed: zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(zc_fsprops); nvlist_free(hidden_args); if (wkeydata != NULL) free(wkeydata); return (ret); } /* * Destroy the given pool. It is up to the caller to ensure that there are no * datasets left in the pool. */ int zpool_destroy(zpool_handle_t *zhp, const char *log_str) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *zfp = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; if (zhp->zpool_state == POOL_STATE_ACTIVE && (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zpool_name); if (errno == EROFS) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); } else { (void) zpool_standard_error(hdl, errno, errbuf); } if (zfp) zfs_close(zfp); return (-1); } if (zfp) { remove_mountpoint(zfp); zfs_close(zfp); } return (0); } /* * Create a checkpoint in the given pool. */ int zpool_checkpoint(zpool_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; int error; error = lzc_pool_checkpoint(zhp->zpool_name); if (error != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot checkpoint '%s'"), zhp->zpool_name); (void) zpool_standard_error(hdl, error, errbuf); return (-1); } return (0); } /* * Discard the checkpoint from the given pool. */ int zpool_discard_checkpoint(zpool_handle_t *zhp) { libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; int error; error = lzc_pool_checkpoint_discard(zhp->zpool_name); if (error != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot discard checkpoint in '%s'"), zhp->zpool_name); (void) zpool_standard_error(hdl, error, errbuf); return (-1); } return (0); } /* * Add the given vdevs to the pool. The caller must have already performed the * necessary verification to ensure that the vdev specification is well-formed. */ int zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) { zfs_cmd_t zc = {"\0"}; int ret; libzfs_handle_t *hdl = zhp->zpool_hdl; char errbuf[ERRBUFLEN]; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot add to '%s'"), zhp->zpool_name); if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_SPARES && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add hot spares")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); } if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_L2CACHE && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " "upgraded to add cache devices")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); } zcmd_write_conf_nvlist(hdl, &zc, nvroot); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { switch (errno) { case EBUSY: /* * This can happen if the user has specified the same * device multiple times. We can't reliably detect this * until we try to add it and see we already have a * label. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more vdevs refer to the same device")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EINVAL: if (zpool_has_draid_vdev(nvroot) && zfeature_lookup_name("draid", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID vdevs are unsupported by the " "kernel")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; a pool with removing/" "removed vdevs does not support adding " "raidz or dRAID vdevs")); } (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EOVERFLOW: /* * This occurs when one of the devices is below * SPA_MINDEVSIZE. Unfortunately, we can't detect which * device was the problem device since there's no * reliable way to determine device size from userland. */ { char buf[64]; zfs_nicebytes(SPA_MINDEVSIZE, buf, sizeof (buf)); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is less than the minimum " "size (%s)"), buf); } (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to add these vdevs")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; default: (void) zpool_standard_error(hdl, errno, errbuf); } ret = -1; } else { ret = 0; } zcmd_free_nvlists(&zc); return (ret); } /* * Exports the pool from the system. The caller must ensure that there are no * mounted datasets in the pool. */ static int zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, const char *log_str) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = force; zc.zc_guid = hardforce; zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { switch (errno) { case EXDEV: zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "use '-f' to override the following errors:\n" "'%s' has an active shared spare which could be" " used by other pools once '%s' is exported."), zhp->zpool_name, zhp->zpool_name); return (zfs_error_fmt(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, dgettext(TEXT_DOMAIN, "cannot export '%s'"), zhp->zpool_name)); default: return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, dgettext(TEXT_DOMAIN, "cannot export '%s'"), zhp->zpool_name)); } } return (0); } int zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) { return (zpool_export_common(zhp, force, B_FALSE, log_str)); } int zpool_export_force(zpool_handle_t *zhp, const char *log_str) { return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); } static void zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, nvlist_t *config) { nvlist_t *nv = NULL; uint64_t rewindto; int64_t loss = -1; struct tm t; char timestr[128]; if (!hdl->libzfs_printerr || config == NULL) return; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { return; } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) return; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && strftime(timestr, 128, "%c", &t) != 0) { if (dryrun) { (void) printf(dgettext(TEXT_DOMAIN, "Would be able to return %s " "to its state as of %s.\n"), name, timestr); } else { (void) printf(dgettext(TEXT_DOMAIN, "Pool %s returned to its state as of %s.\n"), name, timestr); } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", ((longlong_t)loss + 30) / 60); (void) printf(dgettext(TEXT_DOMAIN, "minutes of transactions.\n")); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", (longlong_t)loss); (void) printf(dgettext(TEXT_DOMAIN, "seconds of transactions.\n")); } } } void zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, nvlist_t *config) { nvlist_t *nv = NULL; int64_t loss = -1; uint64_t edata = UINT64_MAX; uint64_t rewindto; struct tm t; char timestr[128]; if (!hdl->libzfs_printerr) return; if (reason >= 0) (void) printf(dgettext(TEXT_DOMAIN, "action: ")); else (void) printf(dgettext(TEXT_DOMAIN, "\t")); /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) goto no_info; (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, &edata); (void) printf(dgettext(TEXT_DOMAIN, "Recovery is possible, but will result in some data loss.\n")); if (localtime_r((time_t *)&rewindto, &t) != NULL && strftime(timestr, 128, "%c", &t) != 0) { (void) printf(dgettext(TEXT_DOMAIN, "\tReturning the pool to its state as of %s\n" "\tshould correct the problem. "), timestr); } else { (void) printf(dgettext(TEXT_DOMAIN, "\tReverting the pool to an earlier state " "should correct the problem.\n\t")); } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, "Approximately %lld minutes of data\n" "\tmust be discarded, irreversibly. "), ((longlong_t)loss + 30) / 60); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, "Approximately %lld seconds of data\n" "\tmust be discarded, irreversibly. "), (longlong_t)loss); } if (edata != 0 && edata != UINT64_MAX) { if (edata == 1) { (void) printf(dgettext(TEXT_DOMAIN, "After rewind, at least\n" "\tone persistent user-data error will remain. ")); } else { (void) printf(dgettext(TEXT_DOMAIN, "After rewind, several\n" "\tpersistent user-data errors will remain. ")); } } (void) printf(dgettext(TEXT_DOMAIN, "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "), reason >= 0 ? "clear" : "import", name); (void) printf(dgettext(TEXT_DOMAIN, "A scrub of the pool\n" "\tis strongly recommended after recovery.\n")); return; no_info: (void) printf(dgettext(TEXT_DOMAIN, "Destroy and re-create the pool from\n\ta backup source.\n")); } /* * zpool_import() is a contracted interface. Should be kept the same * if possible. * * Applications should use zpool_import_props() to import a pool with * new properties value to be set. */ int zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, char *altroot) { nvlist_t *props = NULL; int ret; if (altroot != NULL) { if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } if (nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 || nvlist_add_string(props, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) { nvlist_free(props); return (zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); } } ret = zpool_import_props(hdl, config, newname, props, ZFS_IMPORT_NORMAL); nvlist_free(props); return (ret); } static void print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, int indent) { nvlist_t **child; uint_t c, children; char *vname; uint64_t is_log = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); if (name != NULL) (void) printf("\t%*s%s%s\n", indent, "", name, is_log ? " [log]" : ""); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return; for (c = 0; c < children; c++) { vname = zpool_vdev_name(hdl, NULL, child[c], VDEV_NAME_TYPE_ID); print_vdev_tree(hdl, vname, child[c], indent + 2); free(vname); } } void zpool_print_unsup_feat(nvlist_t *config) { nvlist_t *nvinfo, *unsup_feat; nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); unsup_feat = fnvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT); for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; nvp = nvlist_next_nvpair(unsup_feat, nvp)) { const char *desc = fnvpair_value_string(nvp); if (strlen(desc) > 0) (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc); else (void) printf("\t%s\n", nvpair_name(nvp)); } } /* * Import the given pool using the known configuration and a list of * properties to be set. The configuration should have come from * zpool_find_import(). The 'newname' parameters control whether the pool * is imported with a different name. */ int zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, nvlist_t *props, int flags) { zfs_cmd_t zc = {"\0"}; zpool_load_policy_t policy; nvlist_t *nv = NULL; nvlist_t *nvinfo = NULL; nvlist_t *missing = NULL; const char *thename; const char *origname; int ret; int error = 0; char errbuf[ERRBUFLEN]; origname = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot import pool '%s'"), origname); if (newname != NULL) { if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); thename = newname; } else { thename = origname; } if (props != NULL) { uint64_t version; prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if ((props = zpool_valid_proplist(hdl, origname, props, version, flags, errbuf)) == NULL) return (-1); zcmd_write_src_nvlist(hdl, &zc, props); nvlist_free(props); } (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name)); zc.zc_guid = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID); zcmd_write_conf_nvlist(hdl, &zc, config); zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2); zc.zc_cookie = flags; while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); if (ret != 0) error = errno; (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); zcmd_free_nvlists(&zc); zpool_get_load_policy(config, &policy); if (error) { char desc[1024]; char aux[256]; /* * Dry-run failed, but we print out what success * looks like if we found a best txg */ if (policy.zlp_rewind & ZPOOL_TRY_REWIND) { zpool_rewind_exclaim(hdl, newname ? origname : thename, B_TRUE, nv); nvlist_free(nv); return (-1); } if (newname == NULL) (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s'"), thename); else (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), origname, thename); switch (error) { case ENOTSUP: if (nv != NULL && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { (void) printf(dgettext(TEXT_DOMAIN, "This " "pool uses the following feature(s) not " "supported by this system:\n")); zpool_print_unsup_feat(nv); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) { (void) printf(dgettext(TEXT_DOMAIN, "All unsupported features are only " "required for writing to the pool." "\nThe pool can be imported using " "'-o readonly=on'.\n")); } } /* * Unsupported version. */ (void) zfs_error(hdl, EZFS_BADVERSION, desc); break; case EREMOTEIO: if (nv != NULL && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) { const char *hostname = ""; uint64_t hostid = 0; mmp_state_t mmp_state; mmp_state = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_STATE); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) hostname = fnvlist_lookup_string(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) hostid = fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_HOSTID); if (mmp_state == MMP_STATE_ACTIVE) { (void) snprintf(aux, sizeof (aux), dgettext(TEXT_DOMAIN, "pool is imp" "orted on host '%s' (hostid=%lx).\n" "Export the pool on the other " "system, then run 'zpool import'."), hostname, (unsigned long) hostid); } else if (mmp_state == MMP_STATE_NO_HOSTID) { (void) snprintf(aux, sizeof (aux), dgettext(TEXT_DOMAIN, "pool has " "the multihost property on and " "the\nsystem's hostid is not set. " "Set a unique system hostid with " "the zgenhostid(8) command.\n")); } (void) zfs_error_aux(hdl, "%s", aux); } (void) zfs_error(hdl, EZFS_ACTIVE_POOL, desc); break; case EINVAL: (void) zfs_error(hdl, EZFS_INVALCONFIG, desc); break; case EROFS: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is read only")); (void) zfs_error(hdl, EZFS_BADDEV, desc); break; case ENXIO: if (nv && nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { (void) printf(dgettext(TEXT_DOMAIN, "The devices below are missing or " "corrupted, use '-m' to import the pool " "anyway:\n")); print_vdev_tree(hdl, NULL, missing, 2); (void) printf("\n"); } (void) zpool_standard_error(hdl, error, desc); break; case EEXIST: (void) zpool_standard_error(hdl, error, desc); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices are already in use\n")); (void) zfs_error(hdl, EZFS_BADDEV, desc); break; case ENAMETOOLONG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new name of at least one dataset is longer than " "the maximum allowable length")); (void) zfs_error(hdl, EZFS_NAMETOOLONG, desc); break; default: (void) zpool_standard_error(hdl, error, desc); zpool_explain_recover(hdl, newname ? origname : thename, -error, nv); break; } nvlist_free(nv); ret = -1; } else { zpool_handle_t *zhp; /* * This should never fail, but play it safe anyway. */ if (zpool_open_silent(hdl, thename, &zhp) != 0) ret = -1; else if (zhp != NULL) zpool_close(zhp); if (policy.zlp_rewind & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { zpool_rewind_exclaim(hdl, newname ? origname : thename, ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nv); } nvlist_free(nv); } return (ret); } /* * Translate vdev names to guids. If a vdev_path is determined to be * unsuitable then a vd_errlist is allocated and the vdev path and errno * are added to it. */ static int zpool_translate_vdev_guids(zpool_handle_t *zhp, nvlist_t *vds, nvlist_t *vdev_guids, nvlist_t *guids_to_paths, nvlist_t **vd_errlist) { nvlist_t *errlist = NULL; int error = 0; for (nvpair_t *elem = nvlist_next_nvpair(vds, NULL); elem != NULL; elem = nvlist_next_nvpair(vds, elem)) { boolean_t spare, cache; const char *vd_path = nvpair_name(elem); nvlist_t *tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL); if ((tgt == NULL) || cache || spare) { if (errlist == NULL) { errlist = fnvlist_alloc(); error = EINVAL; } uint64_t err = (tgt == NULL) ? EZFS_NODEVICE : (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); fnvlist_add_int64(errlist, vd_path, err); continue; } uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); fnvlist_add_uint64(vdev_guids, vd_path, guid); char msg[MAXNAMELEN]; (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid); fnvlist_add_string(guids_to_paths, msg, vd_path); } if (error != 0) { verify(errlist != NULL); if (vd_errlist != NULL) *vd_errlist = errlist; else fnvlist_free(errlist); } return (error); } static int xlate_init_err(int err) { switch (err) { case ENODEV: return (EZFS_NODEVICE); case EINVAL: case EROFS: return (EZFS_BADDEV); case EBUSY: return (EZFS_INITIALIZING); case ESRCH: return (EZFS_NO_INITIALIZE); } return (err); } /* * Begin, suspend, cancel, or uninit (clear) the initialization (initializing * of all free blocks) for the given vdevs in the given pool. */ static int zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds, boolean_t wait) { int err; nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *guids_to_paths = fnvlist_alloc(); nvlist_t *vd_errlist = NULL; nvlist_t *errlist; nvpair_t *elem; err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, guids_to_paths, &vd_errlist); if (err != 0) { verify(vd_errlist != NULL); goto list_errors; } err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids, &errlist); if (err != 0) { if (errlist != NULL && nvlist_lookup_nvlist(errlist, ZPOOL_INITIALIZE_VDEVS, &vd_errlist) == 0) { goto list_errors; } if (err == EINVAL && cmd_type == POOL_INITIALIZE_UNINIT) { zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "uninitialize is not supported by kernel")); } (void) zpool_standard_error(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "operation failed")); goto out; } if (wait) { for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; elem = nvlist_next_nvpair(vdev_guids, elem)) { uint64_t guid = fnvpair_value_uint64(elem); err = lzc_wait_tag(zhp->zpool_name, ZPOOL_WAIT_INITIALIZE, guid, NULL); if (err != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "error " "waiting for '%s' to initialize"), nvpair_name(elem)); goto out; } } } goto out; list_errors: for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; elem = nvlist_next_nvpair(vd_errlist, elem)) { int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); const char *path; if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), &path) != 0) path = nvpair_name(elem); (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, "cannot initialize '%s'", path); } out: fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); if (vd_errlist != NULL) fnvlist_free(vd_errlist); return (err == 0 ? 0 : -1); } int zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) { return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE)); } int zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, nvlist_t *vds) { return (zpool_initialize_impl(zhp, cmd_type, vds, B_TRUE)); } static int xlate_trim_err(int err) { switch (err) { case ENODEV: return (EZFS_NODEVICE); case EINVAL: case EROFS: return (EZFS_BADDEV); case EBUSY: return (EZFS_TRIMMING); case ESRCH: return (EZFS_NO_TRIM); case EOPNOTSUPP: return (EZFS_TRIM_NOTSUP); } return (err); } static int zpool_trim_wait(zpool_handle_t *zhp, nvlist_t *vdev_guids) { int err; nvpair_t *elem; for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; elem = nvlist_next_nvpair(vdev_guids, elem)) { uint64_t guid = fnvpair_value_uint64(elem); err = lzc_wait_tag(zhp->zpool_name, ZPOOL_WAIT_TRIM, guid, NULL); if (err != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "error " "waiting to trim '%s'"), nvpair_name(elem)); return (err); } } return (0); } /* * Check errlist and report any errors, omitting ones which should be * suppressed. Returns B_TRUE if any errors were reported. */ static boolean_t check_trim_errs(zpool_handle_t *zhp, trimflags_t *trim_flags, nvlist_t *guids_to_paths, nvlist_t *vds, nvlist_t *errlist) { nvpair_t *elem; boolean_t reported_errs = B_FALSE; int num_vds = 0; int num_suppressed_errs = 0; for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL; elem = nvlist_next_nvpair(vds, elem)) { num_vds++; } for (elem = nvlist_next_nvpair(errlist, NULL); elem != NULL; elem = nvlist_next_nvpair(errlist, elem)) { int64_t vd_error = xlate_trim_err(fnvpair_value_int64(elem)); const char *path; /* * If only the pool was specified, and it was not a secure * trim then suppress warnings for individual vdevs which * do not support trimming. */ if (vd_error == EZFS_TRIM_NOTSUP && trim_flags->fullpool && !trim_flags->secure) { num_suppressed_errs++; continue; } reported_errs = B_TRUE; if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), &path) != 0) path = nvpair_name(elem); (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, "cannot trim '%s'", path); } if (num_suppressed_errs == num_vds) { (void) zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, "no devices in pool support trim operations")); (void) (zfs_error(zhp->zpool_hdl, EZFS_TRIM_NOTSUP, dgettext(TEXT_DOMAIN, "cannot trim"))); reported_errs = B_TRUE; } return (reported_errs); } /* * Begin, suspend, or cancel the TRIM (discarding of all free blocks) for * the given vdevs in the given pool. */ int zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, trimflags_t *trim_flags) { int err; int retval = 0; nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *guids_to_paths = fnvlist_alloc(); nvlist_t *errlist = NULL; err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, guids_to_paths, &errlist); if (err != 0) { check_trim_errs(zhp, trim_flags, guids_to_paths, vds, errlist); retval = -1; goto out; } err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate, trim_flags->secure, vdev_guids, &errlist); if (err != 0) { nvlist_t *vd_errlist; if (errlist != NULL && nvlist_lookup_nvlist(errlist, ZPOOL_TRIM_VDEVS, &vd_errlist) == 0) { if (check_trim_errs(zhp, trim_flags, guids_to_paths, vds, vd_errlist)) { retval = -1; goto out; } } else { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "operation failed")); zpool_standard_error(zhp->zpool_hdl, err, errbuf); retval = -1; goto out; } } if (trim_flags->wait) retval = zpool_trim_wait(zhp, vdev_guids); out: if (errlist != NULL) fnvlist_free(errlist); fnvlist_free(vdev_guids); fnvlist_free(guids_to_paths); return (retval); } /* * Scan the pool. */ int zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) { char errbuf[ERRBUFLEN]; int err; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *args = fnvlist_alloc(); fnvlist_add_uint64(args, "scan_type", (uint64_t)func); fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd); err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL); fnvlist_free(args); if (err == 0) { return (0); } else if (err == ZFS_ERR_IOC_CMD_UNAVAIL) { zfs_cmd_t zc = {"\0"}; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = func; zc.zc_flags = cmd; if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0) return (0); } /* * An ECANCELED on a scrub means one of the following: * 1. we resumed a paused scrub. * 2. we resumed a paused error scrub. * 3. Error scrub is not run because of no error log. */ if (err == ECANCELED && (func == POOL_SCAN_SCRUB || func == POOL_SCAN_ERRORSCRUB) && cmd == POOL_SCRUB_NORMAL) return (0); /* * The following cases have been handled here: * 1. Paused a scrub/error scrub if there is none in progress. */ if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_PAUSE) { return (0); } ASSERT3U(func, >=, POOL_SCAN_NONE); ASSERT3U(func, <, POOL_SCAN_FUNCS); if (func == POOL_SCAN_SCRUB || func == POOL_SCAN_ERRORSCRUB) { if (cmd == POOL_SCRUB_PAUSE) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot pause scrubbing %s"), zhp->zpool_name); } else { assert(cmd == POOL_SCRUB_NORMAL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot scrub %s"), zhp->zpool_name); } } else if (func == POOL_SCAN_RESILVER) { assert(cmd == POOL_SCRUB_NORMAL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot restart resilver on %s"), zhp->zpool_name); } else if (func == POOL_SCAN_NONE) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), zhp->zpool_name); } else { assert(!"unexpected result"); } /* * With EBUSY, five cases are possible: * * Current state Requested * 1. Normal Scrub Running Normal Scrub or Error Scrub * 2. Normal Scrub Paused Error Scrub * 3. Normal Scrub Paused Pause Normal Scrub * 4. Error Scrub Running Normal Scrub or Error Scrub * 5. Error Scrub Paused Pause Error Scrub * 6. Resilvering Anything else */ if (err == EBUSY) { nvlist_t *nvroot; pool_scan_stat_t *ps = NULL; uint_t psc; nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); if (ps && ps->pss_func == POOL_SCAN_SCRUB && ps->pss_state == DSS_SCANNING) { if (ps->pss_pass_scrub_pause == 0) { /* handles case 1 */ assert(cmd == POOL_SCRUB_NORMAL); return (zfs_error(hdl, EZFS_SCRUBBING, errbuf)); } else { if (func == POOL_SCAN_ERRORSCRUB) { /* handles case 2 */ ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); return (zfs_error(hdl, EZFS_SCRUB_PAUSED_TO_CANCEL, errbuf)); } else { /* handles case 3 */ ASSERT3U(func, ==, POOL_SCAN_SCRUB); ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE); return (zfs_error(hdl, EZFS_SCRUB_PAUSED, errbuf)); } } } else if (ps && ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB && ps->pss_error_scrub_state == DSS_ERRORSCRUBBING) { if (ps->pss_pass_error_scrub_pause == 0) { /* handles case 4 */ ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); return (zfs_error(hdl, EZFS_ERRORSCRUBBING, errbuf)); } else { /* handles case 5 */ ASSERT3U(func, ==, POOL_SCAN_ERRORSCRUB); ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE); return (zfs_error(hdl, EZFS_ERRORSCRUB_PAUSED, errbuf)); } } else { /* handles case 6 */ return (zfs_error(hdl, EZFS_RESILVERING, errbuf)); } } else if (err == ENOENT) { return (zfs_error(hdl, EZFS_NO_SCRUB, errbuf)); } else if (err == ENOTSUP && func == POOL_SCAN_RESILVER) { return (zfs_error(hdl, EZFS_NO_RESILVER_DEFER, errbuf)); } else { return (zpool_standard_error(hdl, err, errbuf)); } } /* * Find a vdev that matches the search criteria specified. We use the * the nvpair name to determine how we should look for the device. * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL * spare; but FALSE if its an INUSE spare. */ static nvlist_t * vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { uint_t c, children; nvlist_t **child; nvlist_t *ret; uint64_t is_log; const char *srchkey; nvpair_t *pair = nvlist_next_nvpair(search, NULL); /* Nothing to look for */ if (search == NULL || pair == NULL) return (NULL); /* Obtain the key we will use to search */ srchkey = nvpair_name(pair); switch (nvpair_type(pair)) { case DATA_TYPE_UINT64: if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { uint64_t srchval = fnvpair_value_uint64(pair); uint64_t theguid = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID); if (theguid == srchval) return (nv); } break; case DATA_TYPE_STRING: { const char *srchval, *val; srchval = fnvpair_value_string(pair); if (nvlist_lookup_string(nv, srchkey, &val) != 0) break; /* * Search for the requested value. Special cases: * * - ZPOOL_CONFIG_PATH for whole disk entries. These end in * "-part1", or "p1". The suffix is hidden from the user, * but included in the string, so this matches around it. * - ZPOOL_CONFIG_PATH for short names zfs_strcmp_shortname() * is used to check all possible expanded paths. * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). * * Otherwise, all other searches are simple string compares. */ if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); if (zfs_strcmp_pathname(srchval, val, wholedisk) == 0) return (nv); } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0) { char *type, *idx, *end, *p; uint64_t id, vdev_id; /* * Determine our vdev type, keeping in mind * that the srchval is composed of a type and * vdev id pair (i.e. mirror-4). */ if ((type = strdup(srchval)) == NULL) return (NULL); if ((p = strrchr(type, '-')) == NULL) { free(type); break; } idx = p + 1; *p = '\0'; /* * If the types don't match then keep looking. */ if (strncmp(val, type, strlen(val)) != 0) { free(type); break; } verify(zpool_vdev_is_interior(type)); id = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID); errno = 0; vdev_id = strtoull(idx, &end, 10); /* * If we are looking for a raidz and a parity is * specified, make sure it matches. */ int rzlen = strlen(VDEV_TYPE_RAIDZ); assert(rzlen == strlen(VDEV_TYPE_DRAID)); int typlen = strlen(type); if ((strncmp(type, VDEV_TYPE_RAIDZ, rzlen) == 0 || strncmp(type, VDEV_TYPE_DRAID, rzlen) == 0) && typlen != rzlen) { uint64_t vdev_parity; int parity = *(type + rzlen) - '0'; if (parity <= 0 || parity > 3 || (typlen - rzlen) != 1) { /* * Nonsense parity specified, can * never match */ free(type); return (NULL); } vdev_parity = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); if ((int)vdev_parity != parity) { free(type); break; } } free(type); if (errno != 0) return (NULL); /* * Now verify that we have the correct vdev id. */ if (vdev_id == id) return (nv); } /* * Common case */ if (strcmp(srchval, val) == 0) return (nv); break; } default: break; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (NULL); for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { /* * The 'is_log' value is only set for the toplevel * vdev, not the leaf vdevs. So we always lookup the * log device from the root of the vdev tree (where * 'log' is non-NULL). */ if (log != NULL && nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log) == 0 && is_log) { *log = B_TRUE; } return (ret); } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *avail_spare = B_TRUE; return (ret); } } } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *l2cache = B_TRUE; return (ret); } } } return (NULL); } /* * Given a physical path or guid, find the associated vdev. */ nvlist_t * zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { nvlist_t *search, *nvroot, *ret; uint64_t guid; char *end; search = fnvlist_alloc(); guid = strtoull(ppath, &end, 0); if (guid != 0 && *end == '\0') { fnvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid); } else { fnvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath); } nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); fnvlist_free(search); return (ret); } /* * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). */ static boolean_t zpool_vdev_is_interior(const char *name) { if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 || strncmp(name, VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_ROOT, strlen(VDEV_TYPE_ROOT)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 && !zpool_is_draid_spare(name)) return (B_TRUE); return (B_FALSE); } /* * Lookup the nvlist for a given vdev. */ nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { char *end; nvlist_t *nvroot, *search, *ret; uint64_t guid; boolean_t __avail_spare, __l2cache, __log; search = fnvlist_alloc(); guid = strtoull(path, &end, 0); if (guid != 0 && *end == '\0') { fnvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid); } else if (zpool_vdev_is_interior(path)) { fnvlist_add_string(search, ZPOOL_CONFIG_TYPE, path); } else { fnvlist_add_string(search, ZPOOL_CONFIG_PATH, path); } nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); /* * User can pass NULL for avail_spare, l2cache, and log, but * we still need to provide variables to vdev_to_nvlist_iter(), so * just point them to junk variables here. */ if (!avail_spare) avail_spare = &__avail_spare; if (!l2cache) l2cache = &__l2cache; if (!log) log = &__log; *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); fnvlist_free(search); return (ret); } /* * Convert a vdev path to a GUID. Returns GUID or 0 on error. * * If is_spare, is_l2cache, or is_log is non-NULL, then store within it * if the VDEV is a spare, l2cache, or log device. If they're NULL then * ignore them. */ static uint64_t zpool_vdev_path_to_guid_impl(zpool_handle_t *zhp, const char *path, boolean_t *is_spare, boolean_t *is_l2cache, boolean_t *is_log) { boolean_t spare = B_FALSE, l2cache = B_FALSE, log = B_FALSE; nvlist_t *tgt; if ((tgt = zpool_find_vdev(zhp, path, &spare, &l2cache, &log)) == NULL) return (0); if (is_spare != NULL) *is_spare = spare; if (is_l2cache != NULL) *is_l2cache = l2cache; if (is_log != NULL) *is_log = log; return (fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID)); } /* Convert a vdev path to a GUID. Returns GUID or 0 on error. */ uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path) { return (zpool_vdev_path_to_guid_impl(zhp, path, NULL, NULL, NULL)); } /* * Bring the specified vdev online. The 'flags' parameter is a set of the * ZFS_ONLINE_* flags. */ int zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, vdev_state_t *newstate) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; if (flags & ZFS_ONLINE_EXPAND) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot expand %s"), path); } else { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot online %s"), path); } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (!(flags & ZFS_ONLINE_SPARE) && avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); #ifndef __FreeBSD__ const char *pathname; if ((flags & ZFS_ONLINE_EXPAND || zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) && nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); /* * XXX - L2ARC 1.0 devices can't support expansion. */ if (l2cache) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot expand cache devices")); return (zfs_error(hdl, EZFS_VDEVNOTSUP, errbuf)); } if (wholedisk) { const char *fullpath = path; char buf[MAXPATHLEN]; int error; if (path[0] != '/') { error = zfs_resolve_shortname(path, buf, sizeof (buf)); if (error != 0) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); fullpath = buf; } error = zpool_relabel_disk(hdl, fullpath, errbuf); if (error != 0) return (error); } } #endif zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { if (errno == EINVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " "from this pool into a new one. Use '%s' " "instead"), "zpool detach"); return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, errbuf)); } return (zpool_standard_error(hdl, errno, errbuf)); } *newstate = zc.zc_cookie; return (0); } /* * Take the specified vdev offline */ int zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot offline %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); zc.zc_cookie = VDEV_STATE_OFFLINE; zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, errbuf)); case EEXIST: /* * The log device has unplayed logs */ return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, errbuf)); default: return (zpool_standard_error(hdl, errno, errbuf)); } } /* * Remove the specified vdev asynchronously from the configuration, so * that it may come ONLINE if reinserted. This is called from zed on * Udev remove event. * Note: We also have a similar function zpool_vdev_remove() that * removes the vdev from the pool. */ int zpool_vdev_remove_wanted(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); zc.zc_cookie = VDEV_STATE_REMOVED; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Mark the given vdev faulted. */ int zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_FAULTED; zc.zc_obj = aux; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { case EBUSY: /* * There are no other replicas of this device. */ return (zfs_error(hdl, EZFS_NOREPLICAS, errbuf)); default: return (zpool_standard_error(hdl, errno, errbuf)); } } /* * Generic set vdev state function */ static int zpool_vdev_set_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux, vdev_state_t state) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set %s %llu"), zpool_state_to_name(state, aux), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = state; zc.zc_obj = aux; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Mark the given vdev degraded. */ int zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_DEGRADED)); } /* * Mark the given vdev as in a removed state (as if the device does not exist). * * This is different than zpool_vdev_remove() which does a removal of a device * from the pool (but the device does exist). */ int zpool_vdev_set_removed_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_REMOVED)); } /* * Returns TRUE if the given nvlist is a vdev that was originally swapped in as * a hot spare. */ static boolean_t is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) { nvlist_t **child; uint_t c, children; if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { const char *type = fnvlist_lookup_string(search, ZPOOL_CONFIG_TYPE); if ((strcmp(type, VDEV_TYPE_SPARE) == 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) && children == 2 && child[which] == tgt) return (B_TRUE); for (c = 0; c < children; c++) if (is_replacing_spare(child[c], tgt, which)) return (B_TRUE); } return (B_FALSE); } /* * Attach new_disk (fully described by nvroot) to old_disk. * If 'replacing' is specified, the new disk will replace the old one. */ int zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; int ret; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; uint64_t val; char *newname; const char *type; nvlist_t **child; uint_t children; nvlist_t *config_root; libzfs_handle_t *hdl = zhp->zpool_hdl; if (replacing) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot replace %s with %s"), old_disk, new_disk); else (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot attach %s to %s"), new_disk, old_disk); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); zc.zc_cookie = replacing; zc.zc_simple = rebuild; if (rebuild && zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs module doesn't support device rebuilds")); return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); } type = fnvlist_lookup_string(tgt, ZPOOL_CONFIG_TYPE); if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 && zfeature_lookup_guid("org.openzfs:raidz_expansion", NULL) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs module doesn't support raidz expansion")); return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); return (zfs_error(hdl, EZFS_INVALCONFIG, errbuf)); } config_root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE); if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL) return (-1); /* * If the target is a hot spare that has been swapped in, we can only * replace it with another hot spare. */ if (replacing && nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 && (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache, NULL) == NULL || !avail_spare) && is_replacing_spare(config_root, tgt, 1)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only be replaced by another hot spare")); free(newname); return (zfs_error(hdl, EZFS_BADTARGET, errbuf)); } free(newname); zcmd_write_conf_nvlist(hdl, &zc, nvroot); ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); zcmd_free_nvlists(&zc); if (ret == 0) return (0); switch (errno) { case ENOTSUP: /* * Can't attach to or replace this type of vdev. */ if (replacing) { uint64_t version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); } else if (rebuild) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only mirror and dRAID vdevs support " "sequential reconstruction")); } else if (zpool_is_draid_spare(new_disk)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID spares can only replace child " "devices in their parent's dRAID vdev")); } else if (version >= SPA_VERSION_MULTI_REPLACE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " "for completion or use 'zpool detach'")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); } } else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "raidz_expansion feature must be enabled " "in order to attach a device to raidz")); } else { char status[64] = {0}; zpool_prop_get_feature(zhp, "feature@device_rebuild", status, 63); if (rebuild && strncmp(status, ZFS_FEATURE_DISABLED, 64) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device_rebuild feature must be enabled " "in order to use sequential " "reconstruction")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only attach to mirrors and top-level " "disks")); } } (void) zfs_error(hdl, EZFS_BADTARGET, errbuf); break; case EINVAL: /* * The new device must be a single disk. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device must be a single disk")); (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy"), new_disk); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EOVERFLOW: /* * The new device is too small. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device is too small")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case EDOM: /* * The new device has a different optimal sector size. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "new device has a different optimal sector size; use the " "option '-o ashift=N' to override the optimal size")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; case ENAMETOOLONG: /* * The resulting top-level vdev spec won't fit in the label. */ (void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf); break; case ENXIO: /* * The existing raidz vdev has offline children */ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "raidz vdev has devices that are are offline or " "being replaced")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); break; } else { (void) zpool_standard_error(hdl, errno, errbuf); } break; case EADDRINUSE: /* * The boot reserved area is already being used (FreeBSD) */ if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the reserved boot area needed for the expansion " "is already being used by a boot loader")); (void) zfs_error(hdl, EZFS_BADDEV, errbuf); } else { (void) zpool_standard_error(hdl, errno, errbuf); } break; default: (void) zpool_standard_error(hdl, errno, errbuf); } return (-1); } /* * Detach the specified device. */ int zpool_vdev_detach(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot detach %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); if (l2cache) return (zfs_error(hdl, EZFS_ISL2CACHE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0) return (0); switch (errno) { case ENOTSUP: /* * Can't detach from this type of vdev. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only " "applicable to mirror and replacing vdevs")); (void) zfs_error(hdl, EZFS_BADTARGET, errbuf); break; case EBUSY: /* * There are no other replicas of this device. */ (void) zfs_error(hdl, EZFS_NOREPLICAS, errbuf); break; default: (void) zpool_standard_error(hdl, errno, errbuf); } return (-1); } /* * Find a mirror vdev in the source nvlist. * * The mchild array contains a list of disks in one of the top-level mirrors * of the source pool. The schild array contains a list of disks that the * user specified on the command line. We loop over the mchild array to * see if any entry in the schild array matches. * * If a disk in the mchild array is found in the schild array, we return * the index of that entry. Otherwise we return -1. */ static int find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, nvlist_t **schild, uint_t schildren) { uint_t mc; for (mc = 0; mc < mchildren; mc++) { uint_t sc; char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, mchild[mc], 0); for (sc = 0; sc < schildren; sc++) { char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, schild[sc], 0); boolean_t result = (strcmp(mpath, spath) == 0); free(spath); if (result) { free(mpath); return (mc); } } free(mpath); } return (-1); } /* * Split a mirror pool. If newroot points to null, then a new nvlist * is generated and it is the responsibility of the caller to free it. */ int zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, nvlist_t *props, splitflags_t flags) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; const char *bias; nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; nvlist_t **varray = NULL, *zc_props = NULL; uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t vers, readonly = B_FALSE; boolean_t freelist = B_FALSE, memory_err = B_TRUE; int retval = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); if (!zpool_name_valid(hdl, B_FALSE, newname)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); if ((config = zpool_get_config(zhp, NULL)) == NULL) { (void) fprintf(stderr, gettext("Internal error: unable to " "retrieve pool configuration\n")); return (-1); } tree = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); vers = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); if (props) { prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, props, vers, flags, errbuf)) == NULL) return (-1); (void) nvlist_lookup_uint64(zc_props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property %s can only be set at import time"), zpool_prop_to_name(ZPOOL_PROP_READONLY)); return (-1); } } if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool is missing vdev tree")); nvlist_free(zc_props); return (-1); } varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); vcount = 0; if (*newroot == NULL || nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, &newchild, &newchildren) != 0) newchildren = 0; for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; boolean_t is_special = B_FALSE, is_dedup = B_FALSE; const char *type; nvlist_t **mchild, *vdev; uint_t mchildren; int entry; /* * Unlike cache & spares, slogs are stored in the * ZPOOL_CONFIG_CHILDREN array. We filter them out here. */ (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_log || is_hole) { /* * Create a hole vdev and put it in the config. */ if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) != 0) goto out; if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, 1) != 0) goto out; if (lastlog == 0) lastlog = vcount; varray[vcount++] = vdev; continue; } lastlog = 0; type = fnvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE); if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { vdev = child[c]; if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; continue; } else if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool must be composed only of mirrors\n")); retval = zfs_error(hdl, EZFS_INVALCONFIG, errbuf); goto out; } if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) is_special = B_TRUE; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) is_dedup = B_TRUE; } verify(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); /* find or add an entry for this top-level vdev */ if (newchildren > 0 && (entry = find_vdev_entry(zhp, mchild, mchildren, newchild, newchildren)) >= 0) { /* We found a disk that the user specified. */ vdev = mchild[entry]; ++found; } else { /* User didn't specify a disk for this vdev. */ vdev = mchild[mchildren - 1]; } if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; if (flags.dryrun != 0) { if (is_dedup == B_TRUE) { if (nvlist_add_string(varray[vcount - 1], ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_DEDUP) != 0) goto out; } else if (is_special == B_TRUE) { if (nvlist_add_string(varray[vcount - 1], ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_SPECIAL) != 0) goto out; } } } /* did we find every disk the user specified? */ if (found != newchildren) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " "include at most one disk from each mirror")); retval = zfs_error(hdl, EZFS_INVALCONFIG, errbuf); goto out; } /* Prepare the nvlist for populating. */ if (*newroot == NULL) { if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) goto out; freelist = B_TRUE; if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0) goto out; } else { verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); } /* Add all the children we found */ if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, (const nvlist_t **)varray, lastlog == 0 ? vcount : lastlog) != 0) goto out; /* * If we're just doing a dry run, exit now with success. */ if (flags.dryrun) { memory_err = B_FALSE; freelist = B_FALSE; goto out; } /* now build up the config list & call the ioctl */ if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) goto out; if (nvlist_add_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || nvlist_add_string(newconfig, ZPOOL_CONFIG_POOL_NAME, newname) != 0 || nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) goto out; /* * The new pool is automatically part of the namespace unless we * explicitly export it. */ if (!flags.import) zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); zcmd_write_conf_nvlist(hdl, &zc, newconfig); if (zc_props != NULL) zcmd_write_src_nvlist(hdl, &zc, zc_props); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { retval = zpool_standard_error(hdl, errno, errbuf); goto out; } freelist = B_FALSE; memory_err = B_FALSE; out: if (varray != NULL) { int v; for (v = 0; v < vcount; v++) nvlist_free(varray[v]); free(varray); } zcmd_free_nvlists(&zc); nvlist_free(zc_props); nvlist_free(newconfig); if (freelist) { nvlist_free(*newroot); *newroot = NULL; } if (retval != 0) return (retval); if (memory_err) return (no_memory(hdl)); return (0); } /* * Remove the given device. */ int zpool_vdev_remove(zpool_handle_t *zhp, const char *path) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; uint64_t version; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); if (zpool_is_draid_spare(path)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dRAID spares cannot be removed")); return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (islog && version < SPA_VERSION_HOLES) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to support log removal")); return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); } zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); switch (errno) { case EALREADY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "removal for this vdev is already in progress.")); (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; case EINVAL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid config; all top-level vdevs must " "have the same sector size and not be raidz.")); (void) zfs_error(hdl, EZFS_INVALCONFIG, errbuf); break; case EBUSY: if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Mount encrypted datasets to replay logs.")); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Pool busy; removal may already be in progress")); } (void) zfs_error(hdl, EZFS_BUSY, errbuf); break; case EACCES: if (islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Mount encrypted datasets to replay logs.")); (void) zfs_error(hdl, EZFS_BUSY, errbuf); } else { (void) zpool_standard_error(hdl, errno, errbuf); } break; default: (void) zpool_standard_error(hdl, errno, errbuf); } return (-1); } int zpool_vdev_remove_cancel(zpool_handle_t *zhp) { zfs_cmd_t zc = {{0}}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot cancel removal")); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = 1; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } int zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path, uint64_t *sizep) { char errbuf[ERRBUFLEN]; nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"), path); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); if (avail_spare || l2cache || islog) { *sizep = 0; return (0); } if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "indirect size not available")); return (zfs_error(hdl, EINVAL, errbuf)); } return (0); } /* * Clear the errors for the pool, or the particular device if specified. */ int zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; nvlist_t *tgt; zpool_load_policy_t policy; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *nvi = NULL; int error; if (path) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), path); else (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (path) { if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, NULL)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); /* * Don't allow error clearing for hot spares. Do allow * error clearing for l2cache devices. */ if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); } zpool_get_load_policy(rewindnvl, &policy); zc.zc_cookie = policy.zlp_rewind; zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2); zcmd_write_src_nvlist(hdl, &zc, rewindnvl); while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && errno == ENOMEM) zcmd_expand_dst_nvlist(hdl, &zc); if (!error || ((policy.zlp_rewind & ZPOOL_TRY_REWIND) && errno != EPERM && errno != EACCES)) { if (policy.zlp_rewind & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); zpool_rewind_exclaim(hdl, zc.zc_name, ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nvi); nvlist_free(nvi); } zcmd_free_nvlists(&zc); return (0); } zcmd_free_nvlists(&zc); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Similar to zpool_clear(), but takes a GUID (used by fmd). */ int zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = ZPOOL_NO_REWIND; if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Change the GUID for a pool. */ int zpool_reguid(zpool_handle_t *zhp) { char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; zfs_cmd_t zc = {"\0"}; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, errbuf)); } /* * Reopen the pool. */ int zpool_reopen_one(zpool_handle_t *zhp, void *data) { libzfs_handle_t *hdl = zpool_get_handle(zhp); const char *pool_name = zpool_get_name(zhp); boolean_t *scrub_restart = data; int error; error = lzc_reopen(pool_name, *scrub_restart); if (error) { return (zpool_standard_error_fmt(hdl, error, dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), pool_name)); } return (0); } /* call into libzfs_core to execute the sync IOCTL per pool */ int zpool_sync_one(zpool_handle_t *zhp, void *data) { int ret; libzfs_handle_t *hdl = zpool_get_handle(zhp); const char *pool_name = zpool_get_name(zhp); boolean_t *force = data; nvlist_t *innvl = fnvlist_alloc(); fnvlist_add_boolean_value(innvl, "force", *force); if ((ret = lzc_sync(pool_name, innvl, NULL)) != 0) { nvlist_free(innvl); return (zpool_standard_error_fmt(hdl, ret, dgettext(TEXT_DOMAIN, "sync '%s' failed"), pool_name)); } nvlist_free(innvl); return (0); } #define PATH_BUF_LEN 64 /* * Given a vdev, return the name to display in iostat. If the vdev has a path, * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type. * We also check if this is a whole disk, in which case we strip off the * trailing 's0' slice name. * * This routine is also responsible for identifying when disks have been * reconfigured in a new location. The kernel will have opened the device by * devid, but the path will still refer to the old location. To catch this, we * first do a path -> devid translation (which is fast for the common case). If * the devid matches, we're done. If not, we do a reverse devid -> path * translation and issue the appropriate ioctl() to update the path of the vdev. * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any * of these checks. */ char * zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, int name_flags) { const char *type, *tpath; const char *path; uint64_t value; char buf[PATH_BUF_LEN]; char tmpbuf[PATH_BUF_LEN * 2]; /* * vdev_name will be "root"/"root-0" for the root vdev, but it is the * zpool name that will be displayed to the user. */ type = fnvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE); if (zhp != NULL && strcmp(type, "root") == 0) return (zfs_strdup(hdl, zpool_get_name(zhp))); if (libzfs_envvar_is_set("ZPOOL_VDEV_NAME_PATH")) name_flags |= VDEV_NAME_PATH; if (libzfs_envvar_is_set("ZPOOL_VDEV_NAME_GUID")) name_flags |= VDEV_NAME_GUID; if (libzfs_envvar_is_set("ZPOOL_VDEV_NAME_FOLLOW_LINKS")) name_flags |= VDEV_NAME_FOLLOW_LINKS; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 || name_flags & VDEV_NAME_GUID) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value); (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); path = buf; } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tpath) == 0) { path = tpath; if (name_flags & VDEV_NAME_FOLLOW_LINKS) { char *rp = realpath(path, NULL); if (rp) { strlcpy(buf, rp, sizeof (buf)); path = buf; free(rp); } } /* * For a block device only use the name. */ if ((strcmp(type, VDEV_TYPE_DISK) == 0) && !(name_flags & VDEV_NAME_PATH)) { path = zfs_strip_path(path); } /* * Remove the partition from the path if this is a whole disk. */ if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } } else { path = type; /* * If it's a raidz device, we need to stick in the parity level. */ if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { value = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); (void) snprintf(buf, sizeof (buf), "%s%llu", path, (u_longlong_t)value); path = buf; } /* * If it's a dRAID device, we add parity, groups, and spares. */ if (strcmp(path, VDEV_TYPE_DRAID) == 0) { uint64_t ndata, nparity, nspares; nvlist_t **child; uint_t children; verify(nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); nparity = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY); ndata = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA); nspares = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES); path = zpool_draid_name(buf, sizeof (buf), ndata, nparity, nspares, children); } /* * We identify each top-level vdev by using a * naming convention. */ if (name_flags & VDEV_NAME_TYPE_ID) { uint64_t id = fnvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID); (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s-%llu", path, (u_longlong_t)id); path = tmpbuf; } } return (zfs_strdup(hdl, path)); } static int zbookmark_mem_compare(const void *a, const void *b) { return (memcmp(a, b, sizeof (zbookmark_phys_t))); } /* * Retrieve the persistent error log, uniquify the members, and return to the * caller. */ int zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; zbookmark_phys_t *buf; uint64_t buflen = 10000; /* approx. 1MB of RAM */ if (fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT) == 0) return (0); /* * Retrieve the raw error list from the kernel. If it doesn't fit, * allocate a larger buffer and retry. */ (void) strcpy(zc.zc_name, zhp->zpool_name); for (;;) { buf = zfs_alloc(zhp->zpool_hdl, buflen * sizeof (zbookmark_phys_t)); zc.zc_nvlist_dst = (uintptr_t)buf; zc.zc_nvlist_dst_size = buflen; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG, &zc) != 0) { free(buf); if (errno == ENOMEM) { buflen *= 2; } else { return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "errors: List of " "errors unavailable"))); } } else { break; } } /* * Sort the resulting bookmarks. This is a little confusing due to the * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last * to first, and 'zc_nvlist_dst_size' indicates the number of bookmarks * _not_ copied as part of the process. So we point the start of our * array appropriate and decrement the total number of elements. */ zbookmark_phys_t *zb = buf + zc.zc_nvlist_dst_size; uint64_t zblen = buflen - zc.zc_nvlist_dst_size; qsort(zb, zblen, sizeof (zbookmark_phys_t), zbookmark_mem_compare); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); /* * Fill in the nverrlistp with nvlist's of dataset and object numbers. */ for (uint64_t i = 0; i < zblen; i++) { nvlist_t *nv; /* ignoring zb_blkid and zb_level for now */ if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset && zb[i-1].zb_object == zb[i].zb_object) continue; if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0) goto nomem; if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET, zb[i].zb_objset) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, zb[i].zb_object) != 0) { nvlist_free(nv); goto nomem; } if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) { nvlist_free(nv); goto nomem; } nvlist_free(nv); } free(buf); return (0); nomem: free(buf); return (no_memory(zhp->zpool_hdl)); } /* * Upgrade a ZFS pool to the latest on-disk version. */ int zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strcpy(zc.zc_name, zhp->zpool_name); zc.zc_cookie = new_version; if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0) return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"), zhp->zpool_name)); return (0); } void zfs_save_arguments(int argc, char **argv, char *string, int len) { int i; (void) strlcpy(string, zfs_basename(argv[0]), len); for (i = 1; i < argc; i++) { (void) strlcat(string, " ", len); (void) strlcat(string, argv[i], len); } } int zpool_log_history(libzfs_handle_t *hdl, const char *message) { zfs_cmd_t zc = {"\0"}; nvlist_t *args; args = fnvlist_alloc(); fnvlist_add_string(args, "message", message); zcmd_write_src_nvlist(hdl, &zc, args); int err = zfs_ioctl(hdl, ZFS_IOC_LOG_HISTORY, &zc); nvlist_free(args); zcmd_free_nvlists(&zc); return (err); } /* * Perform ioctl to get some command history of a pool. * * 'buf' is the buffer to fill up to 'len' bytes. 'off' is the * logical offset of the history buffer to start reading from. * * Upon return, 'off' is the next logical offset to read from and * 'len' is the actual amount of bytes read into 'buf'. */ static int get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) { zfs_cmd_t zc = {"\0"}; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_history = (uint64_t)(uintptr_t)buf; zc.zc_history_len = *len; zc.zc_history_offset = *off; if (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { switch (errno) { case EPERM: return (zfs_error_fmt(hdl, EZFS_PERM, dgettext(TEXT_DOMAIN, "cannot show history for pool '%s'"), zhp->zpool_name)); case ENOENT: return (zfs_error_fmt(hdl, EZFS_NOHISTORY, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s'"), zhp->zpool_name)); case ENOTSUP: return (zfs_error_fmt(hdl, EZFS_BADVERSION, dgettext(TEXT_DOMAIN, "cannot get history for pool " "'%s', pool must be upgraded"), zhp->zpool_name)); default: return (zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get history for '%s'"), zhp->zpool_name)); } } *len = zc.zc_history_len; *off = zc.zc_history_offset; return (0); } /* * Retrieve the command history of a pool. */ int zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp, uint64_t *off, boolean_t *eof) { libzfs_handle_t *hdl = zhp->zpool_hdl; char *buf; int buflen = 128 * 1024; nvlist_t **records = NULL; uint_t numrecords = 0; int err = 0, i; uint64_t start = *off; buf = zfs_alloc(hdl, buflen); /* process about 1MiB a time */ while (*off - start < 1024 * 1024) { uint64_t bytes_read = buflen; uint64_t leftover; if ((err = get_history(zhp, buf, off, &bytes_read)) != 0) break; /* if nothing else was read in, we're at EOF, just return */ if (!bytes_read) { *eof = B_TRUE; break; } if ((err = zpool_history_unpack(buf, bytes_read, &leftover, &records, &numrecords)) != 0) { zpool_standard_error_fmt(hdl, err, dgettext(TEXT_DOMAIN, "cannot get history for '%s'"), zhp->zpool_name); break; } *off -= leftover; if (leftover == bytes_read) { /* * no progress made, because buffer is not big enough * to hold this record; resize and retry. */ buflen *= 2; free(buf); buf = zfs_alloc(hdl, buflen); } } free(buf); if (!err) { *nvhisp = fnvlist_alloc(); fnvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD, (const nvlist_t **)records, numrecords); } for (i = 0; i < numrecords; i++) nvlist_free(records[i]); free(records); return (err); } /* * Retrieve the next event given the passed 'zevent_fd' file descriptor. * If there is a new event available 'nvp' will contain a newly allocated * nvlist and 'dropped' will be set to the number of missed events since * the last call to this function. When 'nvp' is set to NULL it indicates * no new events are available. In either case the function returns 0 and * it is up to the caller to free 'nvp'. In the case of a fatal error the * function will return a non-zero value. When the function is called in * blocking mode (the default, unless the ZEVENT_NONBLOCK flag is passed), * it will not return until a new event is available. */ int zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, int *dropped, unsigned flags, int zevent_fd) { zfs_cmd_t zc = {"\0"}; int error = 0; *nvp = NULL; *dropped = 0; zc.zc_cleanup_fd = zevent_fd; if (flags & ZEVENT_NONBLOCK) zc.zc_guid = ZEVENT_NONBLOCK; zcmd_alloc_dst_nvlist(hdl, &zc, ZEVENT_SIZE); retry: if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_NEXT, &zc) != 0) { switch (errno) { case ESHUTDOWN: error = zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, dgettext(TEXT_DOMAIN, "zfs shutdown")); goto out; case ENOENT: /* Blocking error case should not occur */ if (!(flags & ZEVENT_NONBLOCK)) error = zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get event")); goto out; case ENOMEM: zcmd_expand_dst_nvlist(hdl, &zc); goto retry; default: error = zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get event")); goto out; } } error = zcmd_read_dst_nvlist(hdl, &zc, nvp); if (error != 0) goto out; *dropped = (int)zc.zc_cookie; out: zcmd_free_nvlists(&zc); return (error); } /* * Clear all events. */ int zpool_events_clear(libzfs_handle_t *hdl, int *count) { zfs_cmd_t zc = {"\0"}; if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_CLEAR, &zc) != 0) return (zpool_standard_error(hdl, errno, dgettext(TEXT_DOMAIN, "cannot clear events"))); if (count != NULL) *count = (int)zc.zc_cookie; /* # of events cleared */ return (0); } /* * Seek to a specific EID, ZEVENT_SEEK_START, or ZEVENT_SEEK_END for * the passed zevent_fd file handle. On success zero is returned, * otherwise -1 is returned and hdl->libzfs_error is set to the errno. */ int zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd) { zfs_cmd_t zc = {"\0"}; int error = 0; zc.zc_guid = eid; zc.zc_cleanup_fd = zevent_fd; if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_SEEK, &zc) != 0) { switch (errno) { case ENOENT: error = zfs_error_fmt(hdl, EZFS_NOENT, dgettext(TEXT_DOMAIN, "cannot get event")); break; case ENOMEM: error = zfs_error_fmt(hdl, EZFS_NOMEM, dgettext(TEXT_DOMAIN, "cannot get event")); break; default: error = zpool_standard_error_fmt(hdl, errno, dgettext(TEXT_DOMAIN, "cannot get event")); break; } } return (error); } static void zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len, boolean_t always_unmounted) { zfs_cmd_t zc = {"\0"}; boolean_t mounted = B_FALSE; char *mntpnt = NULL; char dsname[ZFS_MAX_DATASET_NAME_LEN]; if (dsobj == 0) { /* special case for the MOS */ (void) snprintf(pathname, len, ":<0x%llx>", (longlong_t)obj); return; } /* get the dataset's name */ (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_obj = dsobj; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { /* just write out a path of two object numbers */ (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", (longlong_t)dsobj, (longlong_t)obj); return; } (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); /* find out if the dataset is mounted */ mounted = !always_unmounted && is_mounted(zhp->zpool_hdl, dsname, &mntpnt); /* get the corrupted object's path */ (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); zc.zc_obj = obj; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_OBJ_TO_PATH, &zc) == 0) { if (mounted) { (void) snprintf(pathname, len, "%s%s", mntpnt, zc.zc_value); } else { (void) snprintf(pathname, len, "%s:%s", dsname, zc.zc_value); } } else { (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, (longlong_t)obj); } free(mntpnt); } void zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) { zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_FALSE); } void zpool_obj_to_path_ds(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, char *pathname, size_t len) { zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_TRUE); } /* * Wait while the specified activity is in progress in the pool. */ int zpool_wait(zpool_handle_t *zhp, zpool_wait_activity_t activity) { boolean_t missing; int error = zpool_wait_status(zhp, activity, &missing, NULL); if (missing) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, ENOENT, dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), zhp->zpool_name); return (ENOENT); } else { return (error); } } /* * Wait for the given activity and return the status of the wait (whether or not * any waiting was done) in the 'waited' parameter. Non-existent pools are * reported via the 'missing' parameter, rather than by printing an error * message. This is convenient when this function is called in a loop over a * long period of time (as it is, for example, by zpool's wait cmd). In that * scenario, a pool being exported or destroyed should be considered a normal * event, so we don't want to print an error when we find that the pool doesn't * exist. */ int zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity, boolean_t *missing, boolean_t *waited) { int error = lzc_wait(zhp->zpool_name, activity, waited); *missing = (error == ENOENT); if (*missing) return (0); if (error != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), zhp->zpool_name); } return (error); } int zpool_set_bootenv(zpool_handle_t *zhp, const nvlist_t *envmap) { int error = lzc_set_bootenv(zhp->zpool_name, envmap); if (error != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, dgettext(TEXT_DOMAIN, "error setting bootenv in pool '%s'"), zhp->zpool_name); } return (error); } int zpool_get_bootenv(zpool_handle_t *zhp, nvlist_t **nvlp) { nvlist_t *nvl; int error; nvl = NULL; error = lzc_get_bootenv(zhp->zpool_name, &nvl); if (error != 0) { (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, dgettext(TEXT_DOMAIN, "error getting bootenv in pool '%s'"), zhp->zpool_name); } else { *nvlp = nvl; } return (error); } /* * Attempt to read and parse feature file(s) (from "compatibility" property). * Files contain zpool feature names, comma or whitespace-separated. * Comments (# character to next newline) are discarded. * * Arguments: * compatibility : string containing feature filenames * features : either NULL or pointer to array of boolean * report : either NULL or pointer to string buffer * rlen : length of "report" buffer * * compatibility is NULL (unset), "", "off", "legacy", or list of * comma-separated filenames. filenames should either be absolute, * or relative to: * 1) ZPOOL_SYSCONF_COMPAT_D (eg: /etc/zfs/compatibility.d) or * 2) ZPOOL_DATA_COMPAT_D (eg: /usr/share/zfs/compatibility.d). * (Unset), "" or "off" => enable all features * "legacy" => disable all features * * Any feature names read from files which match unames in spa_feature_table * will have the corresponding boolean set in the features array (if non-NULL). * If more than one feature set specified, only features present in *all* of * them will be set. * * "report" if not NULL will be populated with a suitable status message. * * Return values: * ZPOOL_COMPATIBILITY_OK : files read and parsed ok * ZPOOL_COMPATIBILITY_BADFILE : file too big or not a text file * ZPOOL_COMPATIBILITY_BADTOKEN : SYSCONF file contains invalid feature name * ZPOOL_COMPATIBILITY_WARNTOKEN : DATA file contains invalid feature name * ZPOOL_COMPATIBILITY_NOFILES : no feature files found */ zpool_compat_status_t zpool_load_compat(const char *compat, boolean_t *features, char *report, size_t rlen) { int sdirfd, ddirfd, featfd; struct stat fs; char *fc; char *ps, *ls, *ws; char *file, *line, *word; char l_compat[ZFS_MAXPROPLEN]; boolean_t ret_nofiles = B_TRUE; boolean_t ret_badfile = B_FALSE; boolean_t ret_badtoken = B_FALSE; boolean_t ret_warntoken = B_FALSE; /* special cases (unset), "" and "off" => enable all features */ if (compat == NULL || compat[0] == '\0' || strcmp(compat, ZPOOL_COMPAT_OFF) == 0) { if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_TRUE; if (report != NULL) strlcpy(report, gettext("all features enabled"), rlen); return (ZPOOL_COMPATIBILITY_OK); } /* Final special case "legacy" => disable all features */ if (strcmp(compat, ZPOOL_COMPAT_LEGACY) == 0) { if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_FALSE; if (report != NULL) strlcpy(report, gettext("all features disabled"), rlen); return (ZPOOL_COMPATIBILITY_OK); } /* * Start with all true; will be ANDed with results from each file */ if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] = B_TRUE; char err_badfile[ZFS_MAXPROPLEN] = ""; char err_badtoken[ZFS_MAXPROPLEN] = ""; /* * We ignore errors from the directory open() * as they're only needed if the filename is relative * which will be checked during the openat(). */ /* O_PATH safer than O_RDONLY if system allows it */ #if defined(O_PATH) #define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_PATH) #else #define ZC_DIR_FLAGS (O_DIRECTORY | O_CLOEXEC | O_RDONLY) #endif sdirfd = open(ZPOOL_SYSCONF_COMPAT_D, ZC_DIR_FLAGS); ddirfd = open(ZPOOL_DATA_COMPAT_D, ZC_DIR_FLAGS); (void) strlcpy(l_compat, compat, ZFS_MAXPROPLEN); for (file = strtok_r(l_compat, ",", &ps); file != NULL; file = strtok_r(NULL, ",", &ps)) { boolean_t l_features[SPA_FEATURES]; enum { Z_SYSCONF, Z_DATA } source; /* try sysconfdir first, then datadir */ source = Z_SYSCONF; if ((featfd = openat(sdirfd, file, O_RDONLY | O_CLOEXEC)) < 0) { featfd = openat(ddirfd, file, O_RDONLY | O_CLOEXEC); source = Z_DATA; } /* File readable and correct size? */ if (featfd < 0 || fstat(featfd, &fs) < 0 || fs.st_size < 1 || fs.st_size > ZPOOL_COMPAT_MAXSIZE) { (void) close(featfd); strlcat(err_badfile, file, ZFS_MAXPROPLEN); strlcat(err_badfile, " ", ZFS_MAXPROPLEN); ret_badfile = B_TRUE; continue; } /* Prefault the file if system allows */ #if defined(MAP_POPULATE) #define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) #elif defined(MAP_PREFAULT_READ) #define ZC_MMAP_FLAGS (MAP_PRIVATE | MAP_PREFAULT_READ) #else #define ZC_MMAP_FLAGS (MAP_PRIVATE) #endif /* private mmap() so we can strtok safely */ fc = (char *)mmap(NULL, fs.st_size, PROT_READ | PROT_WRITE, ZC_MMAP_FLAGS, featfd, 0); (void) close(featfd); /* map ok, and last character == newline? */ if (fc == MAP_FAILED || fc[fs.st_size - 1] != '\n') { (void) munmap((void *) fc, fs.st_size); strlcat(err_badfile, file, ZFS_MAXPROPLEN); strlcat(err_badfile, " ", ZFS_MAXPROPLEN); ret_badfile = B_TRUE; continue; } ret_nofiles = B_FALSE; for (uint_t i = 0; i < SPA_FEATURES; i++) l_features[i] = B_FALSE; /* replace final newline with NULL to ensure string ends */ fc[fs.st_size - 1] = '\0'; for (line = strtok_r(fc, "\n", &ls); line != NULL; line = strtok_r(NULL, "\n", &ls)) { /* discard comments */ char *r = strchr(line, '#'); if (r != NULL) *r = '\0'; for (word = strtok_r(line, ", \t", &ws); word != NULL; word = strtok_r(NULL, ", \t", &ws)) { /* Find matching feature name */ uint_t f; for (f = 0; f < SPA_FEATURES; f++) { zfeature_info_t *fi = &spa_feature_table[f]; if (strcmp(word, fi->fi_uname) == 0) { l_features[f] = B_TRUE; break; } } if (f < SPA_FEATURES) continue; /* found an unrecognized word */ /* lightly sanitize it */ if (strlen(word) > 32) word[32] = '\0'; for (char *c = word; *c != '\0'; c++) if (!isprint(*c)) *c = '?'; strlcat(err_badtoken, word, ZFS_MAXPROPLEN); strlcat(err_badtoken, " ", ZFS_MAXPROPLEN); if (source == Z_SYSCONF) ret_badtoken = B_TRUE; else ret_warntoken = B_TRUE; } } (void) munmap((void *) fc, fs.st_size); if (features != NULL) for (uint_t i = 0; i < SPA_FEATURES; i++) features[i] &= l_features[i]; } (void) close(sdirfd); (void) close(ddirfd); /* Return the most serious error */ if (ret_badfile) { if (report != NULL) snprintf(report, rlen, gettext("could not read/" "parse feature file(s): %s"), err_badfile); return (ZPOOL_COMPATIBILITY_BADFILE); } if (ret_nofiles) { if (report != NULL) strlcpy(report, gettext("no valid compatibility files specified"), rlen); return (ZPOOL_COMPATIBILITY_NOFILES); } if (ret_badtoken) { if (report != NULL) snprintf(report, rlen, gettext("invalid feature " "name(s) in local compatibility files: %s"), err_badtoken); return (ZPOOL_COMPATIBILITY_BADTOKEN); } if (ret_warntoken) { if (report != NULL) snprintf(report, rlen, gettext("unrecognized feature " "name(s) in distribution compatibility files: %s"), err_badtoken); return (ZPOOL_COMPATIBILITY_WARNTOKEN); } if (report != NULL) strlcpy(report, gettext("compatibility set ok"), rlen); return (ZPOOL_COMPATIBILITY_OK); } static int zpool_vdev_guid(zpool_handle_t *zhp, const char *vdevname, uint64_t *vdev_guid) { nvlist_t *tgt; boolean_t avail_spare, l2cache; verify(zhp != NULL); if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "pool is in an unavailable state")); return (zfs_error(zhp->zpool_hdl, EZFS_POOLUNAVAIL, errbuf)); } if ((tgt = zpool_find_vdev(zhp, vdevname, &avail_spare, &l2cache, NULL)) == NULL) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "can not find %s in %s"), vdevname, zhp->zpool_name); return (zfs_error(zhp->zpool_hdl, EZFS_NODEVICE, errbuf)); } *vdev_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); return (0); } /* * Get a vdev property value for 'prop' and return the value in * a pre-allocated buffer. */ int zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { nvlist_t *nv; const char *strval; uint64_t intval; zprop_source_t src = ZPROP_SRC_NONE; if (prop == VDEV_PROP_USERPROP) { /* user property, prop_name must contain the property name */ assert(prop_name != NULL); if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); strval = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { /* user prop not found */ return (-1); } (void) strlcpy(buf, strval, len); if (srctype) *srctype = src; return (0); } if (prop_name == NULL) prop_name = (char *)vdev_prop_to_name(prop); switch (vdev_prop_get_type(prop)) { case PROP_TYPE_STRING: if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); strval = fnvlist_lookup_string(nv, ZPROP_VALUE); } else { src = ZPROP_SRC_DEFAULT; if ((strval = vdev_prop_default_string(prop)) == NULL) strval = "-"; } (void) strlcpy(buf, strval, len); break; case PROP_TYPE_NUMBER: if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); intval = fnvlist_lookup_uint64(nv, ZPROP_VALUE); } else { src = ZPROP_SRC_DEFAULT; intval = vdev_prop_default_numeric(prop); } switch (prop) { case VDEV_PROP_ASIZE: case VDEV_PROP_PSIZE: case VDEV_PROP_SIZE: case VDEV_PROP_BOOTSIZE: case VDEV_PROP_ALLOCATED: case VDEV_PROP_FREE: case VDEV_PROP_READ_ERRORS: case VDEV_PROP_WRITE_ERRORS: case VDEV_PROP_CHECKSUM_ERRORS: case VDEV_PROP_INITIALIZE_ERRORS: case VDEV_PROP_OPS_NULL: case VDEV_PROP_OPS_READ: case VDEV_PROP_OPS_WRITE: case VDEV_PROP_OPS_FREE: case VDEV_PROP_OPS_CLAIM: case VDEV_PROP_OPS_TRIM: case VDEV_PROP_BYTES_NULL: case VDEV_PROP_BYTES_READ: case VDEV_PROP_BYTES_WRITE: case VDEV_PROP_BYTES_FREE: case VDEV_PROP_BYTES_CLAIM: case VDEV_PROP_BYTES_TRIM: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case VDEV_PROP_EXPANDSZ: if (intval == 0) { (void) strlcpy(buf, "-", len); } else if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) zfs_nicenum(intval, buf, len); } break; case VDEV_PROP_CAPACITY: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: + case VDEV_PROP_SLOW_IO_N: + case VDEV_PROP_SLOW_IO_T: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } break; case VDEV_PROP_FRAGMENTATION: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { (void) snprintf(buf, len, "%llu%%", (u_longlong_t)intval); } break; case VDEV_PROP_STATE: if (literal) { (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } else { (void) strlcpy(buf, zpool_state_to_name(intval, VDEV_AUX_NONE), len); } break; default: (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); } break; case PROP_TYPE_INDEX: if (nvlist_lookup_nvlist(nvprop, prop_name, &nv) == 0) { src = fnvlist_lookup_uint64(nv, ZPROP_SOURCE); intval = fnvlist_lookup_uint64(nv, ZPROP_VALUE); } else { src = ZPROP_SRC_DEFAULT; intval = vdev_prop_default_numeric(prop); /* Only use if provided by the RAIDZ VDEV above */ if (prop == VDEV_PROP_RAIDZ_EXPANDING) return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, (const char **)&strval) != 0) return (-1); (void) strlcpy(buf, strval, len); break; default: abort(); } if (srctype) *srctype = src; return (0); } /* * Get a vdev property value for 'prop_name' and return the value in * a pre-allocated buffer. */ int zpool_get_vdev_prop(zpool_handle_t *zhp, const char *vdevname, vdev_prop_t prop, char *prop_name, char *buf, size_t len, zprop_source_t *srctype, boolean_t literal) { nvlist_t *reqnvl, *reqprops; nvlist_t *retprops = NULL; uint64_t vdev_guid = 0; int ret; if ((ret = zpool_vdev_guid(zhp, vdevname, &vdev_guid)) != 0) return (ret); if (nvlist_alloc(&reqnvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_alloc(&reqprops, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); fnvlist_add_uint64(reqnvl, ZPOOL_VDEV_PROPS_GET_VDEV, vdev_guid); if (prop != VDEV_PROP_USERPROP) { /* prop_name overrides prop value */ if (prop_name != NULL) prop = vdev_name_to_prop(prop_name); else prop_name = (char *)vdev_prop_to_name(prop); assert(prop < VDEV_NUM_PROPS); } assert(prop_name != NULL); if (nvlist_add_uint64(reqprops, prop_name, prop) != 0) { nvlist_free(reqnvl); nvlist_free(reqprops); return (no_memory(zhp->zpool_hdl)); } fnvlist_add_nvlist(reqnvl, ZPOOL_VDEV_PROPS_GET_PROPS, reqprops); ret = lzc_get_vdev_prop(zhp->zpool_name, reqnvl, &retprops); if (ret == 0) { ret = zpool_get_vdev_prop_value(retprops, prop, prop_name, buf, len, srctype, literal); } else { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get vdev property %s from" " %s in %s"), prop_name, vdevname, zhp->zpool_name); (void) zpool_standard_error(zhp->zpool_hdl, ret, errbuf); } nvlist_free(reqnvl); nvlist_free(reqprops); nvlist_free(retprops); return (ret); } /* * Get all vdev properties */ int zpool_get_all_vdev_props(zpool_handle_t *zhp, const char *vdevname, nvlist_t **outnvl) { nvlist_t *nvl = NULL; uint64_t vdev_guid = 0; int ret; if ((ret = zpool_vdev_guid(zhp, vdevname, &vdev_guid)) != 0) return (ret); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); fnvlist_add_uint64(nvl, ZPOOL_VDEV_PROPS_GET_VDEV, vdev_guid); ret = lzc_get_vdev_prop(zhp->zpool_name, nvl, outnvl); nvlist_free(nvl); if (ret) { char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot get vdev properties for" " %s in %s"), vdevname, zhp->zpool_name); (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); } return (ret); } /* * Set vdev property */ int zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname, const char *propname, const char *propval) { int ret; nvlist_t *nvl = NULL; nvlist_t *outnvl = NULL; nvlist_t *props; nvlist_t *realprops; prop_flags_t flags = { 0 }; uint64_t version; uint64_t vdev_guid; if ((ret = zpool_vdev_guid(zhp, vdevname, &vdev_guid)) != 0) return (ret); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); fnvlist_add_uint64(nvl, ZPOOL_VDEV_PROPS_SET_VDEV, vdev_guid); if (nvlist_add_string(props, propname, propval) != 0) { nvlist_free(props); return (no_memory(zhp->zpool_hdl)); } char errbuf[ERRBUFLEN]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property %s for %s on %s"), propname, vdevname, zhp->zpool_name); flags.vdevprop = 1; version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, zhp->zpool_name, props, version, flags, errbuf)) == NULL) { nvlist_free(props); nvlist_free(nvl); return (-1); } nvlist_free(props); props = realprops; fnvlist_add_nvlist(nvl, ZPOOL_VDEV_PROPS_SET_PROPS, props); ret = lzc_set_vdev_prop(zhp->zpool_name, nvl, &outnvl); nvlist_free(props); nvlist_free(nvl); nvlist_free(outnvl); if (ret) (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); return (ret); } diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index d0a63a4daa8c..8e70af2e5830 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1,2271 +1,2273 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2020 Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2017 Datto Inc. * Copyright (c) 2020 The FreeBSD Foundation * * Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ /* * Internal utility routines for the ZFS library. */ #include #include #include #include #include #include #include #include #include #if LIBFETCH_DYNAMIC #include #endif #include #include #include #include #include #include #include #include "libzfs_impl.h" #include "zfs_prop.h" #include "zfeature_common.h" #include #include /* * We only care about the scheme in order to match the scheme * with the handler. Each handler should validate the full URI * as necessary. */ #define URI_REGEX "^\\([A-Za-z][A-Za-z0-9+.\\-]*\\):" int libzfs_errno(libzfs_handle_t *hdl) { return (hdl->libzfs_error); } const char * libzfs_error_action(libzfs_handle_t *hdl) { return (hdl->libzfs_action); } const char * libzfs_error_description(libzfs_handle_t *hdl) { if (hdl->libzfs_desc[0] != '\0') return (hdl->libzfs_desc); switch (hdl->libzfs_error) { case EZFS_NOMEM: return (dgettext(TEXT_DOMAIN, "out of memory")); case EZFS_BADPROP: return (dgettext(TEXT_DOMAIN, "invalid property value")); case EZFS_PROPREADONLY: return (dgettext(TEXT_DOMAIN, "read-only property")); case EZFS_PROPTYPE: return (dgettext(TEXT_DOMAIN, "property doesn't apply to " "datasets of this type")); case EZFS_PROPNONINHERIT: return (dgettext(TEXT_DOMAIN, "property cannot be inherited")); case EZFS_PROPSPACE: return (dgettext(TEXT_DOMAIN, "invalid quota or reservation")); case EZFS_BADTYPE: return (dgettext(TEXT_DOMAIN, "operation not applicable to " "datasets of this type")); case EZFS_BUSY: return (dgettext(TEXT_DOMAIN, "pool or dataset is busy")); case EZFS_EXISTS: return (dgettext(TEXT_DOMAIN, "pool or dataset exists")); case EZFS_NOENT: return (dgettext(TEXT_DOMAIN, "no such pool or dataset")); case EZFS_BADSTREAM: return (dgettext(TEXT_DOMAIN, "invalid backup stream")); case EZFS_DSREADONLY: return (dgettext(TEXT_DOMAIN, "dataset is read-only")); case EZFS_VOLTOOBIG: return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for " "this system")); case EZFS_INVALIDNAME: return (dgettext(TEXT_DOMAIN, "invalid name")); case EZFS_BADRESTORE: return (dgettext(TEXT_DOMAIN, "unable to restore to " "destination")); case EZFS_BADBACKUP: return (dgettext(TEXT_DOMAIN, "backup failed")); case EZFS_BADTARGET: return (dgettext(TEXT_DOMAIN, "invalid target vdev")); case EZFS_NODEVICE: return (dgettext(TEXT_DOMAIN, "no such device in pool")); case EZFS_BADDEV: return (dgettext(TEXT_DOMAIN, "invalid device")); case EZFS_NOREPLICAS: return (dgettext(TEXT_DOMAIN, "no valid replicas")); case EZFS_RESILVERING: return (dgettext(TEXT_DOMAIN, "currently resilvering")); case EZFS_BADVERSION: return (dgettext(TEXT_DOMAIN, "unsupported version or " "feature")); case EZFS_POOLUNAVAIL: return (dgettext(TEXT_DOMAIN, "pool is unavailable")); case EZFS_DEVOVERFLOW: return (dgettext(TEXT_DOMAIN, "too many devices in one vdev")); case EZFS_BADPATH: return (dgettext(TEXT_DOMAIN, "must be an absolute path")); case EZFS_CROSSTARGET: return (dgettext(TEXT_DOMAIN, "operation crosses datasets or " "pools")); case EZFS_ZONED: return (dgettext(TEXT_DOMAIN, "dataset in use by local zone")); case EZFS_MOUNTFAILED: return (dgettext(TEXT_DOMAIN, "mount failed")); case EZFS_UMOUNTFAILED: return (dgettext(TEXT_DOMAIN, "unmount failed")); case EZFS_UNSHARENFSFAILED: return (dgettext(TEXT_DOMAIN, "NFS share removal failed")); case EZFS_SHARENFSFAILED: return (dgettext(TEXT_DOMAIN, "NFS share creation failed")); case EZFS_UNSHARESMBFAILED: return (dgettext(TEXT_DOMAIN, "SMB share removal failed")); case EZFS_SHARESMBFAILED: return (dgettext(TEXT_DOMAIN, "SMB share creation failed")); case EZFS_PERM: return (dgettext(TEXT_DOMAIN, "permission denied")); case EZFS_NOSPC: return (dgettext(TEXT_DOMAIN, "out of space")); case EZFS_FAULT: return (dgettext(TEXT_DOMAIN, "bad address")); case EZFS_IO: return (dgettext(TEXT_DOMAIN, "I/O error")); case EZFS_INTR: return (dgettext(TEXT_DOMAIN, "signal received")); case EZFS_CKSUM: return (dgettext(TEXT_DOMAIN, "insufficient replicas")); case EZFS_ISSPARE: return (dgettext(TEXT_DOMAIN, "device is reserved as a hot " "spare")); case EZFS_INVALCONFIG: return (dgettext(TEXT_DOMAIN, "invalid vdev configuration")); case EZFS_RECURSIVE: return (dgettext(TEXT_DOMAIN, "recursive dataset dependency")); case EZFS_NOHISTORY: return (dgettext(TEXT_DOMAIN, "no history available")); case EZFS_POOLPROPS: return (dgettext(TEXT_DOMAIN, "failed to retrieve " "pool properties")); case EZFS_POOL_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this type of pool")); case EZFS_POOL_INVALARG: return (dgettext(TEXT_DOMAIN, "invalid argument for " "this pool operation")); case EZFS_NAMETOOLONG: return (dgettext(TEXT_DOMAIN, "dataset name is too long")); case EZFS_OPENFAILED: return (dgettext(TEXT_DOMAIN, "open failed")); case EZFS_NOCAP: return (dgettext(TEXT_DOMAIN, "disk capacity information could not be retrieved")); case EZFS_LABELFAILED: return (dgettext(TEXT_DOMAIN, "write of label failed")); case EZFS_BADWHO: return (dgettext(TEXT_DOMAIN, "invalid user/group")); case EZFS_BADPERM: return (dgettext(TEXT_DOMAIN, "invalid permission")); case EZFS_BADPERMSET: return (dgettext(TEXT_DOMAIN, "invalid permission set name")); case EZFS_NODELEGATION: return (dgettext(TEXT_DOMAIN, "delegated administration is " "disabled on pool")); case EZFS_BADCACHE: return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); case EZFS_ISL2CACHE: return (dgettext(TEXT_DOMAIN, "device is in use as a cache")); case EZFS_VDEVNOTSUP: return (dgettext(TEXT_DOMAIN, "vdev specification is not " "supported")); case EZFS_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this dataset")); case EZFS_IOC_NOTSUPPORTED: return (dgettext(TEXT_DOMAIN, "operation not supported by " "zfs kernel module")); case EZFS_ACTIVE_SPARE: return (dgettext(TEXT_DOMAIN, "pool has active shared spare " "device")); case EZFS_UNPLAYED_LOGS: return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " "logs")); case EZFS_REFTAG_RELE: return (dgettext(TEXT_DOMAIN, "no such tag on this dataset")); case EZFS_REFTAG_HOLD: return (dgettext(TEXT_DOMAIN, "tag already exists on this " "dataset")); case EZFS_TAGTOOLONG: return (dgettext(TEXT_DOMAIN, "tag too long")); case EZFS_PIPEFAILED: return (dgettext(TEXT_DOMAIN, "pipe create failed")); case EZFS_THREADCREATEFAILED: return (dgettext(TEXT_DOMAIN, "thread create failed")); case EZFS_POSTSPLIT_ONLINE: return (dgettext(TEXT_DOMAIN, "disk was split from this pool " "into a new one")); case EZFS_SCRUB_PAUSED: return (dgettext(TEXT_DOMAIN, "scrub is paused; " "use 'zpool scrub' to resume scrub")); case EZFS_SCRUB_PAUSED_TO_CANCEL: return (dgettext(TEXT_DOMAIN, "scrub is paused; " "use 'zpool scrub' to resume or 'zpool scrub -s' to " "cancel scrub")); case EZFS_SCRUBBING: return (dgettext(TEXT_DOMAIN, "currently scrubbing; " "use 'zpool scrub -s' to cancel scrub")); case EZFS_ERRORSCRUBBING: return (dgettext(TEXT_DOMAIN, "currently error scrubbing; " "use 'zpool scrub -s' to cancel error scrub")); case EZFS_ERRORSCRUB_PAUSED: return (dgettext(TEXT_DOMAIN, "error scrub is paused; " "use 'zpool scrub -e' to resume error scrub")); case EZFS_NO_SCRUB: return (dgettext(TEXT_DOMAIN, "there is no active scrub")); case EZFS_DIFF: return (dgettext(TEXT_DOMAIN, "unable to generate diffs")); case EZFS_DIFFDATA: return (dgettext(TEXT_DOMAIN, "invalid diff data")); case EZFS_POOLREADONLY: return (dgettext(TEXT_DOMAIN, "pool is read-only")); case EZFS_NO_PENDING: return (dgettext(TEXT_DOMAIN, "operation is not " "in progress")); case EZFS_CHECKPOINT_EXISTS: return (dgettext(TEXT_DOMAIN, "checkpoint exists")); case EZFS_DISCARDING_CHECKPOINT: return (dgettext(TEXT_DOMAIN, "currently discarding " "checkpoint")); case EZFS_NO_CHECKPOINT: return (dgettext(TEXT_DOMAIN, "checkpoint does not exist")); case EZFS_DEVRM_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "device removal in progress")); case EZFS_VDEV_TOO_BIG: return (dgettext(TEXT_DOMAIN, "device exceeds supported size")); case EZFS_ACTIVE_POOL: return (dgettext(TEXT_DOMAIN, "pool is imported on a " "different host")); case EZFS_CRYPTOFAILED: return (dgettext(TEXT_DOMAIN, "encryption failure")); case EZFS_TOOMANY: return (dgettext(TEXT_DOMAIN, "argument list too long")); case EZFS_INITIALIZING: return (dgettext(TEXT_DOMAIN, "currently initializing")); case EZFS_NO_INITIALIZE: return (dgettext(TEXT_DOMAIN, "there is no active " "initialization")); case EZFS_WRONG_PARENT: return (dgettext(TEXT_DOMAIN, "invalid parent dataset")); case EZFS_TRIMMING: return (dgettext(TEXT_DOMAIN, "currently trimming")); case EZFS_NO_TRIM: return (dgettext(TEXT_DOMAIN, "there is no active trim")); case EZFS_TRIM_NOTSUP: return (dgettext(TEXT_DOMAIN, "trim operations are not " "supported by this device")); case EZFS_NO_RESILVER_DEFER: return (dgettext(TEXT_DOMAIN, "this action requires the " "resilver_defer feature")); case EZFS_EXPORT_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "pool export in progress")); case EZFS_REBUILDING: return (dgettext(TEXT_DOMAIN, "currently sequentially " "resilvering")); case EZFS_VDEV_NOTSUP: return (dgettext(TEXT_DOMAIN, "operation not supported " "on this type of vdev")); case EZFS_NOT_USER_NAMESPACE: return (dgettext(TEXT_DOMAIN, "the provided file " "was not a user namespace file")); case EZFS_RESUME_EXISTS: return (dgettext(TEXT_DOMAIN, "Resuming recv on existing " "dataset without force")); case EZFS_RAIDZ_EXPAND_IN_PROGRESS: return (dgettext(TEXT_DOMAIN, "raidz expansion in progress")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: assert(hdl->libzfs_error == 0); return (dgettext(TEXT_DOMAIN, "no error")); } } void zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...) { va_list ap; va_start(ap, fmt); (void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc), fmt, ap); hdl->libzfs_desc_active = 1; va_end(ap); } static void zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) { (void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action), fmt, ap); hdl->libzfs_error = error; if (hdl->libzfs_desc_active) hdl->libzfs_desc_active = 0; else hdl->libzfs_desc[0] = '\0'; if (hdl->libzfs_printerr) { if (error == EZFS_UNKNOWN) { (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal " "error: %s: %s\n"), hdl->libzfs_action, libzfs_error_description(hdl)); abort(); } (void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action, libzfs_error_description(hdl)); if (error == EZFS_NOMEM) exit(1); } } int zfs_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zfs_error_fmt(hdl, error, "%s", msg)); } int zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); zfs_verror(hdl, error, fmt, ap); va_end(ap); return (-1); } static int zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) { switch (error) { case EPERM: case EACCES: zfs_verror(hdl, EZFS_PERM, fmt, ap); return (-1); case ECANCELED: zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap); return (-1); case EIO: zfs_verror(hdl, EZFS_IO, fmt, ap); return (-1); case EFAULT: zfs_verror(hdl, EZFS_FAULT, fmt, ap); return (-1); case EINTR: zfs_verror(hdl, EZFS_INTR, fmt, ap); return (-1); case ECKSUM: zfs_verror(hdl, EZFS_CKSUM, fmt, ap); return (-1); } return (0); } int zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zfs_standard_error_fmt(hdl, error, "%s", msg)); } int zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (zfs_common_error(hdl, error, fmt, ap) != 0) { va_end(ap); return (-1); } switch (error) { case ENXIO: case ENODEV: case EPIPE: zfs_verror(hdl, EZFS_IO, fmt, ap); break; case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset does not exist")); zfs_verror(hdl, EZFS_NOENT, fmt, ap); break; case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); break; case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); zfs_verror(hdl, EZFS_EXISTS, fmt, ap); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset is busy")); zfs_verror(hdl, EZFS_BUSY, fmt, ap); break; case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; case ENAMETOOLONG: zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap); break; case ENOTSUP: zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); break; case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool I/O is currently suspended")); zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); break; case EREMOTEIO: zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); break; case ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE: case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " "be required to enable this operation.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support an option for this operation. " "A reboot may be required to enable this option.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_REQUIRED: case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_WRONG_PARENT: zfs_verror(hdl, EZFS_WRONG_PARENT, fmt, ap); break; case ZFS_ERR_BADPROP: zfs_verror(hdl, EZFS_BADPROP, fmt, ap); break; case ZFS_ERR_NOT_USER_NAMESPACE: zfs_verror(hdl, EZFS_NOT_USER_NAMESPACE, fmt, ap); break; default: zfs_error_aux(hdl, "%s", zfs_strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); break; } va_end(ap); return (-1); } void zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err, char *errbuf) { switch (err) { case ENOSPC: /* * For quotas and reservations, ENOSPC indicates * something different; setting a quota or reservation * doesn't use any disk space. */ switch (prop) { case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "size is less than current used or " "reserved space")); (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); break; case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "size is greater than available space")); (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); break; default: (void) zfs_standard_error(hdl, err, errbuf); break; } break; case EBUSY: (void) zfs_standard_error(hdl, EBUSY, errbuf); break; case EROFS: (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); break; case E2BIG: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property value too long")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool and or dataset must be upgraded to set this " "property or value")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case ERANGE: if (prop == ZFS_PROP_COMPRESSION || prop == ZFS_PROP_DNODESIZE || prop == ZFS_PROP_RECORDSIZE) { (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property setting is not allowed on " "bootable datasets")); (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); } else if (prop == ZFS_PROP_CHECKSUM || prop == ZFS_PROP_DEDUP) { (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property setting is not allowed on " "root pools")); (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } break; case EINVAL: if (prop == ZPROP_INVAL) { (void) zfs_error(hdl, EZFS_BADPROP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } break; case ZFS_ERR_BADPROP: (void) zfs_error(hdl, EZFS_BADPROP, errbuf); break; case EACCES: if (prop == ZFS_PROP_KEYLOCATION) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "keylocation may only be set on encryption roots")); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } break; case EOVERFLOW: /* * This platform can't address a volume this big. */ #ifdef _ILP32 if (prop == ZFS_PROP_VOLSIZE) { (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); break; } zfs_fallthrough; #endif default: (void) zfs_standard_error(hdl, err, errbuf); } } int zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg) { return (zpool_standard_error_fmt(hdl, error, "%s", msg)); } int zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (zfs_common_error(hdl, error, fmt, ap) != 0) { va_end(ap); return (-1); } switch (error) { case ENODEV: zfs_verror(hdl, EZFS_NODEVICE, fmt, ap); break; case ENOENT: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool or dataset")); zfs_verror(hdl, EZFS_NOENT, fmt, ap); break; case EEXIST: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool already exists")); zfs_verror(hdl, EZFS_EXISTS, fmt, ap); break; case EBUSY: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy")); zfs_verror(hdl, EZFS_BUSY, fmt, ap); break; /* There is no pending operation to cancel */ case ENOTACTIVE: zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap); break; case ENXIO: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "one or more devices is currently unavailable")); zfs_verror(hdl, EZFS_BADDEV, fmt, ap); break; case ENAMETOOLONG: zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap); break; case ENOTSUP: zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap); break; case EINVAL: zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap); break; case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); break; case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool I/O is currently suspended")); zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); break; case EROFS: zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; case EDOM: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "block size out of range or does not match")); zfs_verror(hdl, EZFS_BADPROP, fmt, ap); break; case EREMOTEIO: zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); break; case ZFS_ERR_CHECKPOINT_EXISTS: zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap); break; case ZFS_ERR_DISCARDING_CHECKPOINT: zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap); break; case ZFS_ERR_NO_CHECKPOINT: zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap); break; case ZFS_ERR_DEVRM_IN_PROGRESS: zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap); break; case ZFS_ERR_VDEV_TOO_BIG: zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap); break; case ZFS_ERR_EXPORT_IN_PROGRESS: zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap); break; case ZFS_ERR_RESILVER_IN_PROGRESS: zfs_verror(hdl, EZFS_RESILVERING, fmt, ap); break; case ZFS_ERR_REBUILD_IN_PROGRESS: zfs_verror(hdl, EZFS_REBUILDING, fmt, ap); break; case ZFS_ERR_BADPROP: zfs_verror(hdl, EZFS_BADPROP, fmt, ap); break; case ZFS_ERR_VDEV_NOTSUP: zfs_verror(hdl, EZFS_VDEV_NOTSUP, fmt, ap); break; case ZFS_ERR_IOC_CMD_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support this operation. A reboot may " "be required to enable this operation.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_UNAVAIL: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " "module does not support an option for this operation. " "A reboot may be required to enable this option.")); zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_IOC_ARG_REQUIRED: case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS: zfs_verror(hdl, EZFS_RAIDZ_EXPAND_IN_PROGRESS, fmt, ap); break; default: zfs_error_aux(hdl, "%s", zfs_strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); } va_end(ap); return (-1); } /* * Display an out of memory error message and abort the current program. */ int no_memory(libzfs_handle_t *hdl) { return (zfs_error(hdl, EZFS_NOMEM, "internal error")); } /* * A safe form of malloc() which will die if the allocation fails. */ void * zfs_alloc(libzfs_handle_t *hdl, size_t size) { void *data; if ((data = calloc(1, size)) == NULL) (void) no_memory(hdl); return (data); } /* * A safe form of asprintf() which will die if the allocation fails. */ char * zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) { va_list ap; char *ret; int err; va_start(ap, fmt); err = vasprintf(&ret, fmt, ap); va_end(ap); if (err < 0) { (void) no_memory(hdl); ret = NULL; } return (ret); } /* * A safe form of realloc(), which also zeroes newly allocated space. */ void * zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize) { void *ret; if ((ret = realloc(ptr, newsize)) == NULL) { (void) no_memory(hdl); return (NULL); } memset((char *)ret + oldsize, 0, newsize - oldsize); return (ret); } /* * A safe form of strdup() which will die if the allocation fails. */ char * zfs_strdup(libzfs_handle_t *hdl, const char *str) { char *ret; if ((ret = strdup(str)) == NULL) (void) no_memory(hdl); return (ret); } void libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) { hdl->libzfs_printerr = printerr; } /* * Read lines from an open file descriptor and store them in an array of * strings until EOF. lines[] will be allocated and populated with all the * lines read. All newlines are replaced with NULL terminators for * convenience. lines[] must be freed after use with libzfs_free_str_array(). * * Returns the number of lines read. */ static int libzfs_read_stdout_from_fd(int fd, char **lines[]) { FILE *fp; int lines_cnt = 0; size_t len = 0; char *line = NULL; char **tmp_lines = NULL, **tmp; fp = fdopen(fd, "r"); if (fp == NULL) { close(fd); return (0); } while (getline(&line, &len, fp) != -1) { tmp = realloc(tmp_lines, sizeof (*tmp_lines) * (lines_cnt + 1)); if (tmp == NULL) { /* Return the lines we were able to process */ break; } tmp_lines = tmp; /* Remove newline if not EOF */ if (line[strlen(line) - 1] == '\n') line[strlen(line) - 1] = '\0'; tmp_lines[lines_cnt] = strdup(line); if (tmp_lines[lines_cnt] == NULL) break; ++lines_cnt; } free(line); fclose(fp); *lines = tmp_lines; return (lines_cnt); } static int libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags, char **lines[], int *lines_cnt) { pid_t pid; int error, devnull_fd; int link[2]; /* * Setup a pipe between our child and parent process if we're * reading stdout. */ if (lines != NULL && pipe2(link, O_NONBLOCK | O_CLOEXEC) == -1) return (-EPIPE); pid = fork(); if (pid == 0) { /* Child process */ devnull_fd = open("/dev/null", O_WRONLY | O_CLOEXEC); if (devnull_fd < 0) _exit(-1); if (!(flags & STDOUT_VERBOSE) && (lines == NULL)) (void) dup2(devnull_fd, STDOUT_FILENO); else if (lines != NULL) { /* Save the output to lines[] */ dup2(link[1], STDOUT_FILENO); } if (!(flags & STDERR_VERBOSE)) (void) dup2(devnull_fd, STDERR_FILENO); if (flags & NO_DEFAULT_PATH) { if (env == NULL) execv(path, argv); else execve(path, argv, env); } else { if (env == NULL) execvp(path, argv); else execvpe(path, argv, env); } _exit(-1); } else if (pid > 0) { /* Parent process */ int status; while ((error = waitpid(pid, &status, 0)) == -1 && errno == EINTR) ; if (error < 0 || !WIFEXITED(status)) return (-1); if (lines != NULL) { close(link[1]); *lines_cnt = libzfs_read_stdout_from_fd(link[0], lines); } return (WEXITSTATUS(status)); } return (-1); } int libzfs_run_process(const char *path, char *argv[], int flags) { return (libzfs_run_process_impl(path, argv, NULL, flags, NULL, NULL)); } /* * Run a command and store its stdout lines in an array of strings (lines[]). * lines[] is allocated and populated for you, and the number of lines is set in * lines_cnt. lines[] must be freed after use with libzfs_free_str_array(). * All newlines (\n) in lines[] are terminated for convenience. */ int libzfs_run_process_get_stdout(const char *path, char *argv[], char *env[], char **lines[], int *lines_cnt) { return (libzfs_run_process_impl(path, argv, env, 0, lines, lines_cnt)); } /* * Same as libzfs_run_process_get_stdout(), but run without $PATH set. This * means that *path needs to be the full path to the executable. */ int libzfs_run_process_get_stdout_nopath(const char *path, char *argv[], char *env[], char **lines[], int *lines_cnt) { return (libzfs_run_process_impl(path, argv, env, NO_DEFAULT_PATH, lines, lines_cnt)); } /* * Free an array of strings. Free both the strings contained in the array and * the array itself. */ void libzfs_free_str_array(char **strs, int count) { while (--count >= 0) free(strs[count]); free(strs); } /* * Returns 1 if environment variable is set to "YES", "yes", "ON", "on", or * a non-zero number. * * Returns 0 otherwise. */ boolean_t libzfs_envvar_is_set(const char *envvar) { char *env = getenv(envvar); return (env && (strtoul(env, NULL, 0) > 0 || (!strncasecmp(env, "YES", 3) && strnlen(env, 4) == 3) || (!strncasecmp(env, "ON", 2) && strnlen(env, 3) == 2))); } libzfs_handle_t * libzfs_init(void) { libzfs_handle_t *hdl; int error; char *env; if ((error = libzfs_load_module()) != 0) { errno = error; return (NULL); } if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) { return (NULL); } if (regcomp(&hdl->libzfs_urire, URI_REGEX, 0) != 0) { free(hdl); return (NULL); } if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR|O_EXCL|O_CLOEXEC)) < 0) { free(hdl); return (NULL); } if (libzfs_core_init() != 0) { (void) close(hdl->libzfs_fd); free(hdl); return (NULL); } zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); vdev_prop_init(); libzfs_mnttab_init(hdl); fletcher_4_init(); if (getenv("ZFS_PROP_DEBUG") != NULL) { hdl->libzfs_prop_debug = B_TRUE; } if ((env = getenv("ZFS_SENDRECV_MAX_NVLIST")) != NULL) { if ((error = zfs_nicestrtonum(hdl, env, &hdl->libzfs_max_nvlist))) { errno = error; (void) close(hdl->libzfs_fd); free(hdl); return (NULL); } } else { hdl->libzfs_max_nvlist = (SPA_MAXBLOCKSIZE * 4); } /* * For testing, remove some settable properties and features */ if (libzfs_envvar_is_set("ZFS_SYSFS_PROP_SUPPORT_TEST")) { zprop_desc_t *proptbl; proptbl = zpool_prop_get_table(); proptbl[ZPOOL_PROP_COMMENT].pd_zfs_mod_supported = B_FALSE; proptbl = zfs_prop_get_table(); proptbl[ZFS_PROP_DNODESIZE].pd_zfs_mod_supported = B_FALSE; zfeature_info_t *ftbl = spa_feature_table; ftbl[SPA_FEATURE_LARGE_BLOCKS].fi_zfs_mod_supported = B_FALSE; } return (hdl); } void libzfs_fini(libzfs_handle_t *hdl) { (void) close(hdl->libzfs_fd); zpool_free_handles(hdl); namespace_clear(hdl); libzfs_mnttab_fini(hdl); libzfs_core_fini(); regfree(&hdl->libzfs_urire); fletcher_4_fini(); #if LIBFETCH_DYNAMIC if (hdl->libfetch != (void *)-1 && hdl->libfetch != NULL) (void) dlclose(hdl->libfetch); free(hdl->libfetch_load_error); #endif free(hdl); } libzfs_handle_t * zpool_get_handle(zpool_handle_t *zhp) { return (zhp->zpool_hdl); } libzfs_handle_t * zfs_get_handle(zfs_handle_t *zhp) { return (zhp->zfs_hdl); } zpool_handle_t * zfs_get_pool_handle(const zfs_handle_t *zhp) { return (zhp->zpool_hdl); } /* * Given a name, determine whether or not it's a valid path * (starts with '/' or "./"). If so, walk the mnttab trying * to match the device number. If not, treat the path as an * fs/vol/snap/bkmark name. */ zfs_handle_t * zfs_path_to_zhandle(libzfs_handle_t *hdl, const char *path, zfs_type_t argtype) { struct stat64 statbuf; struct extmnttab entry; if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) { /* * It's not a valid path, assume it's a name of type 'argtype'. */ return (zfs_open(hdl, path, argtype)); } if (getextmntent(path, &entry, &statbuf) != 0) return (NULL); if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"), path); return (NULL); } return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)); } /* * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from * an ioctl(). */ void zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) { if (len == 0) len = 256 * 1024; zc->zc_nvlist_dst_size = len; zc->zc_nvlist_dst = (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); } /* * Called when an ioctl() which returns an nvlist fails with ENOMEM. This will * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was * filled in by the kernel to indicate the actual required size. */ void zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc) { free((void *)(uintptr_t)zc->zc_nvlist_dst); zc->zc_nvlist_dst = (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); } /* * Called to free the src and dst nvlists stored in the command structure. */ void zcmd_free_nvlists(zfs_cmd_t *zc) { free((void *)(uintptr_t)zc->zc_nvlist_conf); free((void *)(uintptr_t)zc->zc_nvlist_src); free((void *)(uintptr_t)zc->zc_nvlist_dst); zc->zc_nvlist_conf = 0; zc->zc_nvlist_src = 0; zc->zc_nvlist_dst = 0; } static void zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen, nvlist_t *nvl) { char *packed; size_t len = fnvlist_size(nvl); packed = zfs_alloc(hdl, len); verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0); *outnv = (uint64_t)(uintptr_t)packed; *outlen = len; } void zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) { zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf, &zc->zc_nvlist_conf_size, nvl); } void zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) { zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src, &zc->zc_nvlist_src_size, nvl); } /* * Unpacks an nvlist from the ZFS ioctl command structure. */ int zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp) { if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size, nvlp, 0) != 0) return (no_memory(hdl)); return (0); } /* * ================================================================ * API shared by zfs and zpool property management * ================================================================ */ static void zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) { zprop_list_t *pl; int i; char *title; size_t len; cbp->cb_first = B_FALSE; if (cbp->cb_scripted) return; /* * Start with the length of the column headers. */ cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME")); cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN, "PROPERTY")); cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN, "VALUE")); cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN, "RECEIVED")); cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN, "SOURCE")); /* first property is always NAME */ assert(cbp->cb_proplist->pl_prop == ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ((type == ZFS_TYPE_VDEV) ? VDEV_PROP_NAME : ZFS_PROP_NAME))); /* * Go through and calculate the widths for each column. For the * 'source' column, we kludge it up by taking the worst-case scenario of * inheriting from the longest name. This is acceptable because in the * majority of cases 'SOURCE' is the last column displayed, and we don't * use the width anyway. Note that the 'VALUE' column can be oversized, * if the name of the property is much longer than any values we find. */ for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* * 'PROPERTY' column */ if (pl->pl_prop != ZPROP_USERPROP) { const char *propname = (type == ZFS_TYPE_POOL) ? zpool_prop_to_name(pl->pl_prop) : ((type == ZFS_TYPE_VDEV) ? vdev_prop_to_name(pl->pl_prop) : zfs_prop_to_name(pl->pl_prop)); assert(propname != NULL); len = strlen(propname); if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) cbp->cb_colwidths[GET_COL_PROPERTY] = len; } else { assert(pl->pl_user_prop != NULL); len = strlen(pl->pl_user_prop); if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) cbp->cb_colwidths[GET_COL_PROPERTY] = len; } /* * 'VALUE' column. The first property is always the 'name' * property that was tacked on either by /sbin/zfs's * zfs_do_get() or when calling zprop_expand_list(), so we * ignore its width. If the user specified the name property * to display, then it will be later in the list in any case. */ if (pl != cbp->cb_proplist && pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE]) cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width; /* 'RECEIVED' column. */ if (pl != cbp->cb_proplist && pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD]) cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width; /* * 'NAME' and 'SOURCE' columns */ if (pl->pl_prop == ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ((type == ZFS_TYPE_VDEV) ? VDEV_PROP_NAME : ZFS_PROP_NAME)) && pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) { cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width; cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width + strlen(dgettext(TEXT_DOMAIN, "inherited from")); } } /* * Now go through and print the headers. */ for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: title = dgettext(TEXT_DOMAIN, "NAME"); break; case GET_COL_PROPERTY: title = dgettext(TEXT_DOMAIN, "PROPERTY"); break; case GET_COL_VALUE: title = dgettext(TEXT_DOMAIN, "VALUE"); break; case GET_COL_RECVD: title = dgettext(TEXT_DOMAIN, "RECEIVED"); break; case GET_COL_SOURCE: title = dgettext(TEXT_DOMAIN, "SOURCE"); break; default: title = NULL; } if (title != NULL) { if (i == (ZFS_GET_NCOLS - 1) || cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", title); else (void) printf("%-*s ", cbp->cb_colwidths[cbp->cb_columns[i]], title); } } (void) printf("\n"); } /* * Display a single line of output, according to the settings in the callback * structure. */ void zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, const char *propname, const char *value, zprop_source_t sourcetype, const char *source, const char *recvd_value) { int i; const char *str = NULL; char buf[128]; /* * Ignore those source types that the user has chosen to ignore. */ if ((sourcetype & cbp->cb_sources) == 0) return; if (cbp->cb_first) zprop_print_headers(cbp, cbp->cb_type); for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: str = name; break; case GET_COL_PROPERTY: str = propname; break; case GET_COL_VALUE: str = value; break; case GET_COL_SOURCE: switch (sourcetype) { case ZPROP_SRC_NONE: str = "-"; break; case ZPROP_SRC_DEFAULT: str = "default"; break; case ZPROP_SRC_LOCAL: str = "local"; break; case ZPROP_SRC_TEMPORARY: str = "temporary"; break; case ZPROP_SRC_INHERITED: (void) snprintf(buf, sizeof (buf), "inherited from %s", source); str = buf; break; case ZPROP_SRC_RECEIVED: str = "received"; break; default: str = NULL; assert(!"unhandled zprop_source_t"); } break; case GET_COL_RECVD: str = (recvd_value == NULL ? "-" : recvd_value); break; default: continue; } if (i == (ZFS_GET_NCOLS - 1) || cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", str); else if (cbp->cb_scripted) (void) printf("%s\t", str); else (void) printf("%-*s ", cbp->cb_colwidths[cbp->cb_columns[i]], str); } (void) printf("\n"); } /* * Given a numeric suffix, convert the value into a number of bits that the * resulting value must be shifted. */ static int str2shift(libzfs_handle_t *hdl, const char *buf) { const char *ends = "BKMGTPEZ"; int i; if (buf[0] == '\0') return (0); for (i = 0; i < strlen(ends); i++) { if (toupper(buf[0]) == ends[i]) break; } if (i == strlen(ends)) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid numeric suffix '%s'"), buf); return (-1); } /* * Allow 'G' = 'GB' = 'GiB', case-insensitively. * However, 'BB' and 'BiB' are disallowed. */ if (buf[1] == '\0' || (toupper(buf[0]) != 'B' && ((toupper(buf[1]) == 'B' && buf[2] == '\0') || (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' && buf[3] == '\0')))) return (10 * i); if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid numeric suffix '%s'"), buf); return (-1); } /* * Convert a string of the form '100G' into a real number. Used when setting * properties or creating a volume. 'buf' is used to place an extended error * message for the caller to use. */ int zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num) { char *end; int shift; *num = 0; /* Check to see if this looks like a number. */ if ((value[0] < '0' || value[0] > '9') && value[0] != '.') { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "bad numeric value '%s'"), value); return (-1); } /* Rely on strtoull() to process the numeric portion. */ errno = 0; *num = strtoull(value, &end, 10); /* * Check for ERANGE, which indicates that the value is too large to fit * in a 64-bit value. */ if (errno == ERANGE) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } /* * If we have a decimal value, then do the computation with floating * point arithmetic. Otherwise, use standard arithmetic. */ if (*end == '.') { double fval = strtod(value, &end); if ((shift = str2shift(hdl, end)) == -1) return (-1); fval *= pow(2, shift); /* * UINT64_MAX is not exactly representable as a double. * The closest representation is UINT64_MAX + 1, so we * use a >= comparison instead of > for the bounds check. */ if (fval >= (double)UINT64_MAX) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } *num = (uint64_t)fval; } else { if ((shift = str2shift(hdl, end)) == -1) return (-1); /* Check for overflow */ if (shift >= 64 || (*num << shift) >> shift != *num) { if (hdl) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "numeric value is too large")); return (-1); } *num <<= shift; } return (0); } /* * Given a propname=value nvpair to set, parse any numeric properties * (index, boolean, etc) if they are specified as strings and add the * resulting nvpair to the returned nvlist. * * At the DSL layer, all properties are either 64-bit numbers or strings. * We want the user to be able to ignore this fact and specify properties * as native values (numbers, for example) or as strings (to simplify * command line utilities). This also handles converting index types * (compression, checksum, etc) from strings to their on-disk index. */ int zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, zfs_type_t type, nvlist_t *ret, const char **svalp, uint64_t *ivalp, const char *errbuf) { data_type_t datatype = nvpair_type(elem); zprop_type_t proptype; const char *propname; const char *value; boolean_t isnone = B_FALSE; boolean_t isauto = B_FALSE; int err = 0; if (type == ZFS_TYPE_POOL) { proptype = zpool_prop_get_type(prop); propname = zpool_prop_to_name(prop); } else if (type == ZFS_TYPE_VDEV) { proptype = vdev_prop_get_type(prop); propname = vdev_prop_to_name(prop); } else { proptype = zfs_prop_get_type(prop); propname = zfs_prop_to_name(prop); } /* * Convert any properties to the internal DSL value types. */ *svalp = NULL; *ivalp = 0; switch (proptype) { case PROP_TYPE_STRING: if (datatype != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), nvpair_name(elem)); goto error; } err = nvpair_value_string(elem, svalp); if (err != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is invalid"), nvpair_name(elem)); goto error; } if (strlen(*svalp) >= ZFS_MAXPROPLEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' is too long"), nvpair_name(elem)); goto error; } break; case PROP_TYPE_NUMBER: if (datatype == DATA_TYPE_STRING) { (void) nvpair_value_string(elem, &value); if (strcmp(value, "none") == 0) { isnone = B_TRUE; } else if (strcmp(value, "auto") == 0) { isauto = B_TRUE; } else if (zfs_nicestrtonum(hdl, value, ivalp) != 0) { goto error; } } else if (datatype == DATA_TYPE_UINT64) { (void) nvpair_value_uint64(elem, ivalp); } else { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a number"), nvpair_name(elem)); goto error; } /* * Quota special: force 'none' and don't allow 0. */ if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone && (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "use 'none' to disable quota/refquota")); goto error; } /* * Special handling for "*_limit=none". In this case it's not * 0 but UINT64_MAX. */ if ((type & ZFS_TYPE_DATASET) && isnone && (prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT)) { *ivalp = UINT64_MAX; } /* * Special handling for "checksum_*=none". In this case it's not * 0 but UINT64_MAX. */ if ((type & ZFS_TYPE_VDEV) && isnone && (prop == VDEV_PROP_CHECKSUM_N || prop == VDEV_PROP_CHECKSUM_T || prop == VDEV_PROP_IO_N || - prop == VDEV_PROP_IO_T)) { + prop == VDEV_PROP_IO_T || + prop == VDEV_PROP_SLOW_IO_N || + prop == VDEV_PROP_SLOW_IO_T)) { *ivalp = UINT64_MAX; } /* * Special handling for setting 'refreservation' to 'auto'. Use * UINT64_MAX to tell the caller to use zfs_fix_auto_resv(). * 'auto' is only allowed on volumes. */ if (isauto) { switch (prop) { case ZFS_PROP_REFRESERVATION: if ((type & ZFS_TYPE_VOLUME) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s=auto' only allowed on " "volumes"), nvpair_name(elem)); goto error; } *ivalp = UINT64_MAX; break; default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'auto' is invalid value for '%s'"), nvpair_name(elem)); goto error; } } break; case PROP_TYPE_INDEX: if (datatype != DATA_TYPE_STRING) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be a string"), nvpair_name(elem)); goto error; } (void) nvpair_value_string(elem, &value); if (zprop_string_to_index(prop, value, ivalp, type) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' must be one of '%s'"), propname, zprop_values(prop, type)); goto error; } break; default: abort(); } /* * Add the result to our return set of properties. */ if (*svalp != NULL) { if (nvlist_add_string(ret, propname, *svalp) != 0) { (void) no_memory(hdl); return (-1); } } else { if (nvlist_add_uint64(ret, propname, *ivalp) != 0) { (void) no_memory(hdl); return (-1); } } return (0); error: (void) zfs_error(hdl, EZFS_BADPROP, errbuf); return (-1); } static int addlist(libzfs_handle_t *hdl, const char *propname, zprop_list_t **listp, zfs_type_t type) { int prop = zprop_name_to_prop(propname, type); if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type, B_FALSE)) prop = ZPROP_INVAL; /* * Return failure if no property table entry was found and this isn't * a user-defined property. */ if (prop == ZPROP_USERPROP && ((type == ZFS_TYPE_POOL && !zfs_prop_user(propname) && !zpool_prop_feature(propname) && !zpool_prop_unsupported(propname)) || ((type == ZFS_TYPE_DATASET) && !zfs_prop_user(propname) && !zfs_prop_userquota(propname) && !zfs_prop_written(propname)) || ((type == ZFS_TYPE_VDEV) && !vdev_prop_user(propname)))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "bad property list"))); } zprop_list_t *entry = zfs_alloc(hdl, sizeof (*entry)); entry->pl_prop = prop; if (prop == ZPROP_USERPROP) { entry->pl_user_prop = zfs_strdup(hdl, propname); entry->pl_width = strlen(propname); } else { entry->pl_width = zprop_width(prop, &entry->pl_fixed, type); } *listp = entry; return (0); } /* * Given a comma-separated list of properties, construct a property list * containing both user-defined and native properties. This function will * return a NULL list if 'all' is specified, which can later be expanded * by zprop_expand_list(). */ int zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, zfs_type_t type) { *listp = NULL; /* * If 'all' is specified, return a NULL list. */ if (strcmp(props, "all") == 0) return (0); /* * If no props were specified, return an error. */ if (props[0] == '\0') { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no properties specified")); return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, "bad property list"))); } for (char *p; (p = strsep(&props, ",")); ) if (strcmp(p, "space") == 0) { static const char *const spaceprops[] = { "name", "avail", "used", "usedbysnapshots", "usedbydataset", "usedbyrefreservation", "usedbychildren" }; for (int i = 0; i < ARRAY_SIZE(spaceprops); i++) { if (addlist(hdl, spaceprops[i], listp, type)) return (-1); listp = &(*listp)->pl_next; } } else { if (addlist(hdl, p, listp, type)) return (-1); listp = &(*listp)->pl_next; } return (0); } void zprop_free_list(zprop_list_t *pl) { zprop_list_t *next; while (pl != NULL) { next = pl->pl_next; free(pl->pl_user_prop); free(pl); pl = next; } } typedef struct expand_data { zprop_list_t **last; libzfs_handle_t *hdl; zfs_type_t type; } expand_data_t; static int zprop_expand_list_cb(int prop, void *cb) { zprop_list_t *entry; expand_data_t *edp = cb; entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t)); entry->pl_prop = prop; entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type); entry->pl_all = B_TRUE; *(edp->last) = entry; edp->last = &entry->pl_next; return (ZPROP_CONT); } int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type) { zprop_list_t *entry; zprop_list_t **last; expand_data_t exp; if (*plp == NULL) { /* * If this is the very first time we've been called for an 'all' * specification, expand the list to include all native * properties. */ last = plp; exp.last = last; exp.hdl = hdl; exp.type = type; if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE, B_FALSE, type) == ZPROP_INVAL) return (-1); /* * Add 'name' to the beginning of the list, which is handled * specially. */ entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ((type == ZFS_TYPE_VDEV) ? VDEV_PROP_NAME : ZFS_PROP_NAME)); entry->pl_width = zprop_width(entry->pl_prop, &entry->pl_fixed, type); entry->pl_all = B_TRUE; entry->pl_next = *plp; *plp = entry; } return (0); } int zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type) { return (zprop_iter_common(func, cb, show_all, ordered, type)); } const char * zfs_version_userland(void) { return (ZFS_META_ALIAS); } /* * Prints both zfs userland and kernel versions * Returns 0 on success, and -1 on error */ int zfs_version_print(void) { (void) puts(ZFS_META_ALIAS); char *kver = zfs_version_kernel(); if (kver == NULL) { fprintf(stderr, "zfs_version_kernel() failed: %s\n", zfs_strerror(errno)); return (-1); } (void) printf("zfs-kmod-%s\n", kver); free(kver); return (0); } /* * Return 1 if the user requested ANSI color output, and our terminal supports * it. Return 0 for no color. */ int use_color(void) { static int use_color = -1; char *term; /* * Optimization: * * For each zpool invocation, we do a single check to see if we should * be using color or not, and cache that value for the lifetime of the * the zpool command. That makes it cheap to call use_color() when * we're printing with color. We assume that the settings are not going * to change during the invocation of a zpool command (the user isn't * going to change the ZFS_COLOR value while zpool is running, for * example). */ if (use_color != -1) { /* * We've already figured out if we should be using color or * not. Return the cached value. */ return (use_color); } term = getenv("TERM"); /* * The user sets the ZFS_COLOR env var set to enable zpool ANSI color * output. However if NO_COLOR is set (https://no-color.org/) then * don't use it. Also, don't use color if terminal doesn't support * it. */ if (libzfs_envvar_is_set("ZFS_COLOR") && !libzfs_envvar_is_set("NO_COLOR") && isatty(STDOUT_FILENO) && term && strcmp("dumb", term) != 0 && strcmp("unknown", term) != 0) { /* Color supported */ use_color = 1; } else { use_color = 0; } return (use_color); } /* * The functions color_start() and color_end() are used for when you want * to colorize a block of text. * * For example: * color_start(ANSI_RED) * printf("hello"); * printf("world"); * color_end(); */ void color_start(const char *color) { if (color && use_color()) { fputs(color, stdout); fflush(stdout); } } void color_end(void) { if (use_color()) { fputs(ANSI_RESET, stdout); fflush(stdout); } } /* * printf() with a color. If color is NULL, then do a normal printf. */ int printf_color(const char *color, const char *format, ...) { va_list aptr; int rc; if (color) color_start(color); va_start(aptr, format); rc = vprintf(format, aptr); va_end(aptr); if (color) color_end(); return (rc); } /* PATH + 5 env vars + a NULL entry = 7 */ #define ZPOOL_VDEV_SCRIPT_ENV_COUNT 7 /* * There's a few places where ZFS will call external scripts (like the script * in zpool.d/ and `zfs_prepare_disk`). These scripts are called with a * reduced $PATH, and some vdev specific environment vars set. This function * will allocate an populate the environment variable array that is passed to * these scripts. The user must free the arrays with zpool_vdev_free_env() when * they are done. * * The following env vars will be set (but value could be blank): * * POOL_NAME * VDEV_PATH * VDEV_UPATH * VDEV_ENC_SYSFS_PATH * * In addition, you can set an optional environment variable named 'opt_key' * to 'opt_val' if you want. * * Returns allocated env[] array on success, NULL otherwise. */ char ** zpool_vdev_script_alloc_env(const char *pool_name, const char *vdev_path, const char *vdev_upath, const char *vdev_enc_sysfs_path, const char *opt_key, const char *opt_val) { char **env = NULL; int rc; env = calloc(ZPOOL_VDEV_SCRIPT_ENV_COUNT, sizeof (*env)); if (!env) return (NULL); env[0] = strdup("PATH=/bin:/sbin:/usr/bin:/usr/sbin"); if (!env[0]) goto error; /* Setup our custom environment variables */ rc = asprintf(&env[1], "POOL_NAME=%s", pool_name ? pool_name : ""); if (rc == -1) { env[1] = NULL; goto error; } rc = asprintf(&env[2], "VDEV_PATH=%s", vdev_path ? vdev_path : ""); if (rc == -1) { env[2] = NULL; goto error; } rc = asprintf(&env[3], "VDEV_UPATH=%s", vdev_upath ? vdev_upath : ""); if (rc == -1) { env[3] = NULL; goto error; } rc = asprintf(&env[4], "VDEV_ENC_SYSFS_PATH=%s", vdev_enc_sysfs_path ? vdev_enc_sysfs_path : ""); if (rc == -1) { env[4] = NULL; goto error; } if (opt_key != NULL) { rc = asprintf(&env[5], "%s=%s", opt_key, opt_val ? opt_val : ""); if (rc == -1) { env[5] = NULL; goto error; } } return (env); error: for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) free(env[i]); free(env); return (NULL); } /* * Free the env[] array that was allocated by zpool_vdev_script_alloc_env(). */ void zpool_vdev_script_free_env(char **env) { for (int i = 0; i < ZPOOL_VDEV_SCRIPT_ENV_COUNT; i++) free(env[i]); free(env); } /* * Prepare a disk by (optionally) running a program before labeling the disk. * This can be useful for installing disk firmware or doing some pre-flight * checks on the disk before it becomes part of the pool. The program run is * located at ZFSEXECDIR/zfs_prepare_disk * (E.x: /usr/local/libexec/zfs/zfs_prepare_disk). * * Return 0 on success, non-zero on failure. */ int zpool_prepare_disk(zpool_handle_t *zhp, nvlist_t *vdev_nv, const char *prepare_str, char **lines[], int *lines_cnt) { const char *script_path = ZFSEXECDIR "/zfs_prepare_disk"; const char *pool_name; int rc = 0; /* Path to script and a NULL entry */ char *argv[2] = {(char *)script_path}; char **env = NULL; const char *path = NULL, *enc_sysfs_path = NULL; char *upath; *lines_cnt = 0; if (access(script_path, X_OK) != 0) { /* No script, nothing to do */ return (0); } (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH, &path); (void) nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &enc_sysfs_path); upath = zfs_get_underlying_path(path); pool_name = zhp ? zpool_get_name(zhp) : NULL; env = zpool_vdev_script_alloc_env(pool_name, path, upath, enc_sysfs_path, "VDEV_PREPARE", prepare_str); free(upath); if (env == NULL) { return (ENOMEM); } rc = libzfs_run_process_get_stdout(script_path, argv, env, lines, lines_cnt); zpool_vdev_script_free_env(env); return (rc); } /* * Optionally run a script and then label a disk. The script can be used to * prepare a disk for inclusion into the pool. For example, it might update * the disk's firmware or check its health. * * The 'name' provided is the short name, stripped of any leading * /dev path, and is passed to zpool_label_disk. vdev_nv is the nvlist for * the vdev. prepare_str is a string that gets passed as the VDEV_PREPARE * env variable to the script. * * The following env vars are passed to the script: * * POOL_NAME: The pool name (blank during zpool create) * VDEV_PREPARE: Reason why the disk is being prepared for inclusion: * "create", "add", "replace", or "autoreplace" * VDEV_PATH: Path to the disk * VDEV_UPATH: One of the 'underlying paths' to the disk. This is * useful for DM devices. * VDEV_ENC_SYSFS_PATH: Path to the disk's enclosure sysfs path, if available. * * Note, some of these values can be blank. * * Return 0 on success, non-zero otherwise. */ int zpool_prepare_and_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name, nvlist_t *vdev_nv, const char *prepare_str, char **lines[], int *lines_cnt) { int rc; char vdev_path[MAXPATHLEN]; (void) snprintf(vdev_path, sizeof (vdev_path), "%s/%s", DISK_ROOT, name); /* zhp will be NULL when creating a pool */ rc = zpool_prepare_disk(zhp, vdev_nv, prepare_str, lines, lines_cnt); if (rc != 0) return (rc); rc = zpool_label_disk(hdl, zhp, name); return (rc); } diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 6eebfa0060de..3d3ebc072915 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -1,186 +1,186 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or https://opensource.org/licenses/CDDL-1.0. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" Copyright (c) 2021 Klara, Inc. .\" .Dd October 30, 2022 .Dt VDEVPROPS 7 .Os . .Sh NAME .Nm vdevprops .Nd native and user-defined properties of ZFS vdevs . .Sh DESCRIPTION Properties are divided into two types, native properties and user-defined .Pq or Qq user properties. Native properties either export internal statistics or control ZFS behavior. In addition, native properties are either editable or read-only. User properties have no effect on ZFS behavior, but you can use them to annotate vdevs in a way that is meaningful in your environment. For more information about user properties, see the .Sx User Properties section, below. . .Ss Native Properties Every vdev has a set of properties that export statistics about the vdev as well as control various behaviors. Properties are not inherited from top-level vdevs, with the exception of -checksum_n, checksum_t, io_n, and io_t. +checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t. .Pp The values of numeric properties can be specified using human-readable suffixes .Po for example, .Sy k , KB , M , Gb , and so forth, up to .Sy Z for zettabyte .Pc . The following are all valid .Pq and equal specifications: .Li 1536M , 1.5g , 1.50GB . .Pp The values of non-numeric properties are case sensitive and must be lowercase. .Pp The following native properties consist of read-only statistics about the vdev. These properties can not be changed. .Bl -tag -width "fragmentation" .It Sy capacity Percentage of vdev space used .It Sy state state of this vdev such as online, faulted, or offline .It Sy guid globally unique id of this vdev .It Sy asize The allocable size of this vdev .It Sy psize The physical size of this vdev .It Sy ashift The physical sector size of this vdev expressed as the power of two .It Sy size The total size of this vdev .It Sy free The amount of remaining free space on this vdev .It Sy allocated The amount of allocated space on this vdev .It Sy expandsize How much this vdev can expand by .It Sy fragmentation Percent of fragmentation in this vdev .It Sy parity The level of parity for this vdev .It Sy devid The device id for this vdev .It Sy physpath The physical path to the device .It Sy encpath The enclosure path to the device .It Sy fru Field Replacable Unit, usually a model number .It Sy parent Parent of this vdev .It Sy children Comma separated list of children of this vdev .It Sy numchildren The number of children belonging to this vdev .It Sy read_errors , write_errors , checksum_errors , initialize_errors The number of errors of each type encountered by this vdev .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops The number of I/O operations of each type performed by this vdev .It Xo .Sy null_bytes , read_bytes , write_bytes , free_bytes , claim_bytes , .Sy trim_bytes .Xc The cumulative size of all operations of each type performed by this vdev .It Sy removing If this device is currently being removed from the pool .El .Pp The following native properties can be used to change the behavior of a vdev. .Bl -tag -width "allocating" -.It Sy checksum_n , checksum_t , io_n , io_t +.It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t Tune the fault management daemon by specifying checksum/io thresholds of errors in seconds, respectively. These properties can be set on leaf and top-level vdevs. When the property is set on the leaf and top-level vdev, the value of the leaf vdev will be used. If the property is only set on the top-level vdev, this value will be used. The value of these properties do not persist across vdev replacement. For this reason, it is advisable to set the property on the top-level vdev - not on the leaf vdev itself. The default values are 10 errors in 600 seconds. .It Sy comment A text comment up to 8192 characters long .It Sy bootsize The amount of space to reserve for the EFI system partition .It Sy failfast If this device should propage BIO errors back to ZFS, used to disable failfast. .It Sy path The path to the device for this vdev .It Sy allocating If this device should perform new allocations, used to disable a device when it is scheduled for later removal. See .Xr zpool-remove 8 . .El .Ss User Properties In addition to the standard native properties, ZFS supports arbitrary user properties. User properties have no effect on ZFS behavior, but applications or administrators can use them to annotate vdevs. .Pp User property names must contain a colon .Pq Qq Sy \&: character to distinguish them from native properties. They may contain lowercase letters, numbers, and the following punctuation characters: colon .Pq Qq Sy \&: , dash .Pq Qq Sy - , period .Pq Qq Sy \&. , and underscore .Pq Qq Sy _ . The expected convention is that the property name is divided into two portions such as .Ar module : Ns Ar property , but this namespace is not enforced by ZFS. User property names can be at most 256 characters, and cannot begin with a dash .Pq Qq Sy - . .Pp When making programmatic use of user properties, it is strongly suggested to use a reversed DNS domain name for the .Ar module component of property names to reduce the chance that two independently-developed packages use the same property name for different purposes. .Pp The values of user properties are arbitrary strings and are never validated. Use the .Nm zpool Cm set command with a blank value to clear a user property. Property values are limited to 8192 bytes. .Sh SEE ALSO .Xr zpoolprops 7 , .Xr zpool-set 8 diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 98f3ee7cd660..18dfca6dc8ac 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -1,512 +1,512 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or https://opensource.org/licenses/CDDL-1.0. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" .Dd April 7, 2023 .Dt ZPOOLCONCEPTS 7 .Os . .Sh NAME .Nm zpoolconcepts .Nd overview of ZFS storage pools . .Sh DESCRIPTION .Ss Virtual Devices (vdevs) A "virtual device" describes a single device or a collection of devices, organized according to certain performance and fault characteristics. The following virtual devices are supported: .Bl -tag -width "special" .It Sy disk A block device, typically located under .Pa /dev . ZFS can use individual slices or partitions, though the recommended mode of operation is to use whole disks. A disk can be specified by a full path, or it can be a shorthand name .Po the relative portion of the path under .Pa /dev .Pc . A whole disk can be specified by omitting the slice or partition designation. For example, .Pa sda is equivalent to .Pa /dev/sda . When given a whole disk, ZFS automatically labels the disk, if necessary. .It Sy file A regular file. The use of files as a backing store is strongly discouraged. It is designed primarily for experimental purposes, as the fault tolerance of a file is only as good as the file system on which it resides. A file must be specified by a full path. .It Sy mirror A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with .Em N No disks of size Em X No can hold Em X No bytes and can withstand Em N-1 devices failing, without losing data. .It Sy raidz , raidz1 , raidz2 , raidz3 A distributed-parity layout, similar to RAID-5/6, with improved distribution of parity, and which does not suffer from the RAID-5/6 .Qq write hole , .Pq in which data and parity become inconsistent after a power loss . Data and parity is striped across all disks within a raidz group, though not necessarily in a consistent stripe width. .Pp A raidz group can have single, double, or triple parity, meaning that the raidz group can sustain one, two, or three failures, respectively, without losing any data. The .Sy raidz1 vdev type specifies a single-parity raidz group; the .Sy raidz2 vdev type specifies a double-parity raidz group; and the .Sy raidz3 vdev type specifies a triple-parity raidz group. The .Sy raidz vdev type is an alias for .Sy raidz1 . .Pp A raidz group with .Em N No disks of size Em X No with Em P No parity disks can hold approximately .Em (N-P)*X No bytes and can withstand Em P No devices failing without losing data . The minimum number of devices in a raidz group is one more than the number of parity disks. The recommended number is between 3 and 9 to help increase performance. .It Sy draid , draid1 , draid2 , draid3 A variant of raidz that provides integrated distributed hot spares, allowing for faster resilvering, while retaining the benefits of raidz. A dRAID vdev is constructed from multiple internal raidz groups, each with .Em D No data devices and Em P No parity devices . These groups are distributed over all of the children in order to fully utilize the available disk performance. .Pp Unlike raidz, dRAID uses a fixed stripe width (padding as necessary with zeros) to allow fully sequential resilvering. This fixed stripe width significantly affects both usable capacity and IOPS. For example, with the default .Em D=8 No and Em 4 KiB No disk sectors the minimum allocation size is Em 32 KiB . If using compression, this relatively large allocation size can reduce the effective compression ratio. When using ZFS volumes (zvols) and dRAID, the default of the .Sy volblocksize property is increased to account for the allocation size. If a dRAID pool will hold a significant amount of small blocks, it is recommended to also add a mirrored .Sy special vdev to store those blocks. .Pp In regards to I/O, performance is similar to raidz since, for any read, all .Em D No data disks must be accessed . Delivered random IOPS can be reasonably approximated as .Sy floor((N-S)/(D+P))*single_drive_IOPS . .Pp Like raidz, a dRAID can have single-, double-, or triple-parity. The .Sy draid1 , .Sy draid2 , and .Sy draid3 types can be used to specify the parity level. The .Sy draid vdev type is an alias for .Sy draid1 . .Pp A dRAID with .Em N No disks of size Em X , D No data disks per redundancy group , Em P .No parity level, and Em S No distributed hot spares can hold approximately .Em (N-S)*(D/(D+P))*X No bytes and can withstand Em P devices failing without losing data. .It Sy draid Ns Oo Ar parity Oc Ns Oo Sy \&: Ns Ar data Ns Sy d Oc Ns Oo Sy \&: Ns Ar children Ns Sy c Oc Ns Oo Sy \&: Ns Ar spares Ns Sy s Oc A non-default dRAID configuration can be specified by appending one or more of the following optional arguments to the .Sy draid keyword: .Bl -tag -compact -width "children" .It Ar parity The parity level (1-3). .It Ar data The number of data devices per redundancy group. In general, a smaller value of .Em D No will increase IOPS, improve the compression ratio , and speed up resilvering at the expense of total usable capacity. Defaults to .Em 8 , No unless Em N-P-S No is less than Em 8 . .It Ar children The expected number of children. Useful as a cross-check when listing a large number of devices. An error is returned when the provided number of children differs. .It Ar spares The number of distributed hot spares. Defaults to zero. .El .It Sy spare A pseudo-vdev which keeps track of available hot spares for a pool. For more information, see the .Sx Hot Spares section. .It Sy log A separate intent log device. If more than one log device is specified, then writes are load-balanced between devices. Log devices can be mirrored. However, raidz vdev types are not supported for the intent log. For more information, see the .Sx Intent Log section. .It Sy dedup A device solely dedicated for deduplication tables. The redundancy of this device should match the redundancy of the other normal devices in the pool. If more than one dedup device is specified, then allocations are load-balanced between those devices. .It Sy special A device dedicated solely for allocating various kinds of internal metadata, and optionally small file blocks. The redundancy of this device should match the redundancy of the other normal devices in the pool. If more than one special device is specified, then allocations are load-balanced between those devices. .Pp For more information on special allocations, see the .Sx Special Allocation Class section. .It Sy cache A device used to cache storage pool data. A cache device cannot be configured as a mirror or raidz group. For more information, see the .Sx Cache Devices section. .El .Pp Virtual devices cannot be nested arbitrarily. A mirror, raidz or draid virtual device can only be created with files or disks. Mirrors of mirrors or other such combinations are not allowed. .Pp A pool can have any number of virtual devices at the top of the configuration .Po known as .Qq root vdevs .Pc . Data is dynamically distributed across all top-level devices to balance data among devices. As new virtual devices are added, ZFS automatically places data on the newly available devices. .Pp Virtual devices are specified one at a time on the command line, separated by whitespace. Keywords like .Sy mirror No and Sy raidz are used to distinguish where a group ends and another begins. For example, the following creates a pool with two root vdevs, each a mirror of two disks: .Dl # Nm zpool Cm create Ar mypool Sy mirror Ar sda sdb Sy mirror Ar sdc sdd . .Ss Device Failure and Recovery ZFS supports a rich set of mechanisms for handling device failure and data corruption. All metadata and data is checksummed, and ZFS automatically repairs bad data from a good copy, when corruption is detected. .Pp In order to take advantage of these features, a pool must make use of some form of redundancy, using either mirrored or raidz groups. While ZFS supports running in a non-redundant configuration, where each root vdev is simply a disk or file, this is strongly discouraged. A single case of bit corruption can render some or all of your data unavailable. .Pp A pool's health status is described by one of three states: .Sy online , degraded , No or Sy faulted . An online pool has all devices operating normally. A degraded pool is one in which one or more devices have failed, but the data is still available due to a redundant configuration. A faulted pool has corrupted metadata, or one or more faulted devices, and insufficient replicas to continue functioning. .Pp The health of the top-level vdev, such as a mirror or raidz device, is potentially impacted by the state of its associated vdevs or component devices. A top-level vdev or component device is in one of the following states: .Bl -tag -width "DEGRADED" .It Sy DEGRADED One or more top-level vdevs is in the degraded state because one or more component devices are offline. Sufficient replicas exist to continue functioning. .Pp One or more component devices is in the degraded or faulted state, but sufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet -compact .It -The number of checksum errors exceeds acceptable levels and the device is -degraded as an indication that something may be wrong. +The number of checksum errors or slow I/Os exceeds acceptable levels and the +device is degraded as an indication that something may be wrong. ZFS continues to use the device as necessary. .It The number of I/O errors exceeds acceptable levels. The device could not be marked as faulted because there are insufficient replicas to continue functioning. .El .It Sy FAULTED One or more top-level vdevs is in the faulted state because one or more component devices are offline. Insufficient replicas exist to continue functioning. .Pp One or more component devices is in the faulted state, and insufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet -compact .It The device could be opened, but the contents did not match expected values. .It The number of I/O errors exceeds acceptable levels and the device is faulted to prevent further use of the device. .El .It Sy OFFLINE The device was explicitly taken offline by the .Nm zpool Cm offline command. .It Sy ONLINE The device is online and functioning. .It Sy REMOVED The device was physically removed while the system was running. Device removal detection is hardware-dependent and may not be supported on all platforms. .It Sy UNAVAIL The device could not be opened. If a pool is imported when a device was unavailable, then the device will be identified by a unique identifier instead of its path since the path was never correct in the first place. .El .Pp Checksum errors represent events where a disk returned data that was expected to be correct, but was not. In other words, these are instances of silent data corruption. The checksum errors are reported in .Nm zpool Cm status and .Nm zpool Cm events . When a block is stored redundantly, a damaged block may be reconstructed (e.g. from raidz parity or a mirrored copy). In this case, ZFS reports the checksum error against the disks that contained damaged data. If a block is unable to be reconstructed (e.g. due to 3 disks being damaged in a raidz2 group), it is not possible to determine which disks were silently corrupted. In this case, checksum errors are reported for all disks on which the block is stored. .Pp If a device is removed and later re-attached to the system, ZFS attempts to bring the device online automatically. Device attachment detection is hardware-dependent and might not be supported on all platforms. . .Ss Hot Spares ZFS allows devices to be associated with pools as .Qq hot spares . These devices are not actively used in the pool. But, when an active device fails, it is automatically replaced by a hot spare. To create a pool with hot spares, specify a .Sy spare vdev with any number of devices. For example, .Dl # Nm zpool Cm create Ar pool Sy mirror Ar sda sdb Sy spare Ar sdc sdd .Pp Spares can be shared across multiple pools, and can be added with the .Nm zpool Cm add command and removed with the .Nm zpool Cm remove command. Once a spare replacement is initiated, a new .Sy spare vdev is created within the configuration that will remain there until the original device is replaced. At this point, the hot spare becomes available again, if another device fails. .Pp If a pool has a shared spare that is currently being used, the pool cannot be exported, since other pools may use this shared spare, which may lead to potential data corruption. .Pp Shared spares add some risk. If the pools are imported on different hosts, and both pools suffer a device failure at the same time, both could attempt to use the spare at the same time. This may not be detected, resulting in data corruption. .Pp An in-progress spare replacement can be cancelled by detaching the hot spare. If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools. .Pp The .Sy draid vdev type provides distributed hot spares. These hot spares are named after the dRAID vdev they're a part of .Po Sy draid1 Ns - Ns Ar 2 Ns - Ns Ar 3 No specifies spare Ar 3 No of vdev Ar 2 , .No which is a single parity dRAID Pc and may only be used by that dRAID vdev. Otherwise, they behave the same as normal hot spares. .Pp Spares cannot replace log devices. . .Ss Intent Log The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous transactions. For instance, databases often require their transactions to be on stable storage devices when returning from a system call. NFS and other applications can also use .Xr fsync 2 to ensure data stability. By default, the intent log is allocated from blocks within the main pool. However, it might be possible to get better performance using separate intent log devices such as NVRAM or a dedicated disk. For example: .Dl # Nm zpool Cm create Ar pool sda sdb Sy log Ar sdc .Pp Multiple log devices can also be specified, and they can be mirrored. See the .Sx EXAMPLES section for an example of mirroring multiple log devices. .Pp Log devices can be added, replaced, attached, detached, and removed. In addition, log devices are imported and exported as part of the pool that contains them. Mirrored devices can be removed by specifying the top-level mirror vdev. . .Ss Cache Devices Devices can be added to a storage pool as .Qq cache devices . These devices provide an additional layer of caching between main memory and disk. For read-heavy workloads, where the working set size is much larger than what can be cached in main memory, using cache devices allows much more of this working set to be served from low latency media. Using cache devices provides the greatest performance improvement for random read-workloads of mostly static content. .Pp To create a pool with cache devices, specify a .Sy cache vdev with any number of devices. For example: .Dl # Nm zpool Cm create Ar pool sda sdb Sy cache Ar sdc sdd .Pp Cache devices cannot be mirrored or part of a raidz configuration. If a read error is encountered on a cache device, that read I/O is reissued to the original storage pool device, which might be part of a mirrored or raidz configuration. .Pp The content of the cache devices is persistent across reboots and restored asynchronously when importing the pool in L2ARC (persistent L2ARC). This can be disabled by setting .Sy l2arc_rebuild_enabled Ns = Ns Sy 0 . For cache devices smaller than .Em 1 GiB , ZFS does not write the metadata structures required for rebuilding the L2ARC, to conserve space. This can be changed with .Sy l2arc_rebuild_blocks_min_l2size . The cache device header .Pq Em 512 B is updated even if no metadata structures are written. Setting .Sy l2arc_headroom Ns = Ns Sy 0 will result in scanning the full-length ARC lists for cacheable content to be written in L2ARC (persistent ARC). If a cache device is added with .Nm zpool Cm add , its label and header will be overwritten and its contents will not be restored in L2ARC, even if the device was previously part of the pool. If a cache device is onlined with .Nm zpool Cm online , its contents will be restored in L2ARC. This is useful in case of memory pressure, where the contents of the cache device are not fully restored in L2ARC. The user can off- and online the cache device when there is less memory pressure, to fully restore its contents to L2ARC. . .Ss Pool checkpoint Before starting critical procedures that include destructive actions .Pq like Nm zfs Cm destroy , an administrator can checkpoint the pool's state and, in the case of a mistake or failure, rewind the entire pool back to the checkpoint. Otherwise, the checkpoint can be discarded when the procedure has completed successfully. .Pp A pool checkpoint can be thought of as a pool-wide snapshot and should be used with care as it contains every part of the pool's state, from properties to vdev configuration. Thus, certain operations are not allowed while a pool has a checkpoint. Specifically, vdev removal/attach/detach, mirror splitting, and changing the pool's GUID. Adding a new vdev is supported, but in the case of a rewind it will have to be added again. Finally, users of this feature should keep in mind that scrubs in a pool that has a checkpoint do not repair checkpointed data. .Pp To create a checkpoint for a pool: .Dl # Nm zpool Cm checkpoint Ar pool .Pp To later rewind to its checkpointed state, you need to first export it and then rewind it during import: .Dl # Nm zpool Cm export Ar pool .Dl # Nm zpool Cm import Fl -rewind-to-checkpoint Ar pool .Pp To discard the checkpoint from a pool: .Dl # Nm zpool Cm checkpoint Fl d Ar pool .Pp Dataset reservations (controlled by the .Sy reservation No and Sy refreservation properties) may be unenforceable while a checkpoint exists, because the checkpoint is allowed to consume the dataset's reservation. Finally, data that is part of the checkpoint but has been freed in the current state of the pool won't be scanned during a scrub. . .Ss Special Allocation Class Allocations in the special class are dedicated to specific block types. By default, this includes all metadata, the indirect blocks of user data, and any deduplication tables. The class can also be provisioned to accept small file blocks. .Pp A pool must always have at least one normal .Pq non- Ns Sy dedup Ns /- Ns Sy special vdev before other devices can be assigned to the special class. If the .Sy special class becomes full, then allocations intended for it will spill back into the normal class. .Pp Deduplication tables can be excluded from the special class by unsetting the .Sy zfs_ddt_data_is_special ZFS module parameter. .Pp Inclusion of small file blocks in the special class is opt-in. Each dataset can control the size of small file blocks allowed in the special class by setting the .Sy special_small_blocks property to nonzero. See .Xr zfsprops 7 for more info on this property. diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index 4f0bbae81212..b692f12130a8 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -1,296 +1,297 @@ .\" .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the .\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE .\" or https://opensource.org/licenses/CDDL-1.0. .\" See the License for the specific language governing permissions .\" and limitations under the License. .\" .\" When distributing Covered Code, include this CDDL HEADER in each .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. .\" If applicable, add the following below this CDDL HEADER, with the .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" .\" CDDL HEADER END .\" .\" Copyright 2013 Darik Horn . All rights reserved. .\" .\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS .\" .Dd May 26, 2021 .Dt ZINJECT 8 .Os . .Sh NAME .Nm zinject .Nd ZFS Fault Injector .Sh DESCRIPTION .Nm creates artificial problems in a ZFS pool by simulating data corruption or device failures. This program is dangerous. . .Sh SYNOPSIS .Bl -tag -width Ds .It Xo .Nm zinject .Xc List injection records. . .It Xo .Nm zinject .Fl b Ar objset : Ns Ar object : Ns Ar level : Ns Ar start : Ns Ar end .Op Fl f Ar frequency .Fl amu .Op pool .Xc Force an error into the pool at a bookmark. . .It Xo .Nm zinject .Fl c Ar id Ns | Ns Sy all .Xc Cancel injection records. . .It Xo .Nm zinject .Fl d Ar vdev .Fl A Sy degrade Ns | Ns Sy fault .Ar pool .Xc Force a vdev into the DEGRADED or FAULTED state. . .It Xo .Nm zinject .Fl d Ar vdev .Fl D Ar latency : Ns Ar lanes +.Op Fl T Ar read|write .Ar pool .Xc Add an artificial delay to I/O requests on a particular device, such that the requests take a minimum of .Ar latency milliseconds to complete. Each delay has an associated number of .Ar lanes which defines the number of concurrent I/O requests that can be processed. .Pp For example, with a single lane delay of 10 ms .No (\& Ns Fl D Ar 10 : Ns Ar 1 ) , the device will only be able to service a single I/O request at a time with each request taking 10 ms to complete. So, if only a single request is submitted every 10 ms, the average latency will be 10 ms; but if more than one request is submitted every 10 ms, the average latency will be more than 10 ms. .Pp Similarly, if a delay of 10 ms is specified to have two lanes .No (\& Ns Fl D Ar 10 : Ns Ar 2 ) , then the device will be able to service two requests at a time, each with a minimum latency of 10 ms. So, if two requests are submitted every 10 ms, then the average latency will be 10 ms; but if more than two requests are submitted every 10 ms, the average latency will be more than 10 ms. .Pp Also note, these delays are additive. So two invocations of .Fl D Ar 10 : Ns Ar 1 are roughly equivalent to a single invocation of .Fl D Ar 10 : Ns Ar 2 . This also means, that one can specify multiple lanes with differing target latencies. For example, an invocation of .Fl D Ar 10 : Ns Ar 1 followed by .Fl D Ar 25 : Ns Ar 2 will create 3 lanes on the device: one lane with a latency of 10 ms and two lanes with a 25 ms latency. . .It Xo .Nm zinject .Fl d Ar vdev .Op Fl e Ar device_error .Op Fl L Ar label_error .Op Fl T Ar failure .Op Fl f Ar frequency .Op Fl F .Ar pool .Xc Force a vdev error. . .It Xo .Nm zinject .Fl I .Op Fl s Ar seconds Ns | Ns Fl g Ar txgs .Ar pool .Xc Simulate a hardware failure that fails to honor a cache flush. . .It Xo .Nm zinject .Fl p Ar function .Ar pool .Xc Panic inside the specified function. . .It Xo .Nm zinject .Fl t Sy data .Fl C Ar dvas .Op Fl e Ar device_error .Op Fl f Ar frequency .Op Fl l Ar level .Op Fl r Ar range .Op Fl amq .Ar path .Xc Force an error into the contents of a file. . .It Xo .Nm zinject .Fl t Sy dnode .Fl C Ar dvas .Op Fl e Ar device_error .Op Fl f Ar frequency .Op Fl l Ar level .Op Fl amq .Ar path .Xc Force an error into the metadnode for a file or directory. . .It Xo .Nm zinject .Fl t Ar mos_type .Fl C Ar dvas .Op Fl e Ar device_error .Op Fl f Ar frequency .Op Fl l Ar level .Op Fl r Ar range .Op Fl amqu .Ar pool .Xc Force an error into the MOS of a pool. .El .Sh OPTIONS .Bl -tag -width "-C dvas" .It Fl a Flush the ARC before injection. .It Fl b Ar objset : Ns Ar object : Ns Ar level : Ns Ar start : Ns Ar end Force an error into the pool at this bookmark tuple. Each number is in hexadecimal, and only one block can be specified. .It Fl C Ar dvas Inject the given error only into specific DVAs. The mask should be specified as a list of 0-indexed DVAs separated by commas .No (e.g . Ar 0,2 Ns No ). This option is not applicable to logical data errors such as .Sy decompress and .Sy decrypt . .It Fl d Ar vdev A vdev specified by path or GUID. .It Fl e Ar device_error Specify .Bl -tag -compact -width "decompress" .It Sy checksum for an ECKSUM error, .It Sy decompress for a data decompression error, .It Sy decrypt for a data decryption error, .It Sy corrupt to flip a bit in the data after a read, .It Sy dtl for an ECHILD error, .It Sy io for an EIO error where reopening the device will succeed, or .It Sy nxio for an ENXIO error where reopening the device will fail. .El .Pp For EIO and ENXIO, the "failed" reads or writes still occur. The probe simply sets the error value reported by the I/O pipeline so it appears the read or write failed. Decryption errors only currently work with file data. .It Fl f Ar frequency Only inject errors a fraction of the time. Expressed as a real number percentage between .Sy 0.0001 and .Sy 100 . .It Fl F Fail faster. Do fewer checks. .It Fl f Ar txgs Run for this many transaction groups before reporting failure. .It Fl h Print the usage message. .It Fl l Ar level Inject an error at a particular block level. The default is .Sy 0 . .It Fl L Ar label_error Set the label error region to one of .Sy nvlist , .Sy pad1 , .Sy pad2 , or .Sy uber . .It Fl m Automatically remount the underlying filesystem. .It Fl q Quiet mode. Only print the handler number added. .It Fl r Ar range Inject an error over a particular logical range of an object, which will be translated to the appropriate blkid range according to the object's properties. .It Fl s Ar seconds Run for this many seconds before reporting failure. .It Fl T Ar failure Set the failure type to one of .Sy all , .Sy claim , .Sy free , .Sy read , or .Sy write . .It Fl t Ar mos_type Set this to .Bl -tag -compact -width "spacemap" .It Sy mos for any data in the MOS, .It Sy mosdir for an object directory, .It Sy config for the pool configuration, .It Sy bpobj for the block pointer list, .It Sy spacemap for the space map, .It Sy metaslab for the metaslab, or .It Sy errlog for the persistent error log. .El .It Fl u Unload the pool after injection. .El . .Sh ENVIRONMENT VARIABLES .Bl -tag -width "ZF" .It Ev ZFS_HOSTID Run .Nm in debug mode. .El . .Sh SEE ALSO .Xr zfs 8 , .Xr zpool 8 diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index e98063e8bc57..e2e3bf5be69e 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -1,603 +1,609 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2021, Klara Inc. */ #include #include #include #include #include #include "zfs_prop.h" #if !defined(_KERNEL) #include #include #include #endif static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS]; static zprop_desc_t vdev_prop_table[VDEV_NUM_PROPS]; zprop_desc_t * zpool_prop_get_table(void) { return (zpool_prop_table); } void zpool_prop_init(void) { static const zprop_index_t boolean_table[] = { { "off", 0}, { "on", 1}, { NULL } }; static const zprop_index_t failuremode_table[] = { { "wait", ZIO_FAILURE_MODE_WAIT }, { "continue", ZIO_FAILURE_MODE_CONTINUE }, { "panic", ZIO_FAILURE_MODE_PANIC }, { NULL } }; struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_POOL_PROPERTIES); /* string properties */ zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "ALTROOT", sfeatures); zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "BOOTFS", sfeatures); zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, " | none", "CACHEFILE", sfeatures); zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "COMMENT", sfeatures); zprop_register_string(ZPOOL_PROP_COMPATIBILITY, "compatibility", "off", PROP_DEFAULT, ZFS_TYPE_POOL, " | off | legacy", "COMPATIBILITY", sfeatures); /* readonly number properties */ zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "SIZE", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREE", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREEING", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "CKPOINT", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "LEAKED", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "EXPANDSZ", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FRAG", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "CAP", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "GUID", B_TRUE, sfeatures); zprop_register_number(ZPOOL_PROP_LOAD_GUID, "load_guid", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "LOAD_GUID", B_TRUE, sfeatures); zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "HEALTH", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_BCLONEUSED, "bcloneused", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "BCLONE_USED", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_BCLONESAVED, "bclonesaved", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "BCLONE_SAVED", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_BCLONERATIO, "bcloneratio", 0, PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if cloned>", "BCLONE_RATIO", B_FALSE, sfeatures); /* default number properties */ zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, PROP_DEFAULT, ZFS_TYPE_POOL, "", "VERSION", B_FALSE, sfeatures); zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "", "ASHIFT", B_FALSE, sfeatures); /* default index (boolean) properties */ zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table, sfeatures); zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table, sfeatures); zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table, sfeatures); zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table, sfeatures); zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table, sfeatures); zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST", boolean_table, sfeatures); /* default index properties */ zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, "wait | continue | panic", "FAILMODE", failuremode_table, sfeatures); zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim", SPA_AUTOTRIM_OFF, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "AUTOTRIM", boolean_table, sfeatures); /* hidden properties */ zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_POOL, "NAME", B_TRUE, sfeatures); zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE", B_FALSE, sfeatures); zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING, PROP_ONETIME, ZFS_TYPE_POOL, "TNAME", B_TRUE, sfeatures); zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE", B_FALSE, sfeatures); zprop_register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO", B_FALSE, sfeatures); zfs_mod_list_supported_free(sfeatures); } /* * Given a property name and its type, returns the corresponding property ID. */ zpool_prop_t zpool_name_to_prop(const char *propname) { return (zprop_name_to_prop(propname, ZFS_TYPE_POOL)); } /* * Given a pool property ID, returns the corresponding name. * Assuming the pool property ID is valid. */ const char * zpool_prop_to_name(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_name); } zprop_type_t zpool_prop_get_type(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_proptype); } boolean_t zpool_prop_readonly(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_attr == PROP_READONLY); } boolean_t zpool_prop_setonce(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_attr == PROP_ONETIME); } const char * zpool_prop_default_string(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_strdefault); } uint64_t zpool_prop_default_numeric(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_numdefault); } /* * Returns true if this is a valid feature@ property. */ boolean_t zpool_prop_feature(const char *name) { static const char *prefix = "feature@"; return (strncmp(name, prefix, strlen(prefix)) == 0); } /* * Returns true if this is a valid unsupported@ property. */ boolean_t zpool_prop_unsupported(const char *name) { static const char *prefix = "unsupported@"; return (strncmp(name, prefix, strlen(prefix)) == 0); } int zpool_prop_string_to_index(zpool_prop_t prop, const char *string, uint64_t *index) { return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL)); } int zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index, const char **string) { return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL)); } uint64_t zpool_prop_random_value(zpool_prop_t prop, uint64_t seed) { return (zprop_random_value(prop, seed, ZFS_TYPE_POOL)); } #ifndef _KERNEL #include const char * zpool_prop_values(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_values); } const char * zpool_prop_column_name(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_colname); } boolean_t zpool_prop_align_right(zpool_prop_t prop) { return (zpool_prop_table[prop].pd_rightalign); } #endif zprop_desc_t * vdev_prop_get_table(void) { return (vdev_prop_table); } void vdev_prop_init(void) { static const zprop_index_t boolean_table[] = { { "off", 0}, { "on", 1}, { NULL } }; static const zprop_index_t boolean_na_table[] = { { "off", 0}, { "on", 1}, { "-", 2}, /* ZPROP_BOOLEAN_NA */ { NULL } }; struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES); /* string properties */ zprop_register_string(VDEV_PROP_COMMENT, "comment", NULL, PROP_DEFAULT, ZFS_TYPE_VDEV, "", "COMMENT", sfeatures); zprop_register_string(VDEV_PROP_PATH, "path", NULL, PROP_DEFAULT, ZFS_TYPE_VDEV, "", "PATH", sfeatures); zprop_register_string(VDEV_PROP_DEVID, "devid", NULL, PROP_READONLY, ZFS_TYPE_VDEV, "", "DEVID", sfeatures); zprop_register_string(VDEV_PROP_PHYS_PATH, "physpath", NULL, PROP_READONLY, ZFS_TYPE_VDEV, "", "PHYSPATH", sfeatures); zprop_register_string(VDEV_PROP_ENC_PATH, "encpath", NULL, PROP_READONLY, ZFS_TYPE_VDEV, "", "ENCPATH", sfeatures); zprop_register_string(VDEV_PROP_FRU, "fru", NULL, PROP_READONLY, ZFS_TYPE_VDEV, "", "FRU", sfeatures); zprop_register_string(VDEV_PROP_PARENT, "parent", NULL, PROP_READONLY, ZFS_TYPE_VDEV, "", "PARENT", sfeatures); zprop_register_string(VDEV_PROP_CHILDREN, "children", NULL, PROP_READONLY, ZFS_TYPE_VDEV, "", "CHILDREN", sfeatures); /* readonly number properties */ zprop_register_number(VDEV_PROP_SIZE, "size", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "SIZE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_FREE, "free", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "FREE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "ALLOC", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_EXPANDSZ, "expandsize", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "EXPANDSZ", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_FRAGMENTATION, "fragmentation", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "FRAG", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "CAP", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "GUID", B_TRUE, sfeatures); zprop_register_number(VDEV_PROP_STATE, "state", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "STATE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_BOOTSIZE, "bootsize", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "BOOTSIZE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_ASIZE, "asize", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "ASIZE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_PSIZE, "psize", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "PSIZE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_ASHIFT, "ashift", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "ASHIFT", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_PARITY, "parity", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "PARITY", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_NUMCHILDREN, "numchildren", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "NUMCHILD", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_READ_ERRORS, "read_errors", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "RDERR", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_WRITE_ERRORS, "write_errors", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "WRERR", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_CHECKSUM_ERRORS, "checksum_errors", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "CKERR", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_INITIALIZE_ERRORS, "initialize_errors", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "INITERR", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_OPS_NULL, "null_ops", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "NULLOP", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_OPS_READ, "read_ops", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "READOP", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_OPS_WRITE, "write_ops", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "WRITEOP", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_OPS_FREE, "free_ops", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "FREEOP", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_OPS_CLAIM, "claim_ops", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "CLAIMOP", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_OPS_TRIM, "trim_ops", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "TRIMOP", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_BYTES_NULL, "null_bytes", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "NULLBYTE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_BYTES_READ, "read_bytes", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "READBYTE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_BYTES_WRITE, "write_bytes", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "WRITEBYTE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_BYTES_FREE, "free_bytes", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "FREEBYTE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_BYTES_CLAIM, "claim_bytes", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "CLAIMBYTE", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_BYTES_TRIM, "trim_bytes", 0, PROP_READONLY, ZFS_TYPE_VDEV, "", "TRIMBYTE", B_FALSE, sfeatures); /* default numeric properties */ zprop_register_number(VDEV_PROP_CHECKSUM_N, "checksum_n", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_VDEV, "", "CKSUM_N", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_CHECKSUM_T, "checksum_t", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_VDEV, "", "CKSUM_T", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_IO_N, "io_n", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_VDEV, "", "IO_N", B_FALSE, sfeatures); zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_VDEV, "", "IO_T", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX, + PROP_DEFAULT, ZFS_TYPE_VDEV, "", "SLOW_IO_N", B_FALSE, + sfeatures); + zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX, + PROP_DEFAULT, ZFS_TYPE_VDEV, "", "SLOW_IO_T", B_FALSE, + sfeatures); /* default index (boolean) properties */ zprop_register_index(VDEV_PROP_REMOVING, "removing", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "REMOVING", boolean_table, sfeatures); zprop_register_index(VDEV_PROP_ALLOCATING, "allocating", 1, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "ALLOCATING", boolean_na_table, sfeatures); zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); /* default index properties */ zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "FAILFAST", boolean_table, sfeatures); /* hidden properties */ zprop_register_hidden(VDEV_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_VDEV, "NAME", B_TRUE, sfeatures); zfs_mod_list_supported_free(sfeatures); } /* * Given a property name and its type, returns the corresponding property ID. */ vdev_prop_t vdev_name_to_prop(const char *propname) { return (zprop_name_to_prop(propname, ZFS_TYPE_VDEV)); } /* * Returns true if this is a valid user-defined property (one with a ':'). */ boolean_t vdev_prop_user(const char *name) { int i; char c; boolean_t foundsep = B_FALSE; for (i = 0; i < strlen(name); i++) { c = name[i]; if (!zprop_valid_char(c)) return (B_FALSE); if (c == ':') foundsep = B_TRUE; } return (foundsep); } /* * Given a pool property ID, returns the corresponding name. * Assuming the pool property ID is valid. */ const char * vdev_prop_to_name(vdev_prop_t prop) { return (vdev_prop_table[prop].pd_name); } zprop_type_t vdev_prop_get_type(vdev_prop_t prop) { return (vdev_prop_table[prop].pd_proptype); } boolean_t vdev_prop_readonly(vdev_prop_t prop) { return (vdev_prop_table[prop].pd_attr == PROP_READONLY); } const char * vdev_prop_default_string(vdev_prop_t prop) { return (vdev_prop_table[prop].pd_strdefault); } uint64_t vdev_prop_default_numeric(vdev_prop_t prop) { return (vdev_prop_table[prop].pd_numdefault); } int vdev_prop_string_to_index(vdev_prop_t prop, const char *string, uint64_t *index) { return (zprop_string_to_index(prop, string, index, ZFS_TYPE_VDEV)); } int vdev_prop_index_to_string(vdev_prop_t prop, uint64_t index, const char **string) { return (zprop_index_to_string(prop, index, string, ZFS_TYPE_VDEV)); } /* * Returns true if this is a valid vdev property. */ boolean_t zpool_prop_vdev(const char *name) { return (vdev_name_to_prop(name) != VDEV_PROP_INVAL); } uint64_t vdev_prop_random_value(vdev_prop_t prop, uint64_t seed) { return (zprop_random_value(prop, seed, ZFS_TYPE_VDEV)); } #ifndef _KERNEL const char * vdev_prop_values(vdev_prop_t prop) { return (vdev_prop_table[prop].pd_values); } const char * vdev_prop_column_name(vdev_prop_t prop) { return (vdev_prop_table[prop].pd_colname); } boolean_t vdev_prop_align_right(vdev_prop_t prop) { return (vdev_prop_table[prop].pd_rightalign); } #endif #if defined(_KERNEL) /* zpool property functions */ EXPORT_SYMBOL(zpool_prop_init); EXPORT_SYMBOL(zpool_prop_get_type); EXPORT_SYMBOL(zpool_prop_get_table); /* vdev property functions */ EXPORT_SYMBOL(vdev_prop_init); EXPORT_SYMBOL(vdev_prop_get_type); EXPORT_SYMBOL(vdev_prop_get_table); /* Pool property functions shared between libzfs and kernel. */ EXPORT_SYMBOL(zpool_name_to_prop); EXPORT_SYMBOL(zpool_prop_to_name); EXPORT_SYMBOL(zpool_prop_default_string); EXPORT_SYMBOL(zpool_prop_default_numeric); EXPORT_SYMBOL(zpool_prop_readonly); EXPORT_SYMBOL(zpool_prop_feature); EXPORT_SYMBOL(zpool_prop_unsupported); EXPORT_SYMBOL(zpool_prop_index_to_string); EXPORT_SYMBOL(zpool_prop_string_to_index); EXPORT_SYMBOL(zpool_prop_vdev); /* vdev property functions shared between libzfs and kernel. */ EXPORT_SYMBOL(vdev_name_to_prop); EXPORT_SYMBOL(vdev_prop_user); EXPORT_SYMBOL(vdev_prop_to_name); EXPORT_SYMBOL(vdev_prop_default_string); EXPORT_SYMBOL(vdev_prop_default_numeric); EXPORT_SYMBOL(vdev_prop_readonly); EXPORT_SYMBOL(vdev_prop_index_to_string); EXPORT_SYMBOL(vdev_prop_string_to_index); #endif diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index d6286dc5920b..ebba453e2b14 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,6471 +1,6501 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %llu: %s", indent, "", (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *const vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, *const *opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if (mc == spa_embedded_log_class(vd->vdev_spa) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { (void) vd, (void) remain_rs; physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } static int vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t objid; int err; if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (EINVAL); } err = zap_lookup(mos, objid, vdev_prop_to_name(prop), sizeof (uint64_t), 1, value); if (err == ENOENT) *value = vdev_prop_default_numeric(prop); return (err); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { memcpy(newchild, pvd->vdev_child, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); /* * Default Thresholds for tuning ZED */ vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N); vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); + vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); + vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; const char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; const char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { const char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0) vd->vdev_path = spa_strdup(tmp); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0) vd->vdev_devid = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0) vd->vdev_physpath = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &tmp) == 0) vd->vdev_enc_sysfs_path = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0) vd->vdev_fru = spa_strdup(tmp); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. Ignore pool ashift for vdev * attach case. */ if (alloctype != VDEV_ALLOC_ATTACH) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); } else { vd->vdev_attaching = B_TRUE; } /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); if (vd->vdev_ops == &vdev_root_ops && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, &vd->vdev_root_zap); } /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); vd->vdev_rz_expanding = nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { const char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } /* * Choose GCD for spa_gcd_alloc. */ static uint64_t vdev_gcd(uint64_t a, uint64_t b) { while (b != 0) { uint64_t t = b; b = a % b; a = t; } return (a); } /* * Set spa_min_alloc and spa_gcd_alloc. */ static void vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) { if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; if (spa->spa_gcd_alloc == INT_MAX) { spa->spa_gcd_alloc = min_alloc; } else { spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); } } void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd, 1); } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundary * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. */ if (vd->vdev_mg->mg_class == spa_normal_class(spa) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is marked as non-allocating then don't * activate the metaslabs since we want to ensure that * no allocations are performed on this device. */ if (vd->vdev_noalloc) { /* track non-allocating vdev space */ spa->spa_nonallocating_dspace += spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", vd->vdev_cant_read, vd->vdev_cant_write); if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { (void) vd; return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE * changed, this algorithm can not change, otherwise it would inconsistently * account for existing bp's. We also hard-code txg 0 for the same reason * since expanded RAIDZ vdevs can use a different asize for different birth * txg's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> SPA_MINBLOCKSHIFT); } } /* * Choose the best of two ashifts, preferring one between logical ashift * (absolute minimum) and administrator defined maximum, otherwise take * the biggest of the two. */ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) { if (a > logical && a <= zfs_vdev_max_auto_ashift) { if (b <= logical || b > zfs_vdev_max_auto_ashift) return (a); else return (MAX(a, b)); } else if (b <= logical || b > zfs_vdev_max_auto_ashift) return (MAX(a, b)); return (b); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift && vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* Keep the device in removed state if unplugged */ if (error == ENOENT && vd->vdev_removed) { vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); return (error); } /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overridden at creation time, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift == 0) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); vdev_spa_set_alloc(spa, min_alloc); } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid) { if (svd != NULL && *dvd != NULL) { if (strcmp(svd, *dvd) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed " "from '%s' to '%s'", (u_longlong_t)guid, prefix, *dvd, svd); spa_strfree(*dvd); *dvd = spa_strdup(svd); } } else if (svd != NULL) { *dvd = spa_strdup(svd); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)guid, *dvd); } } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { char *old, *new; vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path, dvd->vdev_guid); vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid, dvd->vdev_guid); vdev_update_path("vdev_physpath", svd->vdev_physpath, &dvd->vdev_physpath, dvd->vdev_guid); /* * Our enclosure sysfs path may have changed between imports */ old = dvd->vdev_enc_sysfs_path; new = svd->vdev_enc_sysfs_path; if ((old != NULL && new == NULL) || (old == NULL && new != NULL) || ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, old, new); if (dvd->vdev_enc_sysfs_path) spa_strfree(dvd->vdev_enc_sysfs_path); if (svd->vdev_enc_sysfs_path) { dvd->vdev_enc_sysfs_path = spa_strdup( svd->vdev_enc_sysfs_path); } else { dvd->vdev_enc_sysfs_path = NULL; } } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Recheck if resilver is still needed and cancel any * scheduled resilver if resilver is unneeded. */ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && spa->spa_async_tasks & SPA_ASYNC_RESILVER) { mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; mutex_exit(&spa->spa_async_lock); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { (void) dva, (void) psize; /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); } else { mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) { /* leaf vdevs only */ continue; } if (t == DTL_PARTIAL) { /* i.e. non-zero */ minref = 1; } else if (vdev_get_nparity(vd) != 0) { /* RAIDZ, DRAID */ minref = vdev_get_nparity(vd) + 1; } else { /* any kind of mirror */ minref = vd->vdev_children; } space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { raidz_dtl_reassessed(vd); } } /* * Iterate over all the vdevs except spare, and post kobj events */ void vdev_post_kobj_evt(vdev_t *vd) { if (vd->vdev_ops->vdev_op_kobj_evt_post && vd->vdev_kobj_flag == B_FALSE) { vd->vdev_kobj_flag = B_TRUE; vd->vdev_ops->vdev_op_kobj_evt_post(vd); } for (int c = 0; c < vd->vdev_children; c++) vdev_post_kobj_evt(vd->vdev_child[c]); } /* * Iterate over all the vdevs except spare, and clear kobj events */ void vdev_clear_kobj_evt(vdev_t *vd) { vd->vdev_kobj_flag = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) vdev_clear_kobj_evt(vd->vdev_child[c]); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); /* * If the dtl cannot be sync'd there is no need to open it. */ if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) return (0); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } range_tree_vacate(rt, NULL, NULL); range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); vd->vdev_root_zap = vdev_create_link_zap(vd, tx); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); if (vd->vdev_ops == &vdev_raidz_ops) { error = vdev_raidz_load(vd); if (error != 0) return (error); } /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } } if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; uint64_t failfast; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), 1, &failfast); if (error == 0) { vd->vdev_failfast = failfast & 1; } else if (error == ENOENT) { vd->vdev_failfast = vdev_prop_default_numeric( VDEV_PROP_FAILFAST); } else { vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) { uint64_t zapobj; if (vd->vdev_top_zap != 0) zapobj = vd->vdev_top_zap; else zapobj = vd->vdev_leaf_zap; error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N, &vd->vdev_checksum_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T, &vd->vdev_checksum_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_N, &vd->vdev_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_T, &vd->vdev_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, + &vd->vdev_slow_io_n); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, + &vd->vdev_slow_io_t); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } /* * Return the amount of space that should be (or was) allocated for the given * psize (compressed block size) in the given TXG. Note that for expanded * RAIDZ vdevs, the size allocated for older BP's may be larger. See * vdev_raidz_asize(). */ uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) { return (vd->vdev_ops->vdev_op_asize(vd, psize, txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vdev_psize_to_asize_txg(vd, psize, 0)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_remove_wanted(spa_t *spa, uint64_t guid) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* * If the vdev is already removed, or expanding which can trigger * repartition add/remove events, then don't do anything. */ if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); /* * Confirm the vdev has been removed, otherwise don't do anything. */ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa->spa_ccw_fail_time = 0; spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); /* * Asynchronously detach spare vdev if resilver or * rebuild is not required */ if (vd->vdev_unspare && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && !vdev_rebuild_active(tvd)) spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); } return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect or removed vdev. */ if (!vdev_is_concrete(vd) || vd->vdev_removed) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; /* Clear recent error events cache (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { (void) cvd; int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { memcpy(vs, &vd->vdev_stat, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; if (vd->vdev_physical_ashift <= ASHIFT_MAX) vs->vs_physical_ashift = vd->vdev_physical_ashift; else vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } vs->vs_noalloc = MAX(vd->vdev_noalloc, tvd ? tvd->vdev_noalloc : 0); } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; /* Suppress ASAN false positive */ #ifdef __SANITIZE_ADDRESS__ vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; #else vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; #endif zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_IOCTL. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to categorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM, * ZIO_PRIORITY_REBUILD. */ if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { (void) defer_delta; int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_spa->spa_raidz_expand == NULL || vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; VERIFY3U(pvd->vdev_children, >, 1); vdev_remove_child(pvd, vd); vdev_compact_children(pvd); ASSERT3P(pvd->vdev_child, !=, NULL); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, const char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %u active IOs", vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t vdev_xlate_is_empty(range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { range_seg64_t iter_rs = *logical_rs; range_seg64_t physical_rs; range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } static char * vdev_name(vdev_t *vd, char *buf, int buflen) { if (vd->vdev_path == NULL) { if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { strlcpy(buf, vd->vdev_spa->spa_name, buflen); } else if (!vd->vdev_ops->vdev_op_leaf) { snprintf(buf, buflen, "%s-%llu", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id); } } else { strlcpy(buf, vd->vdev_path, buflen); } return (buf); } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } /* * Add a (source=src, propname=propval) list to an nvlist. */ static void vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval, uint64_t intval, zprop_source_t src) { nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static void vdev_props_set_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd; nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; uint64_t objid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); /* this vdev could get removed while waiting for this sync task */ if (vd == NULL) return; /* * Set vdev property values in the vdev props mos object. */ if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { panic("unexpected vdev type"); } mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { uint64_t intval; const char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { strval = fnvpair_value_string(elem); if (strlen(strval) == 0) { /* remove the property if value == "" */ (void) zap_remove(mos, objid, propname, tx); } else { VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); } spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } break; default: /* normalize the property name */ propname = vdev_prop_to_name(prop); proptype = vdev_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(vdev_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, objid, propname, sizeof (uint64_t), 1, &intval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%lld", (u_longlong_t)vdev_guid, nvpair_name(elem), (longlong_t)intval); } else { panic("invalid vdev property type %u", nvpair_type(elem)); } } } mutex_exit(&spa->spa_props_lock); } int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; int error = 0; ASSERT(vd != NULL); /* Check that vdev has a zap we can use */ if (vd->vdev_root_zap == 0 && vd->vdev_top_zap == 0 && vd->vdev_leaf_zap == 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, &nvprops) != 0) return (SET_ERROR(EINVAL)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) return (SET_ERROR(EINVAL)); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { const char *propname = nvpair_name(elem); vdev_prop_t prop = vdev_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { error = EINVAL; goto end; } if (vdev_prop_readonly(prop)) { error = EROFS; goto end; } /* Special Processing */ switch (prop) { case VDEV_PROP_PATH: if (vd->vdev_path == NULL) { error = EROFS; break; } if (nvpair_value_string(elem, &strval) != 0) { error = EINVAL; break; } /* New path must start with /dev/ */ if (strncmp(strval, "/dev/", 5)) { error = EINVAL; break; } error = spa_vdev_setpath(spa, vdev_guid, strval); break; case VDEV_PROP_ALLOCATING: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval != vd->vdev_noalloc) break; if (intval == 0) error = spa_vdev_noalloc(spa, vdev_guid); else error = spa_vdev_alloc(spa, vdev_guid); break; case VDEV_PROP_FAILFAST: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_failfast = intval & 1; break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_n = intval; break; case VDEV_PROP_CHECKSUM_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_t = intval; break; case VDEV_PROP_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_n = intval; break; case VDEV_PROP_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_t = intval; break; + case VDEV_PROP_SLOW_IO_N: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_slow_io_n = intval; + break; + case VDEV_PROP_SLOW_IO_T: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_slow_io_t = intval; + break; default: /* Most processing is done in vdev_props_set_sync */ break; } end: if (error != 0) { intval = error; vdev_prop_add_list(outnvl, propname, strval, intval, 0); return (error); } } return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int err = 0; uint64_t objid; uint64_t vdev_guid; nvpair_t *elem = NULL; nvlist_t *nvprops = NULL; uint64_t intval = 0; char *strval = NULL; const char *propname = NULL; vdev_prop_t prop; ASSERT(vd != NULL); ASSERT(mos != NULL); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (SET_ERROR(EINVAL)); } ASSERT(objid != 0); mutex_enter(&spa->spa_props_lock); if (nvprops != NULL) { char namebuf[64] = { 0 }; while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { intval = 0; strval = NULL; propname = nvpair_name(elem); prop = vdev_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_DEFAULT; uint64_t integer_size, num_integers; switch (prop) { /* Special Read-only Properties */ case VDEV_PROP_NAME: strval = vdev_name(vd, namebuf, sizeof (namebuf)); if (strval == NULL) continue; vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_CAPACITY: /* percent used */ intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : (vd->vdev_stat.vs_alloc * 100 / vd->vdev_stat.vs_dspace); vdev_prop_add_list(outnvl, propname, NULL, intval, ZPROP_SRC_NONE); continue; case VDEV_PROP_STATE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_state, ZPROP_SRC_NONE); continue; case VDEV_PROP_GUID: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_guid, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_asize, ZPROP_SRC_NONE); continue; case VDEV_PROP_PSIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_psize, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASHIFT: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_ashift, ZPROP_SRC_NONE); continue; case VDEV_PROP_SIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); continue; case VDEV_PROP_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace - vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_ALLOCATED: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_EXPANDSZ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRAGMENTATION: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_fragmentation, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARITY: vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_DEVID: if (vd->vdev_devid == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_devid, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PHYS_PATH: if (vd->vdev_physpath == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_physpath, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_ENC_PATH: if (vd->vdev_enc_sysfs_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRU: if (vd->vdev_fru == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_fru, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARENT: if (vd->vdev_parent != NULL) { strval = vdev_name(vd->vdev_parent, namebuf, sizeof (namebuf)); vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); } continue; case VDEV_PROP_CHILDREN: if (vd->vdev_children > 0) strval = kmem_zalloc(ZAP_MAXVALUELEN, KM_SLEEP); for (uint64_t i = 0; i < vd->vdev_children; i++) { const char *vname; vname = vdev_name(vd->vdev_child[i], namebuf, sizeof (namebuf)); if (vname == NULL) vname = "(unknown)"; if (strlen(strval) > 0) strlcat(strval, ",", ZAP_MAXVALUELEN); strlcat(strval, vname, ZAP_MAXVALUELEN); } if (strval != NULL) { vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); kmem_free(strval, ZAP_MAXVALUELEN); } continue; case VDEV_PROP_NUMCHILDREN: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_children, ZPROP_SRC_NONE); continue; case VDEV_PROP_READ_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_read_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_WRITE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_write_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_CHECKSUM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_checksum_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_INITIALIZE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_initialize_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_IOCTL. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_IOCTL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_IOCTL. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_IOCTL], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; case VDEV_PROP_RAIDZ_EXPANDING: /* Only expose this for raidz */ if (vd->vdev_ops == &vdev_raidz_ops) { vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_rz_expanding, ZPROP_SRC_NONE); } continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ if (vd->vdev_mg == NULL && vd->vdev_top != NULL) { src = ZPROP_SRC_NONE; intval = ZPROP_BOOLEAN_NA; } else { err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; } vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; case VDEV_PROP_FAILFAST: src = ZPROP_SRC_LOCAL; strval = NULL; err = zap_lookup(mos, objid, nvpair_name(elem), sizeof (uint64_t), 1, &intval); if (err == ENOENT) { intval = vdev_prop_default_numeric( prop); err = 0; } else if (err) { break; } if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; vdev_prop_add_list(outnvl, propname, strval, intval, src); break; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: + case VDEV_PROP_SLOW_IO_N: + case VDEV_PROP_SLOW_IO_T: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; /* Text Properties */ case VDEV_PROP_COMMENT: /* Exists in the ZAP below */ /* FALLTHRU */ case VDEV_PROP_USERPROP: /* User Properites */ src = ZPROP_SRC_LOCAL; err = zap_length(mos, objid, nvpair_name(elem), &integer_size, &num_integers); if (err) break; switch (integer_size) { case 8: /* User properties cannot be integers */ err = EINVAL; break; case 1: /* string property */ strval = kmem_alloc(num_integers, KM_SLEEP); err = zap_lookup(mos, objid, nvpair_name(elem), 1, num_integers, strval); if (err) { kmem_free(strval, num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, num_integers); break; } break; default: err = ENOENT; break; } if (err) break; } } else { /* * Get all properties from the MOS vdev property object. */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, mos, objid); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { intval = 0; strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; propname = za.za_name; switch (za.za_integer_length) { case 8: /* We do not allow integer user properties */ /* This is likely an internal value */ break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, objid, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); } mutex_exit(&spa->spa_props_lock); if (err && err != ENOENT) { return (err); } return (0); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, "Default lower limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, "Default upper limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); /* END CSTYLED */ diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index c4eb74e873db..481af2ba826b 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1,1572 +1,1598 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012,2021 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * This general routine is responsible for generating all the different ZFS * ereports. The payload is dependent on the class, and which arguments are * supplied to the function: * * EREPORT POOL VDEV IO * block X X X * data X X * device X X * pool X * * If we are in a loading state, all errors are chained together by the same * SPA-wide ENA (Error Numeric Association). * * For isolated I/O requests, we get the ENA from the zio_t. The propagation * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want * to chain together all ereports associated with a logical piece of data. For * read I/Os, there are basically three 'types' of I/O, which form a roughly * layered diagram: * * +---------------+ * | Aggregate I/O | No associated logical data or device * +---------------+ * | * V * +---------------+ Reads associated with a piece of logical data. * | Read I/O | This includes reads on behalf of RAID-Z, * +---------------+ mirrors, gang blocks, retries, etc. * | * V * +---------------+ Reads associated with a particular device, but * | Physical I/O | no logical data. Issued as part of vdev caching * +---------------+ and I/O aggregation. * * Note that 'physical I/O' here is not the same terminology as used in the rest * of ZIO. Typically, 'physical I/O' simply means that there is no attached * blockpointer. But I/O with no associated block pointer can still be related * to a logical piece of data (i.e. RAID-Z requests). * * Purely physical I/O always have unique ENAs. They are not related to a * particular piece of logical data, and therefore cannot be chained together. * We still generate an ereport, but the DE doesn't correlate it with any * logical piece of data. When such an I/O fails, the delegated I/O requests * will issue a retry, which will trigger the 'real' ereport with the correct * ENA. * * We keep track of the ENA for a ZIO chain through the 'io_logical' member. * When a new logical I/O is issued, we set this to point to itself. Child I/Os * then inherit this pointer, so that when it is first set subsequent failures * will use the same ENA. For vdev cache fill and queue aggregation I/O, * this pointer is set to NULL, and no ereport will be generated (since it * doesn't actually correspond to any particular device or piece of data, * and the caller will always retry without caching or queueing anyway). * * For checksum errors, we want to include more information about the actual * error which occurs. Accordingly, we build an ereport when the error is * noticed, but instead of sending it in immediately, we hang it off of the * io_cksum_report field of the logical IO. When the logical IO completes * (successfully or not), zfs_ereport_finish_checksum() is called with the * good and bad versions of the buffer (if available), and we annotate the * ereport with information about the differences. */ #ifdef _KERNEL /* * Duplicate ereport Detection * * Some ereports are retained momentarily for detecting duplicates. These * are kept in a recent_events_node_t in both a time-ordered list and an AVL * tree of recent unique ereports. * * The lifespan of these recent ereports is bounded (15 mins) and a cleaner * task is used to purge stale entries. */ static list_t recent_events_list; static avl_tree_t recent_events_tree; static kmutex_t recent_events_lock; static taskqid_t recent_events_cleaner_tqid; /* * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. * * This setting can be changed dynamically and setting it to zero * disables duplicate detection. */ static unsigned int zfs_zevent_retain_max = 2000; /* * The lifespan for a recent ereport entry. The default of 15 minutes is * intended to outlive the zfs diagnosis engine's threshold of 10 errors * over a period of 10 minutes. */ static unsigned int zfs_zevent_retain_expire_secs = 900; typedef enum zfs_subclass { ZSC_IO, ZSC_DATA, ZSC_CHECKSUM } zfs_subclass_t; typedef struct { /* common criteria */ uint64_t re_pool_guid; uint64_t re_vdev_guid; int re_io_error; uint64_t re_io_size; uint64_t re_io_offset; zfs_subclass_t re_subclass; zio_priority_t re_io_priority; /* logical zio criteria (optional) */ zbookmark_phys_t re_io_bookmark; /* internal state */ avl_node_t re_tree_link; list_node_t re_list_link; uint64_t re_timestamp; } recent_events_node_t; static int recent_events_compare(const void *a, const void *b) { const recent_events_node_t *node1 = a; const recent_events_node_t *node2 = b; int cmp; /* * The comparison order here is somewhat arbitrary. * What's important is that if every criteria matches, then it * is a duplicate (i.e. compare returns 0) */ if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) return (cmp); const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) return (cmp); if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) return (cmp); if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) return (cmp); if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) return (cmp); return (0); } /* * workaround: vdev properties don't have inheritance */ static uint64_t vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) { uint64_t propdef, propval; propdef = vdev_prop_default_numeric(prop); switch (prop) { case VDEV_PROP_CHECKSUM_N: propval = vd->vdev_checksum_n; break; case VDEV_PROP_CHECKSUM_T: propval = vd->vdev_checksum_t; break; case VDEV_PROP_IO_N: propval = vd->vdev_io_n; break; case VDEV_PROP_IO_T: propval = vd->vdev_io_t; break; + case VDEV_PROP_SLOW_IO_N: + propval = vd->vdev_slow_io_n; + break; + case VDEV_PROP_SLOW_IO_T: + propval = vd->vdev_slow_io_t; + break; default: propval = propdef; break; } if (propval != propdef) return (propval); if (vd->vdev_parent == NULL) return (propdef); return (vdev_prop_get_inherited(vd->vdev_parent, prop)); } static void zfs_ereport_schedule_cleaner(void); /* * background task to clean stale recent event nodes. */ static void zfs_ereport_cleaner(void *arg) { recent_events_node_t *entry; uint64_t now = gethrtime(); /* * purge expired entries */ mutex_enter(&recent_events_lock); while ((entry = list_tail(&recent_events_list)) != NULL) { uint64_t age = NSEC2SEC(now - entry->re_timestamp); if (age <= zfs_zevent_retain_expire_secs) break; /* remove expired node */ avl_remove(&recent_events_tree, entry); list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); } /* Restart the cleaner if more entries remain */ recent_events_cleaner_tqid = 0; if (!list_is_empty(&recent_events_list)) zfs_ereport_schedule_cleaner(); mutex_exit(&recent_events_lock); } static void zfs_ereport_schedule_cleaner(void) { ASSERT(MUTEX_HELD(&recent_events_lock)); uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); recent_events_cleaner_tqid = taskq_dispatch_delay( system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(timeout)); } /* * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL */ void zfs_ereport_clear(spa_t *spa, vdev_t *vd) { uint64_t vdev_guid, pool_guid; ASSERT(vd != NULL || spa != NULL); if (vd == NULL) { vdev_guid = 0; pool_guid = spa_guid(spa); } else { vdev_guid = vd->vdev_guid; pool_guid = 0; } mutex_enter(&recent_events_lock); recent_events_node_t *next = list_head(&recent_events_list); while (next != NULL) { recent_events_node_t *entry = next; next = list_next(&recent_events_list, next); if (entry->re_vdev_guid == vdev_guid || entry->re_pool_guid == pool_guid) { avl_remove(&recent_events_tree, entry); list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); } } mutex_exit(&recent_events_lock); } /* * Check if an ereport would be a duplicate of one recently posted. * * An ereport is considered a duplicate if the set of criteria in * recent_events_node_t all match. * * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM * are candidates for duplicate checking. */ static boolean_t zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) { recent_events_node_t search = {0}, *entry; if (vd == NULL || zio == NULL) return (B_FALSE); if (zfs_zevent_retain_max == 0) return (B_FALSE); if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) search.re_subclass = ZSC_IO; else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) search.re_subclass = ZSC_DATA; else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) search.re_subclass = ZSC_CHECKSUM; else return (B_FALSE); search.re_pool_guid = spa_guid(spa); search.re_vdev_guid = vd->vdev_guid; search.re_io_error = zio->io_error; search.re_io_priority = zio->io_priority; /* if size is supplied use it over what's in zio */ if (size) { search.re_io_size = size; search.re_io_offset = offset; } else { search.re_io_size = zio->io_size; search.re_io_offset = zio->io_offset; } /* grab optional logical zio criteria */ if (zb != NULL) { search.re_io_bookmark.zb_objset = zb->zb_objset; search.re_io_bookmark.zb_object = zb->zb_object; search.re_io_bookmark.zb_level = zb->zb_level; search.re_io_bookmark.zb_blkid = zb->zb_blkid; } uint64_t now = gethrtime(); mutex_enter(&recent_events_lock); /* check if we have seen this one recently */ entry = avl_find(&recent_events_tree, &search, NULL); if (entry != NULL) { uint64_t age = NSEC2SEC(now - entry->re_timestamp); /* * There is still an active cleaner (since we're here). * Reset the last seen time for this duplicate entry * so that its lifespand gets extended. */ list_remove(&recent_events_list, entry); list_insert_head(&recent_events_list, entry); entry->re_timestamp = now; zfs_zevent_track_duplicate(); mutex_exit(&recent_events_lock); return (age <= zfs_zevent_retain_expire_secs); } if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { /* recycle oldest node */ entry = list_tail(&recent_events_list); ASSERT(entry != NULL); list_remove(&recent_events_list, entry); avl_remove(&recent_events_tree, entry); } else { entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); } /* record this as a recent ereport */ *entry = search; avl_add(&recent_events_tree, entry); list_insert_head(&recent_events_list, entry); entry->re_timestamp = now; /* Start a cleaner if not already scheduled */ if (recent_events_cleaner_tqid == 0) zfs_ereport_schedule_cleaner(); mutex_exit(&recent_events_lock); return (B_FALSE); } void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) { if (nvl) fm_nvlist_destroy(nvl, FM_NVA_FREE); if (detector) fm_nvlist_destroy(detector, FM_NVA_FREE); } /* * We want to rate limit ZIO delay, deadman, and checksum events so as to not * flood zevent consumers when a disk is acting up. * * Returns 1 if we're ratelimiting, 0 if not. */ static int zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) { int rc = 0; /* * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we * are. Invert it to get our return value. */ if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { rc = !zfs_ratelimit(&vd->vdev_delay_rl); } else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) { rc = !zfs_ratelimit(&vd->vdev_deadman_rl); } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { rc = !zfs_ratelimit(&vd->vdev_checksum_rl); } if (rc) { /* We're rate limiting */ fm_erpt_dropped_increment(); } return (rc); } /* * Return B_TRUE if the event actually posted, B_FALSE if not. */ static boolean_t zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { nvlist_t *ereport, *detector; uint64_t ena; char class[64]; if ((ereport = fm_nvlist_create(NULL)) == NULL) return (B_FALSE); if ((detector = fm_nvlist_create(NULL)) == NULL) { fm_nvlist_destroy(ereport, FM_NVA_FREE); return (B_FALSE); } /* * Serialize ereport generation */ mutex_enter(&spa->spa_errlist_lock); /* * Determine the ENA to use for this event. If we are in a loading * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use * a root zio-wide ENA. Otherwise, simply use a unique ENA. */ if (spa_load_state(spa) != SPA_LOAD_NONE) { if (spa->spa_ena == 0) spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); ena = spa->spa_ena; } else if (zio != NULL && zio->io_logical != NULL) { if (zio->io_logical->io_ena == 0) zio->io_logical->io_ena = fm_ena_generate(0, FM_ENA_FMT1); ena = zio->io_logical->io_ena; } else { ena = fm_ena_generate(0, FM_ENA_FMT1); } /* * Construct the full class, detector, and other standard FMA fields. */ (void) snprintf(class, sizeof (class), "%s.%s", ZFS_ERROR_CLASS, subclass); fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), vd != NULL ? vd->vdev_guid : 0); fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); /* * Construct the per-ereport payload, depending on which parameters are * passed in. */ /* * Generic payload members common to all ereports. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, (uint64_t)spa_state(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, (int32_t)spa_load_state(spa), NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, DATA_TYPE_STRING, spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? FM_EREPORT_FAILMODE_WAIT : spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, NULL); if (vd != NULL) { vdev_t *pvd = vd->vdev_parent; vdev_queue_t *vq = &vd->vdev_queue; vdev_stat_t *vs = &vd->vdev_stat; vdev_t *spare_vd; uint64_t *spare_guids; char **spare_paths; int i, spare_count; fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, DATA_TYPE_UINT64, vd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); if (vd->vdev_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, DATA_TYPE_STRING, vd->vdev_path, NULL); if (vd->vdev_devid != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, DATA_TYPE_STRING, vd->vdev_devid, NULL); if (vd->vdev_fru != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, DATA_TYPE_STRING, vd->vdev_fru, NULL); if (vd->vdev_enc_sysfs_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); if (vd->vdev_ashift) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, DATA_TYPE_UINT64, vd->vdev_ashift, NULL); if (vq != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); } if (vs != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, DATA_TYPE_UINT64, vs->vs_read_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, DATA_TYPE_UINT64, vs->vs_write_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, DATA_TYPE_UINT64, vs->vs_checksum_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, DATA_TYPE_UINT64, vs->vs_slow_ios, NULL); } if (pvd != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, DATA_TYPE_UINT64, pvd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, NULL); if (pvd->vdev_path) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, DATA_TYPE_STRING, pvd->vdev_path, NULL); if (pvd->vdev_devid) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, DATA_TYPE_STRING, pvd->vdev_devid, NULL); } spare_count = spa->spa_spares.sav_count; spare_paths = kmem_zalloc(sizeof (char *) * spare_count, KM_SLEEP); spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, KM_SLEEP); for (i = 0; i < spare_count; i++) { spare_vd = spa->spa_spares.sav_vdevs[i]; if (spare_vd) { spare_paths[i] = spare_vd->vdev_path; spare_guids[i] = spare_vd->vdev_guid; } } fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); kmem_free(spare_guids, sizeof (uint64_t) * spare_count); kmem_free(spare_paths, sizeof (char *) * spare_count); } if (zio != NULL) { /* * Payload common to all I/Os. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, DATA_TYPE_INT32, zio->io_error, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, DATA_TYPE_INT32, zio->io_flags, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, DATA_TYPE_UINT32, zio->io_stage, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, DATA_TYPE_UINT32, zio->io_pipeline, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, DATA_TYPE_UINT64, zio->io_delay, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, DATA_TYPE_UINT64, zio->io_timestamp, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, DATA_TYPE_UINT64, zio->io_delta, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, DATA_TYPE_UINT32, zio->io_priority, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a * RAID-Z or other I/O where the physical offset and length are * provided for us, instead of within the zio_t. */ if (vd != NULL) { if (size) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, DATA_TYPE_UINT64, stateoroffset, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, DATA_TYPE_UINT64, size, NULL); else fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, DATA_TYPE_UINT64, zio->io_offset, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, DATA_TYPE_UINT64, zio->io_size, NULL); } } else if (vd != NULL) { /* * If we have a vdev but no zio, this is a device fault, and the * 'stateoroffset' parameter indicates the previous state of the * vdev. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, DATA_TYPE_UINT64, stateoroffset, NULL); } /* * Payload for I/Os with corresponding logical information. */ if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, DATA_TYPE_UINT64, zb->zb_objset, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, DATA_TYPE_UINT64, zb->zb_object, FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, DATA_TYPE_INT64, zb->zb_level, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, DATA_TYPE_UINT64, zb->zb_blkid, NULL); } /* * Payload for tuning the zed */ if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { uint64_t cksum_n, cksum_t; cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N); if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N)) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N, DATA_TYPE_UINT64, cksum_n, NULL); cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T); if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T)) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T, DATA_TYPE_UINT64, cksum_t, NULL); } if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) { uint64_t io_n, io_t; io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N); if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N)) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N, DATA_TYPE_UINT64, io_n, NULL); io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T); if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T)) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T, DATA_TYPE_UINT64, io_t, NULL); } + if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { + uint64_t slow_io_n, slow_io_t; + + slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N); + if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, + DATA_TYPE_UINT64, + slow_io_n, + NULL); + + slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T); + if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, + DATA_TYPE_UINT64, + slow_io_t, + NULL); + } + mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; *detector_out = detector; return (B_TRUE); } /* if it's <= 128 bytes, save the corruption directly */ #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) #define MAX_RANGES 16 typedef struct zfs_ecksum_info { /* inline arrays of bits set and cleared. */ uint64_t zei_bits_set[ZFM_MAX_INLINE]; uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; /* * for each range, the number of bits set and cleared. The Hamming * distance between the good and bad buffers is the sum of them all. */ uint32_t zei_range_sets[MAX_RANGES]; uint32_t zei_range_clears[MAX_RANGES]; struct zei_ranges { uint32_t zr_start; uint32_t zr_end; } zei_ranges[MAX_RANGES]; size_t zei_range_count; uint32_t zei_mingap; uint32_t zei_allowed_mingap; } zfs_ecksum_info_t; static void update_bad_bits(uint64_t value_arg, uint32_t *count) { size_t i; size_t bits = 0; uint64_t value = BE_64(value_arg); /* We store the bits in big-endian (largest-first) order */ for (i = 0; i < 64; i++) { if (value & (1ull << i)) ++bits; } /* update the count of bits changed */ *count += bits; } /* * We've now filled up the range array, and need to increase "mingap" and * shrink the range list accordingly. zei_mingap is always the smallest * distance between array entries, so we set the new_allowed_gap to be * one greater than that. We then go through the list, joining together * any ranges which are closer than the new_allowed_gap. * * By construction, there will be at least one. We also update zei_mingap * to the new smallest gap, to prepare for our next invocation. */ static void zei_shrink_ranges(zfs_ecksum_info_t *eip) { uint32_t mingap = UINT32_MAX; uint32_t new_allowed_gap = eip->zei_mingap + 1; size_t idx, output; size_t max = eip->zei_range_count; struct zei_ranges *r = eip->zei_ranges; ASSERT3U(eip->zei_range_count, >, 0); ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); output = idx = 0; while (idx < max - 1) { uint32_t start = r[idx].zr_start; uint32_t end = r[idx].zr_end; while (idx < max - 1) { idx++; uint32_t nstart = r[idx].zr_start; uint32_t nend = r[idx].zr_end; uint32_t gap = nstart - end; if (gap < new_allowed_gap) { end = nend; continue; } if (gap < mingap) mingap = gap; break; } r[output].zr_start = start; r[output].zr_end = end; output++; } ASSERT3U(output, <, eip->zei_range_count); eip->zei_range_count = output; eip->zei_mingap = mingap; eip->zei_allowed_mingap = new_allowed_gap; } static void zei_add_range(zfs_ecksum_info_t *eip, int start, int end) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; if (count >= MAX_RANGES) { zei_shrink_ranges(eip); count = eip->zei_range_count; } if (count == 0) { eip->zei_mingap = UINT32_MAX; eip->zei_allowed_mingap = 1; } else { int gap = start - r[count - 1].zr_end; if (gap < eip->zei_allowed_mingap) { r[count - 1].zr_end = end; return; } if (gap < eip->zei_mingap) eip->zei_mingap = gap; } r[count].zr_start = start; r[count].zr_end = end; eip->zei_range_count++; } static size_t zei_range_total_size(zfs_ecksum_info_t *eip) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; size_t result = 0; size_t idx; for (idx = 0; idx < count; idx++) result += (r[idx].zr_end - r[idx].zr_start); return (result); } static zfs_ecksum_info_t * annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, const abd_t *goodabd, const abd_t *badabd, size_t size, boolean_t drop_if_identical) { const uint64_t *good; const uint64_t *bad; size_t nui64s = size / sizeof (uint64_t); size_t inline_size; int no_inline = 0; size_t idx; size_t range; size_t offset = 0; ssize_t start = -1; zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); /* don't do any annotation for injected checksum errors */ if (info != NULL && info->zbc_injected) return (eip); if (info != NULL && info->zbc_has_cksum) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, DATA_TYPE_STRING, info->zbc_checksum_name, NULL); if (info->zbc_byteswapped) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, DATA_TYPE_BOOLEAN, 1, NULL); } } if (badabd == NULL || goodabd == NULL) return (eip); ASSERT3U(nui64s, <=, UINT32_MAX); ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, <=, UINT32_MAX); good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); /* build up the range list by comparing the two buffers. */ for (idx = 0; idx < nui64s; idx++) { if (good[idx] == bad[idx]) { if (start == -1) continue; zei_add_range(eip, start, idx); start = -1; } else { if (start != -1) continue; start = idx; } } if (start != -1) zei_add_range(eip, start, idx); /* See if it will fit in our inline buffers */ inline_size = zei_range_total_size(eip); if (inline_size > ZFM_MAX_INLINE) no_inline = 1; /* * If there is no change and we want to drop if the buffers are * identical, do so. */ if (inline_size == 0 && drop_if_identical) { kmem_free(eip, sizeof (*eip)); abd_return_buf((abd_t *)goodabd, (void *)good, size); abd_return_buf((abd_t *)badabd, (void *)bad, size); return (NULL); } /* * Now walk through the ranges, filling in the details of the * differences. Also convert our uint64_t-array offsets to byte * offsets. */ for (range = 0; range < eip->zei_range_count; range++) { size_t start = eip->zei_ranges[range].zr_start; size_t end = eip->zei_ranges[range].zr_end; for (idx = start; idx < end; idx++) { uint64_t set, cleared; // bits set in bad, but not in good set = ((~good[idx]) & bad[idx]); // bits set in good, but not in bad cleared = (good[idx] & (~bad[idx])); if (!no_inline) { ASSERT3U(offset, <, inline_size); eip->zei_bits_set[offset] = set; eip->zei_bits_cleared[offset] = cleared; offset++; } update_bad_bits(set, &eip->zei_range_sets[range]); update_bad_bits(cleared, &eip->zei_range_clears[range]); } /* convert to byte offsets */ eip->zei_ranges[range].zr_start *= sizeof (uint64_t); eip->zei_ranges[range].zr_end *= sizeof (uint64_t); } abd_return_buf((abd_t *)goodabd, (void *)good, size); abd_return_buf((abd_t *)badabd, (void *)bad, size); eip->zei_allowed_mingap *= sizeof (uint64_t); inline_size *= sizeof (uint64_t); /* fill in ereport */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, (uint32_t *)eip->zei_ranges, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, DATA_TYPE_UINT32, eip->zei_allowed_mingap, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, NULL); if (!no_inline) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_set, FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_cleared, NULL); } return (eip); } #else void zfs_ereport_clear(spa_t *spa, vdev_t *vd) { (void) spa, (void) vd; } #endif /* * Make sure our event is still valid for the given zio/vdev/pool. For example, * we don't want to keep logging events for a faulted or missing vdev. */ boolean_t zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) { #ifdef _KERNEL /* * If we are doing a spa_tryimport() or in recovery mode, * ignore errors. */ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) return (B_FALSE); /* * If we are in the middle of opening a pool, and the previous attempt * failed, don't bother logging any new ereports - we're just going to * get the same diagnosis anyway. */ if (spa_load_state(spa) != SPA_LOAD_NONE && spa->spa_last_open_failed) return (B_FALSE); if (zio != NULL) { /* * If this is not a read or write zio, ignore the error. This * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) return (B_FALSE); if (vd != NULL) { /* * If the vdev has already been marked as failing due * to a failed probe, then ignore any subsequent I/O * errors, as the DE will automatically fault the vdev * on the first such failure. This also catches cases * where vdev_remove_wanted is set and the device has * not yet been asynchronously placed into the REMOVED * state. */ if (zio->io_vd == vd && !vdev_accessible(vd, zio)) return (B_FALSE); /* * Ignore checksum errors for reads from DTL regions of * leaf vdevs. */ if (zio->io_type == ZIO_TYPE_READ && zio->io_error == ECKSUM && vd->vdev_ops->vdev_op_leaf && vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) return (B_FALSE); } } /* * For probe failure, we want to avoid posting ereports if we've * already removed the device in the meantime. */ if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) return (B_FALSE); /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && (zio != NULL) && (!zio->io_timestamp)) { return (B_FALSE); } #else (void) subclass, (void) spa, (void) vd, (void) zio; #endif return (B_TRUE); } /* * Post an ereport for the given subclass * * Returns * - 0 if an event was posted * - EINVAL if there was a problem posting event * - EBUSY if the event was rate limited * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) return (EINVAL); if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) return (SET_ERROR(EALREADY)); if (zfs_is_ratelimiting_event(subclass, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, zb, zio, state, 0)) return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) return (SET_ERROR(EINVAL)); /* Cleanup is handled by the callback function */ rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); #else (void) subclass, (void) spa, (void) vd, (void) zb, (void) zio, (void) state; #endif return (rc); } /* * Prepare a checksum ereport * * Returns * - 0 if an event was posted * - EINVAL if there was a problem posting event * - EBUSY if the event was rate limited * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info) { zio_cksum_report_t *report; #ifdef _KERNEL if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) return (SET_ERROR(EINVAL)); if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length)) return (SET_ERROR(EALREADY)); if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return (SET_ERROR(EBUSY)); #else (void) zb, (void) offset; #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); zio_vsd_default_cksum_report(zio, report); /* copy the checksum failure information if it was provided */ if (info != NULL) { report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); memcpy(report->zcr_ckinfo, info, sizeof (*info)); } report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; report->zcr_align = vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); report->zcr_length = length; #ifdef _KERNEL (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); if (report->zcr_ereport == NULL) { zfs_ereport_free_checksum(report); return (0); } #endif mutex_enter(&spa->spa_errlist_lock); report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; mutex_exit(&spa->spa_errlist_lock); return (0); } void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical) { #ifdef _KERNEL zfs_ecksum_info_t *info; info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, good_data, bad_data, report->zcr_length, drop_if_identical); if (info != NULL) zfs_zevent_post(report->zcr_ereport, report->zcr_detector, zfs_zevent_post_cb); else zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); report->zcr_ereport = report->zcr_detector = NULL; if (info != NULL) kmem_free(info, sizeof (*info)); #else (void) report, (void) good_data, (void) bad_data, (void) drop_if_identical; #endif } void zfs_ereport_free_checksum(zio_cksum_report_t *rpt) { #ifdef _KERNEL if (rpt->zcr_ereport != NULL) { fm_nvlist_destroy(rpt->zcr_ereport, FM_NVA_FREE); fm_nvlist_destroy(rpt->zcr_detector, FM_NVA_FREE); } #endif rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); if (rpt->zcr_ckinfo != NULL) kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); kmem_free(rpt, sizeof (*rpt)); } /* * Post a checksum ereport * * Returns * - 0 if an event was posted * - EINVAL if there was a problem posting event * - EBUSY if the event was rate limited * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; zfs_ecksum_info_t *info; if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) return (SET_ERROR(EINVAL)); if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length)) return (SET_ERROR(EALREADY)); if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length) || (ereport == NULL)) { return (SET_ERROR(EINVAL)); } info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, B_FALSE); if (info != NULL) { rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); kmem_free(info, sizeof (*info)); } #else (void) spa, (void) vd, (void) zb, (void) zio, (void) offset, (void) length, (void) good_data, (void) bad_data, (void) zbc; #endif return (rc); } /* * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h * and are designed to be consumed by the ZFS Event Daemon (ZED). For * additional details refer to the zed(8) man page. */ nvlist_t * zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux) { nvlist_t *resource = NULL; #ifdef _KERNEL char class[64]; if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return (NULL); if ((resource = fm_nvlist_create(NULL)) == NULL) return (NULL); (void) snprintf(class, sizeof (class), "%s.%s.%s", type, ZFS_ERROR_CLASS, name); VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); VERIFY0(nvlist_add_int32(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); if (vd) { VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); if (vd->vdev_path != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); if (vd->vdev_devid != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); if (vd->vdev_fru != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); if (vd->vdev_enc_sysfs_path != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path)); } /* also copy any optional payload data */ if (aux) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) (void) nvlist_add_nvpair(resource, elem); } #else (void) spa, (void) vd, (void) type, (void) name, (void) aux; #endif return (resource); } static void zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux) { #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, type, name, aux); if (resource) zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); #else (void) spa, (void) vd, (void) type, (void) name, (void) aux; #endif } /* * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev * has been removed from the system. This will cause the DE to ignore any * recent I/O errors, inferring that they are due to the asynchronous device * removal. */ void zfs_post_remove(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); } /* * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool * has the 'autoreplace' property set, and therefore any broken vdevs will be * handled by higher level logic, and no vdev fault should be generated. */ void zfs_post_autoreplace(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); } /* * The 'resource.fs.zfs.statechange' event is an internal signal that the * given vdev has transitioned its state to DEGRADED or HEALTHY. This will * cause the retire agent to repair any outstanding fault management cases * open because the device was not found (fault.fs.zfs.device). */ void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) { #ifdef _KERNEL nvlist_t *aux; /* * Add optional supplemental keys to payload */ aux = fm_nvlist_create(NULL); if (vd && aux) { if (vd->vdev_physpath) { fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, vd->vdev_physpath); } if (vd->vdev_enc_sysfs_path) { fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path); } fnvlist_add_uint64(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); } zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, aux); if (aux) fm_nvlist_destroy(aux, FM_NVA_FREE); #else (void) spa, (void) vd, (void) laststate; #endif } #ifdef _KERNEL void zfs_ereport_init(void) { mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&recent_events_list, sizeof (recent_events_node_t), offsetof(recent_events_node_t, re_list_link)); avl_create(&recent_events_tree, recent_events_compare, sizeof (recent_events_node_t), offsetof(recent_events_node_t, re_tree_link)); } /* * This 'early' fini needs to run before zfs_fini() which on Linux waits * for the system_delay_taskq to drain. */ void zfs_ereport_taskq_fini(void) { mutex_enter(&recent_events_lock); if (recent_events_cleaner_tqid != 0) { taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); recent_events_cleaner_tqid = 0; } mutex_exit(&recent_events_lock); } void zfs_ereport_fini(void) { recent_events_node_t *entry; while ((entry = list_remove_head(&recent_events_list)) != NULL) { avl_remove(&recent_events_tree, entry); kmem_free(entry, sizeof (*entry)); } avl_destroy(&recent_events_tree); list_destroy(&recent_events_list); mutex_destroy(&recent_events_lock); } void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name) { nvlist_t *aux; aux = fm_nvlist_create(NULL); fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name); zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); fm_nvlist_destroy(aux, FM_NVA_FREE); } /* * Post when a event when a zvol is created or removed * * This is currently only used by macOS, since it uses the event to create * symlinks between the volume name (mypool/myvol) and the actual /dev * device (/dev/disk3). For example: * * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3 * * name: The full name of the zvol ("mypool/myvol") * dev_name: The full /dev name for the zvol ("/dev/disk3") * raw_name: The raw /dev name for the zvol ("/dev/rdisk3") */ void zfs_ereport_zvol_post(const char *subclass, const char *name, const char *dev_name, const char *raw_name) { nvlist_t *aux; char *r; boolean_t locked = mutex_owned(&spa_namespace_lock); if (!locked) mutex_enter(&spa_namespace_lock); spa_t *spa = spa_lookup(name); if (!locked) mutex_exit(&spa_namespace_lock); if (spa == NULL) return; aux = fm_nvlist_create(NULL); fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name); fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME, raw_name); r = strchr(name, '/'); if (r && r[1]) fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]); zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux); fm_nvlist_destroy(aux, FM_NVA_FREE); } EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_state_change); ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, "Maximum recent zevents records to retain for duplicate checking"); ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, "Expiration time for recent zevents records"); #endif /* _KERNEL */ diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 3598351c499d..609182f4a2cd 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -1,972 +1,976 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ /* * ZFS fault injection * * To handle fault injection, we keep track of a series of zinject_record_t * structures which describe which logical block(s) should be injected with a * fault. These are kept in a global list. Each record corresponds to a given * spa_t and maintains a special hold on the spa_t so that it cannot be deleted * or exported while the injection record exists. * * Device level injection is done using the 'zi_guid' field. If this is set, it * means that the error is destined for a particular device, not a piece of * data. * * This is a rather poor data structure and algorithm, but we don't expect more * than a few faults at any one time, so it should be sufficient for our needs. */ #include #include #include #include #include #include #include uint32_t zio_injection_enabled = 0; /* * Data describing each zinject handler registered on the system, and * contains the list node linking the handler in the global zinject * handler list. */ typedef struct inject_handler { int zi_id; spa_t *zi_spa; zinject_record_t zi_record; uint64_t *zi_lanes; int zi_next_lane; list_node_t zi_link; } inject_handler_t; /* * List of all zinject handlers registered on the system, protected by * the inject_lock defined below. */ static list_t inject_handlers; /* * This protects insertion into, and traversal of, the inject handler * list defined above; as well as the inject_delay_count. Any time a * handler is inserted or removed from the list, this lock should be * taken as a RW_WRITER; and any time traversal is done over the list * (without modification to it) this lock should be taken as a RW_READER. */ static krwlock_t inject_lock; /* * This holds the number of zinject delay handlers that have been * registered on the system. It is protected by the inject_lock defined * above. Thus modifications to this count must be a RW_WRITER of the * inject_lock, and reads of this count must be (at least) a RW_READER * of the lock. */ static int inject_delay_count = 0; /* * This lock is used only in zio_handle_io_delay(), refer to the comment * in that function for more details. */ static kmutex_t inject_delay_mtx; /* * Used to assign unique identifying numbers to each new zinject handler. */ static int inject_next_id = 1; /* * Test if the requested frequency was triggered */ static boolean_t freq_triggered(uint32_t frequency) { /* * zero implies always (100%) */ if (frequency == 0) return (B_TRUE); /* * Note: we still handle legacy (unscaled) frequency values */ uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; return (random_in_range(maximum) < frequency); } /* * Returns true if the given record matches the I/O in progress. */ static boolean_t zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, zinject_record_t *record, int error) { /* * Check for a match against the MOS, which is based on type */ if (zb->zb_objset == DMU_META_OBJSET && record->zi_objset == DMU_META_OBJSET && record->zi_object == DMU_META_DNODE_OBJECT) { if (record->zi_type == DMU_OT_NONE || type == record->zi_type) return (freq_triggered(record->zi_freq)); else return (B_FALSE); } /* * Check for an exact match. */ if (zb->zb_objset == record->zi_objset && zb->zb_object == record->zi_object && zb->zb_level == record->zi_level && zb->zb_blkid >= record->zi_start && zb->zb_blkid <= record->zi_end && (record->zi_dvas == 0 || (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) && error == record->zi_error) { return (freq_triggered(record->zi_freq)); } return (B_FALSE); } /* * Panic the system when a config change happens in the function * specified by tag. */ void zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type) { inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa) continue; if (handler->zi_record.zi_type == type && strcmp(tag, handler->zi_record.zi_func) == 0) panic("Panic requested in function %s\n", tag); } rw_exit(&inject_lock); } /* * Inject a decryption failure. Decryption failures can occur in * both the ARC and the ZIO layers. */ int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, uint64_t type, int error) { int ret = 0; inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) continue; if (zio_match_handler(zb, type, ZI_NO_DVA, &handler->zi_record, error)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } /* * If this is a physical I/O for a vdev child determine which DVA it is * for. We iterate backwards through the DVAs matching on the offset so * that we end up with ZI_NO_DVA (-1) if we don't find a match. */ static int zio_match_dva(zio_t *zio) { int i = ZI_NO_DVA; if (zio->io_bp != NULL && zio->io_vd != NULL && zio->io_child_type == ZIO_CHILD_VDEV) { for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { dva_t *dva = &zio->io_bp->blk_dva[i]; uint64_t off = DVA_GET_OFFSET(dva); vdev_t *vd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(dva)); /* Compensate for vdev label added to leaves */ if (zio->io_vd->vdev_ops->vdev_op_leaf) off += VDEV_LABEL_START_SIZE; if (zio->io_vd == vd && zio->io_offset == off) break; } } return (i); } /* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. */ int zio_handle_fault_injection(zio_t *zio, int error) { int ret = 0; inject_handler_t *handler; /* * Ignore I/O not associated with any logical data. */ if (zio->io_logical == NULL) return (0); /* * Currently, we only support fault injection on reads. */ if (zio->io_type != ZIO_TYPE_READ) return (0); /* * A rebuild I/O has no checksum to verify. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) return (0); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; /* If this handler matches, return the specified error */ if (zio_match_handler(&zio->io_logical->io_bookmark, zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, zio_match_dva(zio), &handler->zi_record, error)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } /* * Determine if the zio is part of a label update and has an injection * handler associated with that portion of the label. Currently, we * allow error injection in either the nvlist or the uberblock region of * of the vdev label. */ int zio_handle_label_injection(zio_t *zio, int error) { inject_handler_t *handler; vdev_t *vd = zio->io_vd; uint64_t offset = zio->io_offset; int label; int ret = 0; if (offset >= VDEV_LABEL_START_SIZE && offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) continue; /* * The injection region is the relative offsets within a * vdev label. We must determine the label which is being * updated and adjust our region accordingly. */ label = vdev_label_number(vd->vdev_psize, offset); start = vdev_label_offset(vd->vdev_psize, label, start); end = vdev_label_offset(vd->vdev_psize, label, end); if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && (offset >= start && offset <= end)) { ret = error; break; } } rw_exit(&inject_lock); return (ret); } static int zio_inject_bitflip_cb(void *data, size_t len, void *private) { zio_t *zio = private; uint8_t *buffer = data; uint_t byte = random_in_range(len); ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); /* flip a single random bit in an abd data buffer */ buffer[byte] ^= 1 << random_in_range(8); return (1); /* stop after first flip */ } static int zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) { inject_handler_t *handler; int ret = 0; /* * We skip over faults in the labels unless it's during * device open (i.e. zio == NULL). */ if (zio != NULL) { uint64_t offset = zio->io_offset; if (offset < VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); } rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { if (handler->zi_record.zi_failfast && (zio == NULL || (zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { continue; } /* Handle type specific I/O failures */ if (zio != NULL && handler->zi_record.zi_iotype != ZIO_TYPES && handler->zi_record.zi_iotype != zio->io_type) continue; if (handler->zi_record.zi_error == err1 || handler->zi_record.zi_error == err2) { /* * limit error injection if requested */ if (!freq_triggered(handler->zi_record.zi_freq)) continue; /* * For a failed open, pretend like the device * has gone away. */ if (err1 == ENXIO) vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; /* * Treat these errors as if they had been * retried so that all the appropriate stats * and FMA events are generated. */ if (!handler->zi_record.zi_failfast && zio != NULL) zio->io_flags |= ZIO_FLAG_IO_RETRY; /* * EILSEQ means flip a bit after a read */ if (handler->zi_record.zi_error == EILSEQ) { if (zio == NULL) break; /* locate buffer data and flip a bit */ (void) abd_iterate_func(zio->io_abd, 0, zio->io_size, zio_inject_bitflip_cb, zio); break; } ret = handler->zi_record.zi_error; break; } if (handler->zi_record.zi_error == ENXIO) { ret = SET_ERROR(EIO); break; } } } rw_exit(&inject_lock); return (ret); } int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) { return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX)); } int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2) { return (zio_handle_device_injection_impl(vd, zio, err1, err2)); } /* * Simulate hardware that ignores cache flushes. For requested number * of seconds nix the actual writing to disk. */ void zio_handle_ignored_writes(zio_t *zio) { inject_handler_t *handler; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { /* Ignore errors not destined for this pool */ if (zio->io_spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; /* * Positive duration implies # of seconds, negative * a number of txgs */ if (handler->zi_record.zi_timer == 0) { if (handler->zi_record.zi_duration > 0) handler->zi_record.zi_timer = ddi_get_lbolt64(); else handler->zi_record.zi_timer = zio->io_txg; } /* Have a "problem" writing 60% of the time */ if (random_in_range(100) < 60) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; break; } rw_exit(&inject_lock); } void spa_handle_ignored_writes(spa_t *spa) { inject_handler_t *handler; if (zio_injection_enabled == 0) return; rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (spa != handler->zi_spa || handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; if (handler->zi_record.zi_duration > 0) { VERIFY(handler->zi_record.zi_timer == 0 || ddi_time_after64( (int64_t)handler->zi_record.zi_timer + handler->zi_record.zi_duration * hz, ddi_get_lbolt64())); } else { /* duration is negative so the subtraction here adds */ VERIFY(handler->zi_record.zi_timer == 0 || handler->zi_record.zi_timer - handler->zi_record.zi_duration >= spa_syncing_txg(spa)); } } rw_exit(&inject_lock); } hrtime_t zio_handle_io_delay(zio_t *zio) { vdev_t *vd = zio->io_vd; inject_handler_t *min_handler = NULL; hrtime_t min_target = 0; rw_enter(&inject_lock, RW_READER); /* * inject_delay_count is a subset of zio_injection_enabled that * is only incremented for delay handlers. These checks are * mainly added to remind the reader why we're not explicitly * checking zio_injection_enabled like the other functions. */ IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); /* * If there aren't any inject delay handlers registered, then we * can short circuit and simply return 0 here. A value of zero * informs zio_delay_interrupt() that this request should not be * delayed. This short circuit keeps us from acquiring the * inject_delay_mutex unnecessarily. */ if (inject_delay_count == 0) { rw_exit(&inject_lock); return (0); } /* * Each inject handler has a number of "lanes" associated with * it. Each lane is able to handle requests independently of one * another, and at a latency defined by the inject handler * record's zi_timer field. Thus if a handler in configured with * a single lane with a 10ms latency, it will delay requests * such that only a single request is completed every 10ms. So, * if more than one request is attempted per each 10ms interval, * the average latency of the requests will be greater than * 10ms; but if only a single request is submitted each 10ms * interval the average latency will be 10ms. * * We need to acquire this mutex to prevent multiple concurrent * threads being assigned to the same lane of a given inject * handler. The mutex allows us to perform the following two * operations atomically: * * 1. determine the minimum handler and minimum target * value of all the possible handlers * 2. update that minimum handler's lane array * * Without atomicity, two (or more) threads could pick the same * lane in step (1), and then conflict with each other in step * (2). This could allow a single lane handler to process * multiple requests simultaneously, which shouldn't be possible. */ mutex_enter(&inject_delay_mtx); for (inject_handler_t *handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) continue; if (!freq_triggered(handler->zi_record.zi_freq)) continue; if (vd->vdev_guid != handler->zi_record.zi_guid) continue; + if (handler->zi_record.zi_iotype != ZIO_TYPES && + handler->zi_record.zi_iotype != zio->io_type) + continue; + /* * Defensive; should never happen as the array allocation * occurs prior to inserting this handler on the list. */ ASSERT3P(handler->zi_lanes, !=, NULL); /* * This should never happen, the zinject command should * prevent a user from setting an IO delay with zero lanes. */ ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); ASSERT3U(handler->zi_record.zi_nlanes, >, handler->zi_next_lane); /* * We want to issue this IO to the lane that will become * idle the soonest, so we compare the soonest this * specific handler can complete the IO with all other * handlers, to find the lowest value of all possible * lanes. We then use this lane to submit the request. * * Since each handler has a constant value for its * delay, we can just use the "next" lane for that * handler; as it will always be the lane with the * lowest value for that particular handler (i.e. the * lane that will become idle the soonest). This saves a * scan of each handler's lanes array. * * There's two cases to consider when determining when * this specific IO request should complete. If this * lane is idle, we want to "submit" the request now so * it will complete after zi_timer milliseconds. Thus, * we set the target to now + zi_timer. * * If the lane is busy, we want this request to complete * zi_timer milliseconds after the lane becomes idle. * Since the 'zi_lanes' array holds the time at which * each lane will become idle, we use that value to * determine when this request should complete. */ hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); hrtime_t busy = handler->zi_record.zi_timer + handler->zi_lanes[handler->zi_next_lane]; hrtime_t target = MAX(idle, busy); if (min_handler == NULL) { min_handler = handler; min_target = target; continue; } ASSERT3P(min_handler, !=, NULL); ASSERT3U(min_target, !=, 0); /* * We don't yet increment the "next lane" variable since * we still might find a lower value lane in another * handler during any remaining iterations. Once we're * sure we've selected the absolute minimum, we'll claim * the lane and increment the handler's "next lane" * field below. */ if (target < min_target) { min_handler = handler; min_target = target; } } /* * 'min_handler' will be NULL if no IO delays are registered for * this vdev, otherwise it will point to the handler containing * the lane that will become idle the soonest. */ if (min_handler != NULL) { ASSERT3U(min_target, !=, 0); min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; /* * If we've used all possible lanes for this handler, * loop back and start using the first lane again; * otherwise, just increment the lane index. */ min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % min_handler->zi_record.zi_nlanes; } mutex_exit(&inject_delay_mtx); rw_exit(&inject_lock); return (min_target); } static int zio_calculate_range(const char *pool, zinject_record_t *record) { dsl_pool_t *dp; dsl_dataset_t *ds; objset_t *os = NULL; dnode_t *dn = NULL; int error; /* * Obtain the dnode for object using pool, objset, and object */ error = dsl_pool_hold(pool, FTAG, &dp); if (error) return (error); error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); dsl_pool_rele(dp, FTAG); if (error) return (error); error = dmu_objset_from_ds(ds, &os); dsl_dataset_rele(ds, FTAG); if (error) return (error); error = dnode_hold(os, record->zi_object, FTAG, &dn); if (error) return (error); /* * Translate the range into block IDs */ if (record->zi_start != 0 || record->zi_end != -1ULL) { record->zi_start >>= dn->dn_datablkshift; record->zi_end >>= dn->dn_datablkshift; } if (record->zi_level > 0) { if (record->zi_level >= dn->dn_nlevels) { dnode_rele(dn, FTAG); return (SET_ERROR(EDOM)); } if (record->zi_start != 0 || record->zi_end != 0) { int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (int level = record->zi_level; level > 0; level--) { record->zi_start >>= shift; record->zi_end >>= shift; } } } dnode_rele(dn, FTAG); return (0); } /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, * which is the switch to trigger all fault injection. */ int zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) { inject_handler_t *handler; int error; spa_t *spa; /* * If this is pool-wide metadata, make sure we unload the corresponding * spa_t, so that the next attempt to load it will trigger the fault. * We call spa_reset() to unload the pool appropriately. */ if (flags & ZINJECT_UNLOAD_SPA) if ((error = spa_reset(name)) != 0) return (error); if (record->zi_cmd == ZINJECT_DELAY_IO) { /* * A value of zero for the number of lanes or for the * delay time doesn't make sense. */ if (record->zi_timer == 0 || record->zi_nlanes == 0) return (SET_ERROR(EINVAL)); /* * The number of lanes is directly mapped to the size of * an array used by the handler. Thus, to ensure the * user doesn't trigger an allocation that's "too large" * we cap the number of lanes here. */ if (record->zi_nlanes >= UINT16_MAX) return (SET_ERROR(EINVAL)); } /* * If the supplied range was in bytes -- calculate the actual blkid */ if (flags & ZINJECT_CALC_RANGE) { error = zio_calculate_range(name, record); if (error != 0) return (error); } if (!(flags & ZINJECT_NULL)) { /* * spa_inject_ref() will add an injection reference, which will * prevent the pool from being removed from the namespace while * still allowing it to be unloaded. */ if ((spa = spa_inject_addref(name)) == NULL) return (SET_ERROR(ENOENT)); handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); handler->zi_spa = spa; handler->zi_record = *record; if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { handler->zi_lanes = kmem_zalloc( sizeof (*handler->zi_lanes) * handler->zi_record.zi_nlanes, KM_SLEEP); handler->zi_next_lane = 0; } else { handler->zi_lanes = NULL; handler->zi_next_lane = 0; } rw_enter(&inject_lock, RW_WRITER); /* * We can't move this increment into the conditional * above because we need to hold the RW_WRITER lock of * inject_lock, and we don't want to hold that while * allocating the handler's zi_lanes array. */ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { ASSERT3S(inject_delay_count, >=, 0); inject_delay_count++; ASSERT3S(inject_delay_count, >, 0); } *id = handler->zi_id = inject_next_id++; list_insert_tail(&inject_handlers, handler); atomic_inc_32(&zio_injection_enabled); rw_exit(&inject_lock); } /* * Flush the ARC, so that any attempts to read this data will end up * going to the ZIO layer. Note that this is a little overkill, but * we don't have the necessary ARC interfaces to do anything else, and * fault injection isn't a performance critical path. */ if (flags & ZINJECT_FLUSH_ARC) /* * We must use FALSE to ensure arc_flush returns, since * we're not preventing concurrent ARC insertions. */ arc_flush(NULL, FALSE); return (0); } /* * Returns the next record with an ID greater than that supplied to the * function. Used to iterate over all handlers in the system. */ int zio_inject_list_next(int *id, char *name, size_t buflen, zinject_record_t *record) { inject_handler_t *handler; int ret; mutex_enter(&spa_namespace_lock); rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) if (handler->zi_id > *id) break; if (handler) { *record = handler->zi_record; *id = handler->zi_id; (void) strlcpy(name, spa_name(handler->zi_spa), buflen); ret = 0; } else { ret = SET_ERROR(ENOENT); } rw_exit(&inject_lock); mutex_exit(&spa_namespace_lock); return (ret); } /* * Clear the fault handler with the given identifier, or return ENOENT if none * exists. */ int zio_clear_fault(int id) { inject_handler_t *handler; rw_enter(&inject_lock, RW_WRITER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) if (handler->zi_id == id) break; if (handler == NULL) { rw_exit(&inject_lock); return (SET_ERROR(ENOENT)); } if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { ASSERT3S(inject_delay_count, >, 0); inject_delay_count--; ASSERT3S(inject_delay_count, >=, 0); } list_remove(&inject_handlers, handler); rw_exit(&inject_lock); if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { ASSERT3P(handler->zi_lanes, !=, NULL); kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * handler->zi_record.zi_nlanes); } else { ASSERT3P(handler->zi_lanes, ==, NULL); } spa_inject_delref(handler->zi_spa); kmem_free(handler, sizeof (inject_handler_t)); atomic_dec_32(&zio_injection_enabled); return (0); } void zio_inject_init(void) { rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&inject_handlers, sizeof (inject_handler_t), offsetof(inject_handler_t, zi_link)); } void zio_inject_fini(void) { list_destroy(&inject_handlers); mutex_destroy(&inject_delay_mtx); rw_destroy(&inject_lock); } #if defined(_KERNEL) EXPORT_SYMBOL(zio_injection_enabled); EXPORT_SYMBOL(zio_inject_fault); EXPORT_SYMBOL(zio_inject_list_next); EXPORT_SYMBOL(zio_clear_fault); EXPORT_SYMBOL(zio_handle_fault_injection); EXPORT_SYMBOL(zio_handle_device_injection); EXPORT_SYMBOL(zio_handle_label_injection); #endif diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 6a4cd3fe691c..a0b74ef4a8c6 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -1,223 +1,224 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 600 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe outputdir = /var/tmp/test_results tags = ['functional'] [tests/functional/acl/posix:Linux] tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos'] tags = ['functional', 'acl', 'posix'] [tests/functional/acl/posix-sa:Linux] tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos'] tags = ['functional', 'acl', 'posix-sa'] [tests/functional/atime:Linux] tests = ['atime_003_pos', 'root_relatime_on'] tags = ['functional', 'atime'] [tests/functional/block_cloning:Linux] tests = ['block_cloning_ficlone', 'block_cloning_ficlonerange', 'block_cloning_ficlonerange_partial', 'block_cloning_disabled_ficlone', 'block_cloning_disabled_ficlonerange'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] tests = ['chattr_001_pos', 'chattr_002_neg'] tags = ['functional', 'chattr'] [tests/functional/cli_root/zfs:Linux] tests = ['zfs_003_neg'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_mount:Linux] tests = ['zfs_mount_006_pos', 'zfs_mount_008_pos', 'zfs_mount_013_pos', 'zfs_mount_014_neg', 'zfs_multi_mount'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_share:Linux] tests = ['zfs_share_005_pos', 'zfs_share_007_neg', 'zfs_share_009_neg', 'zfs_share_012_pos', 'zfs_share_013_pos'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_unshare:Linux] tests = ['zfs_unshare_008_pos'] tags = ['functional', 'cli_root', 'zfs_unshare'] [tests/functional/cli_root/zfs_sysfs:Linux] tests = ['zfeature_set_unsupported', 'zfs_get_unsupported', 'zfs_set_unsupported', 'zfs_sysfs_live', 'zpool_get_unsupported', 'zpool_set_unsupported'] tags = ['functional', 'cli_root', 'zfs_sysfs'] [tests/functional/cli_root/zpool_add:Linux] tests = ['add_nested_replacing_spare'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_expand:Linux] tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos', 'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos'] tags = ['functional', 'cli_root', 'zpool_expand'] [tests/functional/cli_root/zpool_import:Linux] tests = ['zpool_import_hostid_changed', 'zpool_import_hostid_changed_unclean_export', 'zpool_import_hostid_changed_cachefile', 'zpool_import_hostid_changed_cachefile_unclean_export'] tags = ['functional', 'cli_root', 'zpool_import'] [tests/functional/cli_root/zpool_reopen:Linux] tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos', 'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos', 'zpool_reopen_006_neg', 'zpool_reopen_007_pos'] tags = ['functional', 'cli_root', 'zpool_reopen'] [tests/functional/cli_root/zpool_split:Linux] tests = ['zpool_split_wholedisk'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/compression:Linux] tests = ['compress_004_pos'] tags = ['functional', 'compression'] [tests/functional/devices:Linux] tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] tags = ['functional', 'devices'] [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', - 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config'] + 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', + 'zed_slow_io', 'zed_slow_io_many_vdevs'] tags = ['functional', 'events'] [tests/functional/fadvise:Linux] tests = ['fadvise_sequential'] tags = ['functional', 'fadvise'] [tests/functional/fallocate:Linux] tests = ['fallocate_prealloc', 'fallocate_zero-range'] tags = ['functional', 'fallocate'] [tests/functional/fault:Linux] tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos', 'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', 'decompress_fault', 'scrub_after_resilver', 'zpool_status_-s'] tags = ['functional', 'fault'] [tests/functional/features/large_dnode:Linux] tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/io:Linux] tests = ['libaio', 'io_uring'] tags = ['functional', 'io'] [tests/functional/largest_pool:Linux] tests = ['largest_pool_001_pos'] pre = post = tags = ['functional', 'largest_pool'] [tests/functional/mmap:Linux] tests = ['mmap_libaio_001_pos', 'mmap_sync_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mmp:Linux] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] tags = ['functional', 'mmp'] [tests/functional/mount:Linux] tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] tests = ['procfs_list_basic', 'procfs_list_concurrent_readers', 'procfs_list_stale_read', 'pool_state'] tags = ['functional', 'procfs'] [tests/functional/projectquota:Linux] tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', 'projectquota_001_pos', 'projectquota_002_pos', 'projectquota_003_pos', 'projectquota_004_neg', 'projectquota_005_pos', 'projectquota_006_pos', 'projectquota_007_pos', 'projectquota_008_pos', 'projectquota_009_pos', 'projectspace_001_pos', 'projectspace_002_pos', 'projectspace_003_pos', 'projectspace_004_pos', 'projecttree_001_pos', 'projecttree_002_pos', 'projecttree_003_neg'] tags = ['functional', 'projectquota'] [tests/functional/dos_attributes:Linux] tests = ['read_dos_attrs_001', 'write_dos_attrs_001'] tags = ['functional', 'dos_attributes'] [tests/functional/renameat2:Linux] tests = ['renameat2_noreplace', 'renameat2_exchange', 'renameat2_whiteout'] tags = ['functional', 'renameat2'] [tests/functional/rsend:Linux] tests = ['send_realloc_dnode_size', 'send_encrypted_files'] tags = ['functional', 'rsend'] [tests/functional/simd:Linux] pre = post = tests = ['simd_supported'] tags = ['functional', 'simd'] [tests/functional/snapshot:Linux] tests = ['snapshot_015_pos', 'snapshot_016_pos'] tags = ['functional', 'snapshot'] [tests/functional/tmpfile:Linux] tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos', 'tmpfile_stat_mode'] tags = ['functional', 'tmpfile'] [tests/functional/upgrade:Linux] tests = ['upgrade_projectquota_001_pos'] tags = ['functional', 'upgrade'] [tests/functional/user_namespace:Linux] tests = ['user_namespace_001', 'user_namespace_002', 'user_namespace_003', 'user_namespace_004'] tags = ['functional', 'user_namespace'] [tests/functional/userquota:Linux] tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos', 'userquota_013_pos', 'userspace_003_pos'] tags = ['functional', 'userquota'] [tests/functional/zvol/zvol_misc:Linux] tests = ['zvol_misc_fua'] tags = ['functional', 'zvol', 'zvol_misc'] [tests/functional/idmap_mount:Linux] tests = ['idmap_mount_001', 'idmap_mount_002', 'idmap_mount_003', 'idmap_mount_004', 'idmap_mount_005'] tags = ['functional', 'idmap_mount'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 01af258d59fe..fe9c92108725 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1,2112 +1,2114 @@ CLEANFILES = dist_noinst_DATA = include $(top_srcdir)/config/Substfiles.am datadir_zfs_tests_testsdir = $(datadir)/$(PACKAGE)/zfs-tests/tests nobase_dist_datadir_zfs_tests_tests_DATA = \ perf/nfs-sample.cfg \ perf/perf.shlib \ \ perf/fio/mkfiles.fio \ perf/fio/random_reads.fio \ perf/fio/random_readwrite.fio \ perf/fio/random_readwrite_fixed.fio \ perf/fio/random_writes.fio \ perf/fio/sequential_reads.fio \ perf/fio/sequential_readwrite.fio \ perf/fio/sequential_writes.fio nobase_dist_datadir_zfs_tests_tests_SCRIPTS = \ perf/regression/random_reads.ksh \ perf/regression/random_readwrite.ksh \ perf/regression/random_readwrite_fixed.ksh \ perf/regression/random_writes.ksh \ perf/regression/random_writes_zil.ksh \ perf/regression/sequential_reads_arc_cached_clone.ksh \ perf/regression/sequential_reads_arc_cached.ksh \ perf/regression/sequential_reads_dbuf_cached.ksh \ perf/regression/sequential_reads.ksh \ perf/regression/sequential_writes.ksh \ perf/regression/setup.ksh \ \ perf/scripts/prefetch_io.sh # These lists can be regenerated by running make regen-tests at the root, or, on a *clean* source: # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable -name '*.in' | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' -executable -name '*.in' | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -name '*.in' ! -name '*.c' | grep -Fe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$!s/$/ \\/' # # simd and tmpfile are Linux-only and not installed elsewhere # # C programs are specced in ../Makefile.am above as part of the main Makefile find_common := find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' regen: @$(MAKE) -C $(top_builddir) clean @$(MAKE) clean $(SED) $(ac_inplace) '/^# -- >8 --/q' Makefile.am echo >> Makefile.am echo 'nobase_nodist_datadir_zfs_tests_tests_DATA = \' >> Makefile.am $(find_common) ! -executable -name '*.in' | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo 'nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \' >> Makefile.am $(find_common) -executable -name '*.in' | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo >> Makefile.am echo 'SUBSTFILES += $$(nobase_nodist_datadir_zfs_tests_tests_DATA) $$(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)' >> Makefile.am echo >> Makefile.am echo 'if BUILD_LINUX' >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am $(find_common) ! -name '*.in' ! -name '*.c' | grep -Fe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo 'endif' >> Makefile.am echo >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_DATA += \' >> Makefile.am $(find_common) ! -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am echo >> Makefile.am echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am $(find_common) -executable ! -name '*.in' ! -name '*.c' | grep -vFe /simd -e /tmpfile | sort | sed 's/^/\t/;$$!s/$$/ \\/' >> Makefile.am # -- >8 -- nobase_nodist_datadir_zfs_tests_tests_DATA = \ functional/pam/utilities.kshlib nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \ functional/pyzfs/pyzfs_unittest.ksh SUBSTFILES += $(nobase_nodist_datadir_zfs_tests_tests_DATA) $(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS) if BUILD_LINUX nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/simd/simd_supported.ksh \ functional/tmpfile/cleanup.ksh \ functional/tmpfile/setup.ksh endif nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/acl/acl.cfg \ functional/acl/acl_common.kshlib \ functional/alloc_class/alloc_class.cfg \ functional/alloc_class/alloc_class.kshlib \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ functional/bclone/bclone.cfg \ functional/bclone/bclone_common.kshlib \ functional/bclone/bclone_corner_cases.kshlib \ functional/block_cloning/block_cloning.kshlib \ functional/cache/cache.cfg \ functional/cache/cache.kshlib \ functional/cachefile/cachefile.cfg \ functional/cachefile/cachefile.kshlib \ functional/casenorm/casenorm.cfg \ functional/casenorm/casenorm.kshlib \ functional/channel_program/channel_common.kshlib \ functional/channel_program/lua_core/tst.args_to_lua.out \ functional/channel_program/lua_core/tst.args_to_lua.zcp \ functional/channel_program/lua_core/tst.divide_by_zero.err \ functional/channel_program/lua_core/tst.divide_by_zero.zcp \ functional/channel_program/lua_core/tst.exists.zcp \ functional/channel_program/lua_core/tst.large_prog.out \ functional/channel_program/lua_core/tst.large_prog.zcp \ functional/channel_program/lua_core/tst.lib_base.lua \ functional/channel_program/lua_core/tst.lib_coroutine.lua \ functional/channel_program/lua_core/tst.lib_strings.lua \ functional/channel_program/lua_core/tst.lib_table.lua \ functional/channel_program/lua_core/tst.nested_neg.zcp \ functional/channel_program/lua_core/tst.nested_pos.zcp \ functional/channel_program/lua_core/tst.recursive.zcp \ functional/channel_program/lua_core/tst.return_large.zcp \ functional/channel_program/lua_core/tst.return_recursive_table.zcp \ functional/channel_program/lua_core/tst.stack_gsub.err \ functional/channel_program/lua_core/tst.stack_gsub.zcp \ functional/channel_program/lua_core/tst.timeout.zcp \ functional/channel_program/synctask_core/tst.bookmark.copy.zcp \ functional/channel_program/synctask_core/tst.bookmark.create.zcp \ functional/channel_program/synctask_core/tst.get_index_props.out \ functional/channel_program/synctask_core/tst.get_index_props.zcp \ functional/channel_program/synctask_core/tst.get_number_props.out \ functional/channel_program/synctask_core/tst.get_number_props.zcp \ functional/channel_program/synctask_core/tst.get_string_props.out \ functional/channel_program/synctask_core/tst.get_string_props.zcp \ functional/channel_program/synctask_core/tst.promote_conflict.zcp \ functional/channel_program/synctask_core/tst.set_props.zcp \ functional/channel_program/synctask_core/tst.snapshot_destroy.zcp \ functional/channel_program/synctask_core/tst.snapshot_neg.zcp \ functional/channel_program/synctask_core/tst.snapshot_recursive.zcp \ functional/channel_program/synctask_core/tst.snapshot_rename.zcp \ functional/channel_program/synctask_core/tst.snapshot_simple.zcp \ functional/checksum/default.cfg \ functional/clean_mirror/clean_mirror_common.kshlib \ functional/clean_mirror/default.cfg \ functional/cli_root/cli_common.kshlib \ functional/cli_root/zfs_copies/zfs_copies.cfg \ functional/cli_root/zfs_copies/zfs_copies.kshlib \ functional/cli_root/zfs_create/properties.kshlib \ functional/cli_root/zfs_create/zfs_create.cfg \ functional/cli_root/zfs_create/zfs_create_common.kshlib \ functional/cli_root/zfs_destroy/zfs_destroy.cfg \ functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib \ functional/cli_root/zfs_get/zfs_get_common.kshlib \ functional/cli_root/zfs_get/zfs_get_list_d.kshlib \ functional/cli_root/zfs_jail/jail.conf \ functional/cli_root/zfs_load-key/HEXKEY \ functional/cli_root/zfs_load-key/PASSPHRASE \ functional/cli_root/zfs_load-key/RAWKEY \ functional/cli_root/zfs_load-key/zfs_load-key.cfg \ functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib \ functional/cli_root/zfs_mount/zfs_mount.cfg \ functional/cli_root/zfs_mount/zfs_mount.kshlib \ functional/cli_root/zfs_promote/zfs_promote.cfg \ functional/cli_root/zfs_receive/zstd_test_data.txt \ functional/cli_root/zfs_rename/zfs_rename.cfg \ functional/cli_root/zfs_rename/zfs_rename.kshlib \ functional/cli_root/zfs_rollback/zfs_rollback.cfg \ functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib \ functional/cli_root/zfs_send/zfs_send.cfg \ functional/cli_root/zfs_set/zfs_set_common.kshlib \ functional/cli_root/zfs_share/zfs_share.cfg \ functional/cli_root/zfs_snapshot/zfs_snapshot.cfg \ functional/cli_root/zfs_unmount/zfs_unmount.cfg \ functional/cli_root/zfs_unmount/zfs_unmount.kshlib \ functional/cli_root/zfs_upgrade/zfs_upgrade.kshlib \ functional/cli_root/zfs_wait/zfs_wait.kshlib \ functional/cli_root/zpool_add/zpool_add.cfg \ functional/cli_root/zpool_add/zpool_add.kshlib \ functional/cli_root/zpool_clear/zpool_clear.cfg \ functional/cli_root/zpool_create/draidcfg.gz \ functional/cli_root/zpool_create/zpool_create.cfg \ functional/cli_root/zpool_create/zpool_create.shlib \ functional/cli_root/zpool_destroy/zpool_destroy.cfg \ functional/cli_root/zpool_events/zpool_events.cfg \ functional/cli_root/zpool_events/zpool_events.kshlib \ functional/cli_root/zpool_expand/zpool_expand.cfg \ functional/cli_root/zpool_export/zpool_export.cfg \ functional/cli_root/zpool_export/zpool_export.kshlib \ functional/cli_root/zpool_get/vdev_get.cfg \ functional/cli_root/zpool_get/zpool_get.cfg \ functional/cli_root/zpool_get/zpool_get_parsable.cfg \ functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 \ functional/cli_root/zpool_import/blockfiles/missing_ivset.dat.bz2 \ functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \ functional/cli_root/zpool_import/zpool_import.cfg \ functional/cli_root/zpool_import/zpool_import.kshlib \ functional/cli_root/zpool_initialize/zpool_initialize.kshlib \ functional/cli_root/zpool_labelclear/labelclear.cfg \ functional/cli_root/zpool_remove/zpool_remove.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.shlib \ functional/cli_root/zpool_resilver/zpool_resilver.cfg \ functional/cli_root/zpool_scrub/zpool_scrub.cfg \ functional/cli_root/zpool_split/zpool_split.cfg \ functional/cli_root/zpool_trim/zpool_trim.kshlib \ functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz21.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz22.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz23.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe1.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe2.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe3.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 \ functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-vBROKEN.dat.bz2 \ functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \ functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \ functional/cli_root/zpool_wait/zpool_wait.kshlib \ functional/cli_root/zhack/library.kshlib \ functional/cli_user/misc/misc.cfg \ functional/cli_user/zfs_list/zfs_list.cfg \ functional/cli_user/zfs_list/zfs_list.kshlib \ functional/compression/compress.cfg \ functional/compression/testpool_zstd.tar.gz \ functional/deadman/deadman.cfg \ functional/delegate/delegate.cfg \ functional/delegate/delegate_common.kshlib \ functional/devices/devices.cfg \ functional/devices/devices_common.kshlib \ functional/events/events.cfg \ functional/events/events_common.kshlib \ functional/fault/fault.cfg \ functional/grow/grow.cfg \ functional/history/history.cfg \ functional/history/history_common.kshlib \ functional/history/i386.migratedpool.DAT.Z \ functional/history/i386.orig_history.txt \ functional/history/sparc.migratedpool.DAT.Z \ functional/history/sparc.orig_history.txt \ functional/history/zfs-pool-v4.dat.Z \ functional/inheritance/config001.cfg \ functional/inheritance/config002.cfg \ functional/inheritance/config003.cfg \ functional/inheritance/config004.cfg \ functional/inheritance/config005.cfg \ functional/inheritance/config006.cfg \ functional/inheritance/config007.cfg \ functional/inheritance/config008.cfg \ functional/inheritance/config009.cfg \ functional/inheritance/config010.cfg \ functional/inheritance/config011.cfg \ functional/inheritance/config012.cfg \ functional/inheritance/config013.cfg \ functional/inheritance/config014.cfg \ functional/inheritance/config015.cfg \ functional/inheritance/config016.cfg \ functional/inheritance/config017.cfg \ functional/inheritance/config018.cfg \ functional/inheritance/config019.cfg \ functional/inheritance/config020.cfg \ functional/inheritance/config021.cfg \ functional/inheritance/config022.cfg \ functional/inheritance/config023.cfg \ functional/inheritance/config024.cfg \ functional/inheritance/inherit.kshlib \ functional/inheritance/README.config \ functional/inheritance/README.state \ functional/inheritance/state001.cfg \ functional/inheritance/state002.cfg \ functional/inheritance/state003.cfg \ functional/inheritance/state004.cfg \ functional/inheritance/state005.cfg \ functional/inheritance/state006.cfg \ functional/inheritance/state007.cfg \ functional/inheritance/state008.cfg \ functional/inheritance/state009.cfg \ functional/inheritance/state010.cfg \ functional/inheritance/state011.cfg \ functional/inheritance/state012.cfg \ functional/inheritance/state013.cfg \ functional/inheritance/state014.cfg \ functional/inheritance/state015.cfg \ functional/inheritance/state016.cfg \ functional/inheritance/state017.cfg \ functional/inheritance/state018.cfg \ functional/inheritance/state019.cfg \ functional/inheritance/state020.cfg \ functional/inheritance/state021.cfg \ functional/inheritance/state022.cfg \ functional/inheritance/state023.cfg \ functional/inheritance/state024.cfg \ functional/inuse/inuse.cfg \ functional/io/io.cfg \ functional/l2arc/l2arc.cfg \ functional/largest_pool/largest_pool.cfg \ functional/migration/migration.cfg \ functional/migration/migration.kshlib \ functional/mmap/mmap.cfg \ functional/mmp/mmp.cfg \ functional/mmp/mmp.kshlib \ functional/mv_files/mv_files.cfg \ functional/mv_files/mv_files_common.kshlib \ functional/nopwrite/nopwrite.shlib \ functional/no_space/enospc.cfg \ functional/online_offline/online_offline.cfg \ functional/pool_checkpoint/pool_checkpoint.kshlib \ functional/projectquota/projectquota.cfg \ functional/projectquota/projectquota_common.kshlib \ functional/quota/quota.cfg \ functional/quota/quota.kshlib \ functional/redacted_send/redacted.cfg \ functional/redacted_send/redacted.kshlib \ functional/redundancy/redundancy.cfg \ functional/redundancy/redundancy.kshlib \ functional/refreserv/refreserv.cfg \ functional/removal/removal.kshlib \ functional/replacement/replacement.cfg \ functional/reservation/reservation.cfg \ functional/reservation/reservation.shlib \ functional/rsend/dedup_encrypted_zvol.bz2 \ functional/rsend/dedup_encrypted_zvol.zsend.bz2 \ functional/rsend/dedup.zsend.bz2 \ functional/rsend/fs.tar.gz \ functional/rsend/rsend.cfg \ functional/rsend/rsend.kshlib \ functional/scrub_mirror/default.cfg \ functional/scrub_mirror/scrub_mirror_common.kshlib \ functional/slog/slog.cfg \ functional/slog/slog.kshlib \ functional/snapshot/snapshot.cfg \ functional/snapused/snapused.kshlib \ functional/sparse/sparse.cfg \ functional/trim/trim.cfg \ functional/trim/trim.kshlib \ functional/truncate/truncate.cfg \ functional/upgrade/upgrade_common.kshlib \ functional/user_namespace/user_namespace.cfg \ functional/user_namespace/user_namespace_common.kshlib \ functional/userquota/13709_reproducer.bz2 \ functional/userquota/userquota.cfg \ functional/userquota/userquota_common.kshlib \ functional/vdev_zaps/vdev_zaps.kshlib \ functional/xattr/xattr.cfg \ functional/xattr/xattr_common.kshlib \ functional/zvol/zvol.cfg \ functional/zvol/zvol_cli/zvol_cli.cfg \ functional/zvol/zvol_common.shlib \ functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg \ functional/zvol/zvol_misc/zvol_misc_common.kshlib \ functional/zvol/zvol_swap/zvol_swap.cfg \ functional/idmap_mount/idmap_mount.cfg \ functional/idmap_mount/idmap_mount_common.kshlib nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/acl/off/cleanup.ksh \ functional/acl/off/dosmode.ksh \ functional/acl/off/posixmode.ksh \ functional/acl/off/setup.ksh \ functional/acl/posix/cleanup.ksh \ functional/acl/posix/posix_001_pos.ksh \ functional/acl/posix/posix_002_pos.ksh \ functional/acl/posix/posix_003_pos.ksh \ functional/acl/posix/posix_004_pos.ksh \ functional/acl/posix-sa/cleanup.ksh \ functional/acl/posix-sa/posix_001_pos.ksh \ functional/acl/posix-sa/posix_002_pos.ksh \ functional/acl/posix-sa/posix_003_pos.ksh \ functional/acl/posix-sa/posix_004_pos.ksh \ functional/acl/posix-sa/setup.ksh \ functional/acl/posix/setup.ksh \ functional/alloc_class/alloc_class_001_pos.ksh \ functional/alloc_class/alloc_class_002_neg.ksh \ functional/alloc_class/alloc_class_003_pos.ksh \ functional/alloc_class/alloc_class_004_pos.ksh \ functional/alloc_class/alloc_class_005_pos.ksh \ functional/alloc_class/alloc_class_006_pos.ksh \ functional/alloc_class/alloc_class_007_pos.ksh \ functional/alloc_class/alloc_class_008_pos.ksh \ functional/alloc_class/alloc_class_009_pos.ksh \ functional/alloc_class/alloc_class_010_pos.ksh \ functional/alloc_class/alloc_class_011_neg.ksh \ functional/alloc_class/alloc_class_012_pos.ksh \ functional/alloc_class/alloc_class_013_pos.ksh \ functional/alloc_class/alloc_class_014_neg.ksh \ functional/alloc_class/alloc_class_015_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ functional/append/file_append.ksh \ functional/append/threadsappend_001_pos.ksh \ functional/append/cleanup.ksh \ functional/append/setup.ksh \ functional/arc/arcstats_runtime_tuning.ksh \ functional/arc/cleanup.ksh \ functional/arc/dbufstats_001_pos.ksh \ functional/arc/dbufstats_002_pos.ksh \ functional/arc/dbufstats_003_pos.ksh \ functional/arc/setup.ksh \ functional/atime/atime_001_pos.ksh \ functional/atime/atime_002_neg.ksh \ functional/atime/atime_003_pos.ksh \ functional/atime/cleanup.ksh \ functional/atime/root_atime_off.ksh \ functional/atime/root_atime_on.ksh \ functional/atime/root_relatime_on.ksh \ functional/atime/setup.ksh \ functional/bclone/bclone_crossfs_corner_cases.ksh \ functional/bclone/bclone_crossfs_corner_cases_limited.ksh \ functional/bclone/bclone_crossfs_data.ksh \ functional/bclone/bclone_crossfs_embedded.ksh \ functional/bclone/bclone_crossfs_hole.ksh \ functional/bclone/bclone_diffprops_all.ksh \ functional/bclone/bclone_diffprops_checksum.ksh \ functional/bclone/bclone_diffprops_compress.ksh \ functional/bclone/bclone_diffprops_copies.ksh \ functional/bclone/bclone_diffprops_recordsize.ksh \ functional/bclone/bclone_prop_sync.ksh \ functional/bclone/bclone_samefs_corner_cases.ksh \ functional/bclone/bclone_samefs_corner_cases_limited.ksh \ functional/bclone/bclone_samefs_data.ksh \ functional/bclone/bclone_samefs_embedded.ksh \ functional/bclone/bclone_samefs_hole.ksh \ functional/bclone/cleanup.ksh \ functional/bclone/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ functional/block_cloning/block_cloning_clone_mmap_cached.ksh \ functional/block_cloning/block_cloning_clone_mmap_write.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ functional/block_cloning/block_cloning_copyfilerange.ksh \ functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ functional/block_cloning/block_cloning_disabled_ficlone.ksh \ functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlone.ksh \ functional/block_cloning/block_cloning_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlonerange_partial.ksh \ functional/block_cloning/block_cloning_cross_enc_dataset.ksh \ functional/block_cloning/block_cloning_replay.ksh \ functional/block_cloning/block_cloning_replay_encrypted.ksh \ functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ functional/bootfs/bootfs_004_neg.ksh \ functional/bootfs/bootfs_005_neg.ksh \ functional/bootfs/bootfs_006_pos.ksh \ functional/bootfs/bootfs_007_pos.ksh \ functional/bootfs/bootfs_008_pos.ksh \ functional/bootfs/cleanup.ksh \ functional/bootfs/setup.ksh \ functional/btree/btree_negative.ksh \ functional/btree/btree_positive.ksh \ functional/cache/cache_001_pos.ksh \ functional/cache/cache_002_pos.ksh \ functional/cache/cache_003_pos.ksh \ functional/cache/cache_004_neg.ksh \ functional/cache/cache_005_neg.ksh \ functional/cache/cache_006_pos.ksh \ functional/cache/cache_007_neg.ksh \ functional/cache/cache_008_neg.ksh \ functional/cache/cache_009_pos.ksh \ functional/cache/cache_010_pos.ksh \ functional/cache/cache_011_pos.ksh \ functional/cache/cache_012_pos.ksh \ functional/cache/cleanup.ksh \ functional/cachefile/cachefile_001_pos.ksh \ functional/cachefile/cachefile_002_pos.ksh \ functional/cachefile/cachefile_003_pos.ksh \ functional/cachefile/cachefile_004_pos.ksh \ functional/cachefile/cleanup.ksh \ functional/cachefile/setup.ksh \ functional/cache/setup.ksh \ functional/casenorm/case_all_values.ksh \ functional/casenorm/cleanup.ksh \ functional/casenorm/insensitive_formd_delete.ksh \ functional/casenorm/insensitive_formd_lookup.ksh \ functional/casenorm/insensitive_none_delete.ksh \ functional/casenorm/insensitive_none_lookup.ksh \ functional/casenorm/mixed_create_failure.ksh \ functional/casenorm/mixed_formd_delete.ksh \ functional/casenorm/mixed_formd_lookup_ci.ksh \ functional/casenorm/mixed_formd_lookup.ksh \ functional/casenorm/mixed_none_delete.ksh \ functional/casenorm/mixed_none_lookup_ci.ksh \ functional/casenorm/mixed_none_lookup.ksh \ functional/casenorm/norm_all_values.ksh \ functional/casenorm/sensitive_formd_delete.ksh \ functional/casenorm/sensitive_formd_lookup.ksh \ functional/casenorm/sensitive_none_delete.ksh \ functional/casenorm/sensitive_none_lookup.ksh \ functional/casenorm/setup.ksh \ functional/channel_program/lua_core/cleanup.ksh \ functional/channel_program/lua_core/setup.ksh \ functional/channel_program/lua_core/tst.args_to_lua.ksh \ functional/channel_program/lua_core/tst.divide_by_zero.ksh \ functional/channel_program/lua_core/tst.exists.ksh \ functional/channel_program/lua_core/tst.integer_illegal.ksh \ functional/channel_program/lua_core/tst.integer_overflow.ksh \ functional/channel_program/lua_core/tst.language_functions_neg.ksh \ functional/channel_program/lua_core/tst.language_functions_pos.ksh \ functional/channel_program/lua_core/tst.large_prog.ksh \ functional/channel_program/lua_core/tst.libraries.ksh \ functional/channel_program/lua_core/tst.memory_limit.ksh \ functional/channel_program/lua_core/tst.nested_neg.ksh \ functional/channel_program/lua_core/tst.nested_pos.ksh \ functional/channel_program/lua_core/tst.nvlist_to_lua.ksh \ functional/channel_program/lua_core/tst.recursive_neg.ksh \ functional/channel_program/lua_core/tst.recursive_pos.ksh \ functional/channel_program/lua_core/tst.return_large.ksh \ functional/channel_program/lua_core/tst.return_nvlist_neg.ksh \ functional/channel_program/lua_core/tst.return_nvlist_pos.ksh \ functional/channel_program/lua_core/tst.return_recursive_table.ksh \ functional/channel_program/lua_core/tst.stack_gsub.ksh \ functional/channel_program/lua_core/tst.timeout.ksh \ functional/channel_program/synctask_core/cleanup.ksh \ functional/channel_program/synctask_core/setup.ksh \ functional/channel_program/synctask_core/tst.bookmark.copy.ksh \ functional/channel_program/synctask_core/tst.bookmark.create.ksh \ functional/channel_program/synctask_core/tst.destroy_fs.ksh \ functional/channel_program/synctask_core/tst.destroy_snap.ksh \ functional/channel_program/synctask_core/tst.get_count_and_limit.ksh \ functional/channel_program/synctask_core/tst.get_index_props.ksh \ functional/channel_program/synctask_core/tst.get_mountpoint.ksh \ functional/channel_program/synctask_core/tst.get_neg.ksh \ functional/channel_program/synctask_core/tst.get_number_props.ksh \ functional/channel_program/synctask_core/tst.get_string_props.ksh \ functional/channel_program/synctask_core/tst.get_type.ksh \ functional/channel_program/synctask_core/tst.get_userquota.ksh \ functional/channel_program/synctask_core/tst.get_written.ksh \ functional/channel_program/synctask_core/tst.inherit.ksh \ functional/channel_program/synctask_core/tst.list_bookmarks.ksh \ functional/channel_program/synctask_core/tst.list_children.ksh \ functional/channel_program/synctask_core/tst.list_clones.ksh \ functional/channel_program/synctask_core/tst.list_holds.ksh \ functional/channel_program/synctask_core/tst.list_snapshots.ksh \ functional/channel_program/synctask_core/tst.list_system_props.ksh \ functional/channel_program/synctask_core/tst.list_user_props.ksh \ functional/channel_program/synctask_core/tst.parse_args_neg.ksh \ functional/channel_program/synctask_core/tst.promote_conflict.ksh \ functional/channel_program/synctask_core/tst.promote_multiple.ksh \ functional/channel_program/synctask_core/tst.promote_simple.ksh \ functional/channel_program/synctask_core/tst.rollback_mult.ksh \ functional/channel_program/synctask_core/tst.rollback_one.ksh \ functional/channel_program/synctask_core/tst.set_props.ksh \ functional/channel_program/synctask_core/tst.snapshot_destroy.ksh \ functional/channel_program/synctask_core/tst.snapshot_neg.ksh \ functional/channel_program/synctask_core/tst.snapshot_recursive.ksh \ functional/channel_program/synctask_core/tst.snapshot_rename.ksh \ functional/channel_program/synctask_core/tst.snapshot_simple.ksh \ functional/channel_program/synctask_core/tst.terminate_by_signal.ksh \ functional/chattr/chattr_001_pos.ksh \ functional/chattr/chattr_002_neg.ksh \ functional/chattr/cleanup.ksh \ functional/chattr/setup.ksh \ functional/checksum/cleanup.ksh \ functional/checksum/filetest_001_pos.ksh \ functional/checksum/filetest_002_pos.ksh \ functional/checksum/run_blake3_test.ksh \ functional/checksum/run_edonr_test.ksh \ functional/checksum/run_sha2_test.ksh \ functional/checksum/run_skein_test.ksh \ functional/checksum/setup.ksh \ functional/clean_mirror/clean_mirror_001_pos.ksh \ functional/clean_mirror/clean_mirror_002_pos.ksh \ functional/clean_mirror/clean_mirror_003_pos.ksh \ functional/clean_mirror/clean_mirror_004_pos.ksh \ functional/clean_mirror/cleanup.ksh \ functional/clean_mirror/setup.ksh \ functional/cli_root/zdb/zdb_002_pos.ksh \ functional/cli_root/zdb/zdb_003_pos.ksh \ functional/cli_root/zdb/zdb_004_pos.ksh \ functional/cli_root/zdb/zdb_005_pos.ksh \ functional/cli_root/zdb/zdb_006_pos.ksh \ functional/cli_root/zdb/zdb_args_neg.ksh \ functional/cli_root/zdb/zdb_args_pos.ksh \ functional/cli_root/zdb/zdb_backup.ksh \ functional/cli_root/zdb/zdb_block_size_histogram.ksh \ functional/cli_root/zdb/zdb_checksum.ksh \ functional/cli_root/zdb/zdb_decompress.ksh \ functional/cli_root/zdb/zdb_decompress_zstd.ksh \ functional/cli_root/zdb/zdb_display_block.ksh \ functional/cli_root/zdb/zdb_encrypted.ksh \ functional/cli_root/zdb/zdb_label_checksum.ksh \ functional/cli_root/zdb/zdb_object_range_neg.ksh \ functional/cli_root/zdb/zdb_object_range_pos.ksh \ functional/cli_root/zdb/zdb_objset_id.ksh \ functional/cli_root/zdb/zdb_recover_2.ksh \ functional/cli_root/zdb/zdb_recover.ksh \ functional/cli_root/zfs_bookmark/cleanup.ksh \ functional/cli_root/zfs_bookmark/setup.ksh \ functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \ functional/cli_root/zfs_change-key/cleanup.ksh \ functional/cli_root/zfs_change-key/setup.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_child.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_format.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh \ functional/cli_root/zfs_change-key/zfs_change-key.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_load.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_location.ksh \ functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh \ functional/cli_root/zfs/cleanup.ksh \ functional/cli_root/zfs_clone/cleanup.ksh \ functional/cli_root/zfs_clone/setup.ksh \ functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh \ functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh \ functional/cli_root/zfs_clone/zfs_clone_deeply_nested.ksh \ functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh \ functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh \ functional/cli_root/zfs_copies/cleanup.ksh \ functional/cli_root/zfs_copies/setup.ksh \ functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh \ functional/cli_root/zfs_copies/zfs_copies_004_neg.ksh \ functional/cli_root/zfs_copies/zfs_copies_005_neg.ksh \ functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh \ functional/cli_root/zfs_create/cleanup.ksh \ functional/cli_root/zfs_create/setup.ksh \ functional/cli_root/zfs_create/zfs_create_001_pos.ksh \ functional/cli_root/zfs_create/zfs_create_002_pos.ksh \ functional/cli_root/zfs_create/zfs_create_003_pos.ksh \ functional/cli_root/zfs_create/zfs_create_004_pos.ksh \ functional/cli_root/zfs_create/zfs_create_005_pos.ksh \ functional/cli_root/zfs_create/zfs_create_006_pos.ksh \ functional/cli_root/zfs_create/zfs_create_007_pos.ksh \ functional/cli_root/zfs_create/zfs_create_008_neg.ksh \ functional/cli_root/zfs_create/zfs_create_009_neg.ksh \ functional/cli_root/zfs_create/zfs_create_010_neg.ksh \ functional/cli_root/zfs_create/zfs_create_011_pos.ksh \ functional/cli_root/zfs_create/zfs_create_012_pos.ksh \ functional/cli_root/zfs_create/zfs_create_013_pos.ksh \ functional/cli_root/zfs_create/zfs_create_014_pos.ksh \ functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh \ functional/cli_root/zfs_create/zfs_create_dryrun.ksh \ functional/cli_root/zfs_create/zfs_create_encrypted.ksh \ functional/cli_root/zfs_create/zfs_create_nomount.ksh \ functional/cli_root/zfs_create/zfs_create_verbose.ksh \ functional/cli_root/zfs_destroy/cleanup.ksh \ functional/cli_root/zfs_destroy/setup.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh \ functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_006_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_008_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_009_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_011_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_012_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_013_neg.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh \ functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh \ functional/cli_root/zfs_diff/cleanup.ksh \ functional/cli_root/zfs_diff/setup.ksh \ functional/cli_root/zfs_diff/zfs_diff_changes.ksh \ functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh \ functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh \ functional/cli_root/zfs_diff/zfs_diff_mangle.ksh \ functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh \ functional/cli_root/zfs_diff/zfs_diff_types.ksh \ functional/cli_root/zfs_get/cleanup.ksh \ functional/cli_root/zfs_get/setup.ksh \ functional/cli_root/zfs_get/zfs_get_001_pos.ksh \ functional/cli_root/zfs_get/zfs_get_002_pos.ksh \ functional/cli_root/zfs_get/zfs_get_003_pos.ksh \ functional/cli_root/zfs_get/zfs_get_004_pos.ksh \ functional/cli_root/zfs_get/zfs_get_005_neg.ksh \ functional/cli_root/zfs_get/zfs_get_006_neg.ksh \ functional/cli_root/zfs_get/zfs_get_007_neg.ksh \ functional/cli_root/zfs_get/zfs_get_008_pos.ksh \ functional/cli_root/zfs_get/zfs_get_009_pos.ksh \ functional/cli_root/zfs_get/zfs_get_010_neg.ksh \ functional/cli_root/zfs_ids_to_path/cleanup.ksh \ functional/cli_root/zfs_ids_to_path/setup.ksh \ functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh \ functional/cli_root/zfs_inherit/cleanup.ksh \ functional/cli_root/zfs_inherit/setup.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_001_neg.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh \ functional/cli_root/zfs_inherit/zfs_inherit_mountpoint.ksh \ functional/cli_root/zfs_jail/cleanup.ksh \ functional/cli_root/zfs_jail/setup.ksh \ functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh \ functional/cli_root/zfs_load-key/cleanup.ksh \ functional/cli_root/zfs_load-key/setup.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_all.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_file.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_https.ksh \ functional/cli_root/zfs_load-key/zfs_load-key.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_location.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh \ functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh \ functional/cli_root/zfs_mount/cleanup.ksh \ functional/cli_root/zfs_mount/setup.ksh \ functional/cli_root/zfs_mount/zfs_mount_001_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_002_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_003_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_004_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_009_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_001_pos.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \ functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \ functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \ functional/cli_root/zfs_mount/zfs_mount_remount.ksh \ functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \ functional/cli_root/zfs_mount/zfs_multi_mount.ksh \ functional/cli_root/zfs_program/cleanup.ksh \ functional/cli_root/zfs_program/setup.ksh \ functional/cli_root/zfs_program/zfs_program_json.ksh \ functional/cli_root/zfs_promote/cleanup.ksh \ functional/cli_root/zfs_promote/setup.ksh \ functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_002_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh \ functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh \ functional/cli_root/zfs_promote/zfs_promote_008_pos.ksh \ functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh \ functional/cli_root/zfs_property/cleanup.ksh \ functional/cli_root/zfs_property/setup.ksh \ functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh \ functional/cli_root/zfs_receive/cleanup.ksh \ functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh \ functional/cli_root/zfs_receive/receive-o-x_props_override.ksh \ functional/cli_root/zfs_receive/setup.ksh \ functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_006_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_007_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_008_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh \ functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_011_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_012_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_015_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh \ functional/cli_root/zfs_receive/zfs_receive_-e.ksh \ functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh \ functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh \ functional/cli_root/zfs_receive/zfs_receive_new_props.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh \ functional/cli_root/zfs_receive/zfs_receive_raw.ksh \ functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \ functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \ functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \ functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \ functional/cli_root/zfs_receive/zfs_receive_large_block_corrective.ksh \ functional/cli_root/zfs_rename/cleanup.ksh \ functional/cli_root/zfs_rename/setup.ksh \ functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_002_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_004_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_009_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_010_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_012_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh \ functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh \ functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh \ functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh \ functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh \ functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh \ functional/cli_root/zfs_reservation/cleanup.ksh \ functional/cli_root/zfs_reservation/setup.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh \ functional/cli_root/zfs_rollback/cleanup.ksh \ functional/cli_root/zfs_rollback/setup.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_002_pos.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh \ functional/cli_root/zfs_send/cleanup.ksh \ functional/cli_root/zfs_send/setup.ksh \ functional/cli_root/zfs_send/zfs_send_001_pos.ksh \ functional/cli_root/zfs_send/zfs_send_002_pos.ksh \ functional/cli_root/zfs_send/zfs_send_003_pos.ksh \ functional/cli_root/zfs_send/zfs_send_004_neg.ksh \ functional/cli_root/zfs_send/zfs_send_005_pos.ksh \ functional/cli_root/zfs_send/zfs_send_006_pos.ksh \ functional/cli_root/zfs_send/zfs_send_007_pos.ksh \ functional/cli_root/zfs_send/zfs_send-b.ksh \ functional/cli_root/zfs_send/zfs_send_encrypted.ksh \ functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh \ functional/cli_root/zfs_send/zfs_send_raw.ksh \ functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \ functional/cli_root/zfs_send/zfs_send_sparse.ksh \ functional/cli_root/zfs_set/cache_001_pos.ksh \ functional/cli_root/zfs_set/cache_002_neg.ksh \ functional/cli_root/zfs_set/canmount_001_pos.ksh \ functional/cli_root/zfs_set/canmount_002_pos.ksh \ functional/cli_root/zfs_set/canmount_003_pos.ksh \ functional/cli_root/zfs_set/canmount_004_pos.ksh \ functional/cli_root/zfs_set/checksum_001_pos.ksh \ functional/cli_root/zfs_set/cleanup.ksh \ functional/cli_root/zfs_set/compression_001_pos.ksh \ functional/cli_root/zfs_set/mountpoint_001_pos.ksh \ functional/cli_root/zfs_set/mountpoint_002_pos.ksh \ functional/cli_root/zfs_set/mountpoint_003_pos.ksh \ functional/cli_root/zfs_set/onoffs_001_pos.ksh \ functional/cli_root/zfs_set/property_alias_001_pos.ksh \ functional/cli_root/zfs_set/readonly_001_pos.ksh \ functional/cli_root/zfs_set/reservation_001_neg.ksh \ functional/cli_root/zfs_set/ro_props_001_pos.ksh \ functional/cli_root/zfs_set/setup.ksh \ functional/cli_root/zfs_set/share_mount_001_neg.ksh \ functional/cli_root/zfs_set/snapdir_001_pos.ksh \ functional/cli_root/zfs/setup.ksh \ functional/cli_root/zfs_set/user_property_001_pos.ksh \ functional/cli_root/zfs_set/user_property_002_pos.ksh \ functional/cli_root/zfs_set/user_property_003_neg.ksh \ functional/cli_root/zfs_set/user_property_004_pos.ksh \ functional/cli_root/zfs_set/version_001_neg.ksh \ functional/cli_root/zfs_set/zfs_set_001_neg.ksh \ functional/cli_root/zfs_set/zfs_set_002_neg.ksh \ functional/cli_root/zfs_set/zfs_set_003_neg.ksh \ functional/cli_root/zfs_set/zfs_set_feature_activation.ksh \ functional/cli_root/zfs_set/zfs_set_keylocation.ksh \ functional/cli_root/zfs_set/zfs_set_nomount.ksh \ functional/cli_root/zfs_share/cleanup.ksh \ functional/cli_root/zfs_share/setup.ksh \ functional/cli_root/zfs_share/zfs_share_001_pos.ksh \ functional/cli_root/zfs_share/zfs_share_002_pos.ksh \ functional/cli_root/zfs_share/zfs_share_003_pos.ksh \ functional/cli_root/zfs_share/zfs_share_004_pos.ksh \ functional/cli_root/zfs_share/zfs_share_005_pos.ksh \ functional/cli_root/zfs_share/zfs_share_006_pos.ksh \ functional/cli_root/zfs_share/zfs_share_007_neg.ksh \ functional/cli_root/zfs_share/zfs_share_008_neg.ksh \ functional/cli_root/zfs_share/zfs_share_009_neg.ksh \ functional/cli_root/zfs_share/zfs_share_010_neg.ksh \ functional/cli_root/zfs_share/zfs_share_011_pos.ksh \ functional/cli_root/zfs_share/zfs_share_012_pos.ksh \ functional/cli_root/zfs_share/zfs_share_013_pos.ksh \ functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \ functional/cli_root/zfs_share/zfs_share_after_mount.ksh \ functional/cli_root/zfs_snapshot/cleanup.ksh \ functional/cli_root/zfs_snapshot/setup.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh \ functional/cli_root/zfs_sysfs/cleanup.ksh \ functional/cli_root/zfs_sysfs/setup.ksh \ functional/cli_root/zfs_sysfs/zfeature_set_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_get_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_set_unsupported.ksh \ functional/cli_root/zfs_sysfs/zfs_sysfs_live.ksh \ functional/cli_root/zfs_sysfs/zpool_get_unsupported.ksh \ functional/cli_root/zfs_sysfs/zpool_set_unsupported.ksh \ functional/cli_root/zfs_unload-key/cleanup.ksh \ functional/cli_root/zfs_unload-key/setup.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key.ksh \ functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh \ functional/cli_root/zfs_unmount/cleanup.ksh \ functional/cli_root/zfs_unmount/setup.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_002_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_003_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_004_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_005_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_006_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_007_neg.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_all_001_pos.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh \ functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh \ functional/cli_root/zfs_unshare/cleanup.ksh \ functional/cli_root/zfs_unshare/setup.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_005_neg.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh \ functional/cli_root/zfs_unshare/zfs_unshare_008_pos.ksh \ functional/cli_root/zfs_upgrade/cleanup.ksh \ functional/cli_root/zfs_upgrade/setup.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh \ functional/cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh \ functional/cli_root/zfs_wait/cleanup.ksh \ functional/cli_root/zfs_wait/setup.ksh \ functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh \ functional/cli_root/zfs_wait/zfs_wait_getsubopt.ksh \ functional/cli_root/zfs/zfs_001_neg.ksh \ functional/cli_root/zfs/zfs_002_pos.ksh \ functional/cli_root/zfs/zfs_003_neg.ksh \ functional/cli_root/zhack/zhack_label_repair_001.ksh \ functional/cli_root/zhack/zhack_label_repair_002.ksh \ functional/cli_root/zhack/zhack_label_repair_003.ksh \ functional/cli_root/zhack/zhack_label_repair_004.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ functional/cli_root/zpool_add/cleanup.ksh \ functional/cli_root/zpool_add/setup.ksh \ functional/cli_root/zpool_add/zpool_add_001_pos.ksh \ functional/cli_root/zpool_add/zpool_add_002_pos.ksh \ functional/cli_root/zpool_add/zpool_add_003_pos.ksh \ functional/cli_root/zpool_add/zpool_add_004_pos.ksh \ functional/cli_root/zpool_add/zpool_add_005_pos.ksh \ functional/cli_root/zpool_add/zpool_add_006_pos.ksh \ functional/cli_root/zpool_add/zpool_add_007_neg.ksh \ functional/cli_root/zpool_add/zpool_add_008_neg.ksh \ functional/cli_root/zpool_add/zpool_add_009_neg.ksh \ functional/cli_root/zpool_add/zpool_add_010_pos.ksh \ functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \ functional/cli_root/zpool_attach/attach-o_ashift.ksh \ functional/cli_root/zpool_attach/cleanup.ksh \ functional/cli_root/zpool_attach/setup.ksh \ functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \ functional/cli_root/zpool/cleanup.ksh \ functional/cli_root/zpool_clear/cleanup.ksh \ functional/cli_root/zpool_clear/setup.ksh \ functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh \ functional/cli_root/zpool_clear/zpool_clear_002_neg.ksh \ functional/cli_root/zpool_clear/zpool_clear_003_neg.ksh \ functional/cli_root/zpool_clear/zpool_clear_readonly.ksh \ functional/cli_root/zpool_create/cleanup.ksh \ functional/cli_root/zpool_create/create-o_ashift.ksh \ functional/cli_root/zpool_create/setup.ksh \ functional/cli_root/zpool_create/zpool_create_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_004_pos.ksh \ functional/cli_root/zpool_create/zpool_create_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_006_pos.ksh \ functional/cli_root/zpool_create/zpool_create_007_neg.ksh \ functional/cli_root/zpool_create/zpool_create_008_pos.ksh \ functional/cli_root/zpool_create/zpool_create_009_neg.ksh \ functional/cli_root/zpool_create/zpool_create_010_neg.ksh \ functional/cli_root/zpool_create/zpool_create_011_neg.ksh \ functional/cli_root/zpool_create/zpool_create_012_neg.ksh \ functional/cli_root/zpool_create/zpool_create_014_neg.ksh \ functional/cli_root/zpool_create/zpool_create_015_neg.ksh \ functional/cli_root/zpool_create/zpool_create_016_pos.ksh \ functional/cli_root/zpool_create/zpool_create_017_neg.ksh \ functional/cli_root/zpool_create/zpool_create_018_pos.ksh \ functional/cli_root/zpool_create/zpool_create_019_pos.ksh \ functional/cli_root/zpool_create/zpool_create_020_pos.ksh \ functional/cli_root/zpool_create/zpool_create_021_pos.ksh \ functional/cli_root/zpool_create/zpool_create_022_pos.ksh \ functional/cli_root/zpool_create/zpool_create_023_neg.ksh \ functional/cli_root/zpool_create/zpool_create_024_pos.ksh \ functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \ functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \ functional/cli_root/zpool_create/zpool_create_encrypted.ksh \ functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_002_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_003_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_004_neg.ksh \ functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh \ functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh \ functional/cli_root/zpool_create/zpool_create_tempname.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh \ functional/cli_root/zpool_destroy/zpool_destroy_003_neg.ksh \ functional/cli_root/zpool_detach/cleanup.ksh \ functional/cli_root/zpool_detach/setup.ksh \ functional/cli_root/zpool_detach/zpool_detach_001_neg.ksh \ functional/cli_root/zpool_events/cleanup.ksh \ functional/cli_root/zpool_events/setup.ksh \ functional/cli_root/zpool_events/zpool_events_clear.ksh \ functional/cli_root/zpool_events/zpool_events_clear_retained.ksh \ functional/cli_root/zpool_events/zpool_events_cliargs.ksh \ functional/cli_root/zpool_events/zpool_events_duplicates.ksh \ functional/cli_root/zpool_events/zpool_events_errors.ksh \ functional/cli_root/zpool_events/zpool_events_follow.ksh \ functional/cli_root/zpool_events/zpool_events_poolname.ksh \ functional/cli_root/zpool_expand/cleanup.ksh \ functional/cli_root/zpool_expand/setup.ksh \ functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh \ functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh \ functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh \ functional/cli_root/zpool_export/cleanup.ksh \ functional/cli_root/zpool_export/setup.ksh \ functional/cli_root/zpool_export/zpool_export_001_pos.ksh \ functional/cli_root/zpool_export/zpool_export_002_pos.ksh \ functional/cli_root/zpool_export/zpool_export_003_neg.ksh \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ functional/cli_root/zpool_get/setup.ksh \ functional/cli_root/zpool_get/vdev_get_001_pos.ksh \ functional/cli_root/zpool_get/zpool_get_001_pos.ksh \ functional/cli_root/zpool_get/zpool_get_002_pos.ksh \ functional/cli_root/zpool_get/zpool_get_003_pos.ksh \ functional/cli_root/zpool_get/zpool_get_004_neg.ksh \ functional/cli_root/zpool_get/zpool_get_005_pos.ksh \ functional/cli_root/zpool_history/cleanup.ksh \ functional/cli_root/zpool_history/setup.ksh \ functional/cli_root/zpool_history/zpool_history_001_neg.ksh \ functional/cli_root/zpool_history/zpool_history_002_pos.ksh \ functional/cli_root/zpool_import/cleanup.ksh \ functional/cli_root/zpool_import/import_cachefile_device_added.ksh \ functional/cli_root/zpool_import/import_cachefile_device_removed.ksh \ functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh \ functional/cli_root/zpool_import/import_cachefile_mirror_attached.ksh \ functional/cli_root/zpool_import/import_cachefile_mirror_detached.ksh \ functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \ functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \ functional/cli_root/zpool_import/import_devices_missing.ksh \ functional/cli_root/zpool_import/import_log_missing.ksh \ functional/cli_root/zpool_import/import_paths_changed.ksh \ functional/cli_root/zpool_import/import_rewind_config_changed.ksh \ functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \ functional/cli_root/zpool_import/setup.ksh \ functional/cli_root/zpool_import/zpool_import_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_004_pos.ksh \ functional/cli_root/zpool_import/zpool_import_005_pos.ksh \ functional/cli_root/zpool_import/zpool_import_006_pos.ksh \ functional/cli_root/zpool_import/zpool_import_007_pos.ksh \ functional/cli_root/zpool_import/zpool_import_008_pos.ksh \ functional/cli_root/zpool_import/zpool_import_009_neg.ksh \ functional/cli_root/zpool_import/zpool_import_010_pos.ksh \ functional/cli_root/zpool_import/zpool_import_011_neg.ksh \ functional/cli_root/zpool_import/zpool_import_012_pos.ksh \ functional/cli_root/zpool_import/zpool_import_013_neg.ksh \ functional/cli_root/zpool_import/zpool_import_014_pos.ksh \ functional/cli_root/zpool_import/zpool_import_015_pos.ksh \ functional/cli_root/zpool_import/zpool_import_016_pos.ksh \ functional/cli_root/zpool_import/zpool_import_017_pos.ksh \ functional/cli_root/zpool_import/zpool_import_all_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_encrypted.ksh \ functional/cli_root/zpool_import/zpool_import_encrypted_load.ksh \ functional/cli_root/zpool_import/zpool_import_errata3.ksh \ functional/cli_root/zpool_import/zpool_import_errata4.ksh \ functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \ functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \ functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \ functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \ functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \ functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \ functional/cli_root/zpool_import/zpool_import_status.ksh \ functional/cli_root/zpool_initialize/cleanup.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_removed.ksh \ functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh \ functional/cli_root/zpool_offline/cleanup.ksh \ functional/cli_root/zpool_offline/setup.ksh \ functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh \ functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh \ functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh \ functional/cli_root/zpool_online/cleanup.ksh \ functional/cli_root/zpool_online/setup.ksh \ functional/cli_root/zpool_online/zpool_online_001_pos.ksh \ functional/cli_root/zpool_online/zpool_online_002_neg.ksh \ functional/cli_root/zpool_remove/cleanup.ksh \ functional/cli_root/zpool_remove/setup.ksh \ functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \ functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh \ functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh \ functional/cli_root/zpool_reopen/cleanup.ksh \ functional/cli_root/zpool_reopen/setup.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_001_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_002_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_006_neg.ksh \ functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh \ functional/cli_root/zpool_replace/cleanup.ksh \ functional/cli_root/zpool_replace/replace-o_ashift.ksh \ functional/cli_root/zpool_replace/replace_prop_ashift.ksh \ functional/cli_root/zpool_replace/setup.ksh \ functional/cli_root/zpool_replace/zpool_replace_001_neg.ksh \ functional/cli_root/zpool_resilver/cleanup.ksh \ functional/cli_root/zpool_resilver/setup.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \ functional/cli_root/zpool_scrub/cleanup.ksh \ functional/cli_root/zpool_scrub/setup.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \ functional/cli_root/zpool_set/cleanup.ksh \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ functional/cli_root/zpool_set/vdev_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_common.kshlib \ functional/cli_root/zpool_set/zpool_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_003_neg.ksh \ functional/cli_root/zpool_set/zpool_set_ashift.ksh \ functional/cli_root/zpool_set/user_property_001_pos.ksh \ functional/cli_root/zpool_set/user_property_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_features.ksh \ functional/cli_root/zpool_split/cleanup.ksh \ functional/cli_root/zpool_split/setup.ksh \ functional/cli_root/zpool_split/zpool_split_cliargs.ksh \ functional/cli_root/zpool_split/zpool_split_devices.ksh \ functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh \ functional/cli_root/zpool_split/zpool_split_encryption.ksh \ functional/cli_root/zpool_split/zpool_split_indirect.ksh \ functional/cli_root/zpool_split/zpool_split_props.ksh \ functional/cli_root/zpool_split/zpool_split_resilver.ksh \ functional/cli_root/zpool_split/zpool_split_vdevs.ksh \ functional/cli_root/zpool_split/zpool_split_wholedisk.ksh \ functional/cli_root/zpool_status/cleanup.ksh \ functional/cli_root/zpool_status/setup.ksh \ functional/cli_root/zpool_status/zpool_status_001_pos.ksh \ functional/cli_root/zpool_status/zpool_status_002_pos.ksh \ functional/cli_root/zpool_status/zpool_status_003_pos.ksh \ functional/cli_root/zpool_status/zpool_status_004_pos.ksh \ functional/cli_root/zpool_status/zpool_status_005_pos.ksh \ functional/cli_root/zpool_status/zpool_status_006_pos.ksh \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \ functional/cli_root/zpool_sync/zpool_sync_002_neg.ksh \ functional/cli_root/zpool_trim/cleanup.ksh \ functional/cli_root/zpool_trim/setup.ksh \ functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh \ functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \ functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \ functional/cli_root/zpool_trim/zpool_trim_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \ functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \ functional/cli_root/zpool_trim/zpool_trim_partial.ksh \ functional/cli_root/zpool_trim/zpool_trim_rate.ksh \ functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_secure.ksh \ functional/cli_root/zpool_trim/zpool_trim_split.ksh \ functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh \ functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh \ functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh \ functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh \ functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh \ functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh \ functional/cli_root/zpool_upgrade/cleanup.ksh \ functional/cli_root/zpool_upgrade/setup.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_001_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_002_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_003_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_004_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_005_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_006_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_009_neg.ksh \ functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh \ functional/cli_root/zpool_wait/cleanup.ksh \ functional/cli_root/zpool_wait/scan/cleanup.ksh \ functional/cli_root/zpool_wait/scan/setup.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh \ functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh \ functional/cli_root/zpool_wait/setup.ksh \ functional/cli_root/zpool_wait/zpool_wait_discard.ksh \ functional/cli_root/zpool_wait/zpool_wait_freeing.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh \ functional/cli_root/zpool_wait/zpool_wait_multiple.ksh \ functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh \ functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_remove.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh \ functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh \ functional/cli_root/zpool_wait/zpool_wait_usage.ksh \ functional/cli_root/zpool/zpool_001_neg.ksh \ functional/cli_root/zpool/zpool_002_pos.ksh \ functional/cli_root/zpool/zpool_003_pos.ksh \ functional/cli_root/zpool/zpool_colors.ksh \ functional/cli_user/misc/arcstat_001_pos.ksh \ functional/cli_user/misc/arc_summary_001_pos.ksh \ functional/cli_user/misc/arc_summary_002_neg.ksh \ functional/cli_user/misc/zilstat_001_pos.ksh \ functional/cli_user/misc/cleanup.ksh \ functional/cli_user/misc/setup.ksh \ functional/cli_user/misc/zdb_001_neg.ksh \ functional/cli_user/misc/zfs_001_neg.ksh \ functional/cli_user/misc/zfs_allow_001_neg.ksh \ functional/cli_user/misc/zfs_clone_001_neg.ksh \ functional/cli_user/misc/zfs_create_001_neg.ksh \ functional/cli_user/misc/zfs_destroy_001_neg.ksh \ functional/cli_user/misc/zfs_get_001_neg.ksh \ functional/cli_user/misc/zfs_inherit_001_neg.ksh \ functional/cli_user/misc/zfs_mount_001_neg.ksh \ functional/cli_user/misc/zfs_promote_001_neg.ksh \ functional/cli_user/misc/zfs_receive_001_neg.ksh \ functional/cli_user/misc/zfs_rename_001_neg.ksh \ functional/cli_user/misc/zfs_rollback_001_neg.ksh \ functional/cli_user/misc/zfs_send_001_neg.ksh \ functional/cli_user/misc/zfs_set_001_neg.ksh \ functional/cli_user/misc/zfs_share_001_neg.ksh \ functional/cli_user/misc/zfs_snapshot_001_neg.ksh \ functional/cli_user/misc/zfs_unallow_001_neg.ksh \ functional/cli_user/misc/zfs_unmount_001_neg.ksh \ functional/cli_user/misc/zfs_unshare_001_neg.ksh \ functional/cli_user/misc/zfs_upgrade_001_neg.ksh \ functional/cli_user/misc/zpool_001_neg.ksh \ functional/cli_user/misc/zpool_add_001_neg.ksh \ functional/cli_user/misc/zpool_attach_001_neg.ksh \ functional/cli_user/misc/zpool_clear_001_neg.ksh \ functional/cli_user/misc/zpool_create_001_neg.ksh \ functional/cli_user/misc/zpool_destroy_001_neg.ksh \ functional/cli_user/misc/zpool_detach_001_neg.ksh \ functional/cli_user/misc/zpool_export_001_neg.ksh \ functional/cli_user/misc/zpool_get_001_neg.ksh \ functional/cli_user/misc/zpool_history_001_neg.ksh \ functional/cli_user/misc/zpool_import_001_neg.ksh \ functional/cli_user/misc/zpool_import_002_neg.ksh \ functional/cli_user/misc/zpool_offline_001_neg.ksh \ functional/cli_user/misc/zpool_online_001_neg.ksh \ functional/cli_user/misc/zpool_remove_001_neg.ksh \ functional/cli_user/misc/zpool_replace_001_neg.ksh \ functional/cli_user/misc/zpool_scrub_001_neg.ksh \ functional/cli_user/misc/zpool_set_001_neg.ksh \ functional/cli_user/misc/zpool_status_001_neg.ksh \ functional/cli_user/misc/zpool_upgrade_001_neg.ksh \ functional/cli_user/misc/zpool_wait_privilege.ksh \ functional/cli_user/zfs_list/cleanup.ksh \ functional/cli_user/zfs_list/setup.ksh \ functional/cli_user/zfs_list/zfs_list_001_pos.ksh \ functional/cli_user/zfs_list/zfs_list_002_pos.ksh \ functional/cli_user/zfs_list/zfs_list_003_pos.ksh \ functional/cli_user/zfs_list/zfs_list_004_neg.ksh \ functional/cli_user/zfs_list/zfs_list_005_neg.ksh \ functional/cli_user/zfs_list/zfs_list_007_pos.ksh \ functional/cli_user/zfs_list/zfs_list_008_neg.ksh \ functional/cli_user/zpool_iostat/cleanup.ksh \ functional/cli_user/zpool_iostat/setup.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_disable.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh \ functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh \ functional/cli_user/zpool_list/cleanup.ksh \ functional/cli_user/zpool_list/setup.ksh \ functional/cli_user/zpool_list/zpool_list_001_pos.ksh \ functional/cli_user/zpool_list/zpool_list_002_neg.ksh \ functional/cli_user/zpool_status/cleanup.ksh \ functional/cli_user/zpool_status/setup.ksh \ functional/cli_user/zpool_status/zpool_status_003_pos.ksh \ functional/cli_user/zpool_status/zpool_status_-c_disable.ksh \ functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh \ functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh \ functional/compression/cleanup.ksh \ functional/compression/compress_001_pos.ksh \ functional/compression/compress_002_pos.ksh \ functional/compression/compress_003_pos.ksh \ functional/compression/compress_004_pos.ksh \ functional/compression/compress_zstd_bswap.ksh \ functional/compression/l2arc_compressed_arc_disabled.ksh \ functional/compression/l2arc_compressed_arc.ksh \ functional/compression/l2arc_encrypted.ksh \ functional/compression/l2arc_encrypted_no_compressed_arc.ksh \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ functional/cp_files/cp_files_002_pos.ksh \ functional/cp_files/cp_stress.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ functional/crtime/crtime_001_pos.ksh \ functional/crtime/setup.ksh \ functional/ctime/cleanup.ksh \ functional/ctime/ctime_001_pos.ksh \ functional/ctime/setup.ksh \ functional/deadman/deadman_ratelimit.ksh \ functional/deadman/deadman_sync.ksh \ functional/deadman/deadman_zio.ksh \ functional/delegate/cleanup.ksh \ functional/delegate/setup.ksh \ functional/delegate/zfs_allow_001_pos.ksh \ functional/delegate/zfs_allow_002_pos.ksh \ functional/delegate/zfs_allow_003_pos.ksh \ functional/delegate/zfs_allow_004_pos.ksh \ functional/delegate/zfs_allow_005_pos.ksh \ functional/delegate/zfs_allow_006_pos.ksh \ functional/delegate/zfs_allow_007_pos.ksh \ functional/delegate/zfs_allow_008_pos.ksh \ functional/delegate/zfs_allow_009_neg.ksh \ functional/delegate/zfs_allow_010_pos.ksh \ functional/delegate/zfs_allow_011_neg.ksh \ functional/delegate/zfs_allow_012_neg.ksh \ functional/delegate/zfs_unallow_001_pos.ksh \ functional/delegate/zfs_unallow_002_pos.ksh \ functional/delegate/zfs_unallow_003_pos.ksh \ functional/delegate/zfs_unallow_004_pos.ksh \ functional/delegate/zfs_unallow_005_pos.ksh \ functional/delegate/zfs_unallow_006_pos.ksh \ functional/delegate/zfs_unallow_007_neg.ksh \ functional/delegate/zfs_unallow_008_neg.ksh \ functional/devices/cleanup.ksh \ functional/devices/devices_001_pos.ksh \ functional/devices/devices_002_neg.ksh \ functional/devices/devices_003_pos.ksh \ functional/devices/setup.ksh \ functional/dos_attributes/cleanup.ksh \ functional/dos_attributes/read_dos_attrs_001.ksh \ functional/dos_attributes/setup.ksh \ functional/dos_attributes/write_dos_attrs_001.ksh \ functional/events/cleanup.ksh \ functional/events/events_001_pos.ksh \ functional/events/events_002_pos.ksh \ functional/events/setup.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ functional/events/zed_fd_spill.ksh \ functional/events/zed_io_config.ksh \ functional/events/zed_rc_filter.ksh \ + functional/events/zed_slow_io.ksh \ + functional/events/zed_slow_io_many_vdevs.ksh \ functional/exec/cleanup.ksh \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ functional/exec/setup.ksh \ functional/fadvise/cleanup.ksh \ functional/fadvise/fadvise_sequential.ksh \ functional/fadvise/setup.ksh \ functional/fallocate/cleanup.ksh \ functional/fallocate/fallocate_prealloc.ksh \ functional/fallocate/fallocate_punch-hole.ksh \ functional/fallocate/fallocate_zero-range.ksh \ functional/fallocate/setup.ksh \ functional/fault/auto_offline_001_pos.ksh \ functional/fault/auto_online_001_pos.ksh \ functional/fault/auto_online_002_pos.ksh \ functional/fault/auto_replace_001_pos.ksh \ functional/fault/auto_replace_002_pos.ksh \ functional/fault/auto_spare_001_pos.ksh \ functional/fault/auto_spare_002_pos.ksh \ functional/fault/auto_spare_ashift.ksh \ functional/fault/auto_spare_multiple.ksh \ functional/fault/auto_spare_shared.ksh \ functional/fault/cleanup.ksh \ functional/fault/decompress_fault.ksh \ functional/fault/decrypt_fault.ksh \ functional/fault/scrub_after_resilver.ksh \ functional/fault/setup.ksh \ functional/fault/zpool_status_-s.ksh \ functional/features/async_destroy/async_destroy_001_pos.ksh \ functional/features/async_destroy/cleanup.ksh \ functional/features/async_destroy/setup.ksh \ functional/features/large_dnode/cleanup.ksh \ functional/features/large_dnode/large_dnode_001_pos.ksh \ functional/features/large_dnode/large_dnode_002_pos.ksh \ functional/features/large_dnode/large_dnode_003_pos.ksh \ functional/features/large_dnode/large_dnode_004_neg.ksh \ functional/features/large_dnode/large_dnode_005_pos.ksh \ functional/features/large_dnode/large_dnode_006_pos.ksh \ functional/features/large_dnode/large_dnode_007_neg.ksh \ functional/features/large_dnode/large_dnode_008_pos.ksh \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ functional/history/cleanup.ksh \ functional/history/history_001_pos.ksh \ functional/history/history_002_pos.ksh \ functional/history/history_003_pos.ksh \ functional/history/history_004_pos.ksh \ functional/history/history_005_neg.ksh \ functional/history/history_006_neg.ksh \ functional/history/history_007_pos.ksh \ functional/history/history_008_pos.ksh \ functional/history/history_009_pos.ksh \ functional/history/history_010_pos.ksh \ functional/history/setup.ksh \ functional/inheritance/cleanup.ksh \ functional/inheritance/inherit_001_pos.ksh \ functional/inuse/inuse_001_pos.ksh \ functional/inuse/inuse_003_pos.ksh \ functional/inuse/inuse_004_pos.ksh \ functional/inuse/inuse_005_pos.ksh \ functional/inuse/inuse_006_pos.ksh \ functional/inuse/inuse_007_pos.ksh \ functional/inuse/inuse_008_pos.ksh \ functional/inuse/inuse_009_pos.ksh \ functional/inuse/setup.ksh \ functional/io/cleanup.ksh \ functional/io/io_uring.ksh \ functional/io/libaio.ksh \ functional/io/mmap.ksh \ functional/io/posixaio.ksh \ functional/io/psync.ksh \ functional/io/setup.ksh \ functional/io/sync.ksh \ functional/l2arc/cleanup.ksh \ functional/l2arc/l2arc_arcstats_pos.ksh \ functional/l2arc/l2arc_l2miss_pos.ksh \ functional/l2arc/l2arc_mfuonly_pos.ksh \ functional/l2arc/persist_l2arc_001_pos.ksh \ functional/l2arc/persist_l2arc_002_pos.ksh \ functional/l2arc/persist_l2arc_003_neg.ksh \ functional/l2arc/persist_l2arc_004_pos.ksh \ functional/l2arc/persist_l2arc_005_pos.ksh \ functional/l2arc/setup.ksh \ functional/large_files/cleanup.ksh \ functional/large_files/large_files_001_pos.ksh \ functional/large_files/large_files_002_pos.ksh \ functional/large_files/setup.ksh \ functional/largest_pool/largest_pool_001_pos.ksh \ functional/libzfs/cleanup.ksh \ functional/libzfs/libzfs_input.ksh \ functional/libzfs/setup.ksh \ functional/limits/cleanup.ksh \ functional/limits/filesystem_count.ksh \ functional/limits/filesystem_limit.ksh \ functional/limits/setup.ksh \ functional/limits/snapshot_count.ksh \ functional/limits/snapshot_limit.ksh \ functional/link_count/cleanup.ksh \ functional/link_count/link_count_001.ksh \ functional/link_count/link_count_root_inode.ksh \ functional/link_count/setup.ksh \ functional/log_spacemap/log_spacemap_import_logs.ksh \ functional/migration/cleanup.ksh \ functional/migration/migration_001_pos.ksh \ functional/migration/migration_002_pos.ksh \ functional/migration/migration_003_pos.ksh \ functional/migration/migration_004_pos.ksh \ functional/migration/migration_005_pos.ksh \ functional/migration/migration_006_pos.ksh \ functional/migration/migration_007_pos.ksh \ functional/migration/migration_008_pos.ksh \ functional/migration/migration_009_pos.ksh \ functional/migration/migration_010_pos.ksh \ functional/migration/migration_011_pos.ksh \ functional/migration/migration_012_pos.ksh \ functional/migration/setup.ksh \ functional/mmap/cleanup.ksh \ functional/mmap/mmap_libaio_001_pos.ksh \ functional/mmap/mmap_mixed.ksh \ functional/mmap/mmap_read_001_pos.ksh \ functional/mmap/mmap_seek_001_pos.ksh \ functional/mmap/mmap_sync_001_pos.ksh \ functional/mmap/mmap_write_001_pos.ksh \ functional/mmap/setup.ksh \ functional/mmp/cleanup.ksh \ functional/mmp/mmp_active_import.ksh \ functional/mmp/mmp_exported_import.ksh \ functional/mmp/mmp_hostid.ksh \ functional/mmp/mmp_inactive_import.ksh \ functional/mmp/mmp_interval.ksh \ functional/mmp/mmp_on_off.ksh \ functional/mmp/mmp_on_thread.ksh \ functional/mmp/mmp_on_uberblocks.ksh \ functional/mmp/mmp_on_zdb.ksh \ functional/mmp/mmp_reset_interval.ksh \ functional/mmp/mmp_write_distribution.ksh \ functional/mmp/mmp_write_uberblocks.ksh \ functional/mmp/multihost_history.ksh \ functional/mmp/setup.ksh \ functional/mount/cleanup.ksh \ functional/mount/setup.ksh \ functional/mount/umount_001.ksh \ functional/mount/umountall_001.ksh \ functional/mount/umount_unlinked_drain.ksh \ functional/mv_files/cleanup.ksh \ functional/mv_files/mv_files_001_pos.ksh \ functional/mv_files/mv_files_002_pos.ksh \ functional/mv_files/random_creation.ksh \ functional/mv_files/setup.ksh \ functional/nestedfs/cleanup.ksh \ functional/nestedfs/nestedfs_001_pos.ksh \ functional/nestedfs/setup.ksh \ functional/nopwrite/cleanup.ksh \ functional/nopwrite/nopwrite_copies.ksh \ functional/nopwrite/nopwrite_mtime.ksh \ functional/nopwrite/nopwrite_negative.ksh \ functional/nopwrite/nopwrite_promoted_clone.ksh \ functional/nopwrite/nopwrite_recsize.ksh \ functional/nopwrite/nopwrite_sync.ksh \ functional/nopwrite/nopwrite_varying_compression.ksh \ functional/nopwrite/nopwrite_volume.ksh \ functional/nopwrite/setup.ksh \ functional/no_space/cleanup.ksh \ functional/no_space/enospc_001_pos.ksh \ functional/no_space/enospc_002_pos.ksh \ functional/no_space/enospc_003_pos.ksh \ functional/no_space/enospc_df.ksh \ functional/no_space/enospc_ganging.ksh \ functional/no_space/enospc_rm.ksh \ functional/no_space/setup.ksh \ functional/online_offline/cleanup.ksh \ functional/online_offline/online_offline_001_pos.ksh \ functional/online_offline/online_offline_002_neg.ksh \ functional/online_offline/online_offline_003_neg.ksh \ functional/online_offline/setup.ksh \ functional/pam/cleanup.ksh \ functional/pam/pam_basic.ksh \ functional/pam/pam_change_unmounted.ksh \ functional/pam/pam_nounmount.ksh \ functional/pam/pam_recursive.ksh \ functional/pam/pam_short_password.ksh \ functional/pam/setup.ksh \ functional/pool_checkpoint/checkpoint_after_rewind.ksh \ functional/pool_checkpoint/checkpoint_big_rewind.ksh \ functional/pool_checkpoint/checkpoint_capacity.ksh \ functional/pool_checkpoint/checkpoint_conf_change.ksh \ functional/pool_checkpoint/checkpoint_discard_busy.ksh \ functional/pool_checkpoint/checkpoint_discard.ksh \ functional/pool_checkpoint/checkpoint_discard_many.ksh \ functional/pool_checkpoint/checkpoint_indirect.ksh \ functional/pool_checkpoint/checkpoint_invalid.ksh \ functional/pool_checkpoint/checkpoint_lun_expsz.ksh \ functional/pool_checkpoint/checkpoint_open.ksh \ functional/pool_checkpoint/checkpoint_removal.ksh \ functional/pool_checkpoint/checkpoint_rewind.ksh \ functional/pool_checkpoint/checkpoint_ro_rewind.ksh \ functional/pool_checkpoint/checkpoint_sm_scale.ksh \ functional/pool_checkpoint/checkpoint_twice.ksh \ functional/pool_checkpoint/checkpoint_vdev_add.ksh \ functional/pool_checkpoint/checkpoint_zdb.ksh \ functional/pool_checkpoint/checkpoint_zhack_feat.ksh \ functional/pool_checkpoint/cleanup.ksh \ functional/pool_checkpoint/setup.ksh \ functional/pool_names/pool_names_001_pos.ksh \ functional/pool_names/pool_names_002_neg.ksh \ functional/poolversion/cleanup.ksh \ functional/poolversion/poolversion_001_pos.ksh \ functional/poolversion/poolversion_002_pos.ksh \ functional/poolversion/setup.ksh \ functional/privilege/cleanup.ksh \ functional/privilege/privilege_001_pos.ksh \ functional/privilege/privilege_002_pos.ksh \ functional/privilege/setup.ksh \ functional/procfs/cleanup.ksh \ functional/procfs/pool_state.ksh \ functional/procfs/procfs_list_basic.ksh \ functional/procfs/procfs_list_concurrent_readers.ksh \ functional/procfs/procfs_list_stale_read.ksh \ functional/procfs/setup.ksh \ functional/projectquota/cleanup.ksh \ functional/projectquota/projectid_001_pos.ksh \ functional/projectquota/projectid_002_pos.ksh \ functional/projectquota/projectid_003_pos.ksh \ functional/projectquota/projectquota_001_pos.ksh \ functional/projectquota/projectquota_002_pos.ksh \ functional/projectquota/projectquota_003_pos.ksh \ functional/projectquota/projectquota_004_neg.ksh \ functional/projectquota/projectquota_005_pos.ksh \ functional/projectquota/projectquota_006_pos.ksh \ functional/projectquota/projectquota_007_pos.ksh \ functional/projectquota/projectquota_008_pos.ksh \ functional/projectquota/projectquota_009_pos.ksh \ functional/projectquota/projectspace_001_pos.ksh \ functional/projectquota/projectspace_002_pos.ksh \ functional/projectquota/projectspace_003_pos.ksh \ functional/projectquota/projectspace_004_pos.ksh \ functional/projectquota/projecttree_001_pos.ksh \ functional/projectquota/projecttree_002_pos.ksh \ functional/projectquota/projecttree_003_neg.ksh \ functional/projectquota/setup.ksh \ functional/quota/cleanup.ksh \ functional/quota/quota_001_pos.ksh \ functional/quota/quota_002_pos.ksh \ functional/quota/quota_003_pos.ksh \ functional/quota/quota_004_pos.ksh \ functional/quota/quota_005_pos.ksh \ functional/quota/quota_006_neg.ksh \ functional/quota/setup.ksh \ functional/raidz/cleanup.ksh \ functional/raidz/raidz_001_neg.ksh \ functional/raidz/raidz_002_pos.ksh \ functional/raidz/raidz_expand_001_pos.ksh \ functional/raidz/raidz_expand_002_pos.ksh \ functional/raidz/raidz_expand_003_neg.ksh \ functional/raidz/raidz_expand_003_pos.ksh \ functional/raidz/raidz_expand_004_pos.ksh \ functional/raidz/raidz_expand_005_pos.ksh \ functional/raidz/raidz_expand_006_neg.ksh \ functional/raidz/raidz_expand_007_neg.ksh \ functional/raidz/setup.ksh \ functional/redacted_send/cleanup.ksh \ functional/redacted_send/redacted_compressed.ksh \ functional/redacted_send/redacted_contents.ksh \ functional/redacted_send/redacted_deleted.ksh \ functional/redacted_send/redacted_disabled_feature.ksh \ functional/redacted_send/redacted_embedded.ksh \ functional/redacted_send/redacted_holes.ksh \ functional/redacted_send/redacted_incrementals.ksh \ functional/redacted_send/redacted_largeblocks.ksh \ functional/redacted_send/redacted_many_clones.ksh \ functional/redacted_send/redacted_mixed_recsize.ksh \ functional/redacted_send/redacted_mounts.ksh \ functional/redacted_send/redacted_negative.ksh \ functional/redacted_send/redacted_origin.ksh \ functional/redacted_send/redacted_panic.ksh \ functional/redacted_send/redacted_props.ksh \ functional/redacted_send/redacted_resume.ksh \ functional/redacted_send/redacted_size.ksh \ functional/redacted_send/redacted_volume.ksh \ functional/redacted_send/setup.ksh \ functional/redundancy/cleanup.ksh \ functional/redundancy/redundancy_draid1.ksh \ functional/redundancy/redundancy_draid2.ksh \ functional/redundancy/redundancy_draid3.ksh \ functional/redundancy/redundancy_draid_damaged1.ksh \ functional/redundancy/redundancy_draid_damaged2.ksh \ functional/redundancy/redundancy_draid.ksh \ functional/redundancy/redundancy_draid_spare1.ksh \ functional/redundancy/redundancy_draid_spare2.ksh \ functional/redundancy/redundancy_draid_spare3.ksh \ functional/redundancy/redundancy_mirror.ksh \ functional/redundancy/redundancy_raidz1.ksh \ functional/redundancy/redundancy_raidz2.ksh \ functional/redundancy/redundancy_raidz3.ksh \ functional/redundancy/redundancy_raidz.ksh \ functional/redundancy/redundancy_stripe.ksh \ functional/redundancy/setup.ksh \ functional/refquota/cleanup.ksh \ functional/refquota/refquota_001_pos.ksh \ functional/refquota/refquota_002_pos.ksh \ functional/refquota/refquota_003_pos.ksh \ functional/refquota/refquota_004_pos.ksh \ functional/refquota/refquota_005_pos.ksh \ functional/refquota/refquota_006_neg.ksh \ functional/refquota/refquota_007_neg.ksh \ functional/refquota/refquota_008_neg.ksh \ functional/refquota/setup.ksh \ functional/refreserv/cleanup.ksh \ functional/refreserv/refreserv_001_pos.ksh \ functional/refreserv/refreserv_002_pos.ksh \ functional/refreserv/refreserv_003_pos.ksh \ functional/refreserv/refreserv_004_pos.ksh \ functional/refreserv/refreserv_005_pos.ksh \ functional/refreserv/refreserv_multi_raidz.ksh \ functional/refreserv/refreserv_raidz.ksh \ functional/refreserv/setup.ksh \ functional/removal/cleanup.ksh \ functional/removal/removal_all_vdev.ksh \ functional/removal/removal_cancel.ksh \ functional/removal/removal_check_space.ksh \ functional/removal/removal_condense_export.ksh \ functional/removal/removal_multiple_indirection.ksh \ functional/removal/removal_nopwrite.ksh \ functional/removal/removal_remap_deadlists.ksh \ functional/removal/removal_reservation.ksh \ functional/removal/removal_resume_export.ksh \ functional/removal/removal_sanity.ksh \ functional/removal/removal_with_add.ksh \ functional/removal/removal_with_create_fs.ksh \ functional/removal/removal_with_dedup.ksh \ functional/removal/removal_with_errors.ksh \ functional/removal/removal_with_export.ksh \ functional/removal/removal_with_faulted.ksh \ functional/removal/removal_with_ganging.ksh \ functional/removal/removal_with_indirect.ksh \ functional/removal/removal_with_remove.ksh \ functional/removal/removal_with_scrub.ksh \ functional/removal/removal_with_send.ksh \ functional/removal/removal_with_send_recv.ksh \ functional/removal/removal_with_snapshot.ksh \ functional/removal/removal_with_write.ksh \ functional/removal/removal_with_zdb.ksh \ functional/removal/remove_attach_mirror.ksh \ functional/removal/remove_expanded.ksh \ functional/removal/remove_indirect.ksh \ functional/removal/remove_mirror.ksh \ functional/removal/remove_mirror_sanity.ksh \ functional/removal/remove_raidz.ksh \ functional/rename_dirs/cleanup.ksh \ functional/rename_dirs/rename_dirs_001_pos.ksh \ functional/rename_dirs/setup.ksh \ functional/renameat2/cleanup.ksh \ functional/renameat2/setup.ksh \ functional/renameat2/renameat2_exchange.ksh \ functional/renameat2/renameat2_noreplace.ksh \ functional/renameat2/renameat2_whiteout.ksh \ functional/replacement/attach_import.ksh \ functional/replacement/attach_multiple.ksh \ functional/replacement/attach_rebuild.ksh \ functional/replacement/attach_resilver.ksh \ functional/replacement/cleanup.ksh \ functional/replacement/detach.ksh \ functional/replacement/rebuild_disabled_feature.ksh \ functional/replacement/rebuild_multiple.ksh \ functional/replacement/rebuild_raidz.ksh \ functional/replacement/replace_import.ksh \ functional/replacement/replace_rebuild.ksh \ functional/replacement/replace_resilver.ksh \ functional/replacement/resilver_restart_001.ksh \ functional/replacement/resilver_restart_002.ksh \ functional/replacement/scrub_cancel.ksh \ functional/replacement/setup.ksh \ functional/reservation/cleanup.ksh \ functional/reservation/reservation_001_pos.ksh \ functional/reservation/reservation_002_pos.ksh \ functional/reservation/reservation_003_pos.ksh \ functional/reservation/reservation_004_pos.ksh \ functional/reservation/reservation_005_pos.ksh \ functional/reservation/reservation_006_pos.ksh \ functional/reservation/reservation_007_pos.ksh \ functional/reservation/reservation_008_pos.ksh \ functional/reservation/reservation_009_pos.ksh \ functional/reservation/reservation_010_pos.ksh \ functional/reservation/reservation_011_pos.ksh \ functional/reservation/reservation_012_pos.ksh \ functional/reservation/reservation_013_pos.ksh \ functional/reservation/reservation_014_pos.ksh \ functional/reservation/reservation_015_pos.ksh \ functional/reservation/reservation_016_pos.ksh \ functional/reservation/reservation_017_pos.ksh \ functional/reservation/reservation_018_pos.ksh \ functional/reservation/reservation_019_pos.ksh \ functional/reservation/reservation_020_pos.ksh \ functional/reservation/reservation_021_neg.ksh \ functional/reservation/reservation_022_pos.ksh \ functional/reservation/setup.ksh \ functional/rootpool/cleanup.ksh \ functional/rootpool/rootpool_002_neg.ksh \ functional/rootpool/rootpool_003_neg.ksh \ functional/rootpool/rootpool_007_pos.ksh \ functional/rootpool/setup.ksh \ functional/rsend/cleanup.ksh \ functional/rsend/recv_dedup_encrypted_zvol.ksh \ functional/rsend/recv_dedup.ksh \ functional/rsend/rsend_001_pos.ksh \ functional/rsend/rsend_002_pos.ksh \ functional/rsend/rsend_003_pos.ksh \ functional/rsend/rsend_004_pos.ksh \ functional/rsend/rsend_005_pos.ksh \ functional/rsend/rsend_006_pos.ksh \ functional/rsend/rsend_007_pos.ksh \ functional/rsend/rsend_008_pos.ksh \ functional/rsend/rsend_009_pos.ksh \ functional/rsend/rsend_010_pos.ksh \ functional/rsend/rsend_011_pos.ksh \ functional/rsend/rsend_012_pos.ksh \ functional/rsend/rsend_013_pos.ksh \ functional/rsend/rsend_014_pos.ksh \ functional/rsend/rsend_016_neg.ksh \ functional/rsend/rsend_019_pos.ksh \ functional/rsend/rsend_020_pos.ksh \ functional/rsend/rsend_021_pos.ksh \ functional/rsend/rsend_022_pos.ksh \ functional/rsend/rsend_024_pos.ksh \ functional/rsend/rsend_025_pos.ksh \ functional/rsend/rsend_026_neg.ksh \ functional/rsend/rsend_027_pos.ksh \ functional/rsend/rsend_028_neg.ksh \ functional/rsend/rsend_029_neg.ksh \ functional/rsend/rsend_030_pos.ksh \ functional/rsend/rsend_031_pos.ksh \ functional/rsend/send-c_embedded_blocks.ksh \ functional/rsend/send-c_incremental.ksh \ functional/rsend/send-c_lz4_disabled.ksh \ functional/rsend/send-c_mixed_compression.ksh \ functional/rsend/send-c_props.ksh \ functional/rsend/send-c_recv_dedup.ksh \ functional/rsend/send-c_recv_lz4_disabled.ksh \ functional/rsend/send-c_resume.ksh \ functional/rsend/send-c_stream_size_estimate.ksh \ functional/rsend/send-c_verify_contents.ksh \ functional/rsend/send-c_verify_ratio.ksh \ functional/rsend/send-c_volume.ksh \ functional/rsend/send-c_zstream_recompress.ksh \ functional/rsend/send-c_zstreamdump.ksh \ functional/rsend/send-cpL_varied_recsize.ksh \ functional/rsend/send_doall.ksh \ functional/rsend/send_encrypted_incremental.ksh \ functional/rsend/send_encrypted_files.ksh \ functional/rsend/send_encrypted_freeobjects.ksh \ functional/rsend/send_encrypted_hierarchy.ksh \ functional/rsend/send_encrypted_props.ksh \ functional/rsend/send_encrypted_truncated_files.ksh \ functional/rsend/send_freeobjects.ksh \ functional/rsend/send_holds.ksh \ functional/rsend/send_hole_birth.ksh \ functional/rsend/send_invalid.ksh \ functional/rsend/send-L_toggle.ksh \ functional/rsend/send_mixed_raw.ksh \ functional/rsend/send_partial_dataset.ksh \ functional/rsend/send_raw_ashift.ksh \ functional/rsend/send_raw_spill_block.ksh \ functional/rsend/send_raw_large_blocks.ksh \ functional/rsend/send_realloc_dnode_size.ksh \ functional/rsend/send_realloc_encrypted_files.ksh \ functional/rsend/send_realloc_files.ksh \ functional/rsend/send_spill_block.ksh \ functional/rsend/send-wR_encrypted_zvol.ksh \ functional/rsend/setup.ksh \ functional/scrub_mirror/cleanup.ksh \ functional/scrub_mirror/scrub_mirror_001_pos.ksh \ functional/scrub_mirror/scrub_mirror_002_pos.ksh \ functional/scrub_mirror/scrub_mirror_003_pos.ksh \ functional/scrub_mirror/scrub_mirror_004_pos.ksh \ functional/scrub_mirror/setup.ksh \ functional/slog/cleanup.ksh \ functional/slog/setup.ksh \ functional/slog/slog_001_pos.ksh \ functional/slog/slog_002_pos.ksh \ functional/slog/slog_003_pos.ksh \ functional/slog/slog_004_pos.ksh \ functional/slog/slog_005_pos.ksh \ functional/slog/slog_006_pos.ksh \ functional/slog/slog_007_pos.ksh \ functional/slog/slog_008_neg.ksh \ functional/slog/slog_009_neg.ksh \ functional/slog/slog_010_neg.ksh \ functional/slog/slog_011_neg.ksh \ functional/slog/slog_012_neg.ksh \ functional/slog/slog_013_pos.ksh \ functional/slog/slog_014_pos.ksh \ functional/slog/slog_015_neg.ksh \ functional/slog/slog_016_pos.ksh \ functional/slog/slog_replay_fs_001.ksh \ functional/slog/slog_replay_fs_002.ksh \ functional/slog/slog_replay_volume.ksh \ functional/snapshot/cleanup.ksh \ functional/snapshot/clone_001_pos.ksh \ functional/snapshot/rollback_001_pos.ksh \ functional/snapshot/rollback_002_pos.ksh \ functional/snapshot/rollback_003_pos.ksh \ functional/snapshot/setup.ksh \ functional/snapshot/snapshot_001_pos.ksh \ functional/snapshot/snapshot_002_pos.ksh \ functional/snapshot/snapshot_003_pos.ksh \ functional/snapshot/snapshot_004_pos.ksh \ functional/snapshot/snapshot_005_pos.ksh \ functional/snapshot/snapshot_006_pos.ksh \ functional/snapshot/snapshot_007_pos.ksh \ functional/snapshot/snapshot_008_pos.ksh \ functional/snapshot/snapshot_009_pos.ksh \ functional/snapshot/snapshot_010_pos.ksh \ functional/snapshot/snapshot_011_pos.ksh \ functional/snapshot/snapshot_012_pos.ksh \ functional/snapshot/snapshot_013_pos.ksh \ functional/snapshot/snapshot_014_pos.ksh \ functional/snapshot/snapshot_015_pos.ksh \ functional/snapshot/snapshot_016_pos.ksh \ functional/snapshot/snapshot_017_pos.ksh \ functional/snapshot/snapshot_018_pos.ksh \ functional/snapused/cleanup.ksh \ functional/snapused/setup.ksh \ functional/snapused/snapused_001_pos.ksh \ functional/snapused/snapused_002_pos.ksh \ functional/snapused/snapused_003_pos.ksh \ functional/snapused/snapused_004_pos.ksh \ functional/snapused/snapused_005_pos.ksh \ functional/sparse/cleanup.ksh \ functional/sparse/setup.ksh \ functional/sparse/sparse_001_pos.ksh \ functional/stat/cleanup.ksh \ functional/stat/setup.ksh \ functional/stat/stat_001_pos.ksh \ functional/suid/cleanup.ksh \ functional/suid/setup.ksh \ functional/suid/suid_write_to_none.ksh \ functional/suid/suid_write_to_sgid.ksh \ functional/suid/suid_write_to_suid.ksh \ functional/suid/suid_write_to_suid_sgid.ksh \ functional/suid/suid_write_zil_replay.ksh \ functional/trim/autotrim_config.ksh \ functional/trim/autotrim_integrity.ksh \ functional/trim/autotrim_trim_integrity.ksh \ functional/trim/cleanup.ksh \ functional/trim/setup.ksh \ functional/trim/trim_config.ksh \ functional/trim/trim_integrity.ksh \ functional/trim/trim_l2arc.ksh \ functional/truncate/cleanup.ksh \ functional/truncate/setup.ksh \ functional/truncate/truncate_001_pos.ksh \ functional/truncate/truncate_002_pos.ksh \ functional/truncate/truncate_timestamps.ksh \ functional/upgrade/cleanup.ksh \ functional/upgrade/setup.ksh \ functional/upgrade/upgrade_projectquota_001_pos.ksh \ functional/upgrade/upgrade_readonly_pool.ksh \ functional/upgrade/upgrade_userobj_001_pos.ksh \ functional/user_namespace/cleanup.ksh \ functional/user_namespace/setup.ksh \ functional/user_namespace/user_namespace_001.ksh \ functional/user_namespace/user_namespace_002.ksh \ functional/user_namespace/user_namespace_003.ksh \ functional/user_namespace/user_namespace_004.ksh \ functional/userquota/cleanup.ksh \ functional/userquota/groupspace_001_pos.ksh \ functional/userquota/groupspace_002_pos.ksh \ functional/userquota/groupspace_003_pos.ksh \ functional/userquota/setup.ksh \ functional/userquota/userquota_001_pos.ksh \ functional/userquota/userquota_002_pos.ksh \ functional/userquota/userquota_003_pos.ksh \ functional/userquota/userquota_004_pos.ksh \ functional/userquota/userquota_005_neg.ksh \ functional/userquota/userquota_006_pos.ksh \ functional/userquota/userquota_007_pos.ksh \ functional/userquota/userquota_008_pos.ksh \ functional/userquota/userquota_009_pos.ksh \ functional/userquota/userquota_010_pos.ksh \ functional/userquota/userquota_011_pos.ksh \ functional/userquota/userquota_012_neg.ksh \ functional/userquota/userquota_013_pos.ksh \ functional/userquota/userspace_001_pos.ksh \ functional/userquota/userspace_002_pos.ksh \ functional/userquota/userspace_003_pos.ksh \ functional/userquota/userspace_encrypted.ksh \ functional/userquota/userspace_send_encrypted.ksh \ functional/userquota/userspace_encrypted_13709.ksh \ functional/vdev_zaps/cleanup.ksh \ functional/vdev_zaps/setup.ksh \ functional/vdev_zaps/vdev_zaps_001_pos.ksh \ functional/vdev_zaps/vdev_zaps_002_pos.ksh \ functional/vdev_zaps/vdev_zaps_003_pos.ksh \ functional/vdev_zaps/vdev_zaps_004_pos.ksh \ functional/vdev_zaps/vdev_zaps_005_pos.ksh \ functional/vdev_zaps/vdev_zaps_006_pos.ksh \ functional/vdev_zaps/vdev_zaps_007_pos.ksh \ functional/write_dirs/cleanup.ksh \ functional/write_dirs/setup.ksh \ functional/write_dirs/write_dirs_001_pos.ksh \ functional/write_dirs/write_dirs_002_pos.ksh \ functional/xattr/cleanup.ksh \ functional/xattr/setup.ksh \ functional/xattr/xattr_001_pos.ksh \ functional/xattr/xattr_002_neg.ksh \ functional/xattr/xattr_003_neg.ksh \ functional/xattr/xattr_004_pos.ksh \ functional/xattr/xattr_005_pos.ksh \ functional/xattr/xattr_006_pos.ksh \ functional/xattr/xattr_007_neg.ksh \ functional/xattr/xattr_008_pos.ksh \ functional/xattr/xattr_009_neg.ksh \ functional/xattr/xattr_010_neg.ksh \ functional/xattr/xattr_011_pos.ksh \ functional/xattr/xattr_012_pos.ksh \ functional/xattr/xattr_013_pos.ksh \ functional/xattr/xattr_compat.ksh \ functional/zpool_influxdb/cleanup.ksh \ functional/zpool_influxdb/setup.ksh \ functional/zpool_influxdb/zpool_influxdb.ksh \ functional/zvol/zvol_cli/cleanup.ksh \ functional/zvol/zvol_cli/setup.ksh \ functional/zvol/zvol_cli/zvol_cli_001_pos.ksh \ functional/zvol/zvol_cli/zvol_cli_002_pos.ksh \ functional/zvol/zvol_cli/zvol_cli_003_neg.ksh \ functional/zvol/zvol_ENOSPC/cleanup.ksh \ functional/zvol/zvol_ENOSPC/setup.ksh \ functional/zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh \ functional/zvol/zvol_misc/cleanup.ksh \ functional/zvol/zvol_misc/setup.ksh \ functional/zvol/zvol_misc/zvol_misc_001_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_002_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_003_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \ functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \ functional/zvol/zvol_misc/zvol_misc_fua.ksh \ functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \ functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \ functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \ functional/zvol/zvol_misc/zvol_misc_trim.ksh \ functional/zvol/zvol_misc/zvol_misc_volmode.ksh \ functional/zvol/zvol_misc/zvol_misc_zil.ksh \ functional/zvol/zvol_stress/cleanup.ksh \ functional/zvol/zvol_stress/setup.ksh \ functional/zvol/zvol_stress/zvol_stress.ksh \ functional/zvol/zvol_swap/cleanup.ksh \ functional/zvol/zvol_swap/setup.ksh \ functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_002_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_003_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_004_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_005_pos.ksh \ functional/zvol/zvol_swap/zvol_swap_006_pos.ksh \ functional/idmap_mount/cleanup.ksh \ functional/idmap_mount/setup.ksh \ functional/idmap_mount/idmap_mount_001.ksh \ functional/idmap_mount/idmap_mount_002.ksh \ functional/idmap_mount/idmap_mount_003.ksh \ functional/idmap_mount/idmap_mount_004.ksh \ functional/idmap_mount/idmap_mount_005.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg index 71a64d4fae7a..c3b9efd6464a 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -1,73 +1,75 @@ # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2022, Klara Inc. # # Set the expected properties of a vdev typeset -a properties=( capacity state guid asize psize ashift size free allocated comment expandsize fragmentation bootsize parity path devid physpath encpath fru parent children numchildren read_errors write_errors checksum_errors initialize_errors null_ops read_ops write_ops free_ops claim_ops trim_ops null_bytes read_bytes write_bytes free_bytes claim_bytes trim_bytes removing allocating failfast checksum_n checksum_t io_n io_t + slow_io_n + slow_io_t ) diff --git a/tests/zfs-tests/tests/functional/events/cleanup.ksh b/tests/zfs-tests/tests/functional/events/cleanup.ksh index ef6e098cf42a..669b8ae99456 100755 --- a/tests/zfs-tests/tests/functional/events/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/events/cleanup.ksh @@ -1,33 +1,35 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2017 by Lawrence Livermore National Security, LLC. # . $STF_SUITE/include/libtest.shlib +zed_stop + zed_cleanup all-debug.sh all-syslog.sh all-dumpfds -zed_stop +zed_events_drain default_cleanup diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh new file mode 100755 index 000000000000..d9fabb2c3bc9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh @@ -0,0 +1,205 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +# DESCRIPTION: +# Verify that vdev properties, slow_io_n and slow_io_t, work with ZED. +# +# STRATEGY: +# 1. Create a pool with single vdev +# 2. Set slow_io_n/slow_io_t to non-default values +# 3. Inject slow io errors +# 4. Verify that ZED degrades vdev +# + +. $STF_SUITE/include/libtest.shlib + +TESTDIR="$TEST_BASE_DIR/zed_slow_io" +VDEV="$TEST_BASE_DIR/vdevfile.$$" +TESTPOOL="slow_io_pool" +FILEPATH="$TESTDIR/slow_io.testfile" + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) +OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) + +verify_runnable "both" + +function do_setup +{ + log_must truncate -s 1G $VDEV + default_setup_noexit $VDEV + zed_events_drain + log_must zfs set compression=off $TESTPOOL + log_must zfs set primarycache=none $TESTPOOL + log_must zfs set prefetch=none $TESTPOOL + log_must zfs set recordsize=512 $TESTPOOL + for i in {1..10}; do + dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null + done + zpool sync +} + +# intermediate cleanup +function do_clean +{ + log_must zinject -c all + log_must zpool destroy $TESTPOOL + log_must rm -f $VDEV +} + +# final cleanup +function cleanup +{ + log_must zinject -c all + + # if pool still exists then something failed so log additional info + if poolexists $TESTPOOL ; then + log_note "$(zpool status -s $TESTPOOL)" + echo "=================== zed log search ===================" + grep "Diagnosis Engine" $ZEDLET_DIR/zed.log + destroy_pool $TESTPOOL + fi + log_must zed_stop + + log_must rm -f $VDEV + + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS +} + +function start_slow_io +{ + zpool sync + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000 + + log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL + zpool sync +} + +function stop_slow_io +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + + log_must zinject -c all +} + +# Test default ZED settings: +# inject 10 events over 2.5 seconds, should not degrade. +function default_degrade +{ + do_setup + + start_slow_io + for i in {1..10}; do + dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null + sleep 0.25 + done + stop_slow_io + log_note "$(zpool status -s $TESTPOOL)" + + # give slow ZED a chance to process the delay events + sleep 18 + log_note "$(zpool status -s $TESTPOOL)" + + degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l) + log_note $degrades vdev degrades in ZED log + [ $degrades -eq "0" ] || \ + log_fail "expecting no degrade events, found $degrades" + + do_clean +} + +# change slow_io_n, slow_io_t to 5 events in 60 seconds +# fire more than 5 events, should degrade +function slow_io_degrade +{ + do_setup + + zpool set slow_io_n=5 $TESTPOOL $VDEV + zpool set slow_io_t=60 $TESTPOOL $VDEV + + start_slow_io + for i in {1..16}; do + dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null + sleep 0.5 + done + stop_slow_io + zpool sync + + # + # wait up to 60 seconds for kernel to produce at least 5 delay events + # + typeset -i i=0 + typeset -i events=0 + while [[ $i -lt 60 ]]; do + events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l) + [[ $events -ge "5" ]] && break + i=$((i+1)) + sleep 1 + done + log_note "$events delay events found" + + if [[ $events -ge "5" ]]; then + log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10 + fi + + do_clean +} + +# change slow_io_n, slow_io_t to 10 events in 1 second +# inject events spaced 0.5 seconds apart, should not degrade +function slow_io_no_degrade +{ + do_setup + + zpool set slow_io_n=10 $TESTPOOL $VDEV + zpool set slow_io_t=1 $TESTPOOL $VDEV + + start_slow_io + for i in {1..16}; do + dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null + sleep 0.5 + done + stop_slow_io + zpool sync + + log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45 + + do_clean +} + +log_assert "Test ZED slow io configurability" +log_onexit cleanup + +log_must zed_events_drain +log_must zed_start + +default_degrade +slow_io_degrade +slow_io_no_degrade + +log_pass "Test ZED slow io configurability" diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh new file mode 100755 index 000000000000..3357ae2e3510 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh @@ -0,0 +1,177 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +# DESCRIPTION: +# Verify that delay events from multiple vdevs doesnt degrade +# +# STRATEGY: +# 1. Create a pool with a 3 disk raidz vdev +# 2. Inject slow io errors +# 3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs +# + +. $STF_SUITE/include/libtest.shlib + +TESTDIR="$TEST_BASE_DIR/zed_slow_io" +VDEV1="$TEST_BASE_DIR/vdevfile1.$$" +VDEV2="$TEST_BASE_DIR/vdevfile2.$$" +VDEV3="$TEST_BASE_DIR/vdevfile3.$$" +VDEV4="$TEST_BASE_DIR/vdevfile4.$$" +VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4" +TESTPOOL="slow_io_pool" +FILEPATH="$TESTDIR/slow_io.testfile" + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) +OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + + # if pool still exists then something failed so log additional info + if poolexists $TESTPOOL ; then + log_note "$(zpool status -s $TESTPOOL)" + echo "=================== zed log search ===================" + grep "Diagnosis Engine" $ZEDLET_DIR/zed.log + destroy_pool $TESTPOOL + fi + log_must zed_stop + + log_must rm -f $VDEVS + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS +} + +function start_slow_io +{ + for vdev in $VDEVS + do + log_must zpool set slow_io_n=4 $TESTPOOL $vdev + log_must zpool set slow_io_t=60 $TESTPOOL $vdev + done + zpool sync + + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000 + + for vdev in $VDEVS + do + log_must zinject -d $vdev -D10:1 $TESTPOOL + done + zpool sync +} + +function stop_slow_io +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + + log_must zinject -c all +} + +function multiple_slow_vdevs_test +{ + log_must truncate -s 1G $VDEVS + default_raidz_setup_noexit $VDEVS + + log_must zpool events -c + log_must zfs set compression=off $TESTPOOL + log_must zfs set primarycache=none $TESTPOOL + log_must zfs set recordsize=4K $TESTPOOL + + log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20 + zpool sync + + # + # Read the file with slow io injected on the disks + # This will cause multiple errors on each disk to trip ZED SERD + # + # pool: slow_io_pool + # state: ONLINE + # config: + # + # NAME STATE READ WRITE CKSUM SLOW + # slow_io_pool ONLINE 0 0 0 - + # raidz1-0 ONLINE 0 0 0 - + # /var/tmp/vdevfile1.499278 ONLINE 0 0 0 113 + # /var/tmp/vdevfile2.499278 ONLINE 0 0 0 109 + # /var/tmp/vdevfile3.499278 ONLINE 0 0 0 96 + # /var/tmp/vdevfile4.499278 ONLINE 0 0 0 109 + # + start_slow_io + dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null + stop_slow_io + + # count events available for processing + typeset -i i=0 + typeset -i events=0 + while [[ $i -lt 60 ]]; do + events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l) + [[ $events -ge "50" ]] && break + i=$((i+1)) + sleep 1 + done + log_note "$events delay events found" + if [[ $events -lt "50" ]]; then + log_note "bailing: not enough events to complete the test" + destroy_pool $TESTPOOL + return + fi + + # + # give slow ZED a chance to process the delay events + # + typeset -i i=0 + typeset -i skips=0 + while [[ $i -lt 75 ]]; do + skips=$(grep "retiring case" \ + $ZEDLET_DIR/zed.log | wc -l) + [[ $skips -gt "0" ]] && break + i=$((i+1)) + sleep 1 + done + + log_note $skips degrade skips in ZED log after $i seconds + [ $skips -gt "0" ] || log_fail "expecting to see skips" + + degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l) + log_note $degrades vdev degrades in ZED log + [ $degrades -eq "0" ] || \ + log_fail "expecting no degrade events, found $degrades" + + destroy_pool $TESTPOOL +} + +log_assert "Test ZED slow io across multiple vdevs" +log_onexit cleanup + +log_must zed_events_drain +log_must zed_start +multiple_slow_vdevs_test + +log_pass "Test ZED slow io across multiple vdevs" diff --git a/tests/zfs-tests/tests/functional/fault/cleanup.ksh b/tests/zfs-tests/tests/functional/fault/cleanup.ksh index 654343c0cf00..2959236b59a3 100755 --- a/tests/zfs-tests/tests/functional/fault/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/fault/cleanup.ksh @@ -1,36 +1,37 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/fault/fault.cfg verify_runnable "global" cleanup_devices $DISKS zed_stop zed_cleanup resilver_finish-start-scrub.sh +zed_events_drain log_pass diff --git a/tests/zfs-tests/tests/functional/fault/setup.ksh b/tests/zfs-tests/tests/functional/fault/setup.ksh index 62f1c8ab56cb..61b9206ec1a6 100755 --- a/tests/zfs-tests/tests/functional/fault/setup.ksh +++ b/tests/zfs-tests/tests/functional/fault/setup.ksh @@ -1,34 +1,35 @@ #!/bin/ksh -p # # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or https://opensource.org/licenses/CDDL-1.0. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved. # . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/fault/fault.cfg verify_runnable "global" +zed_events_drain zed_setup resilver_finish-start-scrub.sh zed_start log_pass