diff --git a/cddl/lib/libzpool/Makefile b/cddl/lib/libzpool/Makefile index 819f67ceec5a..6893e31ff20a 100644 --- a/cddl/lib/libzpool/Makefile +++ b/cddl/lib/libzpool/Makefile @@ -1,267 +1,267 @@ # $FreeBSD$ ZFSTOP= ${SRCTOP}/sys/contrib/openzfs # ZFS_COMMON_SRCS .PATH: ${ZFSTOP}/module/zfs .PATH: ${ZFSTOP}/module/zcommon .PATH: ${ZFSTOP}/module/unicode # LUA_SRCS .PATH: ${ZFSTOP}/module/lua # ZSTD_SRCS .PATH: ${ZFSTOP}/module/zstd .PATH: ${ZFSTOP}/module/zstd/lib .PATH: ${ZFSTOP}/module/os/linux/zfs .PATH: ${ZFSTOP}/lib/libzpool .if exists(${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH}/opensolaris_atomic.S) .PATH: ${SRCTOP}/sys/cddl/contrib/opensolaris/common/atomic/${MACHINE_ARCH} ATOMIC_SRCS= opensolaris_atomic.S ACFLAGS+= -Wa,--noexecstack .else .PATH: ${SRCTOP}/sys/cddl/compat/opensolaris/kern ATOMIC_SRCS= opensolaris_atomic.c .endif .if ${MACHINE_ARCH} == "powerpc" || ${MACHINE_ARCH} == "powerpcspe" # Don't waste GOT entries on small data. PICFLAG= -fPIC .endif LIB= zpool USER_C = \ kernel.c \ taskq.c \ util.c KERNEL_C = \ zfeature_common.c \ zfs_comutil.c \ zfs_deleg.c \ zfs_fletcher.c \ zfs_fletcher_superscalar.c \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ zfs_zstd.c \ zpool_prop.c \ zprop_common.c \ abd.c \ abd_os.c \ aggsum.c \ arc.c \ arc_os.c \ blkptr.c \ bplist.c \ bpobj.c \ bptree.c \ btree.c \ bqueue.c \ cityhash.c \ dbuf.c \ dbuf_stats.c \ ddt.c \ ddt_zap.c \ dmu.c \ dmu_diff.c \ dmu_object.c \ dmu_objset.c \ dmu_recv.c \ dmu_redact.c \ dmu_send.c \ dmu_traverse.c \ dmu_tx.c \ dmu_zfetch.c \ dnode.c \ dnode_sync.c \ dsl_bookmark.c \ dsl_dataset.c \ dsl_deadlist.c \ dsl_deleg.c \ dsl_dir.c \ dsl_crypt.c \ dsl_pool.c \ dsl_prop.c \ dsl_scan.c \ dsl_synctask.c \ dsl_destroy.c \ dsl_userhold.c \ edonr_zfs.c \ hkdf.c \ fm.c \ gzip.c \ lzjb.c \ lz4.c \ metaslab.c \ mmp.c \ multilist.c \ objlist.c \ pathname.c \ range_tree.c \ refcount.c \ rrwlock.c \ sa.c \ sha256.c \ skein_zfs.c \ spa.c \ spa_boot.c \ spa_checkpoint.c \ spa_config.c \ spa_errlog.c \ spa_history.c \ spa_log_spacemap.c \ spa_misc.c \ spa_stats.c \ space_map.c \ space_reftree.c \ txg.c \ trace.c \ uberblock.c \ unique.c \ vdev.c \ vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_file.c \ vdev_indirect_births.c \ vdev_indirect.c \ vdev_indirect_mapping.c \ vdev_initialize.c \ vdev_label.c \ vdev_mirror.c \ vdev_missing.c \ vdev_queue.c \ vdev_raidz.c \ vdev_raidz_math_aarch64_neon.c \ vdev_raidz_math_aarch64_neonx2.c \ vdev_raidz_math_avx2.c \ vdev_raidz_math_avx512bw.c \ vdev_raidz_math_avx512f.c \ vdev_raidz_math.c \ vdev_raidz_math_scalar.c \ vdev_rebuild.c \ vdev_removal.c \ vdev_root.c \ vdev_trim.c \ zap.c \ zap_leaf.c \ zap_micro.c \ zcp.c \ zcp_get.c \ zcp_global.c \ zcp_iter.c \ zcp_set.c \ zcp_synctask.c \ zfeature.c \ zfs_byteswap.c \ zfs_debug.c \ zfs_fm.c \ zfs_fuid.c \ - zfs_racct.c \ zfs_sa.c \ zfs_znode.c \ + zfs_racct.c \ zfs_ratelimit.c \ zfs_rlock.c \ zil.c \ zio.c \ zio_checksum.c \ zio_compress.c \ zio_crypt.c \ zio_inject.c \ zle.c \ zrlock.c \ zstd.c \ zthr.c ARCH_C = .if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386" ARCH_C += vdev_raidz_math_sse2.c \ vdev_raidz_math_ssse3.c \ zfs_fletcher_intel.c \ zfs_fletcher_sse.c CFLAGS += -DHAVE_SSE2 -DHAVE_SSE3 .endif .if ${MACHINE_ARCH} == "amd64" ARCH_C += zfs_fletcher_avx512.c CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_AVX512F \ -DHAVE_AVX512BW .endif .if ${MACHINE_CPUARCH} == "aarch64" ARCH_C += zfs_fletcher_aarch64_neon.c .endif LUA_C = \ lapi.c \ lauxlib.c \ lbaselib.c \ lcode.c \ lcompat.c \ lcorolib.c \ lctype.c \ ldebug.c \ ldo.c \ lfunc.c \ lgc.c \ llex.c \ lmem.c \ lobject.c \ lopcodes.c \ lparser.c \ lstate.c \ lstring.c \ lstrlib.c \ ltable.c \ ltablib.c \ ltm.c \ lvm.c \ lzio.c UNICODE_C = u8_textprep.c uconv.c SRCS= ${USER_C} ${KERNEL_C} ${LUA_C} ${UNICODE_C} ${ARCH_C} WARNS?= 2 CFLAGS+= \ -DIN_BASE \ -I${ZFSTOP}/include \ -I${ZFSTOP}/lib/libspl/include \ -I${ZFSTOP}/lib/libspl/include/os/freebsd \ -I${SRCTOP}/sys \ -I${ZFSTOP}/include/os/freebsd/zfs \ -I${SRCTOP}/cddl/compat/opensolaris/include \ -I${ZFSTOP}/module/icp/include \ -include ${ZFSTOP}/include/os/freebsd/spl/sys/ccompile.h \ -DHAVE_ISSETUGID \ -include ${SRCTOP}/sys/modules/zfs/zfs_config.h \ -I${SRCTOP}/sys/modules/zfs \ -I${ZFSTOP}/include/os/freebsd/zfs \ -DLIB_ZPOOL_BUILD -DZFS_DEBUG \ # XXX: pthread doesn't have mutex_owned() equivalent, so we need to look # into libthr private structures. That's sooo evil, but it's only for # ZFS debugging tools needs. CFLAGS+= -DWANTS_MUTEX_OWNED CFLAGS+= -I${SRCTOP}/lib/libpthread/thread CFLAGS+= -I${SRCTOP}/lib/libpthread/sys CFLAGS+= -I${SRCTOP}/lib/libthr/arch/${MACHINE_CPUARCH}/include CFLAGS.gcc+= -fms-extensions LIBADD= md pthread z spl icp nvpair avl umem # atomic.S doesn't like profiling. MK_PROFILE= no CSTD= c99 # Since there are many asserts in this library, it makes no sense to compile # it without debugging. CFLAGS+= -g -DDEBUG=1 CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zstd.c+= -fno-tree-vectorize .include diff --git a/stand/libsa/zfs/zstd_shim.c b/stand/libsa/zfs/zstd_shim.c index b94df6f51913..91f5171a72b5 100644 --- a/stand/libsa/zfs/zstd_shim.c +++ b/stand/libsa/zfs/zstd_shim.c @@ -1,40 +1,45 @@ /*- * Copyright (c) 2020 M. Warner Losh * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); /* * Small amount of shim code needed to get zfs_zstd.c to compile. These items * here should all be defined in the SPL or as part of libstand somewhere, but * aren't for reasons that haven't been tracked down yet. Ideally, they would * all go away and we'd compile zfs_zstd.c directly. Based on an original by * Matt Macey, but only the #include remains untouched from that. */ #define ZFS_MODULE_PARAM_ARGS void typedef int boolean_t; /* This one may be tough to get rid of */ +/* TODO: openzfs/include/sys/uio_impl.h must not be included */ +#ifndef _SYS_UIO_IMPL_H +#define _SYS_UIO_IMPL_H +#endif + #include diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c index ab2b006ae460..9a59fddbf06d 100644 --- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c +++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c @@ -1,8762 +1,8759 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright 2016 Igor Kozhukhov . * Copyright 2016 Nexenta Systems, Inc. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, loli10K * Copyright 2019 Joyent, Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_IDMAP #include #include #endif /* HAVE_IDMAP */ #include "zfs_iter.h" #include "zfs_util.h" #include "zfs_comutil.h" #include "libzfs_impl.h" #include "zfs_projectutil.h" libzfs_handle_t *g_zfs; static FILE *mnttab_file; static char history_str[HIS_MAX_RECORD_LEN]; static boolean_t log_history = B_TRUE; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); static int zfs_do_destroy(int argc, char **argv); static int zfs_do_get(int argc, char **argv); static int zfs_do_inherit(int argc, char **argv); static int zfs_do_list(int argc, char **argv); static int zfs_do_mount(int argc, char **argv); static int zfs_do_rename(int argc, char **argv); static int zfs_do_rollback(int argc, char **argv); static int zfs_do_set(int argc, char **argv); static int zfs_do_upgrade(int argc, char **argv); static int zfs_do_snapshot(int argc, char **argv); static int zfs_do_unmount(int argc, char **argv); static int zfs_do_share(int argc, char **argv); static int zfs_do_unshare(int argc, char **argv); static int zfs_do_send(int argc, char **argv); static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv); static int zfs_do_allow(int argc, char **argv); static int zfs_do_unallow(int argc, char **argv); static int zfs_do_hold(int argc, char **argv); static int zfs_do_holds(int argc, char **argv); static int zfs_do_release(int argc, char **argv); static int zfs_do_diff(int argc, char **argv); static int zfs_do_bookmark(int argc, char **argv); static int zfs_do_channel_program(int argc, char **argv); static int zfs_do_load_key(int argc, char **argv); static int zfs_do_unload_key(int argc, char **argv); static int zfs_do_change_key(int argc, char **argv); static int zfs_do_project(int argc, char **argv); static int zfs_do_version(int argc, char **argv); static int zfs_do_redact(int argc, char **argv); static int zfs_do_wait(int argc, char **argv); #ifdef __FreeBSD__ static int zfs_do_jail(int argc, char **argv); static int zfs_do_unjail(int argc, char **argv); #endif /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. */ #ifdef DEBUG const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } #endif typedef enum { HELP_CLONE, HELP_CREATE, HELP_DESTROY, HELP_GET, HELP_INHERIT, HELP_UPGRADE, HELP_LIST, HELP_MOUNT, HELP_PROMOTE, HELP_RECEIVE, HELP_RENAME, HELP_ROLLBACK, HELP_SEND, HELP_SET, HELP_SHARE, HELP_SNAPSHOT, HELP_UNMOUNT, HELP_UNSHARE, HELP_ALLOW, HELP_UNALLOW, HELP_USERSPACE, HELP_GROUPSPACE, HELP_PROJECTSPACE, HELP_PROJECT, HELP_HOLD, HELP_HOLDS, HELP_RELEASE, HELP_DIFF, HELP_BOOKMARK, HELP_CHANNEL_PROGRAM, HELP_LOAD_KEY, HELP_UNLOAD_KEY, HELP_CHANGE_KEY, HELP_VERSION, HELP_REDACT, HELP_JAIL, HELP_UNJAIL, HELP_WAIT, } zfs_help_t; typedef struct zfs_command { const char *name; int (*func)(int argc, char **argv); zfs_help_t usage; } zfs_command_t; /* * Master command table. Each ZFS command has a name, associated function, and * usage message. The usage messages need to be internationalized, so we have * to have a function to return the usage message based on a command index. * * These commands are organized according to how they are displayed in the usage * message. An empty command (one with a NULL name) indicates an empty line in * the generic usage message. */ static zfs_command_t command_table[] = { { "version", zfs_do_version, HELP_VERSION }, { NULL }, { "create", zfs_do_create, HELP_CREATE }, { "destroy", zfs_do_destroy, HELP_DESTROY }, { NULL }, { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, { "rollback", zfs_do_rollback, HELP_ROLLBACK }, { "clone", zfs_do_clone, HELP_CLONE }, { "promote", zfs_do_promote, HELP_PROMOTE }, { "rename", zfs_do_rename, HELP_RENAME }, { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, { NULL }, { "list", zfs_do_list, HELP_LIST }, { NULL }, { "set", zfs_do_set, HELP_SET }, { "get", zfs_do_get, HELP_GET }, { "inherit", zfs_do_inherit, HELP_INHERIT }, { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, { NULL }, { "userspace", zfs_do_userspace, HELP_USERSPACE }, { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, { "projectspace", zfs_do_userspace, HELP_PROJECTSPACE }, { NULL }, { "project", zfs_do_project, HELP_PROJECT }, { NULL }, { "mount", zfs_do_mount, HELP_MOUNT }, { "unmount", zfs_do_unmount, HELP_UNMOUNT }, { "share", zfs_do_share, HELP_SHARE }, { "unshare", zfs_do_unshare, HELP_UNSHARE }, { NULL }, { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { NULL }, { "allow", zfs_do_allow, HELP_ALLOW }, { NULL }, { "unallow", zfs_do_unallow, HELP_UNALLOW }, { NULL }, { "hold", zfs_do_hold, HELP_HOLD }, { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, { "diff", zfs_do_diff, HELP_DIFF }, { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, { "redact", zfs_do_redact, HELP_REDACT }, { "wait", zfs_do_wait, HELP_WAIT }, #ifdef __FreeBSD__ { "jail", zfs_do_jail, HELP_JAIL }, { "unjail", zfs_do_unjail, HELP_UNJAIL }, #endif }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) zfs_command_t *current_command; static const char * get_usage(zfs_help_t idx) { switch (idx) { case HELP_CLONE: return (gettext("\tclone [-p] [-o property=value] ... " " \n")); case HELP_CREATE: return (gettext("\tcreate [-Pnpuv] [-o property=value] ... " "\n" "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: return (gettext("\tdestroy [-fnpRrv] \n" "\tdestroy [-dnpRrv] " "@[%][,...]\n" "\tdestroy #\n")); case HELP_GET: return (gettext("\tget [-rHp] [-d max] " "[-o \"all\" | field[,...]]\n" "\t [-t type[,...]] [-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot|bookmark] ...\n")); case HELP_INHERIT: return (gettext("\tinherit [-rS] " " ...\n")); case HELP_UPGRADE: return (gettext("\tupgrade [-v]\n" "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); case HELP_LIST: return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " "[-s property]...\n\t [-S property]... [-t type[,...]] " "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: return (gettext("\treceive [-vMnsFhu] " "[-o =] ... [-x ] ...\n" "\t \n" "\treceive [-vMnsFhu] [-o =] ... " "[-x ] ... \n" "\t [-d | -e] \n" "\treceive -A \n")); case HELP_RENAME: return (gettext("\trename [-f] " "\n" "\trename -p [-f] \n" "\trename -u [-f] \n" "\trename -r \n")); case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] " "\n" "\tsend [-nvPLecw] [-i snapshot|bookmark] " "\n" "\tsend [-DnPpvLec] [-i bookmark|snapshot] " "--redact \n" "\tsend [-nvPe] -t \n" "\tsend [-Pnv] --saved filesystem\n")); case HELP_SET: return (gettext("\tset ... " " ...\n")); case HELP_SHARE: return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n")); case HELP_SNAPSHOT: return (gettext("\tsnapshot [-r] [-o property=value] ... " "@ ...\n")); case HELP_UNMOUNT: return (gettext("\tunmount [-fu] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: return (gettext("\tunshare " "<-a [nfs|smb] | filesystem|mountpoint>\n")); case HELP_ALLOW: return (gettext("\tallow \n" "\tallow [-ldug] " "<\"everyone\"|user|group>[,...] [,...]\n" "\t \n" "\tallow [-ld] -e [,...] " "\n" "\tallow -c [,...] \n" "\tallow -s @setname [,...] " "\n")); case HELP_UNALLOW: return (gettext("\tunallow [-rldug] " "<\"everyone\"|user|group>[,...]\n" "\t [[,...]] \n" "\tunallow [-rld] -e [[,...]] " "\n" "\tunallow [-r] -c [[,...]] " "\n" "\tunallow [-r] -s @setname [[,...]] " "\n")); case HELP_USERSPACE: return (gettext("\tuserspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_GROUPSPACE: return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " "[-s field] ...\n" "\t [-S field] ... [-t type[,...]] " "\n")); case HELP_PROJECTSPACE: return (gettext("\tprojectspace [-Hp] [-o field[,...]] " "[-s field] ... \n" "\t [-S field] ... \n")); case HELP_PROJECT: return (gettext("\tproject [-d|-r] \n" "\tproject -c [-0] [-d|-r] [-p id] \n" "\tproject -C [-k] [-r] \n" "\tproject [-p id] [-r] [-s] \n")); case HELP_HOLD: return (gettext("\thold [-r] ...\n")); case HELP_HOLDS: return (gettext("\tholds [-rH] ...\n")); case HELP_RELEASE: return (gettext("\trelease [-r] ...\n")); case HELP_DIFF: return (gettext("\tdiff [-FHt] " "[snapshot|filesystem]\n")); case HELP_BOOKMARK: return (gettext("\tbookmark " "\n")); case HELP_CHANNEL_PROGRAM: return (gettext("\tprogram [-jn] [-t ] " "[-m ]\n" "\t [lua args...]\n")); case HELP_LOAD_KEY: return (gettext("\tload-key [-rn] [-L ] " "<-a | filesystem|volume>\n")); case HELP_UNLOAD_KEY: return (gettext("\tunload-key [-r] " "<-a | filesystem|volume>\n")); case HELP_CHANGE_KEY: return (gettext("\tchange-key [-l] [-o keyformat=]\n" "\t [-o keylocation=] [-o pbkdf2iters=]\n" "\t \n" "\tchange-key -i [-l] \n")); case HELP_VERSION: return (gettext("\tversion\n")); case HELP_REDACT: return (gettext("\tredact " " ...\n")); case HELP_JAIL: return (gettext("\tjail \n")); case HELP_UNJAIL: return (gettext("\tunjail \n")); case HELP_WAIT: return (gettext("\twait [-t ] \n")); } abort(); /* NOTREACHED */ } void nomem(void) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); exit(1); } /* * Utility function to guarantee malloc() success. */ void * safe_malloc(size_t size) { void *data; if ((data = calloc(1, size)) == NULL) nomem(); return (data); } static void * safe_realloc(void *data, size_t size) { void *newp; if ((newp = realloc(data, size)) == NULL) { free(data); nomem(); } return (newp); } static char * safe_strdup(char *str) { char *dupstr = strdup(str); if (dupstr == NULL) nomem(); return (dupstr); } /* * Callback routine that will print out information for each of * the properties. */ static int usage_prop_cb(int prop, void *cb) { FILE *fp = cb; (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); if (zfs_prop_readonly(prop)) (void) fprintf(fp, " NO "); else (void) fprintf(fp, "YES "); if (zfs_prop_inheritable(prop)) (void) fprintf(fp, " YES "); else (void) fprintf(fp, " NO "); if (zfs_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); else (void) fprintf(fp, "%s\n", zfs_prop_values(prop)); return (ZPROP_CONT); } /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display * a complete usage message. */ static void usage(boolean_t requested) { int i; boolean_t show_properties = B_FALSE; FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { (void) fprintf(fp, gettext("usage: zfs command args ...\n")); (void) fprintf(fp, gettext("where 'command' is one of the following:\n\n")); for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) (void) fprintf(fp, "\n"); else (void) fprintf(fp, "%s", get_usage(command_table[i].usage)); } (void) fprintf(fp, gettext("\nEach dataset is of the form: " "pool/[dataset/]*dataset[@name]\n")); } else { (void) fprintf(fp, gettext("usage:\n")); (void) fprintf(fp, "%s", get_usage(current_command->usage)); } if (current_command != NULL && (strcmp(current_command->name, "set") == 0 || strcmp(current_command->name, "get") == 0 || strcmp(current_command->name, "inherit") == 0 || strcmp(current_command->name, "list") == 0)) show_properties = B_TRUE; if (show_properties) { (void) fprintf(fp, gettext("\nThe following properties are supported:\n")); (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", "PROPERTY", "EDIT", "INHERIT", "VALUES"); /* Iterate over all properties */ (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); (void) fprintf(fp, "\t%-15s ", "userused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "groupused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "projectused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "userobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "groupobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "projectobjused@..."); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "userquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "projectquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "userobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "projectobjquota@..."); (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "written@"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, "\t%-15s ", "written#"); (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); (void) fprintf(fp, gettext("\nUser-defined properties can " "be specified by using a name containing a colon (:).\n")); (void) fprintf(fp, gettext("\nThe {user|group|project}" "[obj]{used|quota}@ properties must be appended with\n" "a user|group|project specifier of one of these forms:\n" " POSIX name (eg: \"matt\")\n" " POSIX id (eg: \"126829\")\n" " SMB name@domain (eg: \"matt@sun\")\n" " SMB SID (eg: \"S-1-234-567-89\")\n")); } else { (void) fprintf(fp, gettext("\nFor the property list, run: %s\n"), "zfs set|get"); (void) fprintf(fp, gettext("\nFor the delegated permission list, run: %s\n"), "zfs allow|unallow"); } /* * See comments at end of main(). */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } exit(requested ? 0 : 2); } /* * Take a property=value argument string and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parseprop(nvlist_t *props, char *propname) { char *propval; if ((propval = strchr(propname, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " "'=' for property=value argument\n")); return (B_FALSE); } *propval = '\0'; propval++; if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_string(props, propname, propval) != 0) nomem(); return (B_TRUE); } /* * Take a property name argument and add it to the given nvlist. * Modifies the argument inplace. */ static boolean_t parsepropname(nvlist_t *props, char *propname) { if (strchr(propname, '=') != NULL) { (void) fprintf(stderr, gettext("invalid character " "'=' in property argument\n")); return (B_FALSE); } if (nvlist_exists(props, propname)) { (void) fprintf(stderr, gettext("property '%s' " "specified multiple times\n"), propname); return (B_FALSE); } if (nvlist_add_boolean(props, propname) != 0) nomem(); return (B_TRUE); } static int parse_depth(char *opt, int *flags) { char *tmp; int depth; depth = (int)strtol(opt, &tmp, 0); if (*tmp) { (void) fprintf(stderr, gettext("%s is not an integer\n"), optarg); usage(B_FALSE); } if (depth < 0) { (void) fprintf(stderr, gettext("Depth can not be negative.\n")); usage(B_FALSE); } *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); return (depth); } #define PROGRESS_DELAY 2 /* seconds */ static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; static time_t pt_begin; static char *pt_header = NULL; static boolean_t pt_shown; static void start_progress_timer(void) { pt_begin = time(NULL) + PROGRESS_DELAY; pt_shown = B_FALSE; } static void set_progress_header(char *header) { assert(pt_header == NULL); pt_header = safe_strdup(header); if (pt_shown) { (void) printf("%s: ", header); (void) fflush(stdout); } } static void update_progress(char *update) { if (!pt_shown && time(NULL) > pt_begin) { int len = strlen(update); (void) printf("%s: %s%*.*s", pt_header, update, len, len, pt_reverse); (void) fflush(stdout); pt_shown = B_TRUE; } else if (pt_shown) { int len = strlen(update); (void) printf("%s%*.*s", update, len, len, pt_reverse); (void) fflush(stdout); } } static void finish_progress(char *done) { if (pt_shown) { (void) printf("%s\n", done); (void) fflush(stdout); } free(pt_header); pt_header = NULL; } static int zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) { zfs_handle_t *zhp = NULL; int ret = 0; zhp = zfs_open(hdl, dataset, type); if (zhp == NULL) return (1); /* * Volumes may neither be mounted or shared. Potentially in the * future filesystems detected on these volumes could be mounted. */ if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) { zfs_close(zhp); return (0); } /* * Mount and/or share the new filesystem as appropriate. We provide a * verbose error message to let the user know that their filesystem was * in fact created, even if we failed to mount or share it. * * If the user doesn't want the dataset automatically mounted, then * skip the mount/share step */ if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) && zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) { if (zfs_mount_delegation_check()) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but it may only be " "mounted by root\n")); ret = 1; } else if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); ret = 1; } else if (zfs_share(zhp) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not shared\n")); ret = 1; } zfs_commit_all_shares(); } zfs_close(zhp); return (ret); } /* * zfs clone [-p] [-o prop=value] ... * * Given an existing dataset, create a writable copy whose initial contents * are the same as the source. The newly created dataset maintains a * dependency on the original; the original cannot be destroyed so long as * the clone exists. * * The '-p' flag creates all the non-existing ancestors of the target first. */ static int zfs_do_clone(int argc, char **argv) { zfs_handle_t *zhp = NULL; boolean_t parents = B_FALSE; nvlist_t *props; int ret = 0; int c; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "o:p")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; case 'p': parents = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); goto usage; } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); goto usage; } /* open the source dataset */ if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) { nvlist_free(props); return (1); } if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { /* * Now create the ancestors of the target dataset. If the * target already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_close(zhp); nvlist_free(props); return (0); } if (zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); nvlist_free(props); return (1); } } /* pass to libzfs */ ret = zfs_clone(zhp, argv[1], props); /* create the mountpoint if necessary */ if (ret == 0) { if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); } zfs_close(zhp); nvlist_free(props); return (!!ret); usage: ASSERT3P(zhp, ==, NULL); nvlist_free(props); usage(B_FALSE); return (-1); } /* * Return a default volblocksize for the pool which always uses more than * half of the data sectors. This primarily applies to dRAID which always * writes full stripe widths. */ static uint64_t default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) { uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; nvlist_t *tree, **vdevs; uint_t nvdevs; nvlist_t *config = zpool_get_config(zhp, NULL); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &vdevs, &nvdevs) != 0) { return (ZVOL_DEFAULT_BLOCKSIZE); } for (int i = 0; i < nvdevs; i++) { nvlist_t *nv = vdevs[i]; uint64_t ashift, ndata, nparity; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0) continue; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0) { /* dRAID minimum allocation width */ asize = MAX(asize, ndata * (1ULL << ashift)); } else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { /* raidz minimum allocation width */ if (nparity == 1) asize = MAX(asize, 2 * (1ULL << ashift)); else asize = MAX(asize, 4 * (1ULL << ashift)); } else { /* mirror or (non-redundant) leaf vdev */ asize = MAX(asize, 1ULL << ashift); } } /* * Calculate the target volblocksize such that more than half * of the asize is used. The following table is for 4k sectors. * * n asize blksz used | n asize blksz used * -------------------------+--------------------------------- * 1 4,096 8,192 100% | 9 36,864 32,768 88% * 2 8,192 8,192 100% | 10 40,960 32,768 80% * 3 12,288 8,192 66% | 11 45,056 32,768 72% * 4 16,384 16,384 100% | 12 49,152 32,768 66% * 5 20,480 16,384 80% | 13 53,248 32,768 61% * 6 24,576 16,384 66% | 14 57,344 32,768 57% * 7 28,672 16,384 57% | 15 61,440 32,768 53% * 8 32,768 32,768 100% | 16 65,536 65,636 100% * * This is primarily a concern for dRAID which always allocates * a full stripe width. For dRAID the default stripe width is * n=8 in which case the volblocksize is set to 32k. Ignoring * compression there are no unused sectors. This same reasoning * applies to raidz[2,3] so target 4 sectors to minimize waste. */ uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE; while (tgt_volblocksize * 2 <= asize) tgt_volblocksize *= 2; const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) { /* Issue a warning when a non-optimal size is requested. */ if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) { (void) fprintf(stderr, gettext("Warning: " "volblocksize (%llu) is less than the default " "minimum block size (%llu).\nTo reduce wasted " "space a volblocksize of %llu is recommended.\n"), (u_longlong_t)volblocksize, (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE, (u_longlong_t)tgt_volblocksize); } else if (volblocksize < tgt_volblocksize) { (void) fprintf(stderr, gettext("Warning: " "volblocksize (%llu) is much less than the " "minimum allocation\nunit (%llu), which wastes " "at least %llu%% of space. To reduce wasted " "space,\nuse a larger volblocksize (%llu is " "recommended), fewer dRAID data disks\n" "per group, or smaller sector size (ashift).\n"), (u_longlong_t)volblocksize, (u_longlong_t)asize, (u_longlong_t)((100 * (asize - volblocksize)) / asize), (u_longlong_t)tgt_volblocksize); } } else { volblocksize = tgt_volblocksize; fnvlist_add_uint64(props, prop, volblocksize); } return (volblocksize); } /* * zfs create [-Pnpv] [-o prop=value] ... fs * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size * * Create a new dataset. This command can be used to create filesystems * and volumes. Snapshot creation is handled by 'zfs snapshot'. * For volumes, the user must specify a size to be used. * * The '-s' flag applies only to volumes, and indicates that we should not try * to set the reservation for this volume. By default we set a reservation * equal to the size for any volume. For pools with SPA_VERSION >= * SPA_VERSION_REFRESERVATION, we set a refreservation instead. * * The '-p' flag creates all the non-existing ancestors of the target first. * * The '-n' flag is no-op (dry run) mode. This will perform a user-space sanity * check of arguments and properties, but does not check for permissions, * available space, etc. * * The '-u' flag prevents the newly created file system from being mounted. * * The '-v' flag is for verbose output. * * The '-P' flag is used for parseable output. It implies '-v'. */ static int zfs_do_create(int argc, char **argv) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; zpool_handle_t *zpool_handle = NULL; nvlist_t *real_props = NULL; uint64_t volsize = 0; int c; boolean_t noreserve = B_FALSE; boolean_t bflag = B_FALSE; boolean_t parents = B_FALSE; boolean_t dryrun = B_FALSE; boolean_t nomount = B_FALSE; boolean_t verbose = B_FALSE; boolean_t parseable = B_FALSE; int ret = 1; nvlist_t *props; uint64_t intval; char *strval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":PV:b:nso:puv")) != -1) { switch (c) { case 'V': type = ZFS_TYPE_VOLUME; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) nomem(); volsize = intval; break; case 'P': verbose = B_TRUE; parseable = B_TRUE; break; case 'p': parents = B_TRUE; break; case 'b': bflag = B_TRUE; if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { (void) fprintf(stderr, gettext("bad volume " "block size '%s': %s\n"), optarg, libzfs_error_description(g_zfs)); goto error; } if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), intval) != 0) nomem(); break; case 'n': dryrun = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) goto error; break; case 's': noreserve = B_TRUE; break; case 'u': nomount = B_TRUE; break; case 'v': verbose = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing size " "argument\n")); goto badusage; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto badusage; } } if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { (void) fprintf(stderr, gettext("'-s' and '-b' can only be " "used when creating a volume\n")); goto badusage; } if (nomount && type != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("'-u' can only be " "used when creating a filesystem\n")); goto badusage; } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing %s argument\n"), zfs_type_to_name(type)); goto badusage; } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); goto badusage; } if (dryrun || type == ZFS_TYPE_VOLUME) { char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; char *p; if ((p = strchr(argv[0], '/')) != NULL) *p = '\0'; zpool_handle = zpool_open(g_zfs, argv[0]); if (p != NULL) *p = '/'; if (zpool_handle == NULL) goto error; (void) snprintf(msg, sizeof (msg), dryrun ? gettext("cannot verify '%s'") : gettext("cannot create '%s'"), argv[0]); if (props && (real_props = zfs_valid_proplist(g_zfs, type, props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) { zpool_close(zpool_handle); goto error; } } if (type == ZFS_TYPE_VOLUME) { const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); uint64_t volblocksize = default_volblocksize(zpool_handle, real_props); if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE && nvlist_lookup_string(props, prop, &strval) != 0) { if (asprintf(&strval, "%llu", (u_longlong_t)volblocksize) == -1) nomem(); nvlist_add_string(props, prop, strval); free(strval); } /* * If volsize is not a multiple of volblocksize, round it * up to the nearest multiple of the volblocksize. */ if (volsize % volblocksize) { volsize = P2ROUNDUP_TYPED(volsize, volblocksize, uint64_t); if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) { nvlist_free(props); nomem(); } } } if (type == ZFS_TYPE_VOLUME && !noreserve) { uint64_t spa_version; zfs_prop_t resv_prop; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); if (spa_version >= SPA_VERSION_REFRESERVATION) resv_prop = ZFS_PROP_REFRESERVATION; else resv_prop = ZFS_PROP_RESERVATION; volsize = zvol_volsize_to_reservation(zpool_handle, volsize, real_props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { if (nvlist_add_uint64(props, zfs_prop_to_name(resv_prop), volsize) != 0) { nvlist_free(props); nomem(); } } } if (zpool_handle != NULL) { zpool_close(zpool_handle); nvlist_free(real_props); } if (parents && zfs_name_valid(argv[0], type)) { /* * Now create the ancestors of target dataset. If the target * already exists and '-p' option was used we should not * complain. */ if (zfs_dataset_exists(g_zfs, argv[0], type)) { ret = 0; goto error; } if (verbose) { (void) printf(parseable ? "create_ancestors\t%s\n" : dryrun ? "would create ancestors of %s\n" : "create ancestors of %s\n", argv[0]); } if (!dryrun) { if (zfs_create_ancestors(g_zfs, argv[0]) != 0) { goto error; } } } if (verbose) { nvpair_t *nvp = NULL; (void) printf(parseable ? "create\t%s\n" : dryrun ? "would create %s\n" : "create %s\n", argv[0]); while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { uint64_t uval; char *sval; switch (nvpair_type(nvp)) { case DATA_TYPE_UINT64: VERIFY0(nvpair_value_uint64(nvp, &uval)); (void) printf(parseable ? "property\t%s\t%llu\n" : "\t%s=%llu\n", nvpair_name(nvp), (u_longlong_t)uval); break; case DATA_TYPE_STRING: VERIFY0(nvpair_value_string(nvp, &sval)); (void) printf(parseable ? "property\t%s\t%s\n" : "\t%s=%s\n", nvpair_name(nvp), sval); break; default: (void) fprintf(stderr, "property '%s' " "has illegal type %d\n", nvpair_name(nvp), nvpair_type(nvp)); abort(); } } } if (dryrun) { ret = 0; goto error; } /* pass to libzfs */ if (zfs_create(g_zfs, argv[0], type, props) != 0) goto error; if (log_history) { (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (nomount) { ret = 0; goto error; } ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); error: nvlist_free(props); return (ret); badusage: nvlist_free(props); usage(B_FALSE); return (2); } /* * zfs destroy [-rRf] * zfs destroy [-rRd] * * -r Recursively destroy all children * -R Recursively destroy all dependents, including clones * -f Force unmounting of any dependents * -d If we can't destroy now, mark for deferred destruction * * Destroys the given dataset. By default, it will unmount any filesystems, * and refuse to destroy a dataset that has any dependents. A dependent can * either be a child, or a clone of a child. */ typedef struct destroy_cbdata { boolean_t cb_first; boolean_t cb_force; boolean_t cb_recurse; boolean_t cb_error; boolean_t cb_doclones; zfs_handle_t *cb_target; boolean_t cb_defer_destroy; boolean_t cb_verbose; boolean_t cb_parsable; boolean_t cb_dryrun; nvlist_t *cb_nvl; nvlist_t *cb_batchedsnaps; /* first snap in contiguous run */ char *cb_firstsnap; /* previous snap in contiguous run */ char *cb_prevsnap; int64_t cb_snapused; char *cb_snapspec; char *cb_bookmark; uint64_t cb_snap_count; } destroy_cbdata_t; /* * Check for any dependents based on the '-r' or '-R' flags. */ static int destroy_check_dependent(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cbp = data; const char *tname = zfs_get_name(cbp->cb_target); const char *name = zfs_get_name(zhp); if (strncmp(tname, name, strlen(tname)) == 0 && (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { /* * This is a direct descendant, not a clone somewhere else in * the hierarchy. */ if (cbp->cb_recurse) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has children\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-r' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } else { /* * This is a clone. We only want to report this if the '-r' * wasn't specified, or the target is a snapshot. */ if (!cbp->cb_recurse && zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) goto out; if (cbp->cb_first) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "%s has dependent clones\n"), zfs_get_name(cbp->cb_target), zfs_type_to_name(zfs_get_type(cbp->cb_target))); (void) fprintf(stderr, gettext("use '-R' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; cbp->cb_error = B_TRUE; cbp->cb_dryrun = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); } out: zfs_close(zhp); return (0); } static int destroy_batched(destroy_cbdata_t *cb) { int error = zfs_destroy_snaps_nvl(g_zfs, cb->cb_batchedsnaps, B_FALSE); fnvlist_free(cb->cb_batchedsnaps); cb->cb_batchedsnaps = fnvlist_alloc(); return (error); } static int destroy_callback(zfs_handle_t *zhp, void *data) { destroy_cbdata_t *cb = data; const char *name = zfs_get_name(zhp); int error; if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } /* * Ignore pools (which we've already flagged as an error before getting * here). */ if (strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { zfs_close(zhp); return (0); } if (cb->cb_dryrun) { zfs_close(zhp); return (0); } /* * We batch up all contiguous snapshots (even of different * filesystems) and destroy them with one ioctl. We can't * simply do all snap deletions and then all fs deletions, * because we must delete a clone before its origin. */ if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { cb->cb_snap_count++; fnvlist_add_boolean(cb->cb_batchedsnaps, name); if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy) error = destroy_batched(cb); } else { error = destroy_batched(cb); if (error != 0 || zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { zfs_close(zhp); /* * When performing a recursive destroy we ignore errors * so that the recursive destroy could continue * destroying past problem datasets */ if (cb->cb_recurse) { cb->cb_error = B_TRUE; return (0); } return (-1); } } zfs_close(zhp); return (0); } static int destroy_print_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; const char *name = zfs_get_name(zhp); int err = 0; if (nvlist_exists(cb->cb_nvl, name)) { if (cb->cb_firstsnap == NULL) cb->cb_firstsnap = strdup(name); if (cb->cb_prevsnap != NULL) free(cb->cb_prevsnap); /* this snap continues the current range */ cb->cb_prevsnap = strdup(name); if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) nomem(); if (cb->cb_verbose) { if (cb->cb_parsable) { (void) printf("destroy\t%s\n", name); } else if (cb->cb_dryrun) { (void) printf(gettext("would destroy %s\n"), name); } else { (void) printf(gettext("will destroy %s\n"), name); } } } else if (cb->cb_firstsnap != NULL) { /* end of this range */ uint64_t used = 0; err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } zfs_close(zhp); return (err); } static int destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) { int err; assert(cb->cb_firstsnap == NULL); assert(cb->cb_prevsnap == NULL); err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0); if (cb->cb_firstsnap != NULL) { uint64_t used = 0; if (err == 0) { err = lzc_snaprange_space(cb->cb_firstsnap, cb->cb_prevsnap, &used); } cb->cb_snapused += used; free(cb->cb_firstsnap); cb->cb_firstsnap = NULL; free(cb->cb_prevsnap); cb->cb_prevsnap = NULL; } return (err); } static int snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; /* Check for clones. */ if (!cb->cb_doclones && !cb->cb_defer_destroy) { cb->cb_target = zhp; cb->cb_first = B_TRUE; err = zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, cb); } if (err == 0) { if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) nomem(); } zfs_close(zhp); return (err); } static int gather_snapshots(zfs_handle_t *zhp, void *arg) { destroy_cbdata_t *cb = arg; int err = 0; err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); if (err == ENOENT) err = 0; if (err != 0) goto out; if (cb->cb_verbose) { err = destroy_print_snapshots(zhp, cb); if (err != 0) goto out; } if (cb->cb_recurse) err = zfs_iter_filesystems(zhp, gather_snapshots, cb); out: zfs_close(zhp); return (err); } static int destroy_clones(destroy_cbdata_t *cb) { nvpair_t *pair; for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); pair != NULL; pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), ZFS_TYPE_SNAPSHOT); if (zhp != NULL) { boolean_t defer = cb->cb_defer_destroy; int err; /* * We can't defer destroy non-snapshots, so set it to * false while destroying the clones. */ cb->cb_defer_destroy = B_FALSE; err = zfs_iter_dependents(zhp, B_FALSE, destroy_callback, cb); cb->cb_defer_destroy = defer; zfs_close(zhp); if (err != 0) return (err); } } return (0); } static int zfs_do_destroy(int argc, char **argv) { destroy_cbdata_t cb = { 0 }; int rv = 0; int err = 0; int c; zfs_handle_t *zhp = NULL; char *at, *pound; zfs_type_t type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, "vpndfrR")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; break; case 'p': cb.cb_verbose = B_TRUE; cb.cb_parsable = B_TRUE; break; case 'n': cb.cb_dryrun = B_TRUE; break; case 'd': cb.cb_defer_destroy = B_TRUE; type = ZFS_TYPE_SNAPSHOT; break; case 'f': cb.cb_force = B_TRUE; break; case 'r': cb.cb_recurse = B_TRUE; break; case 'R': cb.cb_recurse = B_TRUE; cb.cb_doclones = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc == 0) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } at = strchr(argv[0], '@'); pound = strchr(argv[0], '#'); if (at != NULL) { /* Build the list of snaps to destroy in cb_nvl. */ cb.cb_nvl = fnvlist_alloc(); *at = '\0'; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(cb.cb_nvl); return (1); } cb.cb_snapspec = at + 1; if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || cb.cb_error) { rv = 1; goto out; } if (nvlist_empty(cb.cb_nvl)) { (void) fprintf(stderr, gettext("could not find any " "snapshots to destroy; check snapshot names.\n")); rv = 1; goto out; } if (cb.cb_verbose) { char buf[16]; zfs_nicebytes(cb.cb_snapused, buf, sizeof (buf)); if (cb.cb_parsable) { (void) printf("reclaim\t%llu\n", (u_longlong_t)cb.cb_snapused); } else if (cb.cb_dryrun) { (void) printf(gettext("would reclaim %s\n"), buf); } else { (void) printf(gettext("will reclaim %s\n"), buf); } } if (!cb.cb_dryrun) { if (cb.cb_doclones) { cb.cb_batchedsnaps = fnvlist_alloc(); err = destroy_clones(&cb); if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, B_FALSE); } if (err != 0) { rv = 1; goto out; } } if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, cb.cb_defer_destroy); } } if (err != 0) rv = 1; } else if (pound != NULL) { int err; nvlist_t *nvl; if (cb.cb_dryrun) { (void) fprintf(stderr, "dryrun is not supported with bookmark\n"); return (-1); } if (cb.cb_defer_destroy) { (void) fprintf(stderr, "defer destroy is not supported with bookmark\n"); return (-1); } if (cb.cb_recurse) { (void) fprintf(stderr, "recursive is not supported with bookmark\n"); return (-1); } /* * Unfortunately, zfs_bookmark() doesn't honor the * casesensitivity setting. However, we can't simply * remove this check, because lzc_destroy_bookmarks() * ignores non-existent bookmarks, so this is necessary * to get a proper error message. */ if (!zfs_bookmark_exists(argv[0])) { (void) fprintf(stderr, gettext("bookmark '%s' " "does not exist.\n"), argv[0]); return (1); } nvl = fnvlist_alloc(); fnvlist_add_boolean(nvl, argv[0]); err = lzc_destroy_bookmarks(nvl, NULL); if (err != 0) { (void) zfs_standard_error(g_zfs, err, "cannot destroy bookmark"); } nvlist_free(nvl); return (err); } else { /* Open the given dataset */ if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) return (1); cb.cb_target = zhp; /* * Perform an explicit check for pools before going any further. */ if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("cannot destroy '%s': " "operation does not apply to pools\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zfs destroy -r " "%s' to destroy all datasets in the pool\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use 'zpool destroy %s' " "to destroy the pool itself\n"), zfs_get_name(zhp)); rv = 1; goto out; } /* * Check for any dependents and/or clones. */ cb.cb_first = B_TRUE; if (!cb.cb_doclones && zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, &cb) != 0) { rv = 1; goto out; } if (cb.cb_error) { rv = 1; goto out; } cb.cb_batchedsnaps = fnvlist_alloc(); if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) { rv = 1; goto out; } /* * Do the real thing. The callback will close the * handle regardless of whether it succeeds or not. */ err = destroy_callback(zhp, &cb); zhp = NULL; if (err == 0) { err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_batchedsnaps, cb.cb_defer_destroy); } if (err != 0 || cb.cb_error == B_TRUE) rv = 1; } out: fnvlist_free(cb.cb_batchedsnaps); fnvlist_free(cb.cb_nvl); if (zhp != NULL) zfs_close(zhp); return (rv); } static boolean_t is_recvd_column(zprop_get_cbdata_t *cbp) { int i; zfs_get_column_t col; for (i = 0; i < ZFS_GET_NCOLS && (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) if (col == GET_COL_RECVD) return (B_TRUE); return (B_FALSE); } /* * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] * < all | property[,property]... > < fs | snap | vol > ... * * -r recurse over any child datasets * -H scripted mode. Headers are stripped, and fields are separated * by tabs instead of spaces. * -o Set of fields to display. One of "name,property,value, * received,source". Default is "name,property,value,source". * "all" is an alias for all five. * -s Set of sources to allow. One of * "local,default,inherited,received,temporary,none". Default is * all six. * -p Display values in parsable (literal) format. * * Prints properties for the given datasets. The user can control which * columns to display as well as which property types to allow. */ /* * Invoked to display the properties for a single dataset. */ static int get_callback(zfs_handle_t *zhp, void *data) { char buf[ZFS_MAXPROPLEN]; char rbuf[ZFS_MAXPROPLEN]; zprop_source_t sourcetype; char source[ZFS_MAX_DATASET_NAME_LEN]; zprop_get_cbdata_t *cbp = data; nvlist_t *user_props = zfs_get_user_props(zhp); zprop_list_t *pl = cbp->cb_proplist; nvlist_t *propval; char *strval; char *sourceval; boolean_t received = is_recvd_column(cbp); for (; pl != NULL; pl = pl->pl_next) { char *recvdval = NULL; /* * Skip the special fake placeholder. This will also skip over * the name property when 'all' is specified. */ if (pl->pl_prop == ZFS_PROP_NAME && pl == cbp->cb_proplist) continue; if (pl->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, pl->pl_prop, buf, sizeof (buf), &sourcetype, source, sizeof (source), cbp->cb_literal) != 0) { if (pl->pl_all) continue; if (!zfs_prop_valid_for_type(pl->pl_prop, ZFS_TYPE_DATASET, B_FALSE)) { (void) fprintf(stderr, gettext("No such property '%s'\n"), zfs_prop_to_name(pl->pl_prop)); continue; } sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } if (received && (zfs_prop_get_recvd(zhp, zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, zfs_prop_to_name(pl->pl_prop), buf, sourcetype, source, recvdval); } else if (zfs_prop_userquota(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else if (zfs_prop_written(pl->pl_user_prop)) { sourcetype = ZPROP_SRC_LOCAL; if (zfs_prop_get_written(zhp, pl->pl_user_prop, buf, sizeof (buf), cbp->cb_literal) != 0) { sourcetype = ZPROP_SRC_NONE; (void) strlcpy(buf, "-", sizeof (buf)); } zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); } else { if (nvlist_lookup_nvlist(user_props, pl->pl_user_prop, &propval) != 0) { if (pl->pl_all) continue; sourcetype = ZPROP_SRC_NONE; strval = "-"; } else { verify(nvlist_lookup_string(propval, ZPROP_VALUE, &strval) == 0); verify(nvlist_lookup_string(propval, ZPROP_SOURCE, &sourceval) == 0); if (strcmp(sourceval, zfs_get_name(zhp)) == 0) { sourcetype = ZPROP_SRC_LOCAL; } else if (strcmp(sourceval, ZPROP_SOURCE_VAL_RECVD) == 0) { sourcetype = ZPROP_SRC_RECEIVED; } else { sourcetype = ZPROP_SRC_INHERITED; (void) strlcpy(source, sourceval, sizeof (source)); } } if (received && (zfs_prop_get_recvd(zhp, pl->pl_user_prop, rbuf, sizeof (rbuf), cbp->cb_literal) == 0)) recvdval = rbuf; zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, strval, sourcetype, source, recvdval); } } return (0); } static int zfs_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; char *value, *fields; int ret = 0; int limit = 0; zprop_list_t fake_name = { 0 }; /* * Set up default columns and sources. */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_DATASET; /* check options */ while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'o': /* * Process the set of columns to display. We zero out * the structure to give us a blank slate. */ bzero(&cb.cb_columns, sizeof (cb.cb_columns)); i = 0; while (*optarg != '\0') { static char *col_subopts[] = { "name", "property", "value", "received", "source", "all", NULL }; if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); usage(B_FALSE); } switch (getsubopt(&optarg, col_subopts, &value)) { case 0: cb.cb_columns[i++] = GET_COL_NAME; break; case 1: cb.cb_columns[i++] = GET_COL_PROPERTY; break; case 2: cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: cb.cb_columns[i++] = GET_COL_RECVD; flags |= ZFS_ITER_RECVD_PROPS; break; case 4: cb.cb_columns[i++] = GET_COL_SOURCE; break; case 5: if (i > 0) { (void) fprintf(stderr, gettext("\"all\" conflicts " "with specific fields " "given to -o option\n")); usage(B_FALSE); } cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; cb.cb_columns[2] = GET_COL_VALUE; cb.cb_columns[3] = GET_COL_RECVD; cb.cb_columns[4] = GET_COL_SOURCE; flags |= ZFS_ITER_RECVD_PROPS; i = ZFS_GET_NCOLS; break; default: (void) fprintf(stderr, gettext("invalid column name " "'%s'\n"), value); usage(B_FALSE); } } break; case 's': cb.cb_sources = 0; while (*optarg != '\0') { static char *source_subopts[] = { "local", "default", "inherited", "received", "temporary", "none", NULL }; switch (getsubopt(&optarg, source_subopts, &value)) { case 0: cb.cb_sources |= ZPROP_SRC_LOCAL; break; case 1: cb.cb_sources |= ZPROP_SRC_DEFAULT; break; case 2: cb.cb_sources |= ZPROP_SRC_INHERITED; break; case 3: cb.cb_sources |= ZPROP_SRC_RECEIVED; break; case 4: cb.cb_sources |= ZPROP_SRC_TEMPORARY; break; case 5: cb.cb_sources |= ZPROP_SRC_NONE; break; default: (void) fprintf(stderr, gettext("invalid source " "'%s'\n"), value); usage(B_FALSE); } } break; case 't': types = 0; flags &= ~ZFS_ITER_PROP_LISTSNAPS; while (*optarg != '\0') { static char *type_subopts[] = { "filesystem", "volume", "snapshot", "snap", "bookmark", "all", NULL }; switch (getsubopt(&optarg, type_subopts, &value)) { case 0: types |= ZFS_TYPE_FILESYSTEM; break; case 1: types |= ZFS_TYPE_VOLUME; break; case 2: case 3: types |= ZFS_TYPE_SNAPSHOT; break; case 4: types |= ZFS_TYPE_BOOKMARK; break; case 5: types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; break; default: (void) fprintf(stderr, gettext("invalid type '%s'\n"), value); usage(B_FALSE); } } break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing property " "argument\n")); usage(B_FALSE); } fields = argv[0]; /* * Handle users who want to get all snapshots or bookmarks * of a dataset (ex. 'zfs get -t snapshot refer '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); argc--; argv++; /* * As part of zfs_expand_proplist(), we keep track of the maximum column * width for each property. For the 'NAME' (and 'SOURCE') columns, we * need to know the maximum name length. However, the user likely did * not specify 'name' as one of the properties to fetch, so we need to * make sure we always include at least this property for * print_get_headers() to work properly. */ if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZFS_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); fake_name.pl_next = cb.cb_proplist; cb.cb_proplist = &fake_name; } cb.cb_first = B_TRUE; /* run for each object */ ret = zfs_for_each(argc, argv, flags, types, NULL, &cb.cb_proplist, limit, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); else zprop_free_list(cb.cb_proplist); return (ret); } /* * inherit [-rS] ... * * -r Recurse over all children * -S Revert to received value, if any * * For each dataset specified on the command line, inherit the given property * from its parent. Inheriting a property at the pool level will cause it to * use the default value. The '-r' flag will recurse over all children, and is * useful for setting a property on a hierarchy-wide basis, regardless of any * local modifications for each dataset. */ typedef struct inherit_cbdata { const char *cb_propname; boolean_t cb_received; } inherit_cbdata_t; static int inherit_recurse_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); /* * If we're doing it recursively, then ignore properties that * are not valid for this type of dataset. */ if (prop != ZPROP_INVAL && !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE)) return (0); return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int inherit_cb(zfs_handle_t *zhp, void *data) { inherit_cbdata_t *cb = data; return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int zfs_do_inherit(int argc, char **argv) { int c; zfs_prop_t prop; inherit_cbdata_t cb = { 0 }; char *propname; int ret = 0; int flags = 0; boolean_t received = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "rS")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'S': received = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing property argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } propname = argv[0]; argc--; argv++; if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { if (zfs_prop_readonly(prop)) { (void) fprintf(stderr, gettext( "%s property is read-only\n"), propname); return (1); } if (!zfs_prop_inheritable(prop) && !received) { (void) fprintf(stderr, gettext("'%s' property cannot " "be inherited\n"), propname); if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || prop == ZFS_PROP_REFRESERVATION) { (void) fprintf(stderr, gettext("use 'zfs set " "%s=none' to clear\n"), propname); (void) fprintf(stderr, gettext("use 'zfs " "inherit -S %s' to revert to received " "value\n"), propname); } return (1); } if (received && (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION)) { (void) fprintf(stderr, gettext("'%s' property cannot " "be reverted to a received value\n"), propname); return (1); } } else if (!zfs_prop_user(propname)) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), propname); usage(B_FALSE); } cb.cb_propname = propname; cb.cb_received = received; if (flags & ZFS_ITER_RECURSE) { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_recurse_cb, &cb); } else { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, NULL, 0, inherit_cb, &cb); } return (ret); } typedef struct upgrade_cbdata { uint64_t cb_numupgraded; uint64_t cb_numsamegraded; uint64_t cb_numfailed; uint64_t cb_version; boolean_t cb_newer; boolean_t cb_foundone; char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; } upgrade_cbdata_t; static int same_pool(zfs_handle_t *zhp, const char *name) { int len1 = strcspn(name, "/@"); const char *zhname = zfs_get_name(zhp); int len2 = strcspn(zhname, "/@"); if (len1 != len2) return (B_FALSE); return (strncmp(name, zhname, len1) == 0); } static int upgrade_list_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); /* list if it's old/new */ if ((!cb->cb_newer && version < ZPL_VERSION) || (cb->cb_newer && version > ZPL_VERSION)) { char *str; if (cb->cb_newer) { str = gettext("The following filesystems are " "formatted using a newer software version and\n" "cannot be accessed on the current system.\n\n"); } else { str = gettext("The following filesystems are " "out of date, and can be upgraded. After being\n" "upgraded, these filesystems (and any 'zfs send' " "streams generated from\n" "subsequent snapshots) will no longer be " "accessible by older software versions.\n\n"); } if (!cb->cb_foundone) { (void) puts(str); (void) printf(gettext("VER FILESYSTEM\n")); (void) printf(gettext("--- ------------\n")); cb->cb_foundone = B_TRUE; } (void) printf("%2u %s\n", version, zfs_get_name(zhp)); } return (0); } static int upgrade_set_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); int needed_spa_version; int spa_version; if (zfs_spa_version(zhp, &spa_version) < 0) return (-1); needed_spa_version = zfs_spa_version_map(cb->cb_version); if (needed_spa_version < 0) return (-1); if (spa_version < needed_spa_version) { /* can't upgrade */ (void) printf(gettext("%s: can not be " "upgraded; the pool version needs to first " "be upgraded\nto version %d\n\n"), zfs_get_name(zhp), needed_spa_version); cb->cb_numfailed++; return (0); } /* upgrade */ if (version < cb->cb_version) { char verstr[16]; (void) snprintf(verstr, sizeof (verstr), "%llu", (u_longlong_t)cb->cb_version); if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { /* * If they did "zfs upgrade -a", then we could * be doing ioctls to different pools. We need * to log this history once to each pool, and bypass * the normal history logging that happens in main(). */ (void) zpool_log_history(g_zfs, history_str); log_history = B_FALSE; } if (zfs_prop_set(zhp, "version", verstr) == 0) cb->cb_numupgraded++; else cb->cb_numfailed++; (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp)); } else if (version > cb->cb_version) { /* can't downgrade */ (void) printf(gettext("%s: can not be downgraded; " "it is already at version %u\n"), zfs_get_name(zhp), version); cb->cb_numfailed++; } else { cb->cb_numsamegraded++; } return (0); } /* * zfs upgrade * zfs upgrade -v * zfs upgrade [-r] [-V ] <-a | filesystem> */ static int zfs_do_upgrade(int argc, char **argv) { boolean_t all = B_FALSE; boolean_t showversions = B_FALSE; int ret = 0; upgrade_cbdata_t cb = { 0 }; int c; int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "rvV:a")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; case 'v': showversions = B_TRUE; break; case 'V': if (zfs_prop_string_to_index(ZFS_PROP_VERSION, optarg, &cb.cb_version) != 0) { (void) fprintf(stderr, gettext("invalid version %s\n"), optarg); usage(B_FALSE); } break; case 'a': all = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) usage(B_FALSE); if (showversions && (flags & ZFS_ITER_RECURSE || all || cb.cb_version || argc)) usage(B_FALSE); if ((all || argc) && (showversions)) usage(B_FALSE); if (all && argc) usage(B_FALSE); if (showversions) { /* Show info on available versions. */ (void) printf(gettext("The following filesystem versions are " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" "---------------\n"); (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 3 Case insensitive and filesystem " "user identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); (void) printf(gettext(" 5 System attributes\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases,\n")); (void) printf("see the ZFS Administration Guide.\n\n"); ret = 0; } else if (argc || all) { /* Upgrade filesystems */ if (cb.cb_version == 0) cb.cb_version = ZPL_VERSION; ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_set_callback, &cb); (void) printf(gettext("%llu filesystems upgraded\n"), (u_longlong_t)cb.cb_numupgraded); if (cb.cb_numsamegraded) { (void) printf(gettext("%llu filesystems already at " "this version\n"), (u_longlong_t)cb.cb_numsamegraded); } if (cb.cb_numfailed != 0) ret = 1; } else { /* List old-version filesystems */ boolean_t found; (void) printf(gettext("This system is currently running " "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); flags |= ZFS_ITER_RECURSE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); found = cb.cb_foundone; cb.cb_foundone = B_FALSE; cb.cb_newer = B_TRUE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, NULL, NULL, 0, upgrade_list_callback, &cb); if (!cb.cb_foundone && !found) { (void) printf(gettext("All filesystems are " "formatted with the current version.\n")); } } return (ret); } /* * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] * filesystem | snapshot | path * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] [-t type[,...]] * filesystem | snapshot | path * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...] * [-S field [-S field]...] filesystem | snapshot | path * * -H Scripted mode; elide headers and separate columns by tabs. * -i Translate SID to POSIX ID. * -n Print numeric ID instead of user/group name. * -o Control which fields to display. * -p Use exact (parsable) numeric output. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * Displays space consumed by, and quotas on, each user in the specified * filesystem or snapshot. */ /* us_field_types, us_field_hdr and us_field_names should be kept in sync */ enum us_field_types { USFIELD_TYPE, USFIELD_NAME, USFIELD_USED, USFIELD_QUOTA, USFIELD_OBJUSED, USFIELD_OBJQUOTA }; static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA", "OBJUSED", "OBJQUOTA" }; static char *us_field_names[] = { "type", "name", "used", "quota", "objused", "objquota" }; #define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) #define USTYPE_PSX_GRP (1 << 0) #define USTYPE_PSX_USR (1 << 1) #define USTYPE_SMB_GRP (1 << 2) #define USTYPE_SMB_USR (1 << 3) #define USTYPE_PROJ (1 << 4) #define USTYPE_ALL \ (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR | \ USTYPE_PROJ) static int us_type_bits[] = { USTYPE_PSX_GRP, USTYPE_PSX_USR, USTYPE_SMB_GRP, USTYPE_SMB_USR, USTYPE_ALL }; static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup", "smbuser", "all" }; typedef struct us_node { nvlist_t *usn_nvl; uu_avl_node_t usn_avlnode; uu_list_node_t usn_listnode; } us_node_t; typedef struct us_cbdata { nvlist_t **cb_nvlp; uu_avl_pool_t *cb_avl_pool; uu_avl_t *cb_avl; boolean_t cb_numname; boolean_t cb_nicenum; boolean_t cb_sid2posix; zfs_userquota_prop_t cb_prop; zfs_sort_column_t *cb_sortcol; size_t cb_width[USFIELD_LAST]; } us_cbdata_t; static boolean_t us_populated = B_FALSE; typedef struct { zfs_sort_column_t *si_sortcol; boolean_t si_numname; } us_sort_info_t; static int us_field_index(char *field) { int i; for (i = 0; i < USFIELD_LAST; i++) { if (strcmp(field, us_field_names[i]) == 0) return (i); } return (-1); } static int us_compare(const void *larg, const void *rarg, void *unused) { const us_node_t *l = larg; const us_node_t *r = rarg; us_sort_info_t *si = (us_sort_info_t *)unused; zfs_sort_column_t *sortcol = si->si_sortcol; boolean_t numname = si->si_numname; nvlist_t *lnvl = l->usn_nvl; nvlist_t *rnvl = r->usn_nvl; int rc = 0; boolean_t lvb, rvb; for (; sortcol != NULL; sortcol = sortcol->sc_next) { char *lvstr = ""; char *rvstr = ""; uint32_t lv32 = 0; uint32_t rv32 = 0; uint64_t lv64 = 0; uint64_t rv64 = 0; zfs_prop_t prop = sortcol->sc_prop; const char *propname = NULL; boolean_t reverse = sortcol->sc_reverse; switch (prop) { case ZFS_PROP_TYPE: propname = "type"; (void) nvlist_lookup_uint32(lnvl, propname, &lv32); (void) nvlist_lookup_uint32(rnvl, propname, &rv32); if (rv32 != lv32) rc = (rv32 < lv32) ? 1 : -1; break; case ZFS_PROP_NAME: propname = "name"; if (numname) { compare_nums: (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; } else { if ((nvlist_lookup_string(lnvl, propname, &lvstr) == ENOENT) || (nvlist_lookup_string(rnvl, propname, &rvstr) == ENOENT)) { goto compare_nums; } rc = strcmp(lvstr, rvstr); } break; case ZFS_PROP_USED: case ZFS_PROP_QUOTA: if (!us_populated) break; if (prop == ZFS_PROP_USED) propname = "used"; else propname = "quota"; (void) nvlist_lookup_uint64(lnvl, propname, &lv64); (void) nvlist_lookup_uint64(rnvl, propname, &rv64); if (rv64 != lv64) rc = (rv64 < lv64) ? 1 : -1; break; default: break; } if (rc != 0) { if (rc < 0) return (reverse ? 1 : -1); else return (reverse ? -1 : 1); } } /* * If entries still seem to be the same, check if they are of the same * type (smbentity is added only if we are doing SID to POSIX ID * translation where we can have duplicate type/name combinations). */ if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && lvb != rvb) return (lvb < rvb ? -1 : 1); return (0); } static boolean_t zfs_prop_is_user(unsigned p) { return (p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA || p == ZFS_PROP_USEROBJUSED || p == ZFS_PROP_USEROBJQUOTA); } static boolean_t zfs_prop_is_group(unsigned p) { return (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA || p == ZFS_PROP_GROUPOBJUSED || p == ZFS_PROP_GROUPOBJQUOTA); } static boolean_t zfs_prop_is_project(unsigned p) { return (p == ZFS_PROP_PROJECTUSED || p == ZFS_PROP_PROJECTQUOTA || p == ZFS_PROP_PROJECTOBJUSED || p == ZFS_PROP_PROJECTOBJQUOTA); } static inline const char * us_type2str(unsigned field_type) { switch (field_type) { case USTYPE_PSX_USR: return ("POSIX User"); case USTYPE_PSX_GRP: return ("POSIX Group"); case USTYPE_SMB_USR: return ("SMB User"); case USTYPE_SMB_GRP: return ("SMB Group"); case USTYPE_PROJ: return ("Project"); default: return ("Undefined"); } } static int userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) { us_cbdata_t *cb = (us_cbdata_t *)arg; zfs_userquota_prop_t prop = cb->cb_prop; char *name = NULL; char *propname; char sizebuf[32]; us_node_t *node; uu_avl_pool_t *avl_pool = cb->cb_avl_pool; uu_avl_t *avl = cb->cb_avl; uu_avl_index_t idx; nvlist_t *props; us_node_t *n; zfs_sort_column_t *sortcol = cb->cb_sortcol; unsigned type = 0; const char *typestr; size_t namelen; size_t typelen; size_t sizelen; int typeidx, nameidx, sizeidx; us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; boolean_t smbentity = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); node = safe_malloc(sizeof (us_node_t)); uu_avl_node_init(node, &node->usn_avlnode, avl_pool); node->usn_nvl = props; if (domain != NULL && domain[0] != '\0') { #ifdef HAVE_IDMAP /* SMB */ char sid[MAXNAMELEN + 32]; uid_t id; uint64_t classes; int err; directory_error_t e; smbentity = B_TRUE; (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { type = USTYPE_SMB_GRP; err = sid_to_id(sid, B_FALSE, &id); } else { type = USTYPE_SMB_USR; err = sid_to_id(sid, B_TRUE, &id); } if (err == 0) { rid = id; if (!cb->cb_sid2posix) { e = directory_name_from_sid(NULL, sid, &name, &classes); if (e != NULL) directory_error_free(e); if (name == NULL) name = sid; } } #else nvlist_free(props); free(node); return (-1); #endif /* HAVE_IDMAP */ } if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { /* POSIX or -i */ if (zfs_prop_is_group(prop)) { type = USTYPE_PSX_GRP; if (!cb->cb_numname) { struct group *g; if ((g = getgrgid(rid)) != NULL) name = g->gr_name; } } else if (zfs_prop_is_user(prop)) { type = USTYPE_PSX_USR; if (!cb->cb_numname) { struct passwd *p; if ((p = getpwuid(rid)) != NULL) name = p->pw_name; } } else { type = USTYPE_PROJ; } } /* * Make sure that the type/name combination is unique when doing * SID to POSIX ID translation (hence changing the type from SMB to * POSIX). */ if (cb->cb_sid2posix && nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) nomem(); /* Calculate/update width of TYPE field */ typestr = us_type2str(type); typelen = strlen(gettext(typestr)); typeidx = us_field_index("type"); if (typelen > cb->cb_width[typeidx]) cb->cb_width[typeidx] = typelen; if (nvlist_add_uint32(props, "type", type) != 0) nomem(); /* Calculate/update width of NAME field */ if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { if (nvlist_add_uint64(props, "name", rid) != 0) nomem(); namelen = snprintf(NULL, 0, "%u", rid); } else { if (nvlist_add_string(props, "name", name) != 0) nomem(); namelen = strlen(name); } nameidx = us_field_index("name"); if (nameidx >= 0 && namelen > cb->cb_width[nameidx]) cb->cb_width[nameidx] = namelen; /* * Check if this type/name combination is in the list and update it; * otherwise add new node to the list. */ if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { uu_avl_insert(avl, node, idx); } else { nvlist_free(props); free(node); node = n; props = node->usn_nvl; } /* Calculate/update width of USED/QUOTA fields */ if (cb->cb_nicenum) { if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTUSED || prop == ZFS_PROP_PROJECTQUOTA) { zfs_nicebytes(space, sizebuf, sizeof (sizebuf)); } else { zfs_nicenum(space, sizebuf, sizeof (sizebuf)); } } else { (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", (u_longlong_t)space); } sizelen = strlen(sizebuf); if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_PROJECTUSED) { propname = "used"; if (!nvlist_exists(props, "quota")) (void) nvlist_add_uint64(props, "quota", 0); } else if (prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || prop == ZFS_PROP_PROJECTQUOTA) { propname = "quota"; if (!nvlist_exists(props, "used")) (void) nvlist_add_uint64(props, "used", 0); } else if (prop == ZFS_PROP_USEROBJUSED || prop == ZFS_PROP_GROUPOBJUSED || prop == ZFS_PROP_PROJECTOBJUSED) { propname = "objused"; if (!nvlist_exists(props, "objquota")) (void) nvlist_add_uint64(props, "objquota", 0); } else if (prop == ZFS_PROP_USEROBJQUOTA || prop == ZFS_PROP_GROUPOBJQUOTA || prop == ZFS_PROP_PROJECTOBJQUOTA) { propname = "objquota"; if (!nvlist_exists(props, "objused")) (void) nvlist_add_uint64(props, "objused", 0); } else { return (-1); } sizeidx = us_field_index(propname); if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx]) cb->cb_width[sizeidx] = sizelen; if (nvlist_add_uint64(props, propname, space) != 0) nomem(); return (0); } static void print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, us_node_t *node) { nvlist_t *nvl = node->usn_nvl; char valstr[MAXNAMELEN]; boolean_t first = B_TRUE; int cfield = 0; int field; uint32_t ustype; /* Check type */ (void) nvlist_lookup_uint32(nvl, "type", &ustype); if (!(ustype & types)) return; while ((field = fields[cfield]) != USFIELD_LAST) { nvpair_t *nvp = NULL; data_type_t type; uint32_t val32; uint64_t val64; char *strval = "-"; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { if (strcmp(nvpair_name(nvp), us_field_names[field]) == 0) break; } type = nvp == NULL ? DATA_TYPE_UNKNOWN : nvpair_type(nvp); switch (type) { case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &val32); break; case DATA_TYPE_UINT64: (void) nvpair_value_uint64(nvp, &val64); break; case DATA_TYPE_STRING: (void) nvpair_value_string(nvp, &strval); break; case DATA_TYPE_UNKNOWN: break; default: (void) fprintf(stderr, "invalid data type\n"); } switch (field) { case USFIELD_TYPE: if (type == DATA_TYPE_UINT32) strval = (char *)us_type2str(val32); break; case USFIELD_NAME: if (type == DATA_TYPE_UINT64) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } break; case USFIELD_USED: case USFIELD_QUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_QUOTA && val64 == 0) { strval = "none"; } else { zfs_nicebytes(val64, valstr, sizeof (valstr)); strval = valstr; } } break; case USFIELD_OBJUSED: case USFIELD_OBJQUOTA: if (type == DATA_TYPE_UINT64) { if (parsable) { (void) sprintf(valstr, "%llu", (u_longlong_t)val64); strval = valstr; } else if (field == USFIELD_OBJQUOTA && val64 == 0) { strval = "none"; } else { zfs_nicenum(val64, valstr, sizeof (valstr)); strval = valstr; } } break; } if (!first) { if (scripted) (void) printf("\t"); else (void) printf(" "); } if (scripted) (void) printf("%s", strval); else if (field == USFIELD_TYPE || field == USFIELD_NAME) (void) printf("%-*s", (int)width[field], strval); else (void) printf("%*s", (int)width[field], strval); first = B_FALSE; cfield++; } (void) printf("\n"); } static void print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, size_t *width, boolean_t rmnode, uu_avl_t *avl) { us_node_t *node; const char *col; int cfield = 0; int field; if (!scripted) { boolean_t first = B_TRUE; while ((field = fields[cfield]) != USFIELD_LAST) { col = gettext(us_field_hdr[field]); if (field == USFIELD_TYPE || field == USFIELD_NAME) { (void) printf(first ? "%-*s" : " %-*s", (int)width[field], col); } else { (void) printf(first ? "%*s" : " %*s", (int)width[field], col); } first = B_FALSE; cfield++; } (void) printf("\n"); } for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { print_us_node(scripted, parsable, fields, types, width, node); if (rmnode) nvlist_free(node->usn_nvl); } } static int zfs_do_userspace(int argc, char **argv) { zfs_handle_t *zhp; zfs_userquota_prop_t p; uu_avl_pool_t *avl_pool; uu_avl_t *avl_tree; uu_avl_walk_t *walk; char *delim; char deffields[] = "type,name,used,quota,objused,objquota"; char *ofield = NULL; char *tfield = NULL; int cfield = 0; int fields[256]; int i; boolean_t scripted = B_FALSE; boolean_t prtnum = B_FALSE; boolean_t parsable = B_FALSE; boolean_t sid2posix = B_FALSE; int ret = 0; int c; zfs_sort_column_t *sortcol = NULL; int types = USTYPE_PSX_USR | USTYPE_SMB_USR; us_cbdata_t cb; us_node_t *node; us_node_t *rmnode; uu_list_pool_t *listpool; uu_list_t *list; uu_avl_index_t idx = 0; uu_list_index_t idx2 = 0; if (argc < 2) usage(B_FALSE); if (strcmp(argv[0], "groupspace") == 0) { /* Toggle default group types */ types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; } else if (strcmp(argv[0], "projectspace") == 0) { types = USTYPE_PROJ; prtnum = B_TRUE; } while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { switch (c) { case 'n': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'n'\n")); usage(B_FALSE); } prtnum = B_TRUE; break; case 'H': scripted = B_TRUE; break; case 'p': parsable = B_TRUE; break; case 'o': ofield = optarg; break; case 's': case 'S': if (zfs_add_sort_column(&sortcol, optarg, c == 's' ? B_FALSE : B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid field '%s'\n"), optarg); usage(B_FALSE); } break; case 't': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 't'\n")); usage(B_FALSE); } tfield = optarg; break; case 'i': if (types == USTYPE_PROJ) { (void) fprintf(stderr, gettext("invalid option 'i'\n")); usage(B_FALSE); } sid2posix = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset name\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* Use default output fields if not specified using -o */ if (ofield == NULL) ofield = deffields; do { if ((delim = strchr(ofield, ',')) != NULL) *delim = '\0'; if ((fields[cfield++] = us_field_index(ofield)) == -1) { (void) fprintf(stderr, gettext("invalid type '%s' " "for -o option\n"), ofield); return (-1); } if (delim != NULL) ofield = delim + 1; } while (delim != NULL); fields[cfield] = USFIELD_LAST; /* Override output types (-t option) */ if (tfield != NULL) { types = 0; do { boolean_t found = B_FALSE; if ((delim = strchr(tfield, ',')) != NULL) *delim = '\0'; for (i = 0; i < sizeof (us_type_bits) / sizeof (int); i++) { if (strcmp(tfield, us_type_names[i]) == 0) { found = B_TRUE; types |= us_type_bits[i]; break; } } if (!found) { (void) fprintf(stderr, gettext("invalid type " "'%s' for -t option\n"), tfield); return (-1); } if (delim != NULL) tfield = delim + 1; } while (delim != NULL); } if ((zhp = zfs_path_to_zhandle(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) return (1); if (zhp->zfs_head_type != ZFS_TYPE_FILESYSTEM) { (void) fprintf(stderr, gettext("operation is only applicable " "to filesystems and their snapshots\n")); zfs_close(zhp); return (1); } if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) nomem(); if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) nomem(); /* Always add default sorting columns */ (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); cb.cb_sortcol = sortcol; cb.cb_numname = prtnum; cb.cb_nicenum = !parsable; cb.cb_avl_pool = avl_pool; cb.cb_avl = avl_tree; cb.cb_sid2posix = sid2posix; for (i = 0; i < USFIELD_LAST; i++) cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { if ((zfs_prop_is_user(p) && !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || (zfs_prop_is_group(p) && !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))) || (zfs_prop_is_project(p) && types != USTYPE_PROJ)) continue; cb.cb_prop = p; if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) { zfs_close(zhp); return (ret); } } zfs_close(zhp); /* Sort the list */ if ((node = uu_avl_first(avl_tree)) == NULL) return (0); us_populated = B_TRUE; listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); list = uu_list_create(listpool, NULL, UU_DEFAULT); uu_list_node_init(node, &node->usn_listnode, listpool); while (node != NULL) { rmnode = node; node = uu_avl_next(avl_tree, node); uu_avl_remove(avl_tree, rmnode); if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) uu_list_insert(list, rmnode, idx2); } for (node = uu_list_first(list); node != NULL; node = uu_list_next(list, node)) { us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) uu_avl_insert(avl_tree, node, idx); } uu_list_destroy(list); uu_list_pool_destroy(listpool); /* Print and free node nvlist memory */ print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, cb.cb_avl); zfs_free_sort_columns(sortcol); /* Clean up the AVL tree */ if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(cb.cb_avl, node); free(node); } uu_avl_walk_end(walk); uu_avl_destroy(avl_tree); uu_avl_pool_destroy(avl_pool); return (ret); } /* * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] * [-t type[,...]] [filesystem|volume|snapshot] ... * * -H Scripted mode; elide headers and separate columns by tabs * -p Display values in parsable (literal) format. * -r Recurse over all children * -d Limit recursion by depth. * -o Control which fields to display. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * -t Control which object types to display. * * When given no arguments, list all filesystems in the system. * Otherwise, list the specified datasets, optionally recursing down them if * '-r' is specified. */ typedef struct list_cbdata { boolean_t cb_first; boolean_t cb_literal; boolean_t cb_scripted; zprop_list_t *cb_proplist; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void print_header(list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; char headerbuf[ZFS_MAXPROPLEN]; const char *header; int i; boolean_t first = B_TRUE; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { (void) printf(" "); } else { first = B_FALSE; } right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { header = zfs_prop_column_name(pl->pl_prop); right_justify = zfs_prop_align_right(pl->pl_prop); } else { for (i = 0; pl->pl_user_prop[i] != '\0'; i++) headerbuf[i] = toupper(pl->pl_user_prop[i]); headerbuf[i] = '\0'; header = headerbuf; } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) (void) printf("%*s", (int)pl->pl_width, header); else (void) printf("%-*s", (int)pl->pl_width, header); } (void) printf("\n"); } /* * Given a dataset and a list of fields, print out all the properties according * to the described layout. */ static void print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) { zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZFS_MAXPROPLEN]; nvlist_t *userprops = zfs_get_user_props(zhp); nvlist_t *propval; char *propstr; boolean_t right_justify; for (; pl != NULL; pl = pl->pl_next) { if (!first) { if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); } else { first = B_FALSE; } if (pl->pl_prop == ZFS_PROP_NAME) { (void) strlcpy(property, zfs_get_name(zhp), sizeof (property)); propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (pl->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, pl->pl_prop, property, sizeof (property), NULL, NULL, 0, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); } else if (zfs_prop_userquota(pl->pl_user_prop)) { if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else if (zfs_prop_written(pl->pl_user_prop)) { if (zfs_prop_get_written(zhp, pl->pl_user_prop, property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = B_TRUE; } else { if (nvlist_lookup_nvlist(userprops, pl->pl_user_prop, &propval) != 0) propstr = "-"; else verify(nvlist_lookup_string(propval, ZPROP_VALUE, &propstr) == 0); right_justify = B_FALSE; } /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", (int)pl->pl_width, propstr); else (void) printf("%-*s", (int)pl->pl_width, propstr); } (void) printf("\n"); } /* * Generic callback function to list a dataset or snapshot. */ static int list_callback(zfs_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; if (cbp->cb_first) { if (!cbp->cb_scripted) print_header(cbp); cbp->cb_first = B_FALSE; } print_dataset(zhp, cbp); return (0); } static int zfs_do_list(int argc, char **argv) { int c; static char default_fields[] = "name,used,available,referenced,mountpoint"; int types = ZFS_TYPE_DATASET; boolean_t types_specified = B_FALSE; char *fields = NULL; list_cbdata_t cb = { 0 }; char *value; int limit = 0; int ret = 0; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { switch (c) { case 'o': fields = optarg; break; case 'p': cb.cb_literal = B_TRUE; flags |= ZFS_ITER_LITERAL_PROPS; break; case 'd': limit = parse_depth(optarg, &flags); break; case 'r': flags |= ZFS_ITER_RECURSE; break; case 'H': cb.cb_scripted = B_TRUE; break; case 's': if (zfs_add_sort_column(&sortcol, optarg, B_FALSE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 'S': if (zfs_add_sort_column(&sortcol, optarg, B_TRUE) != 0) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), optarg); usage(B_FALSE); } break; case 't': types = 0; types_specified = B_TRUE; flags &= ~ZFS_ITER_PROP_LISTSNAPS; while (*optarg != '\0') { static char *type_subopts[] = { "filesystem", "volume", "snapshot", "snap", "bookmark", "all", NULL }; switch (getsubopt(&optarg, type_subopts, &value)) { case 0: types |= ZFS_TYPE_FILESYSTEM; break; case 1: types |= ZFS_TYPE_VOLUME; break; case 2: case 3: types |= ZFS_TYPE_SNAPSHOT; break; case 4: types |= ZFS_TYPE_BOOKMARK; break; case 5: types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; break; default: (void) fprintf(stderr, gettext("invalid type '%s'\n"), value); usage(B_FALSE); } } break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (fields == NULL) fields = default_fields; /* * If we are only going to list snapshot names and sort by name, * then we can use faster version. */ if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol)) flags |= ZFS_ITER_SIMPLE; /* * If "-o space" and no types were specified, don't display snapshots. */ if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) types &= ~ZFS_TYPE_SNAPSHOT; /* * Handle users who want to list all snapshots or bookmarks * of the current dataset (ex. 'zfs list -t snapshot '). */ if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); limit = 1; } /* * If the user specifies '-o all', the zprop_get_list() doesn't * normally include the name of the dataset. For 'zfs list', we always * want this property to be first. */ if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) != 0) usage(B_FALSE); cb.cb_first = B_TRUE; ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, limit, list_callback, &cb); zprop_free_list(cb.cb_proplist); zfs_free_sort_columns(sortcol); if (ret == 0 && cb.cb_first && !cb.cb_scripted) (void) fprintf(stderr, gettext("no datasets available\n")); return (ret); } /* * zfs rename [-fu] * zfs rename [-f] -p * zfs rename [-u] -r * * Renames the given dataset to another of the same type. * * The '-p' flag creates all the non-existing ancestors of the target first. * The '-u' flag prevents file systems from being remounted during rename. */ /* ARGSUSED */ static int zfs_do_rename(int argc, char **argv) { zfs_handle_t *zhp; renameflags_t flags = { 0 }; int c; int ret = 0; int types; boolean_t parents = B_FALSE; /* check options */ while ((c = getopt(argc, argv, "pruf")) != -1) { switch (c) { case 'p': parents = B_TRUE; break; case 'r': flags.recursive = B_TRUE; break; case 'u': flags.nounmount = B_TRUE; break; case 'f': flags.forceunmount = B_TRUE; break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source dataset " "argument\n")); usage(B_FALSE); } if (argc < 2) { (void) fprintf(stderr, gettext("missing target dataset " "argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (flags.recursive && parents) { (void) fprintf(stderr, gettext("-p and -r options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.nounmount && parents) { (void) fprintf(stderr, gettext("-u and -p options are mutually " "exclusive\n")); usage(B_FALSE); } if (flags.recursive && strchr(argv[0], '@') == 0) { (void) fprintf(stderr, gettext("source dataset for recursive " "rename must be a snapshot\n")); usage(B_FALSE); } if (flags.nounmount) types = ZFS_TYPE_FILESYSTEM; else if (parents) types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; else types = ZFS_TYPE_DATASET; if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) return (1); /* If we were asked and the name looks good, try to create ancestors. */ if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && zfs_create_ancestors(g_zfs, argv[1]) != 0) { zfs_close(zhp); return (1); } ret = (zfs_rename(zhp, argv[1], flags) != 0); zfs_close(zhp); return (ret); } /* * zfs promote * * Promotes the given clone fs to be the parent */ /* ARGSUSED */ static int zfs_do_promote(int argc, char **argv) { zfs_handle_t *zhp; int ret = 0; /* check options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing clone filesystem" " argument\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); ret = (zfs_promote(zhp) != 0); zfs_close(zhp); return (ret); } static int zfs_do_redact(int argc, char **argv) { char *snap = NULL; char *bookname = NULL; char **rsnaps = NULL; int numrsnaps = 0; argv++; argc--; if (argc < 3) { (void) fprintf(stderr, gettext("too few arguments\n")); usage(B_FALSE); } snap = argv[0]; bookname = argv[1]; rsnaps = argv + 2; numrsnaps = argc - 2; nvlist_t *rsnapnv = fnvlist_alloc(); for (int i = 0; i < numrsnaps; i++) { fnvlist_add_boolean(rsnapnv, rsnaps[i]); } int err = lzc_redact(snap, bookname, rsnapnv); fnvlist_free(rsnapnv); switch (err) { case 0: break; case ENOENT: (void) fprintf(stderr, gettext("provided snapshot %s does not exist\n"), snap); break; case EEXIST: (void) fprintf(stderr, gettext("specified redaction bookmark " "(%s) provided already exists\n"), bookname); break; case ENAMETOOLONG: (void) fprintf(stderr, gettext("provided bookmark name cannot " "be used, final name would be too long\n")); break; case E2BIG: (void) fprintf(stderr, gettext("too many redaction snapshots " "specified\n")); break; case EINVAL: if (strchr(bookname, '#') != NULL) (void) fprintf(stderr, gettext( "redaction bookmark name must not contain '#'\n")); else (void) fprintf(stderr, gettext( "redaction snapshot must be descendent of " "snapshot being redacted\n")); break; case EALREADY: (void) fprintf(stderr, gettext("attempted to redact redacted " "dataset or with respect to redacted dataset\n")); break; case ENOTSUP: (void) fprintf(stderr, gettext("redaction bookmarks feature " "not enabled\n")); break; case EXDEV: (void) fprintf(stderr, gettext("potentially invalid redaction " "snapshot; full dataset names required\n")); break; default: (void) fprintf(stderr, gettext("internal error: %s\n"), strerror(errno)); } return (err); } /* * zfs rollback [-rRf] * * -r Delete any intervening snapshots before doing rollback * -R Delete any snapshots and their clones * -f ignored for backwards compatibility * * Given a filesystem, rollback to a specific snapshot, discarding any changes * since then and making it the active dataset. If more recent snapshots exist, * the command will complain unless the '-r' flag is given. */ typedef struct rollback_cbdata { uint64_t cb_create; uint8_t cb_younger_ds_printed; boolean_t cb_first; int cb_doclones; char *cb_target; int cb_error; boolean_t cb_recurse; } rollback_cbdata_t; static int rollback_check_dependent(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; if (cbp->cb_first && cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot rollback to " "'%s': clones of previous snapshots exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-R' to " "force deletion of the following clones and " "dependents:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); zfs_close(zhp); return (0); } /* * Report some snapshots/bookmarks more recent than the one specified. * Used when '-r' is not specified. We reuse this same callback for the * snapshot dependents - if 'cb_dependent' is set, then this is a * dependent and we should report it without checking the transaction group. */ static int rollback_check(zfs_handle_t *zhp, void *data) { rollback_cbdata_t *cbp = data; /* * Max number of younger snapshots and/or bookmarks to display before * we stop the iteration. */ const uint8_t max_younger = 32; if (cbp->cb_doclones) { zfs_close(zhp); return (0); } if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { if (cbp->cb_first && !cbp->cb_recurse) { (void) fprintf(stderr, gettext("cannot " "rollback to '%s': more recent snapshots " "or bookmarks exist\n"), cbp->cb_target); (void) fprintf(stderr, gettext("use '-r' to " "force deletion of the following " "snapshots and bookmarks:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } if (cbp->cb_recurse) { if (zfs_iter_dependents(zhp, B_TRUE, rollback_check_dependent, cbp) != 0) { zfs_close(zhp); return (-1); } } else { (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); cbp->cb_younger_ds_printed++; } } zfs_close(zhp); if (cbp->cb_younger_ds_printed == max_younger) { /* * This non-recursive rollback is going to fail due to the * presence of snapshots and/or bookmarks that are younger than * the rollback target. * We printed some of the offending objects, now we stop * zfs_iter_snapshot/bookmark iteration so we can fail fast and * avoid iterating over the rest of the younger objects */ (void) fprintf(stderr, gettext("Output limited to %d " "snapshots/bookmarks\n"), max_younger); return (-1); } return (0); } static int zfs_do_rollback(int argc, char **argv) { int ret = 0; int c; boolean_t force = B_FALSE; rollback_cbdata_t cb = { 0 }; zfs_handle_t *zhp, *snap; char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *delim; uint64_t min_txg = 0; /* check options */ while ((c = getopt(argc, argv, "rRf")) != -1) { switch (c) { case 'r': cb.cb_recurse = 1; break; case 'R': cb.cb_recurse = 1; cb.cb_doclones = 1; break; case 'f': force = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* open the snapshot */ if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) return (1); /* open the parent dataset */ (void) strlcpy(parentname, argv[0], sizeof (parentname)); verify((delim = strrchr(parentname, '@')) != NULL); *delim = '\0'; if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { zfs_close(snap); return (1); } /* * Check for more recent snapshots and/or clones based on the presence * of '-r' and '-R'. */ cb.cb_target = argv[0]; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); cb.cb_first = B_TRUE; cb.cb_error = 0; if (cb.cb_create > 0) min_txg = cb.cb_create; if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb, min_txg, 0)) != 0) goto out; if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0) goto out; if ((ret = cb.cb_error) != 0) goto out; /* * Rollback parent to the given snapshot. */ ret = zfs_rollback(zhp, snap, force); out: zfs_close(snap); zfs_close(zhp); if (ret == 0) return (0); else return (1); } /* * zfs set property=value ... { fs | snap | vol } ... * * Sets the given properties for all datasets specified on the command line. */ static int set_callback(zfs_handle_t *zhp, void *data) { nvlist_t *props = data; if (zfs_prop_set_list(zhp, props) != 0) { switch (libzfs_errno(g_zfs)) { case EZFS_MOUNTFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to remount filesystem\n")); break; case EZFS_SHARENFSFAILED: (void) fprintf(stderr, gettext("property may be set " "but unable to reshare filesystem\n")); break; } return (1); } return (0); } static int zfs_do_set(int argc, char **argv) { nvlist_t *props = NULL; int ds_start = -1; /* argv idx of first dataset arg */ int ret = 0; int i; /* check for options */ if (argc > 1 && argv[1][0] == '-') { (void) fprintf(stderr, gettext("invalid option '%c'\n"), argv[1][1]); usage(B_FALSE); } /* check number of arguments */ if (argc < 2) { (void) fprintf(stderr, gettext("missing arguments\n")); usage(B_FALSE); } if (argc < 3) { if (strchr(argv[1], '=') == NULL) { (void) fprintf(stderr, gettext("missing property=value " "argument(s)\n")); } else { (void) fprintf(stderr, gettext("missing dataset " "name(s)\n")); } usage(B_FALSE); } /* validate argument order: prop=val args followed by dataset args */ for (i = 1; i < argc; i++) { if (strchr(argv[i], '=') != NULL) { if (ds_start > 0) { /* out-of-order prop=val argument */ (void) fprintf(stderr, gettext("invalid " "argument order\n")); usage(B_FALSE); } } else if (ds_start < 0) { ds_start = i; } } if (ds_start < 0) { (void) fprintf(stderr, gettext("missing dataset name(s)\n")); usage(B_FALSE); } /* Populate a list of property settings */ if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); for (i = 1; i < ds_start; i++) { if (!parseprop(props, argv[i])) { ret = -1; goto error; } } ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); error: nvlist_free(props); return (ret); } typedef struct snap_cbdata { nvlist_t *sd_nvl; boolean_t sd_recursive; const char *sd_snapname; } snap_cbdata_t; static int zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) { snap_cbdata_t *sd = arg; char *name; int rv = 0; int error; if (sd->sd_recursive && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { zfs_close(zhp); return (0); } error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); if (error == -1) nomem(); fnvlist_add_boolean(sd->sd_nvl, name); free(name); if (sd->sd_recursive) rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); zfs_close(zhp); return (rv); } /* * zfs snapshot [-r] [-o prop=value] ... * * Creates a snapshot with the given name. While functionally equivalent to * 'zfs create', it is a separate command to differentiate intent. */ static int zfs_do_snapshot(int argc, char **argv) { int ret = 0; int c; nvlist_t *props; snap_cbdata_t sd = { 0 }; boolean_t multiple_snaps = B_FALSE; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, "ro:")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(sd.sd_nvl); nvlist_free(props); return (1); } break; case 'r': sd.sd_recursive = B_TRUE; multiple_snaps = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); goto usage; } if (argc > 1) multiple_snaps = B_TRUE; for (; argc > 0; argc--, argv++) { char *atp; zfs_handle_t *zhp; atp = strchr(argv[0], '@'); if (atp == NULL) goto usage; *atp = '\0'; sd.sd_snapname = atp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) goto usage; if (zfs_snapshot_cb(zhp, &sd) != 0) goto usage; } ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); nvlist_free(sd.sd_nvl); nvlist_free(props); if (ret != 0 && multiple_snaps) (void) fprintf(stderr, gettext("no snapshots were created\n")); return (ret != 0); usage: nvlist_free(sd.sd_nvl); nvlist_free(props); usage(B_FALSE); return (-1); } /* * Send a backup stream to stdout. */ static int zfs_do_send(int argc, char **argv) { char *fromname = NULL; char *toname = NULL; char *resume_token = NULL; char *cp; zfs_handle_t *zhp; sendflags_t flags = { 0 }; int c, err; nvlist_t *dbgnv = NULL; char *redactbook = NULL; struct option long_options[] = { {"replicate", no_argument, NULL, 'R'}, {"redact", required_argument, NULL, 'd'}, {"props", no_argument, NULL, 'p'}, {"parsable", no_argument, NULL, 'P'}, {"dedup", no_argument, NULL, 'D'}, {"verbose", no_argument, NULL, 'v'}, {"dryrun", no_argument, NULL, 'n'}, {"large-block", no_argument, NULL, 'L'}, {"embed", no_argument, NULL, 'e'}, {"resume", required_argument, NULL, 't'}, {"compressed", no_argument, NULL, 'c'}, {"raw", no_argument, NULL, 'w'}, {"backup", no_argument, NULL, 'b'}, {"holds", no_argument, NULL, 'h'}, {"saved", no_argument, NULL, 'S'}, {0, 0, 0, 0} }; /* check options */ while ((c = getopt_long(argc, argv, ":i:I:RDpvnPLeht:cwbd:S", long_options, NULL)) != -1) { switch (c) { case 'i': if (fromname) usage(B_FALSE); fromname = optarg; break; case 'I': if (fromname) usage(B_FALSE); fromname = optarg; flags.doall = B_TRUE; break; case 'R': flags.replicate = B_TRUE; break; case 'd': redactbook = optarg; break; case 'p': flags.props = B_TRUE; break; case 'b': flags.backup = B_TRUE; break; case 'h': flags.holds = B_TRUE; break; case 'P': flags.parsable = B_TRUE; break; case 'v': flags.verbosity++; flags.progress = B_TRUE; break; case 'D': (void) fprintf(stderr, gettext("WARNING: deduplicated send is no " "longer supported. A regular,\n" "non-deduplicated stream will be generated.\n\n")); break; case 'n': flags.dryrun = B_TRUE; break; case 'L': flags.largeblock = B_TRUE; break; case 'e': flags.embed_data = B_TRUE; break; case 't': resume_token = optarg; break; case 'c': flags.compress = B_TRUE; break; case 'w': flags.raw = B_TRUE; flags.compress = B_TRUE; flags.embed_data = B_TRUE; flags.largeblock = B_TRUE; break; case 'S': flags.saved = B_TRUE; break; case ':': /* * If a parameter was not passed, optopt contains the * value that would normally lead us into the * appropriate case statement. If it's > 256, then this * must be a longopt and we should look at argv to get * the string. Otherwise it's just the character, so we * should use it directly. */ if (optopt <= UINT8_MAX) { (void) fprintf(stderr, gettext("missing argument for '%c' " "option\n"), optopt); } else { (void) fprintf(stderr, gettext("missing argument for '%s' " "option\n"), argv[optind - 1]); } usage(B_FALSE); break; case '?': /*FALLTHROUGH*/ default: /* * If an invalid flag was passed, optopt contains the * character if it was a short flag, or 0 if it was a * longopt. */ if (optopt != 0) { (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); } else { (void) fprintf(stderr, gettext("invalid option '%s'\n"), argv[optind - 1]); } usage(B_FALSE); } } if (flags.parsable && flags.verbosity == 0) flags.verbosity = 1; argc -= optind; argv += optind; if (resume_token != NULL) { if (fromname != NULL || flags.replicate || flags.props || flags.backup || flags.holds || flags.saved || redactbook != NULL) { (void) fprintf(stderr, gettext("invalid flags combined with -t\n")); usage(B_FALSE); } if (argc > 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } else { if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } } if (flags.saved) { if (fromname != NULL || flags.replicate || flags.props || flags.doall || flags.backup || flags.holds || flags.largeblock || flags.embed_data || flags.compress || flags.raw || redactbook != NULL) { (void) fprintf(stderr, gettext("incompatible flags " "combined with saved send flag\n")); usage(B_FALSE); } if (strchr(argv[0], '@') != NULL) { (void) fprintf(stderr, gettext("saved send must " "specify the dataset with partially-received " "state\n")); usage(B_FALSE); } } if (flags.raw && redactbook != NULL) { (void) fprintf(stderr, gettext("Error: raw sends may not be redacted.\n")); return (1); } if (!flags.dryrun && isatty(STDOUT_FILENO)) { (void) fprintf(stderr, gettext("Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n")); return (1); } if (flags.saved) { zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); err = zfs_send_saved(zhp, &flags, STDOUT_FILENO, resume_token); zfs_close(zhp); return (err != 0); } else if (resume_token != NULL) { return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, resume_token)); } /* * For everything except -R and -I, use the new, cleaner code path. */ if (!(flags.replicate || flags.doall)) { char frombuf[ZFS_MAX_DATASET_NAME_LEN]; if (fromname != NULL && (strchr(fromname, '#') == NULL && strchr(fromname, '@') == NULL)) { /* * Neither bookmark or snapshot was specified. Print a * warning, and assume snapshot. */ (void) fprintf(stderr, "Warning: incremental source " "didn't specify type, assuming snapshot. Use '@' " "or '#' prefix to avoid ambiguity.\n"); (void) snprintf(frombuf, sizeof (frombuf), "@%s", fromname); fromname = frombuf; } if (fromname != NULL && (fromname[0] == '#' || fromname[0] == '@')) { /* * Incremental source name begins with # or @. * Default to same fs as target. */ char tmpbuf[ZFS_MAX_DATASET_NAME_LEN]; (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf)); (void) strlcpy(frombuf, argv[0], sizeof (frombuf)); cp = strchr(frombuf, '@'); if (cp != NULL) *cp = '\0'; (void) strlcat(frombuf, tmpbuf, sizeof (frombuf)); fromname = frombuf; } zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); if (zhp == NULL) return (1); err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags, redactbook); zfs_close(zhp); return (err != 0); } if (fromname != NULL && strchr(fromname, '#')) { (void) fprintf(stderr, gettext("Error: multiple snapshots cannot be " "sent from a bookmark.\n")); return (1); } if (redactbook != NULL) { (void) fprintf(stderr, gettext("Error: multiple snapshots " "cannot be sent redacted.\n")); return (1); } if ((cp = strchr(argv[0], '@')) == NULL) { (void) fprintf(stderr, gettext("Error: " "Unsupported flag with filesystem or bookmark.\n")); return (1); } *cp = '\0'; toname = cp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) return (1); /* * If they specified the full path to the snapshot, chop off * everything except the short name of the snapshot, but special * case if they specify the origin. */ if (fromname && (cp = strchr(fromname, '@')) != NULL) { char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, sizeof (origin), &src, NULL, 0, B_FALSE); if (strcmp(origin, fromname) == 0) { fromname = NULL; flags.fromorigin = B_TRUE; } else { *cp = '\0'; if (cp != fromname && strcmp(argv[0], fromname)) { (void) fprintf(stderr, gettext("incremental source must be " "in same filesystem\n")); usage(B_FALSE); } fromname = cp + 1; if (strchr(fromname, '@') || strchr(fromname, '/')) { (void) fprintf(stderr, gettext("invalid incremental source\n")); usage(B_FALSE); } } } if (flags.replicate && fromname == NULL) flags.doall = B_TRUE; err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, flags.verbosity >= 3 ? &dbgnv : NULL); if (flags.verbosity >= 3 && dbgnv != NULL) { /* * dump_nvlist prints to stdout, but that's been * redirected to a file. Make it print to stderr * instead. */ (void) dup2(STDERR_FILENO, STDOUT_FILENO); dump_nvlist(dbgnv, 0); nvlist_free(dbgnv); } zfs_close(zhp); return (err != 0); } /* * Restore a backup stream from stdin. */ static int zfs_do_receive(int argc, char **argv) { int c, err = 0; recvflags_t flags = { 0 }; boolean_t abort_resumable = B_FALSE; nvlist_t *props; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ while ((c = getopt(argc, argv, ":o:x:dehMnuvFsA")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'x': if (!parsepropname(props, optarg)) { nvlist_free(props); usage(B_FALSE); } break; case 'd': if (flags.istail) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.isprefix = B_TRUE; break; case 'e': if (flags.isprefix) { (void) fprintf(stderr, gettext("invalid option " "combination: -d and -e are mutually " "exclusive\n")); usage(B_FALSE); } flags.istail = B_TRUE; break; case 'h': flags.skipholds = B_TRUE; break; case 'M': flags.forceunmount = B_TRUE; break; case 'n': flags.dryrun = B_TRUE; break; case 'u': flags.nomount = B_TRUE; break; case 'v': flags.verbose = B_TRUE; break; case 's': flags.resumable = B_TRUE; break; case 'F': flags.force = B_TRUE; break; case 'A': abort_resumable = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* zfs recv -e (use "tail" name) implies -d (remove dataset "head") */ if (flags.istail) flags.isprefix = B_TRUE; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing snapshot argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (abort_resumable) { if (flags.isprefix || flags.istail || flags.dryrun || flags.resumable || flags.nomount) { (void) fprintf(stderr, gettext("invalid option\n")); usage(B_FALSE); } char namebuf[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(namebuf, sizeof (namebuf), "%s/%%recv", argv[0]); if (zfs_dataset_exists(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { zfs_handle_t *zhp = zfs_open(g_zfs, namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { nvlist_free(props); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } else { zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { (void) fprintf(stderr, gettext("'%s' does not have any " "resumable receive state to abort\n"), argv[0]); nvlist_free(props); zfs_close(zhp); return (1); } err = zfs_destroy(zhp, B_FALSE); zfs_close(zhp); } nvlist_free(props); return (err != 0); } if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " "from a terminal.\n" "You must redirect standard input.\n")); nvlist_free(props); return (1); } err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); nvlist_free(props); return (err != 0); } /* * allow/unallow stuff */ /* copied from zfs/sys/dsl_deleg.h */ #define ZFS_DELEG_PERM_CREATE "create" #define ZFS_DELEG_PERM_DESTROY "destroy" #define ZFS_DELEG_PERM_SNAPSHOT "snapshot" #define ZFS_DELEG_PERM_ROLLBACK "rollback" #define ZFS_DELEG_PERM_CLONE "clone" #define ZFS_DELEG_PERM_PROMOTE "promote" #define ZFS_DELEG_PERM_RENAME "rename" #define ZFS_DELEG_PERM_MOUNT "mount" #define ZFS_DELEG_PERM_SHARE "share" #define ZFS_DELEG_PERM_SEND "send" #define ZFS_DELEG_PERM_RECEIVE "receive" #define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ #define ZFS_DELEG_PERM_USERQUOTA "userquota" #define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" #define ZFS_DELEG_PERM_USERUSED "userused" #define ZFS_DELEG_PERM_GROUPUSED "groupused" #define ZFS_DELEG_PERM_USEROBJQUOTA "userobjquota" #define ZFS_DELEG_PERM_GROUPOBJQUOTA "groupobjquota" #define ZFS_DELEG_PERM_USEROBJUSED "userobjused" #define ZFS_DELEG_PERM_GROUPOBJUSED "groupobjused" #define ZFS_DELEG_PERM_HOLD "hold" #define ZFS_DELEG_PERM_RELEASE "release" #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" #define ZFS_DELEG_PERM_LOAD_KEY "load-key" #define ZFS_DELEG_PERM_CHANGE_KEY "change-key" #define ZFS_DELEG_PERM_PROJECTUSED "projectused" #define ZFS_DELEG_PERM_PROJECTQUOTA "projectquota" #define ZFS_DELEG_PERM_PROJECTOBJUSED "projectobjused" #define ZFS_DELEG_PERM_PROJECTOBJQUOTA "projectobjquota" #define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, { ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY }, { ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY }, { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, { ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_NOTE_USEROBJQUOTA }, { ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_NOTE_USEROBJUSED }, { ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_NOTE_GROUPOBJQUOTA }, { ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_NOTE_GROUPOBJUSED }, { ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_NOTE_PROJECTUSED }, { ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTQUOTA }, { ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJUSED }, { ZFS_DELEG_PERM_PROJECTOBJQUOTA, ZFS_DELEG_NOTE_PROJECTOBJQUOTA }, { NULL, ZFS_DELEG_NOTE_NONE } }; /* permission structure */ typedef struct deleg_perm { zfs_deleg_who_type_t dp_who_type; const char *dp_name; boolean_t dp_local; boolean_t dp_descend; } deleg_perm_t; /* */ typedef struct deleg_perm_node { deleg_perm_t dpn_perm; uu_avl_node_t dpn_avl_node; } deleg_perm_node_t; typedef struct fs_perm fs_perm_t; /* permissions set */ typedef struct who_perm { zfs_deleg_who_type_t who_type; const char *who_name; /* id */ char who_ug_name[256]; /* user/group name */ fs_perm_t *who_fsperm; /* uplink */ uu_avl_t *who_deleg_perm_avl; /* permissions */ } who_perm_t; /* */ typedef struct who_perm_node { who_perm_t who_perm; uu_avl_node_t who_avl_node; } who_perm_node_t; typedef struct fs_perm_set fs_perm_set_t; /* fs permissions */ struct fs_perm { const char *fsp_name; uu_avl_t *fsp_sc_avl; /* sets,create */ uu_avl_t *fsp_uge_avl; /* user,group,everyone */ fs_perm_set_t *fsp_set; /* uplink */ }; /* */ typedef struct fs_perm_node { fs_perm_t fspn_fsperm; uu_avl_t *fspn_avl; uu_list_node_t fspn_list_node; } fs_perm_node_t; /* top level structure */ struct fs_perm_set { uu_list_pool_t *fsps_list_pool; uu_list_t *fsps_list; /* list of fs_perms */ uu_avl_pool_t *fsps_named_set_avl_pool; uu_avl_pool_t *fsps_who_perm_avl_pool; uu_avl_pool_t *fsps_deleg_perm_avl_pool; }; static inline const char * deleg_perm_type(zfs_deleg_note_t note) { /* subcommands */ switch (note) { /* SUBCOMMANDS */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: case ZFS_DELEG_NOTE_GROUPUSED: case ZFS_DELEG_NOTE_USERPROP: case ZFS_DELEG_NOTE_USERQUOTA: case ZFS_DELEG_NOTE_USERUSED: case ZFS_DELEG_NOTE_USEROBJQUOTA: case ZFS_DELEG_NOTE_USEROBJUSED: case ZFS_DELEG_NOTE_GROUPOBJQUOTA: case ZFS_DELEG_NOTE_GROUPOBJUSED: case ZFS_DELEG_NOTE_PROJECTUSED: case ZFS_DELEG_NOTE_PROJECTQUOTA: case ZFS_DELEG_NOTE_PROJECTOBJUSED: case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: /* other */ return (gettext("other")); default: return (gettext("subcommand")); } } static int who_type2weight(zfs_deleg_who_type_t who_type) { int res; switch (who_type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: res = 0; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: res = 1; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: res = 2; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: res = 3; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: res = 4; break; default: res = -1; } return (res); } /* ARGSUSED */ static int who_perm_compare(const void *larg, const void *rarg, void *unused) { const who_perm_node_t *l = larg; const who_perm_node_t *r = rarg; zfs_deleg_who_type_t ltype = l->who_perm.who_type; zfs_deleg_who_type_t rtype = r->who_perm.who_type; int lweight = who_type2weight(ltype); int rweight = who_type2weight(rtype); int res = lweight - rweight; if (res == 0) res = strncmp(l->who_perm.who_name, r->who_perm.who_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } /* ARGSUSED */ static int deleg_perm_compare(const void *larg, const void *rarg, void *unused) { const deleg_perm_node_t *l = larg; const deleg_perm_node_t *r = rarg; int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, ZFS_MAX_DELEG_NAME-1); if (res == 0) return (0); if (res > 0) return (1); else return (-1); } static inline void fs_perm_set_init(fs_perm_set_t *fspset) { bzero(fspset, sizeof (fs_perm_set_t)); if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( who_perm_node_t, who_avl_node), who_perm_compare, UU_DEFAULT)) == NULL) nomem(); if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) == NULL) nomem(); } static inline void fs_perm_fini(fs_perm_t *); static inline void who_perm_fini(who_perm_t *); static inline void fs_perm_set_fini(fs_perm_set_t *fspset) { fs_perm_node_t *node = uu_list_first(fspset->fsps_list); while (node != NULL) { fs_perm_node_t *next_node = uu_list_next(fspset->fsps_list, node); fs_perm_t *fsperm = &node->fspn_fsperm; fs_perm_fini(fsperm); uu_list_remove(fspset->fsps_list, node); free(node); node = next_node; } uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); } static inline void deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, const char *name) { deleg_perm->dp_who_type = type; deleg_perm->dp_name = name; } static inline void who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, zfs_deleg_who_type_t type, const char *name) { uu_avl_pool_t *pool; pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; bzero(who_perm, sizeof (who_perm_t)); if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) nomem(); who_perm->who_type = type; who_perm->who_name = name; who_perm->who_fsperm = fsperm; } static inline void who_perm_fini(who_perm_t *who_perm) { deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); while (node != NULL) { deleg_perm_node_t *next_node = uu_avl_next(who_perm->who_deleg_perm_avl, node); uu_avl_remove(who_perm->who_deleg_perm_avl, node); free(node); node = next_node; } uu_avl_destroy(who_perm->who_deleg_perm_avl); } static inline void fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) { uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; bzero(fsperm, sizeof (fs_perm_t)); if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) == NULL) nomem(); if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) == NULL) nomem(); fsperm->fsp_set = fspset; fsperm->fsp_name = fsname; } static inline void fs_perm_fini(fs_perm_t *fsperm) { who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_sc_avl, node); free(node); node = next_node; } node = uu_avl_first(fsperm->fsp_uge_avl); while (node != NULL) { who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, node); who_perm_t *who_perm = &node->who_perm; who_perm_fini(who_perm); uu_avl_remove(fsperm->fsp_uge_avl, node); free(node); node = next_node; } uu_avl_destroy(fsperm->fsp_sc_avl); uu_avl_destroy(fsperm->fsp_uge_avl); } static void set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, zfs_deleg_who_type_t who_type, const char *name, char locality) { uu_avl_index_t idx = 0; deleg_perm_node_t *found_node = NULL; deleg_perm_t *deleg_perm = &node->dpn_perm; deleg_perm_init(deleg_perm, who_type, name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) uu_avl_insert(avl, node, idx); else { node = found_node; deleg_perm = &node->dpn_perm; } switch (locality) { case ZFS_DELEG_LOCAL: deleg_perm->dp_local = B_TRUE; break; case ZFS_DELEG_DESCENDENT: deleg_perm->dp_descend = B_TRUE; break; case ZFS_DELEG_NA: break; default: assert(B_FALSE); /* invalid locality */ } } static inline int parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; uu_avl_t *avl = who_perm->who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_perm->who_type; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *name = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; deleg_perm_node_t *node = safe_malloc(sizeof (deleg_perm_node_t)); VERIFY(type == DATA_TYPE_BOOLEAN); uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); set_deleg_perm_node(avl, node, who_type, name, locality); } return (0); } static inline int parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) { nvpair_t *nvp = NULL; fs_perm_set_t *fspset = fsperm->fsp_set; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *name = nvpair_name(nvp); uu_avl_t *avl = NULL; uu_avl_pool_t *avl_pool = NULL; zfs_deleg_who_type_t perm_type = name[0]; char perm_locality = name[1]; const char *perm_name = name + 3; who_perm_t *who_perm = NULL; assert('$' == name[2]); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); switch (perm_type) { case ZFS_DELEG_CREATE: case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_NAMED_SET: case ZFS_DELEG_NAMED_SET_SETS: avl_pool = fspset->fsps_named_set_avl_pool; avl = fsperm->fsp_sc_avl; break; case ZFS_DELEG_USER: case ZFS_DELEG_USER_SETS: case ZFS_DELEG_GROUP: case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_EVERYONE: case ZFS_DELEG_EVERYONE_SETS: avl_pool = fspset->fsps_who_perm_avl_pool; avl = fsperm->fsp_uge_avl; break; default: assert(!"unhandled zfs_deleg_who_type_t"); } who_perm_node_t *found_node = NULL; who_perm_node_t *node = safe_malloc( sizeof (who_perm_node_t)); who_perm = &node->who_perm; uu_avl_index_t idx = 0; uu_avl_node_init(node, &node->who_avl_node, avl_pool); who_perm_init(who_perm, fsperm, perm_type, perm_name); if ((found_node = uu_avl_find(avl, node, NULL, &idx)) == NULL) { if (avl == fsperm->fsp_uge_avl) { uid_t rid = 0; struct passwd *p = NULL; struct group *g = NULL; const char *nice_name = NULL; switch (perm_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: rid = atoi(perm_name); p = getpwuid(rid); if (p) nice_name = p->pw_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: rid = atoi(perm_name); g = getgrgid(rid); if (g) nice_name = g->gr_name; break; default: break; } if (nice_name != NULL) { (void) strlcpy( node->who_perm.who_ug_name, nice_name, 256); } else { /* User or group unknown */ (void) snprintf( node->who_perm.who_ug_name, sizeof (node->who_perm.who_ug_name), "(unknown: %d)", rid); } } uu_avl_insert(avl, node, idx); } else { node = found_node; who_perm = &node->who_perm; } assert(who_perm != NULL); (void) parse_who_perm(who_perm, nvl2, perm_locality); } return (0); } static inline int parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) { nvpair_t *nvp = NULL; uu_avl_index_t idx = 0; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { nvlist_t *nvl2 = NULL; const char *fsname = nvpair_name(nvp); data_type_t type = nvpair_type(nvp); fs_perm_t *fsperm = NULL; fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); if (node == NULL) nomem(); fsperm = &node->fspn_fsperm; VERIFY(DATA_TYPE_NVLIST == type); uu_list_node_init(node, &node->fspn_list_node, fspset->fsps_list_pool); idx = uu_list_numnodes(fspset->fsps_list); fs_perm_init(fsperm, fspset, fsname); if (nvpair_value_nvlist(nvp, &nvl2) != 0) return (-1); (void) parse_fs_perm(fsperm, nvl2); uu_list_insert(fspset->fsps_list, node, idx); } return (0); } static inline const char * deleg_perm_comment(zfs_deleg_note_t note) { const char *str = ""; /* subcommands */ switch (note) { /* SUBCOMMANDS */ case ZFS_DELEG_NOTE_ALLOW: str = gettext("Must also have the permission that is being" "\n\t\t\t\tallowed"); break; case ZFS_DELEG_NOTE_CLONE: str = gettext("Must also have the 'create' ability and 'mount'" "\n\t\t\t\tability in the origin file system"); break; case ZFS_DELEG_NOTE_CREATE: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DESTROY: str = gettext("Must also have the 'mount' ability"); break; case ZFS_DELEG_NOTE_DIFF: str = gettext("Allows lookup of paths within a dataset;" "\n\t\t\t\tgiven an object number. Ordinary users need this" "\n\t\t\t\tin order to use zfs diff"); break; case ZFS_DELEG_NOTE_HOLD: str = gettext("Allows adding a user hold to a snapshot"); break; case ZFS_DELEG_NOTE_MOUNT: str = gettext("Allows mount/umount of ZFS datasets"); break; case ZFS_DELEG_NOTE_PROMOTE: str = gettext("Must also have the 'mount'\n\t\t\t\tand" " 'promote' ability in the origin file system"); break; case ZFS_DELEG_NOTE_RECEIVE: str = gettext("Must also have the 'mount' and 'create'" " ability"); break; case ZFS_DELEG_NOTE_RELEASE: str = gettext("Allows releasing a user hold which\n\t\t\t\t" "might destroy the snapshot"); break; case ZFS_DELEG_NOTE_RENAME: str = gettext("Must also have the 'mount' and 'create'" "\n\t\t\t\tability in the new parent"); break; case ZFS_DELEG_NOTE_ROLLBACK: str = gettext(""); break; case ZFS_DELEG_NOTE_SEND: str = gettext(""); break; case ZFS_DELEG_NOTE_SHARE: str = gettext("Allows sharing file systems over NFS or SMB" "\n\t\t\t\tprotocols"); break; case ZFS_DELEG_NOTE_SNAPSHOT: str = gettext(""); break; case ZFS_DELEG_NOTE_LOAD_KEY: str = gettext("Allows loading or unloading an encryption key"); break; case ZFS_DELEG_NOTE_CHANGE_KEY: str = gettext("Allows changing or adding an encryption key"); break; /* * case ZFS_DELEG_NOTE_VSCAN: * str = gettext(""); * break; */ /* OTHER */ case ZFS_DELEG_NOTE_GROUPQUOTA: str = gettext("Allows accessing any groupquota@... property"); break; case ZFS_DELEG_NOTE_GROUPUSED: str = gettext("Allows reading any groupused@... property"); break; case ZFS_DELEG_NOTE_USERPROP: str = gettext("Allows changing any user property"); break; case ZFS_DELEG_NOTE_USERQUOTA: str = gettext("Allows accessing any userquota@... property"); break; case ZFS_DELEG_NOTE_USERUSED: str = gettext("Allows reading any userused@... property"); break; case ZFS_DELEG_NOTE_USEROBJQUOTA: str = gettext("Allows accessing any userobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "groupobjquota@... property"); break; case ZFS_DELEG_NOTE_GROUPOBJUSED: str = gettext("Allows reading any groupobjused@... property"); break; case ZFS_DELEG_NOTE_USEROBJUSED: str = gettext("Allows reading any userobjused@... property"); break; case ZFS_DELEG_NOTE_PROJECTQUOTA: str = gettext("Allows accessing any projectquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjquota@... property"); break; case ZFS_DELEG_NOTE_PROJECTUSED: str = gettext("Allows reading any projectused@... property"); break; case ZFS_DELEG_NOTE_PROJECTOBJUSED: str = gettext("Allows accessing any \n\t\t\t\t" "projectobjused@... property"); break; /* other */ default: str = ""; } return (str); } struct allow_opts { boolean_t local; boolean_t descend; boolean_t user; boolean_t group; boolean_t everyone; boolean_t create; boolean_t set; boolean_t recursive; /* unallow only */ boolean_t prt_usage; boolean_t prt_perms; char *who; char *perms; const char *dataset; }; static inline int prop_cmp(const void *a, const void *b) { const char *str1 = *(const char **)a; const char *str2 = *(const char **)b; return (strcmp(str1, str2)); } static void allow_usage(boolean_t un, boolean_t requested, const char *msg) { const char *opt_desc[] = { "-h", gettext("show this help message and exit"), "-l", gettext("set permission locally"), "-d", gettext("set permission for descents"), "-u", gettext("set permission for user"), "-g", gettext("set permission for group"), "-e", gettext("set permission for everyone"), "-c", gettext("set create time permission"), "-s", gettext("define permission set"), /* unallow only */ "-r", gettext("remove permissions recursively"), }; size_t unallow_size = sizeof (opt_desc) / sizeof (char *); size_t allow_size = unallow_size - 2; const char *props[ZFS_NUM_PROPS]; int i; size_t count = 0; FILE *fp = requested ? stdout : stderr; zprop_desc_t *pdtbl = zfs_prop_get_table(); const char *fmt = gettext("%-16s %-14s\t%s\n"); (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : HELP_ALLOW)); (void) fprintf(fp, gettext("Options:\n")); for (i = 0; i < (un ? unallow_size : allow_size); i += 2) { const char *opt = opt_desc[i]; const char *optdsc = opt_desc[i + 1]; (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); } (void) fprintf(fp, gettext("\nThe following permissions are " "supported:\n\n")); (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), gettext("NOTES")); for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; const char *perm_type = deleg_perm_type(perm_note); const char *perm_comment = deleg_perm_comment(perm_note); (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); } for (i = 0; i < ZFS_NUM_PROPS; i++) { zprop_desc_t *pd = &pdtbl[i]; if (pd->pd_visible != B_TRUE) continue; if (pd->pd_attr == PROP_READONLY) continue; props[count++] = pd->pd_name; } props[count] = NULL; qsort(props, count, sizeof (char *), prop_cmp); for (i = 0; i < count; i++) (void) fprintf(fp, fmt, props[i], gettext("property"), ""); if (msg != NULL) (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); exit(requested ? 0 : 2); } static inline const char * munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, char **permsp) { if (un && argc == expected_argc - 1) *permsp = NULL; else if (argc == expected_argc) *permsp = argv[argc - 2]; else allow_usage(un, B_FALSE, gettext("wrong number of parameters\n")); return (argv[argc - 1]); } static void parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) { int uge_sum = opts->user + opts->group + opts->everyone; int csuge_sum = opts->create + opts->set + uge_sum; int ldcsuge_sum = csuge_sum + opts->local + opts->descend; int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; if (uge_sum > 1) allow_usage(un, B_FALSE, gettext("-u, -g, and -e are mutually exclusive\n")); if (opts->prt_usage) { if (argc == 0 && all_sum == 0) allow_usage(un, B_TRUE, NULL); else usage(B_FALSE); } if (opts->set) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -s\n")); opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); if (argv[0][0] != '@') allow_usage(un, B_FALSE, gettext("invalid set name: missing '@' prefix\n")); opts->who = argv[0]; } else if (opts->create) { if (ldcsuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -c\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (opts->everyone) { if (csuge_sum > 1) allow_usage(un, B_FALSE, gettext("invalid options combined with -e\n")); opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") == 0) { opts->everyone = B_TRUE; argc--; argv++; opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); } else if (argc == 1 && !un) { opts->prt_perms = B_TRUE; opts->dataset = argv[argc-1]; } else { opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); opts->who = argv[0]; } if (!opts->local && !opts->descend) { opts->local = B_TRUE; opts->descend = B_TRUE; } } static void store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, const char *who, char *perms, nvlist_t *top_nvl) { int i; char ld[2] = { '\0', '\0' }; char who_buf[MAXNAMELEN + 32]; char base_type = '\0'; char set_type = '\0'; nvlist_t *base_nvl = NULL; nvlist_t *set_nvl = NULL; nvlist_t *nvl; if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); switch (type) { case ZFS_DELEG_NAMED_SET_SETS: case ZFS_DELEG_NAMED_SET: set_type = ZFS_DELEG_NAMED_SET_SETS; base_type = ZFS_DELEG_NAMED_SET; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_CREATE_SETS: case ZFS_DELEG_CREATE: set_type = ZFS_DELEG_CREATE_SETS; base_type = ZFS_DELEG_CREATE; ld[0] = ZFS_DELEG_NA; break; case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: set_type = ZFS_DELEG_USER_SETS; base_type = ZFS_DELEG_USER; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: set_type = ZFS_DELEG_GROUP_SETS; base_type = ZFS_DELEG_GROUP; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: set_type = ZFS_DELEG_EVERYONE_SETS; base_type = ZFS_DELEG_EVERYONE; if (local) ld[0] = ZFS_DELEG_LOCAL; if (descend) ld[1] = ZFS_DELEG_DESCENDENT; break; default: assert(set_type != '\0' && base_type != '\0'); } if (perms != NULL) { char *curr = perms; char *end = curr + strlen(perms); while (curr < end) { char *delim = strchr(curr, ','); if (delim == NULL) delim = end; else *delim = '\0'; if (curr[0] == '@') nvl = set_nvl; else nvl = base_nvl; (void) nvlist_add_boolean(nvl, curr); if (delim != end) *delim = ','; curr = delim + 1; } for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (!nvlist_empty(base_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, base_nvl); } if (!nvlist_empty(set_nvl)) { if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_nvlist(top_nvl, who_buf, set_nvl); } } } else { for (i = 0; i < 2; i++) { char locality = ld[i]; if (locality == 0) continue; if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", base_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", base_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); if (who != NULL) (void) snprintf(who_buf, sizeof (who_buf), "%c%c$%s", set_type, locality, who); else (void) snprintf(who_buf, sizeof (who_buf), "%c%c$", set_type, locality); (void) nvlist_add_boolean(top_nvl, who_buf); } } } static int construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) { if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) nomem(); if (opts->set) { store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, opts->descend, opts->who, opts->perms, *nvlp); } else if (opts->create) { store_allow_perm(ZFS_DELEG_CREATE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else if (opts->everyone) { store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, opts->descend, NULL, opts->perms, *nvlp); } else { char *curr = opts->who; char *end = curr + strlen(curr); while (curr < end) { const char *who; zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; char *endch; char *delim = strchr(curr, ','); char errbuf[256]; char id[64]; struct passwd *p = NULL; struct group *g = NULL; uid_t rid; if (delim == NULL) delim = end; else *delim = '\0'; rid = (uid_t)strtol(curr, &endch, 0); if (opts->user) { who_type = ZFS_DELEG_USER; if (*endch != '\0') p = getpwnam(curr); else p = getpwuid(rid); if (p != NULL) rid = p->pw_uid; else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( "invalid user %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { who_type = ZFS_DELEG_GROUP; if (*endch != '\0') g = getgrnam(curr); else g = getgrgid(rid); if (g != NULL) rid = g->gr_gid; else if (*endch != '\0') { (void) snprintf(errbuf, 256, gettext( "invalid group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else { if (*endch != '\0') { p = getpwnam(curr); } else { p = getpwuid(rid); } if (p == NULL) { if (*endch != '\0') { g = getgrnam(curr); } else { g = getgrgid(rid); } } if (p != NULL) { who_type = ZFS_DELEG_USER; rid = p->pw_uid; } else if (g != NULL) { who_type = ZFS_DELEG_GROUP; rid = g->gr_gid; } else { (void) snprintf(errbuf, 256, gettext( "invalid user/group %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } (void) sprintf(id, "%u", rid); who = id; store_allow_perm(who_type, opts->local, opts->descend, who, opts->perms, *nvlp); curr = delim + 1; } } return (0); } static void print_set_creat_perms(uu_avl_t *who_avl) { const char *sc_title[] = { gettext("Permission sets:\n"), gettext("Create time permissions:\n"), NULL }; who_perm_node_t *who_node = NULL; int prev_weight = -1; for (who_node = uu_avl_first(who_avl); who_node != NULL; who_node = uu_avl_next(who_avl, who_node)) { uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; const char *who_name = who_node->who_perm.who_name; int weight = who_type2weight(who_type); boolean_t first = B_TRUE; deleg_perm_node_t *deleg_node; if (prev_weight != weight) { (void) printf("%s", sc_title[weight]); prev_weight = weight; } if (who_name == NULL || strnlen(who_name, 1) == 0) (void) printf("\t"); else (void) printf("\t%s ", who_name); for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (first) { (void) printf("%s", deleg_node->dpn_perm.dp_name); first = B_FALSE; } else (void) printf(",%s", deleg_node->dpn_perm.dp_name); } (void) printf("\n"); } } static void print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, const char *title) { who_perm_node_t *who_node = NULL; boolean_t prt_title = B_TRUE; uu_avl_walk_t *walk; if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) nomem(); while ((who_node = uu_avl_walk_next(walk)) != NULL) { const char *who_name = who_node->who_perm.who_name; const char *nice_who_name = who_node->who_perm.who_ug_name; uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; char delim = ' '; deleg_perm_node_t *deleg_node; boolean_t prt_who = B_TRUE; for (deleg_node = uu_avl_first(avl); deleg_node != NULL; deleg_node = uu_avl_next(avl, deleg_node)) { if (local != deleg_node->dpn_perm.dp_local || descend != deleg_node->dpn_perm.dp_descend) continue; if (prt_who) { const char *who = NULL; if (prt_title) { prt_title = B_FALSE; (void) printf("%s", title); } switch (who_type) { case ZFS_DELEG_USER_SETS: case ZFS_DELEG_USER: who = gettext("user"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_GROUP_SETS: case ZFS_DELEG_GROUP: who = gettext("group"); if (nice_who_name) who_name = nice_who_name; break; case ZFS_DELEG_EVERYONE_SETS: case ZFS_DELEG_EVERYONE: who = gettext("everyone"); who_name = NULL; break; default: assert(who != NULL); } prt_who = B_FALSE; if (who_name == NULL) (void) printf("\t%s", who); else (void) printf("\t%s %s", who, who_name); } (void) printf("%c%s", delim, deleg_node->dpn_perm.dp_name); delim = ','; } if (!prt_who) (void) printf("\n"); } uu_avl_walk_end(walk); } static void print_fs_perms(fs_perm_set_t *fspset) { fs_perm_node_t *node = NULL; char buf[MAXNAMELEN + 32]; const char *dsname = buf; for (node = uu_list_first(fspset->fsps_list); node != NULL; node = uu_list_next(fspset->fsps_list, node)) { uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; int left = 0; (void) snprintf(buf, sizeof (buf), gettext("---- Permissions on %s "), node->fspn_fsperm.fsp_name); (void) printf("%s", dsname); left = 70 - strlen(buf); while (left-- > 0) (void) printf("-"); (void) printf("\n"); print_set_creat_perms(sc_avl); print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, gettext("Local permissions:\n")); print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, gettext("Descendent permissions:\n")); print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, gettext("Local+Descendent permissions:\n")); } } static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; struct deleg_perms { boolean_t un; nvlist_t *nvl; }; static int set_deleg_perms(zfs_handle_t *zhp, void *data) { struct deleg_perms *perms = (struct deleg_perms *)data; zfs_type_t zfs_type = zfs_get_type(zhp); if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) return (0); return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); } static int zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) { zfs_handle_t *zhp; nvlist_t *perm_nvl = NULL; nvlist_t *update_perm_nvl = NULL; int error = 1; int c; struct allow_opts opts = { 0 }; const char *optstr = un ? "ldugecsrh" : "ldugecsh"; /* check opts */ while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'l': opts.local = B_TRUE; break; case 'd': opts.descend = B_TRUE; break; case 'u': opts.user = B_TRUE; break; case 'g': opts.group = B_TRUE; break; case 'e': opts.everyone = B_TRUE; break; case 's': opts.set = B_TRUE; break; case 'c': opts.create = B_TRUE; break; case 'r': opts.recursive = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case 'h': opts.prt_usage = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check arguments */ parse_allow_args(argc, argv, un, &opts); /* try to open the dataset */ if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { (void) fprintf(stderr, "Failed to open dataset: %s\n", opts.dataset); return (-1); } if (zfs_get_fsacl(zhp, &perm_nvl) != 0) goto cleanup2; fs_perm_set_init(&fs_perm_set); if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); goto cleanup1; } if (opts.prt_perms) print_fs_perms(&fs_perm_set); else { (void) construct_fsacl_list(un, &opts, &update_perm_nvl); if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) goto cleanup0; if (un && opts.recursive) { struct deleg_perms data = { un, update_perm_nvl }; if (zfs_iter_filesystems(zhp, set_deleg_perms, &data) != 0) goto cleanup0; } } error = 0; cleanup0: nvlist_free(perm_nvl); nvlist_free(update_perm_nvl); cleanup1: fs_perm_set_fini(&fs_perm_set); cleanup2: zfs_close(zhp); return (error); } static int zfs_do_allow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); } static int zfs_do_unallow(int argc, char **argv) { return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); } static int zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) { int errors = 0; int i; const char *tag; boolean_t recursive = B_FALSE; const char *opts = holding ? "rt" : "r"; int c; /* check options */ while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 2) usage(B_FALSE); tag = argv[0]; --argc; ++argv; if (holding && tag[0] == '.') { /* tags starting with '.' are reserved for libzfs */ (void) fprintf(stderr, gettext("tag may not start with '.'\n")); usage(B_FALSE); } for (i = 0; i < argc; ++i) { zfs_handle_t *zhp; char parent[ZFS_MAX_DATASET_NAME_LEN]; const char *delim; char *path = argv[i]; delim = strchr(path, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), path); ++errors; continue; } (void) strncpy(parent, path, delim - path); parent[delim - path] = '\0'; zhp = zfs_open(g_zfs, parent, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) { ++errors; continue; } if (holding) { if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) ++errors; } else { if (zfs_release(zhp, delim+1, tag, recursive) != 0) ++errors; } zfs_close(zhp); } return (errors != 0); } /* * zfs hold [-r] [-t] ... * * -r Recursively hold * * Apply a user-hold with the given tag to the list of snapshots. */ static int zfs_do_hold(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); } /* * zfs release [-r] ... * * -r Recursively release * * Release a user-hold with the given tag from the list of snapshots. */ static int zfs_do_release(int argc, char **argv) { return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); } typedef struct holds_cbdata { boolean_t cb_recursive; const char *cb_snapname; nvlist_t **cb_nvlp; size_t cb_max_namelen; size_t cb_max_taglen; } holds_cbdata_t; #define STRFTIME_FMT_STR "%a %b %e %H:%M %Y" #define DATETIME_BUF_LEN (32) /* * */ static void print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl) { int i; nvpair_t *nvp = NULL; char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; const char *col; if (!scripted) { for (i = 0; i < 3; i++) { col = gettext(hdr_cols[i]); if (i < 2) (void) printf("%-*s ", i ? tagwidth : nwidth, col); else (void) printf("%s\n", col); } } while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { char *zname = nvpair_name(nvp); nvlist_t *nvl2; nvpair_t *nvp2 = NULL; (void) nvpair_value_nvlist(nvp, &nvl2); while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { char tsbuf[DATETIME_BUF_LEN]; char *tagname = nvpair_name(nvp2); uint64_t val = 0; time_t time; struct tm t; (void) nvpair_value_uint64(nvp2, &val); time = (time_t)val; (void) localtime_r(&time, &t); (void) strftime(tsbuf, DATETIME_BUF_LEN, gettext(STRFTIME_FMT_STR), &t); if (scripted) { (void) printf("%s\t%s\t%s\n", zname, tagname, tsbuf); } else { (void) printf("%-*s %-*s %s\n", nwidth, zname, tagwidth, tagname, tsbuf); } } } } /* * Generic callback function to list a dataset or snapshot. */ static int holds_callback(zfs_handle_t *zhp, void *data) { holds_cbdata_t *cbp = data; nvlist_t *top_nvl = *cbp->cb_nvlp; nvlist_t *nvl = NULL; nvpair_t *nvp = NULL; const char *zname = zfs_get_name(zhp); size_t znamelen = strlen(zname); if (cbp->cb_recursive) { const char *snapname; char *delim = strchr(zname, '@'); if (delim == NULL) return (0); snapname = delim + 1; if (strcmp(cbp->cb_snapname, snapname)) return (0); } if (zfs_get_holds(zhp, &nvl) != 0) return (-1); if (znamelen > cbp->cb_max_namelen) cbp->cb_max_namelen = znamelen; while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { const char *tag = nvpair_name(nvp); size_t taglen = strlen(tag); if (taglen > cbp->cb_max_taglen) cbp->cb_max_taglen = taglen; } return (nvlist_add_nvlist(top_nvl, zname, nvl)); } /* * zfs holds [-rH] ... * * -r Lists holds that are set on the named snapshots recursively. * -H Scripted mode; elide headers and separate columns by tabs. */ static int zfs_do_holds(int argc, char **argv) { int errors = 0; int c; int i; boolean_t scripted = B_FALSE; boolean_t recursive = B_FALSE; const char *opts = "rH"; nvlist_t *nvl; int types = ZFS_TYPE_SNAPSHOT; holds_cbdata_t cb = { 0 }; int limit = 0; int ret = 0; int flags = 0; /* check options */ while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'r': recursive = B_TRUE; break; case 'H': scripted = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (recursive) { types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; flags |= ZFS_ITER_RECURSE; } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) usage(B_FALSE); if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) nomem(); for (i = 0; i < argc; ++i) { char *snapshot = argv[i]; const char *delim; const char *snapname; delim = strchr(snapshot, '@'); if (delim == NULL) { (void) fprintf(stderr, gettext("'%s' is not a snapshot\n"), snapshot); ++errors; continue; } snapname = delim + 1; if (recursive) snapshot[delim - snapshot] = '\0'; cb.cb_recursive = recursive; cb.cb_snapname = snapname; cb.cb_nvlp = &nvl; /* * 1. collect holds data, set format options */ ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit, holds_callback, &cb); if (ret != 0) ++errors; } /* * 2. print holds data */ print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl); if (nvlist_empty(nvl)) (void) fprintf(stderr, gettext("no datasets available\n")); nvlist_free(nvl); return (0 != errors); } #define CHECK_SPINNER 30 #define SPINNER_TIME 3 /* seconds */ #define MOUNT_TIME 1 /* seconds */ typedef struct get_all_state { boolean_t ga_verbose; get_all_cb_t *ga_cbp; } get_all_state_t; static int get_one_dataset(zfs_handle_t *zhp, void *data) { static char *spin[] = { "-", "\\", "|", "/" }; static int spinval = 0; static int spincheck = 0; static time_t last_spin_time = (time_t)0; get_all_state_t *state = data; zfs_type_t type = zfs_get_type(zhp); if (state->ga_verbose) { if (--spincheck < 0) { time_t now = time(NULL); if (last_spin_time + SPINNER_TIME < now) { update_progress(spin[spinval++ % 4]); last_spin_time = now; } spincheck = CHECK_SPINNER; } } /* * Iterate over any nested datasets. */ if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { zfs_close(zhp); return (1); } /* * Skip any datasets whose type does not match. */ if ((type & ZFS_TYPE_FILESYSTEM) == 0) { zfs_close(zhp); return (0); } libzfs_add_handle(state->ga_cbp, zhp); assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc); return (0); } static void get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) { get_all_state_t state = { .ga_verbose = verbose, .ga_cbp = cbp }; if (verbose) set_progress_header(gettext("Reading ZFS config")); (void) zfs_iter_root(g_zfs, get_one_dataset, &state); if (verbose) finish_progress(gettext("done.")); } /* * Generic callback for sharing or mounting filesystems. Because the code is so * similar, we have a common function with an extra parameter to determine which * mode we are using. */ typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; typedef struct share_mount_state { share_mount_op_t sm_op; boolean_t sm_verbose; int sm_flags; char *sm_options; char *sm_proto; /* only valid for OP_SHARE */ pthread_mutex_t sm_lock; /* protects the remaining fields */ uint_t sm_total; /* number of filesystems to process */ uint_t sm_done; /* number of filesystems processed */ int sm_status; /* -1 if any of the share/mount operations failed */ } share_mount_state_t; /* * Share or mount a dataset. */ static int share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, boolean_t explicit, const char *options) { char mountpoint[ZFS_MAXPROPLEN]; char shareopts[ZFS_MAXPROPLEN]; char smbshareopts[ZFS_MAXPROPLEN]; const char *cmdname = op == OP_SHARE ? "share" : "mount"; struct mnttab mnt; uint64_t zoned, canmount; boolean_t shared_nfs, shared_smb; assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); /* * Check to make sure we can mount/share this dataset. If we * are in the global zone and the filesystem is exported to a * local zone, or if we are in a local zone and the * filesystem is not exported, then it is an error. */ zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); if (zoned && getzoneid() == GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "dataset is exported to a local zone\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "permission denied\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * Ignore any filesystems which don't apply to us. This * includes those with a legacy mountpoint, or those with * legacy share options. */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && strcmp(smbshareopts, "off") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share '%s': " "legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use exports(5) or " "smb.conf(5) to share this filesystem, or set " "the sharenfs or sharesmb property\n")); return (1); } /* * We cannot share or mount legacy filesystems. If the * shareopts is non-legacy but the mountpoint is legacy, we * treat it as a legacy share. */ if (strcmp(mountpoint, "legacy") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use %s(8) to " "%s this filesystem\n"), cmdname, cmdname); return (1); } if (strcmp(mountpoint, "none") == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': no " "mountpoint set\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * canmount explicit outcome * on no pass through * on yes pass through * off no return 0 * off yes display error, return 1 * noauto no return 0 * noauto yes pass through */ canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); if (canmount == ZFS_CANMOUNT_OFF) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "'canmount' property is set to 'off'\n"), cmdname, zfs_get_name(zhp)); return (1); } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { /* * When performing a 'zfs mount -a', we skip any mounts for * datasets that have 'noauto' set. Sharing a dataset with * 'noauto' set is only allowed if it's mounted. */ if (op == OP_MOUNT) return (0); if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) { /* also purge it from existing exports */ zfs_unshareall_bypath(zhp, mountpoint); return (0); } } /* * If this filesystem is encrypted and does not have * a loaded key, we can not mount it. */ if ((flags & MS_CRYPT) == 0 && zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF && zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "encryption key not loaded\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * If this filesystem is inconsistent and has a receive resume * token, we can not mount it. */ if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Contains partially-completed state from " "\"zfs receive -s\", which can be resumed with " "\"zfs send -t\"\n"), cmdname, zfs_get_name(zhp)); return (1); } if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot %s '%s': " "Dataset is not complete, was created by receiving " "a redacted zfs send stream.\n"), cmdname, zfs_get_name(zhp)); return (1); } /* * At this point, we have verified that the mountpoint and/or * shareopts are appropriate for auto management. If the * filesystem is already mounted or shared, return (failing * for explicit requests); otherwise mount or share the * filesystem. */ switch (op) { case OP_SHARE: shared_nfs = zfs_is_shared_nfs(zhp, NULL); shared_smb = zfs_is_shared_smb(zhp, NULL); if ((shared_nfs && shared_smb) || (shared_nfs && strcmp(shareopts, "on") == 0 && strcmp(smbshareopts, "off") == 0) || (shared_smb && strcmp(smbshareopts, "on") == 0 && strcmp(shareopts, "off") == 0)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot share " "'%s': filesystem already shared\n"), zfs_get_name(zhp)); return (1); } if (!zfs_is_mounted(zhp, NULL) && zfs_mount(zhp, NULL, flags) != 0) return (1); if (protocol == NULL) { if (zfs_shareall(zhp) != 0) return (1); } else if (strcmp(protocol, "nfs") == 0) { if (zfs_share_nfs(zhp)) return (1); } else if (strcmp(protocol, "smb") == 0) { if (zfs_share_smb(zhp)) return (1); } else { (void) fprintf(stderr, gettext("cannot share " "'%s': invalid share type '%s' " "specified\n"), zfs_get_name(zhp), protocol); return (1); } break; case OP_MOUNT: if (options == NULL) mnt.mnt_mntopts = ""; else mnt.mnt_mntopts = (char *)options; if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && zfs_is_mounted(zhp, NULL)) { if (!explicit) return (0); (void) fprintf(stderr, gettext("cannot mount " "'%s': filesystem already mounted\n"), zfs_get_name(zhp)); return (1); } if (zfs_mount(zhp, options, flags) != 0) return (1); break; } return (0); } /* * Reports progress in the form "(current/total)". Not thread-safe. */ static void report_mount_progress(int current, int total) { static time_t last_progress_time = 0; time_t now = time(NULL); char info[32]; - /* report 1..n instead of 0..n-1 */ - ++current; - /* display header if we're here for the first time */ if (current == 1) { set_progress_header(gettext("Mounting ZFS filesystems")); } else if (current != total && last_progress_time + MOUNT_TIME >= now) { /* too soon to report again */ return; } last_progress_time = now; (void) sprintf(info, "(%d/%d)", current, total); if (current == total) finish_progress(info); else update_progress(info); } /* * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and * updates the progress meter. */ static int share_mount_one_cb(zfs_handle_t *zhp, void *arg) { share_mount_state_t *sms = arg; int ret; ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, B_FALSE, sms->sm_options); pthread_mutex_lock(&sms->sm_lock); if (ret != 0) sms->sm_status = ret; sms->sm_done++; if (sms->sm_verbose) report_mount_progress(sms->sm_done, sms->sm_total); pthread_mutex_unlock(&sms->sm_lock); return (ret); } static void append_options(char *mntopts, char *newopts) { int len = strlen(mntopts); /* original length plus new string to append plus 1 for the comma */ if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { (void) fprintf(stderr, gettext("the opts argument for " "'%s' option is too long (more than %d chars)\n"), "-o", MNT_LINE_MAX); usage(B_FALSE); } if (*mntopts) mntopts[len++] = ','; (void) strcpy(&mntopts[len], newopts); } static int share_mount(int op, int argc, char **argv) { int do_all = 0; boolean_t verbose = B_FALSE; int c, ret = 0; char *options = NULL; int flags = 0; /* check options */ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'v': verbose = B_TRUE; break; case 'l': flags |= MS_CRYPT; break; case 'o': if (*optarg == '\0') { (void) fprintf(stderr, gettext("empty mount " "options (-o) specified\n")); usage(B_FALSE); } if (options == NULL) options = safe_malloc(MNT_LINE_MAX + 1); /* option validation is done later */ append_options(options, optarg); break; case 'O': flags |= MS_OVERLAY; break; case 'f': flags |= MS_FORCE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; /* check number of arguments */ if (do_all) { char *protocol = NULL; if (op == OP_SHARE && argc > 0) { if (strcmp(argv[0], "nfs") != 0 && strcmp(argv[0], "smb") != 0) { (void) fprintf(stderr, gettext("share type " "must be 'nfs' or 'smb'\n")); usage(B_FALSE); } protocol = argv[0]; argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } start_progress_timer(); get_all_cb_t cb = { 0 }; get_all_datasets(&cb, verbose); if (cb.cb_used == 0) { if (options != NULL) free(options); return (0); } share_mount_state_t share_mount_state = { 0 }; share_mount_state.sm_op = op; share_mount_state.sm_verbose = verbose; share_mount_state.sm_flags = flags; share_mount_state.sm_options = options; share_mount_state.sm_proto = protocol; share_mount_state.sm_total = cb.cb_used; pthread_mutex_init(&share_mount_state.sm_lock, NULL); /* * libshare isn't mt-safe, so only do the operation in parallel * if we're mounting. Additionally, the key-loading option must * be serialized so that we can prompt the user for their keys * in a consistent manner. */ zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, share_mount_one_cb, &share_mount_state, op == OP_MOUNT && !(flags & MS_CRYPT)); zfs_commit_all_shares(); ret = share_mount_state.sm_status; for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); } else if (argc == 0) { struct mnttab entry; if ((op == OP_SHARE) || (options != NULL)) { (void) fprintf(stderr, gettext("missing filesystem " "argument (specify -a for all)\n")); usage(B_FALSE); } /* * When mount is given no arguments, go through * /proc/self/mounts and display any active ZFS mounts. * We hide any snapshots, since they are controlled * automatically. */ /* Reopen MNTTAB to prevent reading stale data from open file */ if (freopen(MNTTAB, "r", mnttab_file) == NULL) { if (options != NULL) free(options); return (ENOENT); } while (getmntent(mnttab_file, &entry) == 0) { if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || strchr(entry.mnt_special, '@') != NULL) continue; (void) printf("%-30s %s\n", entry.mnt_special, entry.mnt_mountp); } } else { zfs_handle_t *zhp; if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; } else { ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, options); zfs_commit_all_shares(); zfs_close(zhp); } } if (options != NULL) free(options); return (ret); } /* * zfs mount -a [nfs] * zfs mount filesystem * * Mount all filesystems, or mount the given filesystem. */ static int zfs_do_mount(int argc, char **argv) { return (share_mount(OP_MOUNT, argc, argv)); } /* * zfs share -a [nfs | smb] * zfs share filesystem * * Share all filesystems, or share the given filesystem. */ static int zfs_do_share(int argc, char **argv) { return (share_mount(OP_SHARE, argc, argv)); } typedef struct unshare_unmount_node { zfs_handle_t *un_zhp; char *un_mountp; uu_avl_node_t un_avlnode; } unshare_unmount_node_t; /* ARGSUSED */ static int unshare_unmount_compare(const void *larg, const void *rarg, void *unused) { const unshare_unmount_node_t *l = larg; const unshare_unmount_node_t *r = rarg; return (strcmp(l->un_mountp, r->un_mountp)); } /* * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an * absolute path, find the entry /proc/self/mounts, verify that it's a * ZFS filesystem, and unmount it appropriately. */ static int unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) { zfs_handle_t *zhp; int ret = 0; struct stat64 statbuf; struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; /* * Search for the given (major,minor) pair in the mount table. */ /* Reopen MNTTAB to prevent reading stale data from open file */ if (freopen(MNTTAB, "r", mnttab_file) == NULL) return (ENOENT); if (getextmntent(path, &entry, &statbuf) != 0) { if (op == OP_SHARE) { (void) fprintf(stderr, gettext("cannot %s '%s': not " "currently mounted\n"), cmdname, path); return (1); } (void) fprintf(stderr, gettext("warning: %s not in" "/proc/self/mounts\n"), path); if ((ret = umount2(path, flags)) != 0) (void) fprintf(stderr, gettext("%s: %s\n"), path, strerror(errno)); return (ret != 0); } path_inode = statbuf.st_ino; if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " "filesystem\n"), cmdname, path); return (1); } if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) return (1); ret = 1; if (stat64(entry.mnt_mountp, &statbuf) != 0) { (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), cmdname, path, strerror(errno)); goto out; } else if (statbuf.st_ino != path_inode) { (void) fprintf(stderr, gettext("cannot " "%s '%s': not a mountpoint\n"), cmdname, path); goto out; } if (op == OP_SHARE) { char nfs_mnt_prop[ZFS_MAXPROPLEN]; char smbshare_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(smbshare_prop, "off") == 0) { (void) fprintf(stderr, gettext("cannot unshare " "'%s': legacy share\n"), path); (void) fprintf(stderr, gettext("use exportfs(8) " "or smbcontrol(1) to unshare this filesystem\n")); } else if (!zfs_is_shared(zhp)) { (void) fprintf(stderr, gettext("cannot unshare '%s': " "not currently shared\n"), path); } else { ret = zfs_unshareall_bypath(zhp, path); zfs_commit_all_shares(); } } else { char mtpt_prop[ZFS_MAXPROPLEN]; verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); if (is_manual) { ret = zfs_unmount(zhp, NULL, flags); } else if (strcmp(mtpt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot unmount " "'%s': legacy mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use umount(8) " "to unmount this filesystem\n")); } else { ret = zfs_unmountall(zhp, flags); } } out: zfs_close(zhp); return (ret != 0); } /* * Generic callback for unsharing or unmounting a filesystem. */ static int unshare_unmount(int op, int argc, char **argv) { int do_all = 0; int flags = 0; int ret = 0; int c; zfs_handle_t *zhp; char nfs_mnt_prop[ZFS_MAXPROPLEN]; char sharesmb[ZFS_MAXPROPLEN]; /* check options */ while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) { switch (c) { case 'a': do_all = 1; break; case 'f': flags |= MS_FORCE; break; case 'u': flags |= MS_CRYPT; break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); usage(B_FALSE); break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (do_all) { /* * We could make use of zfs_for_each() to walk all datasets in * the system, but this would be very inefficient, especially * since we would have to linearly search /proc/self/mounts for * each one. Instead, do one pass through /proc/self/mounts * looking for zfs entries and call zfs_unmount() for each one. * * Things get a little tricky if the administrator has created * mountpoints beneath other ZFS filesystems. In this case, we * have to unmount the deepest filesystems first. To accomplish * this, we place all the mountpoints in an AVL tree sorted by * the special type (dataset name), and walk the result in * reverse to make sure to get any snapshots first. */ struct mnttab entry; uu_avl_pool_t *pool; uu_avl_t *tree = NULL; unshare_unmount_node_t *node; uu_avl_index_t idx; uu_avl_walk_t *walk; char *protocol = NULL; if (op == OP_SHARE && argc > 0) { if (strcmp(argv[0], "nfs") != 0 && strcmp(argv[0], "smb") != 0) { (void) fprintf(stderr, gettext("share type " "must be 'nfs' or 'smb'\n")); usage(B_FALSE); } protocol = argv[0]; argc--; argv++; } if (argc != 0) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } if (((pool = uu_avl_pool_create("unmount_pool", sizeof (unshare_unmount_node_t), offsetof(unshare_unmount_node_t, un_avlnode), unshare_unmount_compare, UU_DEFAULT)) == NULL) || ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) nomem(); /* Reopen MNTTAB to prevent reading stale data from open file */ if (freopen(MNTTAB, "r", mnttab_file) == NULL) return (ENOENT); while (getmntent(mnttab_file, &entry) == 0) { /* ignore non-ZFS entries */ if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* ignore snapshots */ if (strchr(entry.mnt_special, '@') != NULL) continue; if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; continue; } /* * Ignore datasets that are excluded/restricted by * parent pool name. */ if (zpool_skip_pool(zfs_get_pool_name(zhp))) { zfs_close(zhp); continue; } switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") != 0) break; verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0) continue; break; case OP_MOUNT: /* Ignore legacy mounts */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "legacy") == 0) continue; /* Ignore canmount=noauto mounts */ if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) continue; default: break; } node = safe_malloc(sizeof (unshare_unmount_node_t)); node->un_zhp = zhp; node->un_mountp = safe_strdup(entry.mnt_mountp); uu_avl_node_init(node, &node->un_avlnode, pool); if (uu_avl_find(tree, node, NULL, &idx) == NULL) { uu_avl_insert(tree, node, idx); } else { zfs_close(node->un_zhp); free(node->un_mountp); free(node); } } /* * Walk the AVL tree in reverse, unmounting each filesystem and * removing it from the AVL tree in the process. */ if ((walk = uu_avl_walk_start(tree, UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { const char *mntarg = NULL; uu_avl_remove(tree, node); switch (op) { case OP_SHARE: if (zfs_unshareall_bytype(node->un_zhp, node->un_mountp, protocol) != 0) ret = 1; break; case OP_MOUNT: if (zfs_unmount(node->un_zhp, mntarg, flags) != 0) ret = 1; break; } zfs_close(node->un_zhp); free(node->un_mountp); free(node); } if (op == OP_SHARE) zfs_commit_shares(protocol); uu_avl_walk_end(walk); uu_avl_destroy(tree); uu_avl_pool_destroy(pool); } else { if (argc != 1) { if (argc == 0) (void) fprintf(stderr, gettext("missing filesystem argument\n")); else (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } /* * We have an argument, but it may be a full path or a ZFS * filesystem. Pass full paths off to unmount_path() (shared by * manual_unmount), otherwise open the filesystem and pass to * zfs_unmount(). */ if (argv[0][0] == '/') return (unshare_unmount_path(op, argv[0], flags, B_FALSE)); if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM)) == NULL) return (1); verify(zfs_prop_get(zhp, op == OP_SHARE ? ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, sharesmb, sizeof (sharesmb), NULL, NULL, 0, B_FALSE) == 0); if (strcmp(nfs_mnt_prop, "off") == 0 && strcmp(sharesmb, "off") == 0) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "exports(5) or smb.conf(5) to unshare " "this filesystem\n")); ret = 1; } else if (!zfs_is_shared(zhp)) { (void) fprintf(stderr, gettext("cannot " "unshare '%s': not currently " "shared\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unshareall(zhp) != 0) { ret = 1; } break; case OP_MOUNT: if (strcmp(nfs_mnt_prop, "legacy") == 0) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': legacy " "mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " "umount(8) to unmount this " "filesystem\n")); ret = 1; } else if (!zfs_is_mounted(zhp, NULL)) { (void) fprintf(stderr, gettext("cannot " "unmount '%s': not currently " "mounted\n"), zfs_get_name(zhp)); ret = 1; } else if (zfs_unmountall(zhp, flags) != 0) { ret = 1; } break; } zfs_close(zhp); } return (ret); } /* * zfs unmount [-fu] -a * zfs unmount [-fu] filesystem * * Unmount all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unmount(int argc, char **argv) { return (unshare_unmount(OP_MOUNT, argc, argv)); } /* * zfs unshare -a * zfs unshare filesystem * * Unshare all filesystems, or a specific ZFS filesystem. */ static int zfs_do_unshare(int argc, char **argv) { return (unshare_unmount(OP_SHARE, argc, argv)); } static int find_command_idx(char *command, int *idx) { int i; for (i = 0; i < NCOMMAND; i++) { if (command_table[i].name == NULL) continue; if (strcmp(command, command_table[i].name) == 0) { *idx = i; return (0); } } return (1); } static int zfs_do_diff(int argc, char **argv) { zfs_handle_t *zhp; int flags = 0; char *tosnap = NULL; char *fromsnap = NULL; char *atp, *copy; int err = 0; int c; struct sigaction sa; while ((c = getopt(argc, argv, "FHt")) != -1) { switch (c) { case 'F': flags |= ZFS_DIFF_CLASSIFY; break; case 'H': flags |= ZFS_DIFF_PARSEABLE; break; case 't': flags |= ZFS_DIFF_TIMESTAMP; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("must provide at least one snapshot name\n")); usage(B_FALSE); } if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } fromsnap = argv[0]; tosnap = (argc == 2) ? argv[1] : NULL; copy = NULL; if (*fromsnap != '@') copy = strdup(fromsnap); else if (tosnap) copy = strdup(tosnap); if (copy == NULL) usage(B_FALSE); if ((atp = strchr(copy, '@')) != NULL) *atp = '\0'; if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) { free(copy); return (1); } free(copy); /* * Ignore SIGPIPE so that the library can give us * information on any failure */ if (sigemptyset(&sa.sa_mask) == -1) { err = errno; goto out; } sa.sa_flags = 0; sa.sa_handler = SIG_IGN; if (sigaction(SIGPIPE, &sa, NULL) == -1) { err = errno; goto out; } err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); out: zfs_close(zhp); return (err != 0); } /* * zfs bookmark | * * Creates a bookmark with the given name from the source snapshot * or creates a copy of an existing source bookmark. */ static int zfs_do_bookmark(int argc, char **argv) { char *source, *bookname; char expbuf[ZFS_MAX_DATASET_NAME_LEN]; int source_type; nvlist_t *nvl; int ret = 0; int c; /* check options */ while ((c = getopt(argc, argv, "")) != -1) { switch (c) { case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; /* check number of arguments */ if (argc < 1) { (void) fprintf(stderr, gettext("missing source argument\n")); goto usage; } if (argc < 2) { (void) fprintf(stderr, gettext("missing bookmark argument\n")); goto usage; } source = argv[0]; bookname = argv[1]; if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) { (void) fprintf(stderr, gettext("invalid source name '%s': " "must contain a '@' or '#'\n"), source); goto usage; } if (strchr(bookname, '#') == NULL) { (void) fprintf(stderr, gettext("invalid bookmark name '%s': " "must contain a '#'\n"), bookname); goto usage; } /* * expand source or bookname to full path: * one of them may be specified as short name */ { char **expand; char *source_short, *bookname_short; source_short = strpbrk(source, "@#"); bookname_short = strpbrk(bookname, "#"); if (source_short == source && bookname_short == bookname) { (void) fprintf(stderr, gettext( "either source or bookmark must be specified as " "full dataset paths")); goto usage; } else if (source_short != source && bookname_short != bookname) { expand = NULL; } else if (source_short != source) { strlcpy(expbuf, source, sizeof (expbuf)); expand = &bookname; } else if (bookname_short != bookname) { strlcpy(expbuf, bookname, sizeof (expbuf)); expand = &source; } else { abort(); } if (expand != NULL) { *strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */ (void) strlcat(expbuf, *expand, sizeof (expbuf)); *expand = expbuf; } } /* determine source type */ switch (*strpbrk(source, "@#")) { case '@': source_type = ZFS_TYPE_SNAPSHOT; break; case '#': source_type = ZFS_TYPE_BOOKMARK; break; default: abort(); } /* test the source exists */ zfs_handle_t *zhp; zhp = zfs_open(g_zfs, source, source_type); if (zhp == NULL) goto usage; zfs_close(zhp); nvl = fnvlist_alloc(); fnvlist_add_string(nvl, bookname, source); ret = lzc_bookmark(nvl, NULL); fnvlist_free(nvl); if (ret != 0) { const char *err_msg = NULL; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create bookmark '%s'"), bookname); switch (ret) { case EXDEV: err_msg = "bookmark is in a different pool"; break; case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: err_msg = "source is not an ancestor of the " "new bookmark's dataset"; break; case EEXIST: err_msg = "bookmark exists"; break; case EINVAL: err_msg = "invalid argument"; break; case ENOTSUP: err_msg = "bookmark feature not enabled"; break; case ENOSPC: err_msg = "out of space"; break; case ENOENT: err_msg = "dataset does not exist"; break; default: (void) zfs_standard_error(g_zfs, ret, errbuf); break; } if (err_msg != NULL) { (void) fprintf(stderr, "%s: %s\n", errbuf, dgettext(TEXT_DOMAIN, err_msg)); } } return (ret != 0); usage: usage(B_FALSE); return (-1); } static int zfs_do_channel_program(int argc, char **argv) { int ret, fd, c; char *progbuf, *filename, *poolname; size_t progsize, progread; nvlist_t *outnvl = NULL; uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT; uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT; boolean_t sync_flag = B_TRUE, json_output = B_FALSE; zpool_handle_t *zhp; /* check options */ while ((c = getopt(argc, argv, "nt:m:j")) != -1) { switch (c) { case 't': case 'm': { uint64_t arg; char *endp; errno = 0; arg = strtoull(optarg, &endp, 0); if (errno != 0 || *endp != '\0') { (void) fprintf(stderr, gettext( "invalid argument " "'%s': expected integer\n"), optarg); goto usage; } if (c == 't') { instrlimit = arg; } else { ASSERT3U(c, ==, 'm'); memlimit = arg; } break; } case 'n': { sync_flag = B_FALSE; break; } case 'j': { json_output = B_TRUE; break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); goto usage; } } argc -= optind; argv += optind; if (argc < 2) { (void) fprintf(stderr, gettext("invalid number of arguments\n")); goto usage; } poolname = argv[0]; filename = argv[1]; if (strcmp(filename, "-") == 0) { fd = 0; filename = "standard input"; } else if ((fd = open(filename, O_RDONLY)) < 0) { (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), filename, strerror(errno)); return (1); } if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { (void) fprintf(stderr, gettext("cannot open pool '%s'\n"), poolname); if (fd != 0) (void) close(fd); return (1); } zpool_close(zhp); /* * Read in the channel program, expanding the program buffer as * necessary. */ progread = 0; progsize = 1024; progbuf = safe_malloc(progsize); do { ret = read(fd, progbuf + progread, progsize - progread); progread += ret; if (progread == progsize && ret > 0) { progsize *= 2; progbuf = safe_realloc(progbuf, progsize); } } while (ret > 0); if (fd != 0) (void) close(fd); if (ret < 0) { free(progbuf); (void) fprintf(stderr, gettext("cannot read '%s': %s\n"), filename, strerror(errno)); return (1); } progbuf[progread] = '\0'; /* * Any remaining arguments are passed as arguments to the lua script as * a string array: * { * "argv" -> [ "arg 1", ... "arg n" ], * } */ nvlist_t *argnvl = fnvlist_alloc(); fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2); if (sync_flag) { ret = lzc_channel_program(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } else { ret = lzc_channel_program_nosync(poolname, progbuf, instrlimit, memlimit, argnvl, &outnvl); } if (ret != 0) { /* * On error, report the error message handed back by lua if one * exists. Otherwise, generate an appropriate error message, * falling back on strerror() for an unexpected return code. */ char *errstring = NULL; const char *msg = gettext("Channel program execution failed"); uint64_t instructions = 0; if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) { (void) nvlist_lookup_string(outnvl, ZCP_RET_ERROR, &errstring); if (errstring == NULL) errstring = strerror(ret); if (ret == ETIME) { (void) nvlist_lookup_uint64(outnvl, ZCP_ARG_INSTRLIMIT, &instructions); } } else { switch (ret) { case EINVAL: errstring = "Invalid instruction or memory limit."; break; case ENOMEM: errstring = "Return value too large."; break; case ENOSPC: errstring = "Memory limit exhausted."; break; case ETIME: errstring = "Timed out."; break; case EPERM: errstring = "Permission denied. Channel " "programs must be run as root."; break; default: (void) zfs_standard_error(g_zfs, ret, msg); } } if (errstring != NULL) (void) fprintf(stderr, "%s:\n%s\n", msg, errstring); if (ret == ETIME && instructions != 0) (void) fprintf(stderr, gettext("%llu Lua instructions\n"), (u_longlong_t)instructions); } else { if (json_output) { (void) nvlist_print_json(stdout, outnvl); } else if (nvlist_empty(outnvl)) { (void) fprintf(stdout, gettext("Channel program fully " "executed and did not produce output.\n")); } else { (void) fprintf(stdout, gettext("Channel program fully " "executed and produced output:\n")); dump_nvlist(outnvl, 4); } } free(progbuf); fnvlist_free(outnvl); fnvlist_free(argnvl); return (ret != 0); usage: usage(B_FALSE); return (-1); } typedef struct loadkey_cbdata { boolean_t cb_loadkey; boolean_t cb_recursive; boolean_t cb_noop; char *cb_keylocation; uint64_t cb_numfailed; uint64_t cb_numattempted; } loadkey_cbdata_t; static int load_key_callback(zfs_handle_t *zhp, void *data) { int ret; boolean_t is_encroot; loadkey_cbdata_t *cb = data; uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); /* * If we are working recursively, we want to skip loading / unloading * keys for non-encryption roots and datasets whose keys are already * in the desired end-state. */ if (cb->cb_recursive) { ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); if (ret != 0) return (ret); if (!is_encroot) return (0); if ((cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_AVAILABLE) || (!cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_UNAVAILABLE)) return (0); } cb->cb_numattempted++; if (cb->cb_loadkey) ret = zfs_crypto_load_key(zhp, cb->cb_noop, cb->cb_keylocation); else ret = zfs_crypto_unload_key(zhp); if (ret != 0) { cb->cb_numfailed++; return (ret); } return (0); } static int load_unload_keys(int argc, char **argv, boolean_t loadkey) { int c, ret = 0, flags = 0; boolean_t do_all = B_FALSE; loadkey_cbdata_t cb = { 0 }; cb.cb_loadkey = loadkey; while ((c = getopt(argc, argv, "anrL:")) != -1) { /* noop and alternate keylocations only apply to zfs load-key */ if (loadkey) { switch (c) { case 'n': cb.cb_noop = B_TRUE; continue; case 'L': cb.cb_keylocation = optarg; continue; default: break; } } switch (c) { case 'a': do_all = B_TRUE; cb.cb_recursive = B_TRUE; break; case 'r': flags |= ZFS_ITER_RECURSE; cb.cb_recursive = B_TRUE; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argc -= optind; argv += optind; if (!do_all && argc == 0) { (void) fprintf(stderr, gettext("Missing dataset argument or -a option\n")); usage(B_FALSE); } if (do_all && argc != 0) { (void) fprintf(stderr, gettext("Cannot specify dataset with -a option\n")); usage(B_FALSE); } if (cb.cb_recursive && cb.cb_keylocation != NULL && strcmp(cb.cb_keylocation, "prompt") != 0) { (void) fprintf(stderr, gettext("alternate keylocation may only " "be 'prompt' with -r or -a\n")); usage(B_FALSE); } ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL, 0, load_key_callback, &cb); if (cb.cb_noop || (cb.cb_recursive && cb.cb_numattempted != 0)) { (void) printf(gettext("%llu / %llu key(s) successfully %s\n"), (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed), (u_longlong_t)cb.cb_numattempted, loadkey ? (cb.cb_noop ? "verified" : "loaded") : "unloaded"); } if (cb.cb_numfailed != 0) ret = -1; return (ret); } static int zfs_do_load_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_TRUE)); } static int zfs_do_unload_key(int argc, char **argv) { return (load_unload_keys(argc, argv, B_FALSE)); } static int zfs_do_change_key(int argc, char **argv) { int c, ret; uint64_t keystatus; boolean_t loadkey = B_FALSE, inheritkey = B_FALSE; zfs_handle_t *zhp = NULL; nvlist_t *props = fnvlist_alloc(); while ((c = getopt(argc, argv, "lio:")) != -1) { switch (c) { case 'l': loadkey = B_TRUE; break; case 'i': inheritkey = B_TRUE; break; case 'o': if (!parseprop(props, optarg)) { nvlist_free(props); return (1); } break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (inheritkey && !nvlist_empty(props)) { (void) fprintf(stderr, gettext("Properties not allowed for inheriting\n")); usage(B_FALSE); } argc -= optind; argv += optind; if (argc < 1) { (void) fprintf(stderr, gettext("Missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("Too many arguments\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[argc - 1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); if (zhp == NULL) usage(B_FALSE); if (loadkey) { keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); if (keystatus != ZFS_KEYSTATUS_AVAILABLE) { ret = zfs_crypto_load_key(zhp, B_FALSE, NULL); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } } /* refresh the properties so the new keystatus is visible */ zfs_refresh_properties(zhp); } ret = zfs_crypto_rewrap(zhp, props, inheritkey); if (ret != 0) { nvlist_free(props); zfs_close(zhp); return (-1); } nvlist_free(props); zfs_close(zhp); return (0); } /* * 1) zfs project [-d|-r] * List project ID and inherit flag of file(s) or directories. * -d: List the directory itself, not its children. * -r: List subdirectories recursively. * * 2) zfs project -C [-k] [-r] * Clear project inherit flag and/or ID on the file(s) or directories. * -k: Keep the project ID unchanged. If not specified, the project ID * will be reset as zero. * -r: Clear on subdirectories recursively. * * 3) zfs project -c [-0] [-d|-r] [-p id] * Check project ID and inherit flag on the file(s) or directories, * report the outliers. * -0: Print file name followed by a NUL instead of newline. * -d: Check the directory itself, not its children. * -p: Specify the referenced ID for comparing with the target file(s) * or directories' project IDs. If not specified, the target (top) * directory's project ID will be used as the referenced one. * -r: Check subdirectories recursively. * * 4) zfs project [-p id] [-r] [-s] * Set project ID and/or inherit flag on the file(s) or directories. * -p: Set the project ID as the given id. * -r: Set on subdirectories recursively. If not specify "-p" option, * it will use top-level directory's project ID as the given id, * then set both project ID and inherit flag on all descendants * of the top-level directory. * -s: Set project inherit flag. */ static int zfs_do_project(int argc, char **argv) { zfs_project_control_t zpc = { .zpc_expected_projid = ZFS_INVALID_PROJID, .zpc_op = ZFS_PROJECT_OP_DEFAULT, .zpc_dironly = B_FALSE, .zpc_keep_projid = B_FALSE, .zpc_newline = B_TRUE, .zpc_recursive = B_FALSE, .zpc_set_flag = B_FALSE, }; int ret = 0, c; if (argc < 2) usage(B_FALSE); while ((c = getopt(argc, argv, "0Ccdkp:rs")) != -1) { switch (c) { case '0': zpc.zpc_newline = B_FALSE; break; case 'C': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CLEAR; break; case 'c': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_op = ZFS_PROJECT_OP_CHECK; break; case 'd': zpc.zpc_dironly = B_TRUE; /* overwrite "-r" option */ zpc.zpc_recursive = B_FALSE; break; case 'k': zpc.zpc_keep_projid = B_TRUE; break; case 'p': { char *endptr; errno = 0; zpc.zpc_expected_projid = strtoull(optarg, &endptr, 0); if (errno != 0 || *endptr != '\0') { (void) fprintf(stderr, gettext("project ID must be less than " "%u\n"), UINT32_MAX); usage(B_FALSE); } if (zpc.zpc_expected_projid >= UINT32_MAX) { (void) fprintf(stderr, gettext("invalid project ID\n")); usage(B_FALSE); } break; } case 'r': zpc.zpc_recursive = B_TRUE; /* overwrite "-d" option */ zpc.zpc_dironly = B_FALSE; break; case 's': if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { (void) fprintf(stderr, gettext("cannot " "specify '-C' '-c' '-s' together\n")); usage(B_FALSE); } zpc.zpc_set_flag = B_TRUE; zpc.zpc_op = ZFS_PROJECT_OP_SET; break; default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } if (zpc.zpc_op == ZFS_PROJECT_OP_DEFAULT) { if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) zpc.zpc_op = ZFS_PROJECT_OP_SET; else zpc.zpc_op = ZFS_PROJECT_OP_LIST; } switch (zpc.zpc_op) { case ZFS_PROJECT_OP_LIST: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CHECK: if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_CLEAR: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) { (void) fprintf(stderr, gettext("'-p' is useless together with '-C'\n")); usage(B_FALSE); } break; case ZFS_PROJECT_OP_SET: if (zpc.zpc_dironly) { (void) fprintf(stderr, gettext("'-d' is useless for set project ID and/or " "inherit flag\n")); usage(B_FALSE); } if (zpc.zpc_keep_projid) { (void) fprintf(stderr, gettext("'-k' is only valid together with '-C'\n")); usage(B_FALSE); } if (!zpc.zpc_newline) { (void) fprintf(stderr, gettext("'-0' is only valid together with '-c'\n")); usage(B_FALSE); } break; default: ASSERT(0); break; } argv += optind; argc -= optind; if (argc == 0) { (void) fprintf(stderr, gettext("missing file or directory target(s)\n")); usage(B_FALSE); } for (int i = 0; i < argc; i++) { int err; err = zfs_project_handle(argv[i], &zpc); if (err && !ret) ret = err; } return (ret); } static int zfs_do_wait(int argc, char **argv) { boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; int error, i; int c; /* By default, wait for all types of activity. */ for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) enabled[i] = B_TRUE; while ((c = getopt(argc, argv, "t:")) != -1) { switch (c) { case 't': { static char *col_subopts[] = { "deleteq", NULL }; char *value; /* Reset activities array */ bzero(&enabled, sizeof (enabled)); while (*optarg != '\0') { int activity = getsubopt(&optarg, col_subopts, &value); if (activity < 0) { (void) fprintf(stderr, gettext("invalid activity '%s'\n"), value); usage(B_FALSE); } enabled[activity] = B_TRUE; } break; } case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); usage(B_FALSE); } } argv += optind; argc -= optind; if (argc < 1) { (void) fprintf(stderr, gettext("missing 'filesystem' " "argument\n")); usage(B_FALSE); } if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); for (;;) { boolean_t missing = B_FALSE; boolean_t any_waited = B_FALSE; for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) { boolean_t waited; if (!enabled[i]) continue; error = zfs_wait_status(zhp, i, &missing, &waited); if (error != 0 || missing) break; any_waited = (any_waited || waited); } if (error != 0 || missing || !any_waited) break; } zfs_close(zhp); return (error); } /* * Display version message */ static int zfs_do_version(int argc, char **argv) { if (zfs_version_print() == -1) return (1); return (0); } int main(int argc, char **argv) { int ret = 0; int i = 0; char *cmdname; char **newargv; (void) setlocale(LC_ALL, ""); (void) setlocale(LC_NUMERIC, "C"); (void) textdomain(TEXT_DOMAIN); opterr = 0; /* * Make sure the user has specified some command. */ if (argc < 2) { (void) fprintf(stderr, gettext("missing command\n")); usage(B_FALSE); } cmdname = argv[1]; /* * The 'umount' command is an alias for 'unmount' */ if (strcmp(cmdname, "umount") == 0) cmdname = "unmount"; /* * The 'recv' command is an alias for 'receive' */ if (strcmp(cmdname, "recv") == 0) cmdname = "receive"; /* * The 'snap' command is an alias for 'snapshot' */ if (strcmp(cmdname, "snap") == 0) cmdname = "snapshot"; /* * Special case '-?' */ if ((strcmp(cmdname, "-?") == 0) || (strcmp(cmdname, "--help") == 0)) usage(B_TRUE); /* * Special case '-V|--version' */ if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) return (zfs_do_version(argc, argv)); if ((g_zfs = libzfs_init()) == NULL) { (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); return (1); } mnttab_file = g_zfs->libzfs_mnttab; zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); /* * Many commands modify input strings for string parsing reasons. * We create a copy to protect the original argv. */ newargv = malloc((argc + 1) * sizeof (newargv[0])); for (i = 0; i < argc; i++) newargv[i] = strdup(argv[i]); newargv[argc] = NULL; /* * Run the appropriate command. */ libzfs_mnttab_cache(g_zfs, B_TRUE); if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, newargv + 1); } else if (strchr(cmdname, '=') != NULL) { verify(find_command_idx("set", &i) == 0); current_command = &command_table[i]; ret = command_table[i].func(argc, newargv); } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); ret = 1; } for (i = 0; i < argc; i++) free(newargv[i]); free(newargv); if (ret == 0 && log_history) (void) zpool_log_history(g_zfs, history_str); libzfs_fini(g_zfs); /* * The 'ZFS_ABORT' environment variable causes us to dump core on exit * for the purposes of running ::findleaks. */ if (getenv("ZFS_ABORT") != NULL) { (void) printf("dumping core by request\n"); abort(); } return (ret); } #ifdef __FreeBSD__ #include #include /* * Attach/detach the given dataset to/from the given jail */ /* ARGSUSED */ static int zfs_do_jail_impl(int argc, char **argv, boolean_t attach) { zfs_handle_t *zhp; int jailid, ret; /* check number of arguments */ if (argc < 3) { (void) fprintf(stderr, gettext("missing argument(s)\n")); usage(B_FALSE); } if (argc > 3) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } jailid = jail_getid(argv[1]); if (jailid < 0) { (void) fprintf(stderr, gettext("invalid jail id or name\n")); usage(B_FALSE); } zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); if (zhp == NULL) return (1); ret = (zfs_jail(zhp, jailid, attach) != 0); zfs_close(zhp); return (ret); } /* * zfs jail jailid filesystem * * Attach the given dataset to the given jail */ /* ARGSUSED */ static int zfs_do_jail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_TRUE)); } /* * zfs unjail jailid filesystem * * Detach the given dataset from the given jail */ /* ARGSUSED */ static int zfs_do_unjail(int argc, char **argv) { return (zfs_do_jail_impl(argc, argv, B_FALSE)); } #endif diff --git a/sys/contrib/openzfs/cmd/ztest/ztest.c b/sys/contrib/openzfs/cmd/ztest/ztest.c index d2bf0101993e..cfa1290d78d1 100644 --- a/sys/contrib/openzfs/cmd/ztest/ztest.c +++ b/sys/contrib/openzfs/cmd/ztest/ztest.c @@ -1,7955 +1,8060 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ /* * The objective of this program is to provide a DMU/ZAP/SPA stress test * that runs entirely in userland, is easy to use, and easy to extend. * * The overall design of the ztest program is as follows: * * (1) For each major functional area (e.g. adding vdevs to a pool, * creating and destroying datasets, reading and writing objects, etc) * we have a simple routine to test that functionality. These * individual routines do not have to do anything "stressful". * * (2) We turn these simple functionality tests into a stress test by * running them all in parallel, with as many threads as desired, * and spread across as many datasets, objects, and vdevs as desired. * * (3) While all this is happening, we inject faults into the pool to * verify that self-healing data really works. * * (4) Every time we open a dataset, we change its checksum and compression * functions. Thus even individual objects vary from block to block * in which checksum they use and whether they're compressed. * * (5) To verify that we never lose on-disk consistency after a crash, * we run the entire test in a child of the main process. * At random times, the child self-immolates with a SIGKILL. * This is the software equivalent of pulling the power cord. * The parent then runs the test again, using the existing * storage pool, as many times as desired. If backwards compatibility * testing is enabled ztest will sometimes run the "older" version * of ztest after a SIGKILL. * * (6) To verify that we don't have future leaks or temporal incursions, * many of the functional tests record the transaction group number * as part of their data. When reading old data, they verify that * the transaction group number is less than the current, open txg. * If you add a new test, please do this if applicable. * * (7) Threads are created with a reduced stack size, for sanity checking. * Therefore, it's important not to allocate huge buffers on the stack. * * When run with no arguments, ztest runs for about five minutes and * produces no output if successful. To get a little bit of information, * specify -V. To get more information, specify -VV, and so on. * * To turn this into an overnight stress test, use -T to specify run time. * * You can ask more vdevs [-v], datasets [-d], or threads [-t] * to increase the pool capacity, fanout, and overall stress level. * * Use the -k option to set the desired frequency of kills. * * When ztest invokes itself it passes all relevant information through a * temporary file which is mmap-ed in the child process. This allows shared * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always * stored at offset 0 of this file and contains information on the size and * number of shared structures in the file. The information stored in this file * must remain backwards compatible with older versions of ztest so that * ztest can invoke them during backwards compatibility testing (-B). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if (__GLIBC__ && !__UCLIBC__) #include /* for backtrace() */ #endif static int ztest_fd_data = -1; static int ztest_fd_rand = -1; typedef struct ztest_shared_hdr { uint64_t zh_hdr_size; uint64_t zh_opts_size; uint64_t zh_size; uint64_t zh_stats_size; uint64_t zh_stats_count; uint64_t zh_ds_size; uint64_t zh_ds_count; } ztest_shared_hdr_t; static ztest_shared_hdr_t *ztest_shared_hdr; enum ztest_class_state { ZTEST_VDEV_CLASS_OFF, ZTEST_VDEV_CLASS_ON, ZTEST_VDEV_CLASS_RND }; +#define ZO_GVARS_MAX_ARGLEN ((size_t)64) +#define ZO_GVARS_MAX_COUNT ((size_t)10) + typedef struct ztest_shared_opts { char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; char zo_alt_ztest[MAXNAMELEN]; char zo_alt_libpath[MAXNAMELEN]; uint64_t zo_vdevs; uint64_t zo_vdevtime; size_t zo_vdev_size; int zo_ashift; int zo_mirrors; int zo_raid_children; int zo_raid_parity; char zo_raid_type[8]; int zo_draid_data; int zo_draid_spares; int zo_datasets; int zo_threads; uint64_t zo_passtime; uint64_t zo_killrate; int zo_verbose; int zo_init; uint64_t zo_time; uint64_t zo_maxloops; uint64_t zo_metaslab_force_ganging; int zo_mmp_test; int zo_special_vdevs; int zo_dump_dbgmsg; + int zo_gvars_count; + char zo_gvars[ZO_GVARS_MAX_COUNT][ZO_GVARS_MAX_ARGLEN]; } ztest_shared_opts_t; static const ztest_shared_opts_t ztest_opts_defaults = { .zo_pool = "ztest", .zo_dir = "/tmp", .zo_alt_ztest = { '\0' }, .zo_alt_libpath = { '\0' }, .zo_vdevs = 5, .zo_ashift = SPA_MINBLOCKSHIFT, .zo_mirrors = 2, .zo_raid_children = 4, .zo_raid_parity = 1, .zo_raid_type = VDEV_TYPE_RAIDZ, .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ .zo_draid_data = 4, /* data drives */ .zo_draid_spares = 1, /* distributed spares */ .zo_datasets = 7, .zo_threads = 23, .zo_passtime = 60, /* 60 seconds */ .zo_killrate = 70, /* 70% kill rate */ .zo_verbose = 0, .zo_mmp_test = 0, .zo_init = 1, .zo_time = 300, /* 5 minutes */ .zo_maxloops = 50, /* max loops during spa_freeze() */ .zo_metaslab_force_ganging = 64 << 10, .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, + .zo_gvars_count = 0, }; extern uint64_t metaslab_force_ganging; extern uint64_t metaslab_df_alloc_threshold; extern unsigned long zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; extern int zfs_abd_scatter_enabled; extern int dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; typedef struct ztest_shared_ds { uint64_t zd_seq; } ztest_shared_ds_t; static ztest_shared_ds_t *ztest_shared_ds; #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, ZTEST_IO_WRITE_PATTERN, ZTEST_IO_WRITE_ZEROES, ZTEST_IO_TRUNCATE, ZTEST_IO_SETATTR, ZTEST_IO_REWRITE, ZTEST_IO_TYPES }; typedef struct ztest_block_tag { uint64_t bt_magic; uint64_t bt_objset; uint64_t bt_object; uint64_t bt_dnodesize; uint64_t bt_offset; uint64_t bt_gen; uint64_t bt_txg; uint64_t bt_crtxg; } ztest_block_tag_t; typedef struct bufwad { uint64_t bw_index; uint64_t bw_txg; uint64_t bw_data; } bufwad_t; /* * It would be better to use a rangelock_t per object. Unfortunately * the rangelock_t is not a drop-in replacement for rl_t, because we * still need to map from object ID to rangelock_t. */ typedef enum { RL_READER, RL_WRITER, RL_APPEND } rl_type_t; typedef struct rll { void *rll_writer; int rll_readers; kmutex_t rll_lock; kcondvar_t rll_cv; } rll_t; typedef struct rl { uint64_t rl_object; uint64_t rl_offset; uint64_t rl_size; rll_t *rl_lock; } rl_t; #define ZTEST_RANGE_LOCKS 64 #define ZTEST_OBJECT_LOCKS 64 /* * Object descriptor. Used as a template for object lookup/create/remove. */ typedef struct ztest_od { uint64_t od_dir; uint64_t od_object; dmu_object_type_t od_type; dmu_object_type_t od_crtype; uint64_t od_blocksize; uint64_t od_crblocksize; uint64_t od_crdnodesize; uint64_t od_gen; uint64_t od_crgen; char od_name[ZFS_MAX_DATASET_NAME_LEN]; } ztest_od_t; /* * Per-dataset state. */ typedef struct ztest_ds { ztest_shared_ds_t *zd_shared; objset_t *zd_os; pthread_rwlock_t zd_zilog_lock; zilog_t *zd_zilog; ztest_od_t *zd_od; /* debugging aid */ char zd_name[ZFS_MAX_DATASET_NAME_LEN]; kmutex_t zd_dirobj_lock; rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; } ztest_ds_t; /* * Per-iteration state. */ typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ const char *zi_funcname; /* name of test function */ } ztest_info_t; typedef struct ztest_shared_callstate { uint64_t zc_count; /* per-pass count */ uint64_t zc_time; /* per-pass time */ uint64_t zc_next; /* next time to call this function */ } ztest_shared_callstate_t; static ztest_shared_callstate_t *ztest_shared_callstate; #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; ztest_func_t ztest_dmu_object_next_chunk; ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; ztest_func_t ztest_zil_commit; ztest_func_t ztest_zil_remount; ztest_func_t ztest_dmu_read_write_zcopy; ztest_func_t ztest_dmu_objset_create_destroy; ztest_func_t ztest_dmu_prealloc; ztest_func_t ztest_fzap; ztest_func_t ztest_dmu_snapshot_create_destroy; ztest_func_t ztest_dsl_prop_get_set; ztest_func_t ztest_spa_prop_get_set; ztest_func_t ztest_spa_create_destroy; ztest_func_t ztest_fault_inject; ztest_func_t ztest_dmu_snapshot_hold; ztest_func_t ztest_mmp_enable_disable; ztest_func_t ztest_scrub; ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_class_add; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; ztest_func_t ztest_reguid; ztest_func_t ztest_spa_upgrade; ztest_func_t ztest_device_removal; ztest_func_t ztest_spa_checkpoint_create_discard; ztest_func_t ztest_initialize; ztest_func_t ztest_trim; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ #define ZTI_INIT(func, iters, interval) \ { .zi_func = (func), \ .zi_iters = (iters), \ .zi_interval = (interval), \ .zi_funcname = # func } ztest_info_t ztest_info[] = { ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), ZTI_INIT(ztest_zap, 30, &zopt_always), ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), ZTI_INIT(ztest_split_pool, 1, &zopt_always), ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), #if 0 ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), #endif ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), ZTI_INIT(ztest_reguid, 1, &zopt_rarely), ZTI_INIT(ztest_scrub, 1, &zopt_rarely), ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), ZTI_INIT(ztest_trim, 1, &zopt_sometimes), ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) /* * The following struct is used to hold a list of uncalled commit callbacks. * The callbacks are ordered by txg number. */ typedef struct ztest_cb_list { kmutex_t zcl_callbacks_lock; list_t zcl_callbacks; } ztest_cb_list_t; /* * Stuff we need to share writably between parent and child. */ typedef struct ztest_shared { boolean_t zs_do_init; hrtime_t zs_proc_start; hrtime_t zs_proc_stop; hrtime_t zs_thread_start; hrtime_t zs_thread_stop; hrtime_t zs_thread_kill; uint64_t zs_enospc_count; uint64_t zs_vdev_next_leaf; uint64_t zs_vdev_aux; uint64_t zs_alloc; uint64_t zs_space; uint64_t zs_splits; uint64_t zs_mirrors; uint64_t zs_metaslab_sz; uint64_t zs_metaslab_df_alloc_threshold; uint64_t zs_guid; } ztest_shared_t; #define ID_PARALLEL -1ULL static char ztest_dev_template[] = "%s/%s.%llua"; static char ztest_aux_template[] = "%s/%s.%s.%llu"; ztest_shared_t *ztest_shared; static spa_t *ztest_spa = NULL; static ztest_ds_t *ztest_ds; static kmutex_t ztest_vdev_lock; static boolean_t ztest_device_removal_active = B_FALSE; static boolean_t ztest_pool_scrubbed = B_FALSE; static kmutex_t ztest_checkpoint_lock; /* * The ztest_name_lock protects the pool and dataset namespace used by * the individual tests. To modify the namespace, consumers must grab * this lock as writer. Grabbing the lock as reader will ensure that the * namespace does not change while the lock is held. */ static pthread_rwlock_t ztest_name_lock; static boolean_t ztest_dump_core = B_TRUE; static boolean_t ztest_exiting; /* Global commit callback list */ static ztest_cb_list_t zcl; /* Commit cb delay */ static uint64_t zc_min_txg_delay = UINT64_MAX; static int zc_cb_counter = 0; /* * Minimum number of commit callbacks that need to be registered for us to check * whether the minimum txg delay is acceptable. */ #define ZTEST_COMMIT_CB_MIN_REG 100 /* * If a number of txgs equal to this threshold have been created after a commit * callback has been registered but not called, then we assume there is an * implementation bug. */ #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) enum ztest_object { ZTEST_META_DNODE = 0, ZTEST_DIROBJ, ZTEST_OBJECTS }; static void usage(boolean_t) __NORETURN; static int ztest_scrub_impl(spa_t *spa); /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. */ const char * _umem_debug_init(void) { return ("default,verbose"); /* $UMEM_DEBUG setting */ } const char * _umem_logging_init(void) { return ("fail,contents"); /* $UMEM_LOGGING setting */ } static void dump_debug_buffer(void) { ssize_t ret __attribute__((unused)); if (!ztest_opts.zo_dump_dbgmsg) return; /* * We use write() instead of printf() so that this function * is safe to call from a signal handler. */ ret = write(STDOUT_FILENO, "\n", 1); zfs_dbgmsg_print("ztest"); } #define BACKTRACE_SZ 100 static void sig_handler(int signo) { struct sigaction action; #if (__GLIBC__ && !__UCLIBC__) /* backtrace() is a GNU extension */ int nptrs; void *buffer[BACKTRACE_SZ]; nptrs = backtrace(buffer, BACKTRACE_SZ); backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); #endif dump_debug_buffer(); /* * Restore default action and re-raise signal so SIGSEGV and * SIGABRT can trigger a core dump. */ action.sa_handler = SIG_DFL; sigemptyset(&action.sa_mask); action.sa_flags = 0; (void) sigaction(signo, &action, NULL); raise(signo); } #define FATAL_MSG_SZ 1024 char *fatal_msg; static void fatal(int do_perror, char *message, ...) { va_list args; int save_errno = errno; char *buf; (void) fflush(stdout); buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); va_start(args, message); (void) sprintf(buf, "ztest: "); /* LINTED */ (void) vsprintf(buf + strlen(buf), message, args); va_end(args); if (do_perror) { (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), ": %s", strerror(save_errno)); } (void) fprintf(stderr, "%s\n", buf); fatal_msg = buf; /* to ease debugging */ if (ztest_dump_core) abort(); else dump_debug_buffer(); exit(3); } static int str2shift(const char *buf) { const char *ends = "BKMGTPEZ"; int i; if (buf[0] == '\0') return (0); for (i = 0; i < strlen(ends); i++) { if (toupper(buf[0]) == ends[i]) break; } if (i == strlen(ends)) { (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); } if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { return (10*i); } (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); usage(B_FALSE); /* NOTREACHED */ } static uint64_t nicenumtoull(const char *buf) { char *end; uint64_t val; val = strtoull(buf, &end, 0); if (end == buf) { (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); usage(B_FALSE); } else if (end[0] == '.') { double fval = strtod(buf, &end); fval *= pow(2, str2shift(end)); /* * UINT64_MAX is not exactly representable as a double. * The closest representation is UINT64_MAX + 1, so we * use a >= comparison instead of > for the bounds check. */ if (fval >= (double)UINT64_MAX) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val = (uint64_t)fval; } else { int shift = str2shift(end); if (shift >= 64 || (val << shift) >> shift != val) { (void) fprintf(stderr, "ztest: value too large: %s\n", buf); usage(B_FALSE); } val <<= shift; } return (val); } static void usage(boolean_t requested) { const ztest_shared_opts_t *zo = &ztest_opts_defaults; char nice_vdev_size[NN_NUMBUF_SZ]; char nice_force_ganging[NN_NUMBUF_SZ]; FILE *fp = requested ? stdout : stderr; nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, sizeof (nice_force_ganging)); (void) fprintf(fp, "Usage: %s\n" "\t[-v vdevs (default: %llu)]\n" "\t[-s size_of_each_vdev (default: %s)]\n" "\t[-a alignment_shift (default: %d)] use 0 for random\n" "\t[-m mirror_copies (default: %d)]\n" "\t[-r raidz_disks / draid_disks (default: %d)]\n" "\t[-R raid_parity (default: %d)]\n" "\t[-K raid_kind (default: random)] raidz|draid|random\n" "\t[-D draid_data (default: %d)] in config\n" "\t[-S draid_spares (default: %d)]\n" "\t[-d datasets (default: %d)]\n" "\t[-t threads (default: %d)]\n" "\t[-g gang_block_threshold (default: %s)]\n" "\t[-i init_count (default: %d)] initialize pool i times\n" "\t[-k kill_percentage (default: %llu%%)]\n" "\t[-p pool_name (default: %s)]\n" "\t[-f dir (default: %s)] file directory for vdev files\n" "\t[-M] Multi-host simulate pool imported on remote host\n" "\t[-V] verbose (use multiple times for ever more blather)\n" "\t[-E] use existing pool instead of creating new one\n" "\t[-T time (default: %llu sec)] total run time\n" "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" "\t[-P passtime (default: %llu sec)] time per pass\n" "\t[-B alt_ztest (default: )] alternate ztest path\n" "\t[-C vdev class state (default: random)] special=on|off|random\n" "\t[-o variable=value] ... set global variable to an unsigned\n" "\t 32-bit integer value\n" "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n" "\t[-h] (print help)\n" "", zo->zo_pool, (u_longlong_t)zo->zo_vdevs, /* -v */ nice_vdev_size, /* -s */ zo->zo_ashift, /* -a */ zo->zo_mirrors, /* -m */ zo->zo_raid_children, /* -r */ zo->zo_raid_parity, /* -R */ zo->zo_draid_data, /* -D */ zo->zo_draid_spares, /* -S */ zo->zo_datasets, /* -d */ zo->zo_threads, /* -t */ nice_force_ganging, /* -g */ zo->zo_init, /* -i */ (u_longlong_t)zo->zo_killrate, /* -k */ zo->zo_pool, /* -p */ zo->zo_dir, /* -f */ (u_longlong_t)zo->zo_time, /* -T */ (u_longlong_t)zo->zo_maxloops, /* -F */ (u_longlong_t)zo->zo_passtime); exit(requested ? 0 : 1); } static uint64_t ztest_random(uint64_t range) { uint64_t r; ASSERT3S(ztest_fd_rand, >=, 0); if (range == 0) return (0); if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) fatal(1, "short read from /dev/urandom"); return (r % range); } static void ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) { char name[32]; char *value; int state = ZTEST_VDEV_CLASS_RND; (void) strlcpy(name, input, sizeof (name)); value = strchr(name, '='); if (value == NULL) { (void) fprintf(stderr, "missing value in property=value " "'-C' argument (%s)\n", input); usage(B_FALSE); } *(value) = '\0'; value++; if (strcmp(value, "on") == 0) { state = ZTEST_VDEV_CLASS_ON; } else if (strcmp(value, "off") == 0) { state = ZTEST_VDEV_CLASS_OFF; } else if (strcmp(value, "random") == 0) { state = ZTEST_VDEV_CLASS_RND; } else { (void) fprintf(stderr, "invalid property value '%s'\n", value); usage(B_FALSE); } if (strcmp(name, "special") == 0) { zo->zo_special_vdevs = state; } else { (void) fprintf(stderr, "invalid property name '%s'\n", name); usage(B_FALSE); } if (zo->zo_verbose >= 3) (void) printf("%s vdev state is '%s'\n", name, value); } static void process_options(int argc, char **argv) { char *path; ztest_shared_opts_t *zo = &ztest_opts; int opt; uint64_t value; char altdir[MAXNAMELEN] = { 0 }; char raid_kind[8] = { "random" }; bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, "v:s:a:m:r:R:K:D:S:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { value = 0; switch (opt) { case 'v': case 's': case 'a': case 'm': case 'r': case 'R': case 'D': case 'S': case 'd': case 't': case 'g': case 'i': case 'k': case 'T': case 'P': case 'F': value = nicenumtoull(optarg); } switch (opt) { case 'v': zo->zo_vdevs = value; break; case 's': zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); break; case 'a': zo->zo_ashift = value; break; case 'm': zo->zo_mirrors = value; break; case 'r': zo->zo_raid_children = MAX(1, value); break; case 'R': zo->zo_raid_parity = MIN(MAX(value, 1), 3); break; case 'K': (void) strlcpy(raid_kind, optarg, sizeof (raid_kind)); break; case 'D': zo->zo_draid_data = MAX(1, value); break; case 'S': zo->zo_draid_spares = MAX(1, value); break; case 'd': zo->zo_datasets = MAX(1, value); break; case 't': zo->zo_threads = MAX(1, value); break; case 'g': zo->zo_metaslab_force_ganging = MAX(SPA_MINBLOCKSIZE << 1, value); break; case 'i': zo->zo_init = value; break; case 'k': zo->zo_killrate = value; break; case 'p': (void) strlcpy(zo->zo_pool, optarg, sizeof (zo->zo_pool)); break; case 'f': path = realpath(optarg, NULL); if (path == NULL) { (void) fprintf(stderr, "error: %s: %s\n", optarg, strerror(errno)); usage(B_FALSE); } else { (void) strlcpy(zo->zo_dir, path, sizeof (zo->zo_dir)); free(path); } break; case 'M': zo->zo_mmp_test = 1; break; case 'V': zo->zo_verbose++; break; case 'E': zo->zo_init = 0; break; case 'T': zo->zo_time = value; break; case 'P': zo->zo_passtime = MAX(1, value); break; case 'F': zo->zo_maxloops = MAX(1, value); break; case 'B': (void) strlcpy(altdir, optarg, sizeof (altdir)); break; case 'C': ztest_parse_name_value(optarg, zo); break; case 'o': - if (set_global_var(optarg) != 0) + if (zo->zo_gvars_count >= ZO_GVARS_MAX_COUNT) { + (void) fprintf(stderr, + "max global var count (%zu) exceeded\n", + ZO_GVARS_MAX_COUNT); + usage(B_FALSE); + } + char *v = zo->zo_gvars[zo->zo_gvars_count]; + if (strlcpy(v, optarg, ZO_GVARS_MAX_ARGLEN) >= + ZO_GVARS_MAX_ARGLEN) { + (void) fprintf(stderr, + "global var option '%s' is too long\n", + optarg); usage(B_FALSE); + } + zo->zo_gvars_count++; break; case 'G': zo->zo_dump_dbgmsg = 1; break; case 'h': usage(B_TRUE); break; case '?': default: usage(B_FALSE); break; } } /* When raid choice is 'random' add a draid pool 50% of the time */ if (strcmp(raid_kind, "random") == 0) { (void) strlcpy(raid_kind, (ztest_random(2) == 0) ? "draid" : "raidz", sizeof (raid_kind)); if (ztest_opts.zo_verbose >= 3) (void) printf("choosing RAID type '%s'\n", raid_kind); } if (strcmp(raid_kind, "draid") == 0) { uint64_t min_devsize; /* With fewer disk use 256M, otherwise 128M is OK */ min_devsize = (ztest_opts.zo_raid_children < 16) ? (256ULL << 20) : (128ULL << 20); /* No top-level mirrors with dRAID for now */ zo->zo_mirrors = 0; /* Use more appropriate defaults for dRAID */ if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) zo->zo_vdevs = 1; if (zo->zo_raid_children == ztest_opts_defaults.zo_raid_children) zo->zo_raid_children = 16; if (zo->zo_ashift < 12) zo->zo_ashift = 12; if (zo->zo_vdev_size < min_devsize) zo->zo_vdev_size = min_devsize; if (zo->zo_draid_data + zo->zo_raid_parity > zo->zo_raid_children - zo->zo_draid_spares) { (void) fprintf(stderr, "error: too few draid " "children (%d) for stripe width (%d)\n", zo->zo_raid_children, zo->zo_draid_data + zo->zo_raid_parity); usage(B_FALSE); } (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, sizeof (zo->zo_raid_type)); } else /* using raidz */ { ASSERT0(strcmp(raid_kind, "raidz")); zo->zo_raid_parity = MIN(zo->zo_raid_parity, zo->zo_raid_children - 1); } zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : UINT64_MAX >> 2); if (strlen(altdir) > 0) { char *cmd; char *realaltdir; char *bin; char *ztest; char *isa; int isalen; cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); VERIFY3P(NULL, !=, realpath(getexecname(), cmd)); if (0 != access(altdir, F_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate ztest path: %s", altdir); } VERIFY3P(NULL, !=, realpath(altdir, realaltdir)); /* * 'cmd' should be of the form "/usr/bin//ztest". * We want to extract to determine if we should use * 32 or 64 bit binaries. */ bin = strstr(cmd, "/usr/bin/"); ztest = strstr(bin, "/ztest"); isa = bin + 9; isalen = ztest - isa; (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), "%s/usr/lib/%.*s", realaltdir, isalen, isa); if (0 != access(zo->zo_alt_ztest, X_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate ztest: %s", zo->zo_alt_ztest); } else if (0 != access(zo->zo_alt_libpath, X_OK)) { ztest_dump_core = B_FALSE; fatal(B_TRUE, "invalid alternate lib directory %s", zo->zo_alt_libpath); } umem_free(cmd, MAXPATHLEN); umem_free(realaltdir, MAXPATHLEN); } } static void ztest_kill(ztest_shared_t *zs) { zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); /* * Before we kill off ztest, make sure that the config is updated. * See comment above spa_write_cachefile(). */ mutex_enter(&spa_namespace_lock); spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); mutex_exit(&spa_namespace_lock); (void) kill(getpid(), SIGKILL); } /* ARGSUSED */ static void ztest_record_enospc(const char *s) { ztest_shared->zs_enospc_count++; } static uint64_t ztest_get_ashift(void) { if (ztest_opts.zo_ashift == 0) return (SPA_MINBLOCKSHIFT + ztest_random(5)); return (ztest_opts.zo_ashift); } static boolean_t ztest_is_draid_spare(const char *name) { uint64_t spare_id = 0, parity = 0, vdev_id = 0; if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, (u_longlong_t *)&spare_id) == 3) { return (B_TRUE); } return (B_FALSE); } static nvlist_t * make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { char *pathbuf; uint64_t vdev; nvlist_t *file; boolean_t draid_spare = B_FALSE; pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); if (ashift == 0) ashift = ztest_get_ashift(); if (path == NULL) { path = pathbuf; if (aux != NULL) { vdev = ztest_shared->zs_vdev_aux; (void) snprintf(path, MAXPATHLEN, ztest_aux_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, aux, vdev); } else { vdev = ztest_shared->zs_vdev_next_leaf++; (void) snprintf(path, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, vdev); } } else { draid_spare = ztest_is_draid_spare(path); } if (size != 0 && !draid_spare) { int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) fatal(1, "can't open %s", path); if (ftruncate(fd, size) != 0) fatal(1, "can't ftruncate %s", path); (void) close(fd); } file = fnvlist_alloc(); fnvlist_add_string(file, ZPOOL_CONFIG_TYPE, draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE); fnvlist_add_string(file, ZPOOL_CONFIG_PATH, path); fnvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift); umem_free(pathbuf, MAXPATHLEN); return (file); } static nvlist_t * make_vdev_raid(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r) { nvlist_t *raid, **child; int c; if (r < 2) return (make_vdev_file(path, aux, pool, size, ashift)); child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); raid = fnvlist_alloc(); fnvlist_add_string(raid, ZPOOL_CONFIG_TYPE, ztest_opts.zo_raid_type); fnvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, ztest_opts.zo_raid_parity); fnvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, child, r); if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { uint64_t ndata = ztest_opts.zo_draid_data; uint64_t nparity = ztest_opts.zo_raid_parity; uint64_t nspares = ztest_opts.zo_draid_spares; uint64_t children = ztest_opts.zo_raid_children; uint64_t ngroups = 1; /* * Calculate the minimum number of groups required to fill a * slice. This is the LCM of the stripe width (data + parity) * and the number of data drives (children - spares). */ while (ngroups * (ndata + nparity) % (children - nspares) != 0) ngroups++; /* Store the basic dRAID configuration. */ fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); } for (c = 0; c < r; c++) fnvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); return (raid); } static nvlist_t * make_vdev_mirror(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r, int m) { nvlist_t *mirror, **child; int c; if (m < 1) return (make_vdev_raid(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); mirror = fnvlist_alloc(); fnvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, VDEV_TYPE_MIRROR); fnvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, child, m); for (c = 0; c < m; c++) fnvlist_free(child[c]); umem_free(child, m * sizeof (nvlist_t *)); return (mirror); } static nvlist_t * make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, const char *class, int r, int m, int t) { nvlist_t *root, **child; int c; boolean_t log; ASSERT3S(t, >, 0); log = (class != NULL && strcmp(class, "log") == 0); child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < t; c++) { child[c] = make_vdev_mirror(path, aux, pool, size, ashift, r, m); fnvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log); if (class != NULL && class[0] != '\0') { ASSERT(m > 1 || log); /* expecting a mirror */ fnvlist_add_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, class); } } root = fnvlist_alloc(); fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); fnvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, child, t); for (c = 0; c < t; c++) fnvlist_free(child[c]); umem_free(child, t * sizeof (nvlist_t *)); return (root); } /* * Find a random spa version. Returns back a random spa version in the * range [initial_version, SPA_VERSION_FEATURES]. */ static uint64_t ztest_random_spa_version(uint64_t initial_version) { uint64_t version = initial_version; if (version <= SPA_VERSION_BEFORE_FEATURES) { version = version + ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); } if (version > SPA_VERSION_BEFORE_FEATURES) version = SPA_VERSION_FEATURES; ASSERT(SPA_VERSION_IS_SUPPORTED(version)); return (version); } static int ztest_random_blocksize(void) { ASSERT3U(ztest_spa->spa_max_ashift, !=, 0); /* * Choose a block size >= the ashift. * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. */ int maxbs = SPA_OLD_MAXBLOCKSHIFT; if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) maxbs = 20; uint64_t block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } static int ztest_random_dnodesize(void) { int slots; int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; if (max_slots == DNODE_MIN_SLOTS) return (DNODE_MIN_SIZE); /* * Weight the random distribution more heavily toward smaller * dnode sizes since that is more likely to reflect real-world * usage. */ ASSERT3U(max_slots, >, 4); switch (ztest_random(10)) { case 0: slots = 5 + ztest_random(max_slots - 4); break; case 1 ... 4: slots = 2 + ztest_random(3); break; default: slots = 1; break; } return (slots << DNODE_SHIFT); } static int ztest_random_ibshift(void) { return (DN_MIN_INDBLKSHIFT + ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); } static uint64_t ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) { uint64_t top; vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd; ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); do { top = ztest_random(rvd->vdev_children); tvd = rvd->vdev_child[top]; } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); return (top); } static uint64_t ztest_random_dsl_prop(zfs_prop_t prop) { uint64_t value; do { value = zfs_prop_random_value(prop, ztest_random(-1ULL)); } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); return (value); } static int ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, boolean_t inherit) { const char *propname = zfs_prop_to_name(prop); const char *valname; char *setpoint; uint64_t curval; int error; error = dsl_prop_set_int(osname, propname, (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); if (ztest_opts.zo_verbose >= 6) { int err; err = zfs_prop_index_to_string(prop, curval, &valname); if (err) (void) printf("%s %s = %llu at '%s'\n", osname, propname, (unsigned long long)curval, setpoint); else (void) printf("%s %s = %s at '%s'\n", osname, propname, valname, setpoint); } umem_free(setpoint, MAXPATHLEN); return (error); } static int ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) { spa_t *spa = ztest_spa; nvlist_t *props = NULL; int error; props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(prop), value); error = spa_prop_set(spa, props); fnvlist_free(props); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } ASSERT0(error); return (error); } static int ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) { int err; char *cp = NULL; char ddname[ZFS_MAX_DATASET_NAME_LEN]; strcpy(ddname, name); cp = strchr(ddname, '@'); if (cp != NULL) *cp = '\0'; err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); while (decrypt && err == EACCES) { dsl_crypto_params_t *dcp; nvlist_t *crypto_args = fnvlist_alloc(); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, crypto_args, &dcp)); err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); /* * Note: if there was an error loading, the wkey was not * consumed, and needs to be freed. */ dsl_crypto_params_free(dcp, (err != 0)); fnvlist_free(crypto_args); if (err == EINVAL) { /* * We couldn't load a key for this dataset so try * the parent. This loop will eventually hit the * encryption root since ztest only makes clones * as children of their origin datasets. */ cp = strrchr(ddname, '/'); if (cp == NULL) return (err); *cp = '\0'; err = EACCES; continue; } else if (err != 0) { break; } err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); break; } return (err); } static void ztest_rll_init(rll_t *rll) { rll->rll_writer = NULL; rll->rll_readers = 0; mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); } static void ztest_rll_destroy(rll_t *rll) { ASSERT3P(rll->rll_writer, ==, NULL); ASSERT0(rll->rll_readers); mutex_destroy(&rll->rll_lock); cv_destroy(&rll->rll_cv); } static void ztest_rll_lock(rll_t *rll, rl_type_t type) { mutex_enter(&rll->rll_lock); if (type == RL_READER) { while (rll->rll_writer != NULL) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_readers++; } else { while (rll->rll_writer != NULL || rll->rll_readers) (void) cv_wait(&rll->rll_cv, &rll->rll_lock); rll->rll_writer = curthread; } mutex_exit(&rll->rll_lock); } static void ztest_rll_unlock(rll_t *rll) { mutex_enter(&rll->rll_lock); if (rll->rll_writer) { ASSERT0(rll->rll_readers); rll->rll_writer = NULL; } else { ASSERT3S(rll->rll_readers, >, 0); ASSERT3P(rll->rll_writer, ==, NULL); rll->rll_readers--; } if (rll->rll_writer == NULL && rll->rll_readers == 0) cv_broadcast(&rll->rll_cv); mutex_exit(&rll->rll_lock); } static void ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_lock(rll, type); } static void ztest_object_unlock(ztest_ds_t *zd, uint64_t object) { rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; ztest_rll_unlock(rll); } static rl_t * ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, rl_type_t type) { uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; rl_t *rl; rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); rl->rl_object = object; rl->rl_offset = offset; rl->rl_size = size; rl->rl_lock = rll; ztest_rll_lock(rll, type); return (rl); } static void ztest_range_unlock(rl_t *rl) { rll_t *rll = rl->rl_lock; ztest_rll_unlock(rll); umem_free(rl, sizeof (*rl)); } static void ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) { zd->zd_os = os; zd->zd_zilog = dmu_objset_zil(os); zd->zd_shared = szd; dmu_objset_name(os, zd->zd_name); int l; if (zd->zd_shared != NULL) zd->zd_shared->zd_seq = 0; VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_init(&zd->zd_object_lock[l]); for (l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_init(&zd->zd_range_lock[l]); } static void ztest_zd_fini(ztest_ds_t *zd) { int l; mutex_destroy(&zd->zd_dirobj_lock); (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) ztest_rll_destroy(&zd->zd_object_lock[l]); for (l = 0; l < ZTEST_RANGE_LOCKS; l++) ztest_rll_destroy(&zd->zd_range_lock[l]); } #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) static uint64_t ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) { uint64_t txg; int error; /* * Attempt to assign tx to some transaction group. */ error = dmu_tx_assign(tx, txg_how); if (error) { if (error == ERESTART) { ASSERT3U(txg_how, ==, TXG_NOWAIT); dmu_tx_wait(tx); } else { ASSERT3U(error, ==, ENOSPC); ztest_record_enospc(tag); } dmu_tx_abort(tx); return (0); } txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); return (txg); } static void ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { bt->bt_magic = BT_MAGIC; bt->bt_objset = dmu_objset_id(os); bt->bt_object = object; bt->bt_dnodesize = dnodesize; bt->bt_offset = offset; bt->bt_gen = gen; bt->bt_txg = txg; bt->bt_crtxg = crtxg; } static void ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { ASSERT3U(bt->bt_magic, ==, BT_MAGIC); ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); ASSERT3U(bt->bt_object, ==, object); ASSERT3U(bt->bt_dnodesize, ==, dnodesize); ASSERT3U(bt->bt_offset, ==, offset); ASSERT3U(bt->bt_gen, <=, gen); ASSERT3U(bt->bt_txg, <=, txg); ASSERT3U(bt->bt_crtxg, ==, crtxg); } static ztest_block_tag_t * ztest_bt_bonus(dmu_buf_t *db) { dmu_object_info_t doi; ztest_block_tag_t *bt; dmu_object_info_from_db(db, &doi); ASSERT3U(doi.doi_bonus_size, <=, db->db_size); ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); return (bt); } /* * Generate a token to fill up unused bonus buffer space. Try to make * it unique to the object, generation, and offset to verify that data * is not getting overwritten by data from other dnodes. */ #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) /* * Fill up the unused bonus buffer region before the block tag with a * verifiable pattern. Filling the whole bonus area with non-zero data * helps ensure that all dnode traversal code properly skips the * interior regions of large dnodes. */ static void ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); *bonusp = token; } } /* * Verify that the unused area of a bonus buffer is filled with the * expected tokens. */ static void ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, objset_t *os, uint64_t gen) { uint64_t *bonusp; for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), gen, bonusp - (uint64_t *)db->db_data); VERIFY3U(*bonusp, ==, token); } } /* * ZIL logging ops */ #define lrz_type lr_mode #define lrz_blocksize lr_uid #define lrz_ibshift lr_gid #define lrz_bonustype lr_rdev #define lrz_dnodesize lr_crtime[1] static void ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) + namesize - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) + namesize - sizeof (lr_t)); itx->itx_oid = object; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) { itx_t *itx; itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); if (zil_replaying(zd->zd_zilog, tx)) return; if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) write_state = WR_INDIRECT; itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); if (write_state == WR_COPIED && dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); write_state = WR_NEED_COPY; } itx->itx_private = zd; itx->itx_wr_state = write_state; itx->itx_sync = (ztest_random(8) == 0); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } static void ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) return; itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); itx->itx_sync = B_FALSE; zil_itx_assign(zd->zd_zilog, itx, tx); } /* * ZIL replay ops */ static int ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_create_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; ztest_block_tag_t *bbt; dmu_buf_t *db; dmu_tx_t *tx; uint64_t txg; int error = 0; int bonuslen; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); ASSERT3S(name[0], !=, '\0'); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } else { dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); } txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) return (ENOSPC); ASSERT3U(dmu_objset_zil(os)->zl_replay, ==, !!lr->lr_foid); bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); if (lr->lrz_type == DMU_OT_ZAP_OTHER) { if (lr->lr_foid == 0) { lr->lr_foid = zap_create_dnsize(os, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = zap_create_claim_dnsize(os, lr->lr_foid, lr->lrz_type, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } else { if (lr->lr_foid == 0) { lr->lr_foid = dmu_object_alloc_dnsize(os, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } else { error = dmu_object_claim_dnsize(os, lr->lr_foid, lr->lrz_type, 0, lr->lrz_bonustype, bonuslen, lr->lrz_dnodesize, tx); } } if (error) { ASSERT3U(error, ==, EEXIST); ASSERT(zd->zd_zilog->zl_replay); dmu_tx_commit(tx); return (error); } ASSERT3U(lr->lr_foid, !=, 0); if (lr->lrz_type != DMU_OT_ZAP_OTHER) VERIFY0(dmu_object_set_blocksize(os, lr->lr_foid, lr->lrz_blocksize, lr->lrz_ibshift, tx)); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); bbt = ztest_bt_bonus(db); dmu_buf_will_dirty(db, tx); ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, lr->lr_gen, txg, txg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); dmu_buf_rele(db, FTAG); VERIFY0(zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, &lr->lr_foid, tx)); (void) ztest_log_create(zd, tx, lr); dmu_tx_commit(tx); return (0); } static int ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_remove_t *lr = arg2; char *name = (void *)(lr + 1); /* name follows lr */ objset_t *os = zd->zd_os; dmu_object_info_t doi; dmu_tx_t *tx; uint64_t object, txg; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT3U(lr->lr_doid, ==, ZTEST_DIROBJ); ASSERT3S(name[0], !=, '\0'); VERIFY0( zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); ASSERT3U(object, !=, 0); ztest_object_lock(zd, object, RL_WRITER); VERIFY0(dmu_object_info(os, object, &doi)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_object_unlock(zd, object); return (ENOSPC); } if (doi.doi_type == DMU_OT_ZAP_OTHER) { VERIFY0(zap_destroy(os, object, tx)); } else { VERIFY0(dmu_object_free(os, object, tx)); } VERIFY0(zap_remove(os, lr->lr_doid, name, tx)); (void) ztest_log_remove(zd, tx, lr, object); dmu_tx_commit(tx); ztest_object_unlock(zd, object); return (0); } static int ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_write_t *lr = arg2; objset_t *os = zd->zd_os; void *data = lr + 1; /* data follows lr */ uint64_t offset, length; ztest_block_tag_t *bt = data; ztest_block_tag_t *bbt; uint64_t gen, txg, lrtxg, crtxg; dmu_object_info_t doi; dmu_tx_t *tx; dmu_buf_t *db; arc_buf_t *abuf = NULL; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } if (bt->bt_magic == BSWAP_64(BT_MAGIC)) byteswap_uint64_array(bt, sizeof (*bt)); if (bt->bt_magic != BT_MAGIC) bt = NULL; ztest_object_lock(zd, lr->lr_foid, RL_READER); rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); gen = bbt->bt_gen; crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; tx = dmu_tx_create(os); dmu_tx_hold_write(tx, lr->lr_foid, offset, length); if (ztest_random(8) == 0 && length == doi.doi_data_block_size && P2PHASE(offset, length) == 0) abuf = dmu_request_arcbuf(db, length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { if (abuf != NULL) dmu_return_arcbuf(abuf); dmu_buf_rele(db, FTAG); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } if (bt != NULL) { /* * Usually, verify the old data before writing new data -- * but not always, because we also want to verify correct * behavior when the data was not recently read into cache. */ ASSERT0(offset % doi.doi_data_block_size); if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; ztest_block_tag_t rbt; VERIFY(dmu_read(os, lr->lr_foid, offset, sizeof (rbt), &rbt, prefetch) == 0); if (rbt.bt_magic == BT_MAGIC) { ztest_bt_verify(&rbt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } } /* * Writes can appear to be newer than the bonus buffer because * the ztest_get_data() callback does a dmu_read() of the * open-context data, which may be different than the data * as it was when the write was generated. */ if (zd->zd_zilog->zl_replay) { ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, MAX(gen, bt->bt_gen), MAX(txg, lrtxg), bt->bt_crtxg); } /* * Set the bt's gen/txg to the bonus buffer's gen/txg * so that all of the usual ASSERTs will work. */ ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, crtxg); } if (abuf == NULL) { dmu_write(os, lr->lr_foid, offset, length, data, tx); } else { bcopy(data, abuf->b_data, length); dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx); } (void) ztest_log_write(zd, tx, lr); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_truncate_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, RL_READER); rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, RL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } VERIFY0(dmu_free_range(os, lr->lr_foid, lr->lr_offset, lr->lr_length, tx)); (void) ztest_log_truncate(zd, tx, lr); dmu_tx_commit(tx); ztest_range_unlock(rl); ztest_object_unlock(zd, lr->lr_foid); return (0); } static int ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) { ztest_ds_t *zd = arg1; lr_setattr_t *lr = arg2; objset_t *os = zd->zd_os; dmu_tx_t *tx; dmu_buf_t *db; ztest_block_tag_t *bbt; uint64_t txg, lrtxg, crtxg, dnodesize; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ztest_object_lock(zd, lr->lr_foid, RL_WRITER); VERIFY0(dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, lr->lr_foid); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, lr->lr_foid); return (ENOSPC); } bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); crtxg = bbt->bt_crtxg; lrtxg = lr->lr_common.lrc_txg; dnodesize = bbt->bt_dnodesize; if (zd->zd_zilog->zl_replay) { ASSERT3U(lr->lr_size, !=, 0); ASSERT3U(lr->lr_mode, !=, 0); ASSERT3U(lrtxg, !=, 0); } else { /* * Randomly change the size and increment the generation. */ lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * sizeof (*bbt); lr->lr_mode = bbt->bt_gen + 1; ASSERT0(lrtxg); } /* * Verify that the current bonus buffer is not newer than our txg. */ ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, MAX(txg, lrtxg), crtxg); dmu_buf_will_dirty(db, tx); ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); ASSERT3U(lr->lr_size, <=, db->db_size); VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); bbt = ztest_bt_bonus(db); ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, txg, crtxg); ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); dmu_buf_rele(db, FTAG); (void) ztest_log_setattr(zd, tx, lr); dmu_tx_commit(tx); ztest_object_unlock(zd, lr->lr_foid); return (0); } zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { NULL, /* 0 no such transaction type */ ztest_replay_create, /* TX_CREATE */ NULL, /* TX_MKDIR */ NULL, /* TX_MKXATTR */ NULL, /* TX_SYMLINK */ ztest_replay_remove, /* TX_REMOVE */ NULL, /* TX_RMDIR */ NULL, /* TX_LINK */ NULL, /* TX_RENAME */ ztest_replay_write, /* TX_WRITE */ ztest_replay_truncate, /* TX_TRUNCATE */ ztest_replay_setattr, /* TX_SETATTR */ NULL, /* TX_ACL */ NULL, /* TX_CREATE_ACL */ NULL, /* TX_CREATE_ATTR */ NULL, /* TX_CREATE_ACL_ATTR */ NULL, /* TX_MKDIR_ACL */ NULL, /* TX_MKDIR_ATTR */ NULL, /* TX_MKDIR_ACL_ATTR */ NULL, /* TX_WRITE2 */ }; /* * ZIL get_data callbacks */ /* ARGSUSED */ static void ztest_get_done(zgd_t *zgd, int error) { ztest_ds_t *zd = zgd->zgd_private; uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); ztest_range_unlock((rl_t *)zgd->zgd_lr); ztest_object_unlock(zd, object); umem_free(zgd, sizeof (*zgd)); } static int ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { ztest_ds_t *zd = arg; objset_t *os = zd->zd_os; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; uint64_t txg = lr->lr_common.lrc_txg; uint64_t crtxg; dmu_object_info_t doi; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); ztest_object_lock(zd, object, RL_READER); error = dmu_bonus_hold(os, object, FTAG, &db); if (error) { ztest_object_unlock(zd, object); return (error); } crtxg = ztest_bt_bonus(db)->bt_crtxg; if (crtxg == 0 || crtxg > txg) { dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, object); return (ENOENT); } dmu_object_info_from_db(db, &doi); dmu_buf_rele(db, FTAG); db = NULL; zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); zgd->zgd_lwb = lwb; zgd->zgd_private = zd; if (buf != NULL) { /* immediate write */ zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); ASSERT0(error); } else { size = doi.doi_data_block_size; if (ISP2(size)) { offset = P2ALIGN(offset, size); } else { ASSERT3U(offset, <, size); offset = 0; } zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, object, offset, size, RL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT3U(db->db_offset, ==, offset); ASSERT3U(db->db_size, ==, size); error = dmu_sync(zio, lr->lr_common.lrc_txg, ztest_get_done, zgd); if (error == 0) return (0); } } ztest_get_done(zgd, error); return (error); } static void * ztest_lr_alloc(size_t lrsize, char *name) { char *lr; size_t namesize = name ? strlen(name) + 1 : 0; lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); if (name) bcopy(name, lr + lrsize, namesize); return (lr); } static void ztest_lr_free(void *lr, size_t lrsize, char *name) { size_t namesize = name ? strlen(name) + 1 : 0; umem_free(lr, lrsize + namesize); } /* * Lookup a bunch of objects. Returns the number of objects not found. */ static int ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (i = 0; i < count; i++, od++) { od->od_object = 0; error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, sizeof (uint64_t), 1, &od->od_object); if (error) { ASSERT3S(error, ==, ENOENT); ASSERT0(od->od_object); missing++; } else { dmu_buf_t *db; ztest_block_tag_t *bbt; dmu_object_info_t doi; ASSERT3U(od->od_object, !=, 0); ASSERT0(missing); /* there should be no gaps */ ztest_object_lock(zd, od->od_object, RL_READER); VERIFY0(dmu_bonus_hold(zd->zd_os, od->od_object, FTAG, &db)); dmu_object_info_from_db(db, &doi); bbt = ztest_bt_bonus(db); ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); od->od_type = doi.doi_type; od->od_blocksize = doi.doi_data_block_size; od->od_gen = bbt->bt_gen; dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, od->od_object); } } return (missing); } static int ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); for (i = 0; i < count; i++, od++) { if (missing) { od->od_object = 0; missing++; continue; } lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ lr->lrz_type = od->od_crtype; lr->lrz_blocksize = od->od_crblocksize; lr->lrz_ibshift = ztest_random_ibshift(); lr->lrz_bonustype = DMU_OT_UINT64_OTHER; lr->lrz_dnodesize = od->od_crdnodesize; lr->lr_gen = od->od_crgen; lr->lr_crtime[0] = time(NULL); if (ztest_replay_create(zd, lr, B_FALSE) != 0) { ASSERT0(missing); od->od_object = 0; missing++; } else { od->od_object = lr->lr_foid; od->od_type = od->od_crtype; od->od_blocksize = od->od_crblocksize; od->od_gen = od->od_crgen; ASSERT3U(od->od_object, !=, 0); } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) { int missing = 0; int error; int i; ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); od += count - 1; for (i = count - 1; i >= 0; i--, od--) { if (missing) { missing++; continue; } /* * No object was found. */ if (od->od_object == 0) continue; lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); lr->lr_doid = od->od_dir; if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { ASSERT3U(error, ==, ENOSPC); missing++; } else { od->od_object = 0; } ztest_lr_free(lr, sizeof (*lr), od->od_name); } return (missing); } static int ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, void *data) { lr_write_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); bcopy(data, lr + 1, size); error = ztest_replay_write(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr) + size, NULL); return (error); } static int ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { lr_truncate_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_offset = offset; lr->lr_length = size; error = ztest_replay_truncate(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static int ztest_setattr(ztest_ds_t *zd, uint64_t object) { lr_setattr_t *lr; int error; lr = ztest_lr_alloc(sizeof (*lr), NULL); lr->lr_foid = object; lr->lr_size = 0; lr->lr_mode = 0; error = ztest_replay_setattr(zd, lr, B_FALSE); ztest_lr_free(lr, sizeof (*lr), NULL); return (error); } static void ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) { objset_t *os = zd->zd_os; dmu_tx_t *tx; uint64_t txg; rl_t *rl; txg_wait_synced(dmu_objset_pool(os), 0); ztest_object_lock(zd, object, RL_READER); rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, object, offset, size); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg != 0) { dmu_prealloc(os, object, offset, size, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); } else { (void) dmu_free_long_range(os, object, offset, size); } ztest_range_unlock(rl); ztest_object_unlock(zd, object); } static void ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) { int err; ztest_block_tag_t wbt; dmu_object_info_t doi; enum ztest_io_type io_type; uint64_t blocksize; void *data; VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); blocksize = doi.doi_data_block_size; data = umem_alloc(blocksize, UMEM_NOFAIL); /* * Pick an i/o type at random, biased toward writing block tags. */ io_type = ztest_random(ZTEST_IO_TYPES); if (ztest_random(2) == 0) io_type = ZTEST_IO_WRITE_TAG; (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); switch (io_type) { case ZTEST_IO_WRITE_TAG: ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, offset, 0, 0, 0); (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); break; case ZTEST_IO_WRITE_PATTERN: (void) memset(data, 'a' + (object + offset) % 5, blocksize); if (ztest_random(2) == 0) { /* * Induce fletcher2 collisions to ensure that * zio_ddt_collision() detects and resolves them * when using fletcher2-verify for deduplication. */ ((uint64_t *)data)[0] ^= 1ULL << 63; ((uint64_t *)data)[4] ^= 1ULL << 63; } (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_WRITE_ZEROES: bzero(data, blocksize); (void) ztest_write(zd, object, offset, blocksize, data); break; case ZTEST_IO_TRUNCATE: (void) ztest_truncate(zd, object, offset, blocksize); break; case ZTEST_IO_SETATTR: (void) ztest_setattr(zd, object); break; default: break; case ZTEST_IO_REWRITE: (void) pthread_rwlock_rdlock(&ztest_name_lock); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), B_FALSE); VERIFY(err == 0 || err == ENOSPC); err = ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COMPRESSION, ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), B_FALSE); VERIFY(err == 0 || err == ENOSPC); (void) pthread_rwlock_unlock(&ztest_name_lock); VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, DMU_READ_NO_PREFETCH)); (void) ztest_write(zd, object, offset, blocksize, data); break; } (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); umem_free(data, blocksize); } /* * Initialize an object description template. */ static void ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, uint64_t gen) { od->od_dir = ZTEST_DIROBJ; od->od_object = 0; od->od_crtype = type; od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); od->od_crgen = gen; od->od_type = DMU_OT_NONE; od->od_blocksize = 0; od->od_gen = 0; (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", tag, (longlong_t)id, (u_longlong_t)index); } /* * Lookup or create the objects for a test using the od template. * If the objects do not all exist, or if 'remove' is specified, * remove any existing objects and create new ones. Otherwise, * use the existing objects. */ static int ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) { int count = size / sizeof (*od); int rv = 0; mutex_enter(&zd->zd_dirobj_lock); if ((ztest_lookup(zd, od, count) != 0 || remove) && (ztest_remove(zd, od, count) != 0 || ztest_create(zd, od, count) != 0)) rv = -1; zd->zd_od = od; mutex_exit(&zd->zd_dirobj_lock); return (rv); } /* ARGSUSED */ void ztest_zil_commit(ztest_ds_t *zd, uint64_t id) { zilog_t *zilog = zd->zd_zilog; (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); /* * Remember the committed values in zd, which is in parent/child * shared memory. If we die, the next iteration of ztest_run() * will verify that the log really does contain this record. */ mutex_enter(&zilog->zl_lock); ASSERT3P(zd->zd_shared, !=, NULL); ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; mutex_exit(&zilog->zl_lock); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); } /* * This function is designed to simulate the operations that occur during a * mount/unmount operation. We hold the dataset across these operations in an * attempt to expose any implicit assumptions about ZIL management. */ /* ARGSUSED */ void ztest_zil_remount(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; /* * We hold the ztest_vdev_lock so we don't cause problems with * other threads that wish to remove a log device, such as * ztest_device_removal(). */ mutex_enter(&ztest_vdev_lock); /* * We grab the zd_dirobj_lock to ensure that no other thread is * updating the zil (i.e. adding in-memory log records) and the * zd_zilog_lock to block any I/O. */ mutex_enter(&zd->zd_dirobj_lock); (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); /* zfsvfs_teardown() */ zil_close(zd->zd_zilog); /* zfsvfs_setup() */ VERIFY3P(zil_open(os, ztest_get_data), ==, zd->zd_zilog); zil_replay(os, zd, ztest_replay_vector); (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); mutex_exit(&zd->zd_dirobj_lock); mutex_exit(&ztest_vdev_lock); } /* * Verify that we can't destroy an active pool, create an existing pool, * or create a pool with a bad vdev spec. */ /* ARGSUSED */ void ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) { ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa; nvlist_t *nvroot; if (zo->zo_mmp_test) return; /* * Attempt to create using a bad file. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * Attempt to create using a bad mirror. */ nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); VERIFY3U(ENOENT, ==, spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * Attempt to create an existing pool. It shouldn't matter * what's in the nvroot; we should fail with EEXIST. */ (void) pthread_rwlock_rdlock(&ztest_name_lock); nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); fnvlist_free(nvroot); /* * We open a reference to the spa and then we try to export it * expecting one of the following errors: * * EBUSY * Because of the reference we just opened. * * ZFS_ERR_EXPORT_IN_PROGRESS * For the case that there is another ztest thread doing * an export concurrently. */ VERIFY0(spa_open(zo->zo_pool, &spa, FTAG)); int error = spa_destroy(zo->zo_pool); if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { fatal(0, "spa_destroy(%s) returned unexpected value %d", spa->spa_name, error); } spa_close(spa, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Start and then stop the MMP threads to ensure the startup and shutdown code * works properly. Actual protection and property-related code tested via ZTS. */ /* ARGSUSED */ void ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) { ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa = ztest_spa; if (zo->zo_mmp_test) return; /* * Since enabling MMP involves setting a property, it could not be done * while the pool is suspended. */ if (spa_suspended(spa)) return; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&spa->spa_props_lock); zfs_multihost_fail_intervals = 0; if (!spa_multihost(spa)) { spa->spa_multihost = B_TRUE; mmp_thread_start(spa); } mutex_exit(&spa->spa_props_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); mmp_signal_all_threads(); txg_wait_synced(spa_get_dsl(spa), 0); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&spa->spa_props_lock); if (spa_multihost(spa)) { mmp_thread_stop(spa); spa->spa_multihost = B_FALSE; } mutex_exit(&spa->spa_props_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* ARGSUSED */ void ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) { spa_t *spa; uint64_t initial_version = SPA_VERSION_INITIAL; uint64_t version, newversion; nvlist_t *nvroot, *props; char *name; if (ztest_opts.zo_mmp_test) return; /* dRAID added after feature flags, skip upgrade test. */ if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) return; mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); /* * Clean up from previous runs. */ (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * initial version is capable of supporting that feature. */ switch (ztest_opts.zo_raid_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; break; case 2: initial_version = SPA_VERSION_RAIDZ2; break; case 3: initial_version = SPA_VERSION_RAIDZ3; break; } /* * Create a pool with a spa version that can be upgraded. Pick * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. */ do { version = ztest_random_spa_version(initial_version); } while (version > SPA_VERSION_BEFORE_FEATURES); props = fnvlist_alloc(); fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), version); VERIFY0(spa_create(name, nvroot, props, NULL, NULL)); fnvlist_free(nvroot); fnvlist_free(props); VERIFY0(spa_open(name, &spa, FTAG)); VERIFY3U(spa_version(spa), ==, version); newversion = ztest_random_spa_version(version + 1); if (ztest_opts.zo_verbose >= 4) { (void) printf("upgrading spa version from %llu to %llu\n", (u_longlong_t)version, (u_longlong_t)newversion); } spa_upgrade(spa, newversion); VERIFY3U(spa_version(spa), >, version); VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, zpool_prop_to_name(ZPOOL_PROP_VERSION))); spa_close(spa, FTAG); kmem_strfree(name); mutex_exit(&ztest_vdev_lock); } static void ztest_spa_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DEVRM_IN_PROGRESS: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_CHECKPOINT_EXISTS: break; case ENOSPC: ztest_record_enospc(FTAG); break; default: fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); } } static void ztest_spa_discard_checkpoint(spa_t *spa) { ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); int error = spa_checkpoint_discard(spa->spa_name); switch (error) { case 0: case ZFS_ERR_DISCARDING_CHECKPOINT: case ZFS_ERR_NO_CHECKPOINT: break; default: fatal(0, "spa_discard_checkpoint(%s) = %d", spa->spa_name, error); } } /* ARGSUSED */ void ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; mutex_enter(&ztest_checkpoint_lock); if (ztest_random(2) == 0) { ztest_spa_checkpoint(spa); } else { ztest_spa_discard_checkpoint(spa); } mutex_exit(&ztest_checkpoint_lock); } static vdev_t * vdev_lookup_by_path(vdev_t *vd, const char *path) { vdev_t *mvd; int c; if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) return (vd); for (c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != NULL) return (mvd); return (NULL); } static int spa_num_top_vdevs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); return (rvd->vdev_children); } /* * Verify that vdev_add() works as expected. */ /* ARGSUSED */ void ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; nvlist_t *nvroot; int error; if (ztest_opts.zo_mmp_test) return; mutex_enter(&ztest_vdev_lock); leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; /* * If we have slogs then remove them 1/4 of the time. */ if (spa_has_slogs(spa) && ztest_random(4) == 0) { metaslab_group_t *mg; /* * find the first real slog in log allocation class */ mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; while (!mg->mg_vd->vdev_islog) mg = mg->mg_next; guid = mg->mg_vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); /* * We have to grab the zs_name_lock as writer to * prevent a race between removing a slog (dmu_objset_find) * and destroying a dataset. Removing the slog will * grab a reference on the dataset which may cause * dsl_destroy_head() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ pthread_rwlock_wrlock(&ztest_name_lock); error = spa_vdev_remove(spa, guid, B_FALSE); pthread_rwlock_unlock(&ztest_name_lock); switch (error) { case 0: case EEXIST: /* Generic zil_reset() error */ case EBUSY: /* Replay required */ case EACCES: /* Crypto key not loaded */ case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: fatal(0, "spa_vdev_remove() = %d", error); } } else { spa_config_exit(spa, SCL_VDEV, FTAG); /* * Make 1/4 of the devices be log devices */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); switch (error) { case 0: break; case ENOSPC: ztest_record_enospc("spa_vdev_add"); break; default: fatal(0, "spa_vdev_add() = %d", error); } } mutex_exit(&ztest_vdev_lock); } /* ARGSUSED */ void ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; uint64_t leaves; nvlist_t *nvroot; const char *class = (ztest_random(2) == 0) ? VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; int error; /* * By default add a special vdev 50% of the time */ if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && ztest_random(2) == 0)) { return; } mutex_enter(&ztest_vdev_lock); /* Only test with mirrors */ if (zs->zs_mirrors < 2) { mutex_exit(&ztest_vdev_lock); return; } /* requires feature@allocation_classes */ if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { mutex_exit(&ztest_vdev_lock); return; } leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); fnvlist_free(nvroot); if (error == ENOSPC) ztest_record_enospc("spa_vdev_add"); else if (error != 0) fatal(0, "spa_vdev_add() = %d", error); /* * 50% of the time allow small blocks in the special class */ if (error == 0 && spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { if (ztest_opts.zo_verbose >= 3) (void) printf("Enabling special VDEV small blocks\n"); (void) ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); } mutex_exit(&ztest_vdev_lock); if (ztest_opts.zo_verbose >= 3) { metaslab_class_t *mc; if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) mc = spa_special_class(spa); else mc = spa_dedup_class(spa); (void) printf("Added a %s mirrored vdev (of %d)\n", class, (int)mc->mc_groups); } } /* * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. */ /* ARGSUSED */ void ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; spa_aux_vdev_t *sav; char *aux; char *path; uint64_t guid = 0; int error, ignore_err = 0; if (ztest_opts.zo_mmp_test) return; path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); if (ztest_random(2) == 0) { sav = &spa->spa_spares; aux = ZPOOL_CONFIG_SPARES; } else { sav = &spa->spa_l2cache; aux = ZPOOL_CONFIG_L2CACHE; } mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); if (sav->sav_count != 0 && ztest_random(4) == 0) { /* * Pick a random device to remove. */ vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) ignore_err = ENOTSUP; guid = svd->vdev_guid; } else { /* * Find an unused device we can add. */ zs->zs_vdev_aux = 0; for (;;) { int c; (void) snprintf(path, MAXPATHLEN, ztest_aux_template, ztest_opts.zo_dir, ztest_opts.zo_pool, aux, zs->zs_vdev_aux); for (c = 0; c < sav->sav_count; c++) if (strcmp(sav->sav_vdevs[c]->vdev_path, path) == 0) break; if (c == sav->sav_count && vdev_lookup_by_path(rvd, path) == NULL) break; zs->zs_vdev_aux++; } } spa_config_exit(spa, SCL_VDEV, FTAG); if (guid == 0) { /* * Add a new device. */ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); error = spa_vdev_add(spa, nvroot); switch (error) { case 0: break; default: fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); } fnvlist_free(nvroot); } else { /* * Remove an existing device. Sometimes, dirty its * vdev state first to make sure we handle removal * of devices that have pending state changes. */ if (ztest_random(2) == 0) (void) vdev_online(spa, guid, 0, NULL); error = spa_vdev_remove(spa, guid, B_FALSE); switch (error) { case 0: case EBUSY: case ZFS_ERR_CHECKPOINT_EXISTS: case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: if (error != ignore_err) fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); } } mutex_exit(&ztest_vdev_lock); umem_free(path, MAXPATHLEN); } /* * split a pool if it has mirror tlvdevs */ /* ARGSUSED */ void ztest_split_pool(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; nvlist_t *tree, **child, *config, *split, **schild; uint_t c, children, schildren = 0, lastlogid = 0; int error = 0; if (ztest_opts.zo_mmp_test) return; mutex_enter(&ztest_vdev_lock); /* ensure we have a usable config; mirrors of raidz aren't supported */ if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { mutex_exit(&ztest_vdev_lock); return; } /* clean up the old pool, if any */ (void) spa_destroy("splitp"); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* generate a config from the existing config */ mutex_enter(&spa->spa_props_lock); tree = fnvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE); mutex_exit(&spa->spa_props_lock); VERIFY0(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children)); schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); for (c = 0; c < children; c++) { vdev_t *tvd = rvd->vdev_child[c]; nvlist_t **mchild; uint_t mchildren; if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { schild[schildren] = fnvlist_alloc(); fnvlist_add_string(schild[schildren], ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE); fnvlist_add_uint64(schild[schildren], ZPOOL_CONFIG_IS_HOLE, 1); if (lastlogid == 0) lastlogid = schildren; ++schildren; continue; } lastlogid = 0; VERIFY0(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren)); schild[schildren++] = fnvlist_dup(mchild[0]); } /* OK, create a config that can be used to split */ split = fnvlist_alloc(); fnvlist_add_string(split, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); fnvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, lastlogid != 0 ? lastlogid : schildren); config = fnvlist_alloc(); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split); for (c = 0; c < schildren; c++) fnvlist_free(schild[c]); free(schild); fnvlist_free(split); spa_config_exit(spa, SCL_VDEV, FTAG); (void) pthread_rwlock_wrlock(&ztest_name_lock); error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); (void) pthread_rwlock_unlock(&ztest_name_lock); fnvlist_free(config); if (error == 0) { (void) printf("successful split - results:\n"); mutex_enter(&spa_namespace_lock); show_pool_stats(spa); show_pool_stats(spa_lookup("splitp")); mutex_exit(&spa_namespace_lock); ++zs->zs_splits; --zs->zs_mirrors; } mutex_exit(&ztest_vdev_lock); } /* * Verify that we can attach and detach devices. */ /* ARGSUSED */ void ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; spa_aux_vdev_t *sav = &spa->spa_spares; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; nvlist_t *root; uint64_t leaves; uint64_t leaf, top; uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; uint64_t oldsize, newsize; char *oldpath, *newpath; int replacing; int oldvd_has_siblings = B_FALSE; int newvd_is_spare = B_FALSE; int newvd_is_dspare = B_FALSE; int oldvd_is_log; int error, expected_error; if (ztest_opts.zo_mmp_test) return; oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * If a vdev is in the process of being removed, its removal may * finish while we are in progress, leading to an unexpected error * value. Don't bother trying to attach while we are in the middle * of removal. */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); goto out; } /* * Decide whether to do an attach or a replace. */ replacing = ztest_random(2); /* * Pick a random top-level vdev. */ top = ztest_random_vdev_top(spa, B_TRUE); /* * Pick a random leaf within it. */ leaf = ztest_random(leaves); /* * Locate this vdev. */ oldvd = rvd->vdev_child[top]; /* pick a child from the mirror */ if (zs->zs_mirrors >= 1) { ASSERT3P(oldvd->vdev_ops, ==, &vdev_mirror_ops); ASSERT3U(oldvd->vdev_children, >=, zs->zs_mirrors); oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; } /* pick a child out of the raidz group */ if (ztest_opts.zo_raid_children > 1) { if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) ASSERT3P(oldvd->vdev_ops, ==, &vdev_raidz_ops); else ASSERT3P(oldvd->vdev_ops, ==, &vdev_draid_ops); ASSERT3U(oldvd->vdev_children, ==, ztest_opts.zo_raid_children); oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; } /* * If we're already doing an attach or replace, oldvd may be a * mirror vdev -- in which case, pick a random child. */ while (oldvd->vdev_children != 0) { oldvd_has_siblings = B_TRUE; ASSERT3U(oldvd->vdev_children, >=, 2); oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; } oldguid = oldvd->vdev_guid; oldsize = vdev_get_min_asize(oldvd); oldvd_is_log = oldvd->vdev_top->vdev_islog; (void) strcpy(oldpath, oldvd->vdev_path); pvd = oldvd->vdev_parent; pguid = pvd->vdev_guid; /* * If oldvd has siblings, then half of the time, detach it. Prior * to the detach the pool is scrubbed in order to prevent creating * unrepairable blocks as a result of the data corruption injection. */ if (oldvd_has_siblings && ztest_random(2) == 0) { spa_config_exit(spa, SCL_ALL, FTAG); error = ztest_scrub_impl(spa); if (error) goto out; error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && error != ZFS_ERR_DISCARDING_CHECKPOINT) fatal(0, "detach (%s) returned %d", oldpath, error); goto out; } /* * For the new vdev, choose with equal probability between the two * standard paths (ending in either 'a' or 'b') or a random hot spare. */ if (sav->sav_count != 0 && ztest_random(3) == 0) { newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; newvd_is_spare = B_TRUE; if (newvd->vdev_ops == &vdev_draid_spare_ops) newvd_is_dspare = B_TRUE; (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); if (ztest_random(2) == 0) newpath[strlen(newpath) - 1] = 'b'; newvd = vdev_lookup_by_path(rvd, newpath); } if (newvd) { /* * Reopen to ensure the vdev's asize field isn't stale. */ vdev_reopen(newvd); newsize = vdev_get_min_asize(newvd); } else { /* * Make newsize a little bigger or smaller than oldsize. * If it's smaller, the attach should fail. * If it's larger, and we're doing a replace, * we should get dynamic LUN growth when we're done. */ newsize = 10 * oldsize / (9 + ztest_random(3)); } /* * If pvd is not a mirror or root, the attach should fail with ENOTSUP, * unless it's a replace; in that case any non-replacing parent is OK. * * If newvd is already part of the pool, it should fail with EBUSY. * * If newvd is too small, it should fail with EOVERFLOW. * * If newvd is a distributed spare and it's being attached to a * dRAID which is not its parent it should fail with EINVAL. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || pvd->vdev_ops == &vdev_replacing_ops || pvd->vdev_ops == &vdev_spare_ops)) expected_error = ENOTSUP; else if (newvd_is_spare && (!replacing || oldvd_is_log)) expected_error = ENOTSUP; else if (newvd == oldvd) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; else if (!newvd_is_dspare && newsize < oldsize) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) expected_error = ENOTSUP; else expected_error = 0; spa_config_exit(spa, SCL_ALL, FTAG); /* * Build the nvlist describing newpath. */ root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, NULL, 0, 0, 1); /* * When supported select either a healing or sequential resilver. */ boolean_t rebuilding = B_FALSE; if (pvd->vdev_ops == &vdev_mirror_ops || pvd->vdev_ops == &vdev_root_ops) { rebuilding = !!ztest_random(2); } error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); fnvlist_free(root); /* * If our parent was the replacing vdev, but the replace completed, * then instead of failing with ENOTSUP we may either succeed, * fail with ENODEV, or fail with EOVERFLOW. */ if (expected_error == ENOTSUP && (error == 0 || error == ENODEV || error == EOVERFLOW)) expected_error = error; /* * If someone grew the LUN, the replacement may be too small. */ if (error == EOVERFLOW || error == EBUSY) expected_error = error; if (error == ZFS_ERR_CHECKPOINT_EXISTS || error == ZFS_ERR_DISCARDING_CHECKPOINT || error == ZFS_ERR_RESILVER_IN_PROGRESS || error == ZFS_ERR_REBUILD_IN_PROGRESS) expected_error = error; if (error != expected_error && expected_error != EBUSY) { fatal(0, "attach (%s %llu, %s %llu, %d) " "returned %d, expected %d", oldpath, oldsize, newpath, newsize, replacing, error, expected_error); } out: mutex_exit(&ztest_vdev_lock); umem_free(oldpath, MAXPATHLEN); umem_free(newpath, MAXPATHLEN); } /* ARGSUSED */ void ztest_device_removal(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; vdev_t *vd; uint64_t guid; int error; mutex_enter(&ztest_vdev_lock); if (ztest_device_removal_active) { mutex_exit(&ztest_vdev_lock); return; } /* * Remove a random top-level vdev and wait for removal to finish. */ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); guid = vd->vdev_guid; spa_config_exit(spa, SCL_VDEV, FTAG); error = spa_vdev_remove(spa, guid, B_FALSE); if (error == 0) { ztest_device_removal_active = B_TRUE; mutex_exit(&ztest_vdev_lock); /* * spa->spa_vdev_removal is created in a sync task that * is initiated via dsl_sync_task_nowait(). Since the * task may not run before spa_vdev_remove() returns, we * must wait at least 1 txg to ensure that the removal * struct has been created. */ txg_wait_synced(spa_get_dsl(spa), 0); while (spa->spa_removing_phys.sr_state == DSS_SCANNING) txg_wait_synced(spa_get_dsl(spa), 0); } else { mutex_exit(&ztest_vdev_lock); return; } /* * The pool needs to be scrubbed after completing device removal. * Failure to do so may result in checksum errors due to the * strategy employed by ztest_fault_inject() when selecting which * offset are redundant and can be damaged. */ error = spa_scan(spa, POOL_SCAN_SCRUB); if (error == 0) { while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); } mutex_enter(&ztest_vdev_lock); ztest_device_removal_active = B_FALSE; mutex_exit(&ztest_vdev_lock); } /* * Callback function which expands the physical size of the vdev. */ static vdev_t * grow_vdev(vdev_t *vd, void *arg) { spa_t *spa __maybe_unused = vd->vdev_spa; size_t *newsize = arg; size_t fsize; int fd; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); if ((fd = open(vd->vdev_path, O_RDWR)) == -1) return (vd); fsize = lseek(fd, 0, SEEK_END); VERIFY0(ftruncate(fd, *newsize)); if (ztest_opts.zo_verbose >= 6) { (void) printf("%s grew from %lu to %lu bytes\n", vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); } (void) close(fd); return (NULL); } /* * Callback function which expands a given vdev by calling vdev_online(). */ /* ARGSUSED */ static vdev_t * online_vdev(vdev_t *vd, void *arg) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint64_t guid = vd->vdev_guid; uint64_t generation = spa->spa_config_generation + 1; vdev_state_t newstate = VDEV_STATE_UNKNOWN; int error; ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), ==, SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); /* Calling vdev_online will initialize the new metaslabs */ spa_config_exit(spa, SCL_STATE, spa); error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If vdev_online returned an error or the underlying vdev_open * failed then we abort the expand. The only way to know that * vdev_open fails is by checking the returned newstate. */ if (error || newstate != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Unable to expand vdev, state %llu, " "error %d\n", (u_longlong_t)newstate, error); } return (vd); } ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); /* * Since we dropped the lock we need to ensure that we're * still talking to the original vdev. It's possible this * vdev may have been detached/replaced while we were * trying to online it. */ if (generation != spa->spa_config_generation) { if (ztest_opts.zo_verbose >= 5) { (void) printf("vdev configuration has changed, " "guid %llu, state %llu, expected gen %llu, " "got gen %llu\n", (u_longlong_t)guid, (u_longlong_t)tvd->vdev_state, (u_longlong_t)generation, (u_longlong_t)spa->spa_config_generation); } return (vd); } return (NULL); } /* * Traverse the vdev tree calling the supplied function. * We continue to walk the tree until we either have walked all * children or we receive a non-NULL return from the callback. * If a NULL callback is passed, then we just return back the first * leaf vdev we encounter. */ static vdev_t * vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) { uint_t c; if (vd->vdev_ops->vdev_op_leaf) { if (func == NULL) return (vd); else return (func(vd, arg)); } for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) return (cvd); } return (NULL); } /* * Verify that dynamic LUN growth works as expected. */ /* ARGSUSED */ void ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; vdev_t *vd, *tvd; metaslab_class_t *mc; metaslab_group_t *mg; size_t psize, newsize; uint64_t top; uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; mutex_enter(&ztest_checkpoint_lock); mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_STATE, spa, RW_READER); /* * If there is a vdev removal in progress, it could complete while * we are running, in which case we would not be able to verify * that the metaslab_class space increased (because it decreases * when the device removal completes). */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } top = ztest_random_vdev_top(spa, B_TRUE); tvd = spa->spa_root_vdev->vdev_child[top]; mg = tvd->vdev_mg; mc = mg->mg_class; old_ms_count = tvd->vdev_ms_count; old_class_space = metaslab_class_get_space(mc); /* * Determine the size of the first leaf vdev associated with * our top-level device. */ vd = vdev_walk_tree(tvd, NULL, NULL); ASSERT3P(vd, !=, NULL); ASSERT(vd->vdev_ops->vdev_op_leaf); psize = vd->vdev_psize; /* * We only try to expand the vdev if it's healthy, less than 4x its * original size, and it has a valid psize. */ if (tvd->vdev_state != VDEV_STATE_HEALTHY || psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } ASSERT3U(psize, >, 0); newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); ASSERT3U(newsize, >, psize); if (ztest_opts.zo_verbose >= 6) { (void) printf("Expanding LUN %s from %lu to %lu\n", vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); } /* * Growing the vdev is a two step process: * 1). expand the physical size (i.e. relabel) * 2). online the vdev to create the new metaslabs */ if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || vdev_walk_tree(tvd, online_vdev, NULL) != NULL || tvd->vdev_state != VDEV_STATE_HEALTHY) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not expand LUN because " "the vdev configuration changed.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } spa_config_exit(spa, SCL_STATE, spa); /* * Expanding the LUN will update the config asynchronously, * thus we must wait for the async thread to complete any * pending tasks before proceeding. */ for (;;) { boolean_t done; mutex_enter(&spa->spa_async_lock); done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); mutex_exit(&spa->spa_async_lock); if (done) break; txg_wait_synced(spa_get_dsl(spa), 0); (void) poll(NULL, 0, 100); } spa_config_enter(spa, SCL_STATE, spa, RW_READER); tvd = spa->spa_root_vdev->vdev_child[top]; new_ms_count = tvd->vdev_ms_count; new_class_space = metaslab_class_get_space(mc); if (tvd->vdev_mg != mg || mg->mg_class != mc) { if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not verify LUN expansion due to " "intervening vdev offline or remove.\n"); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); return; } /* * Make sure we were able to grow the vdev. */ if (new_ms_count <= old_ms_count) { fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", old_ms_count, new_ms_count); } /* * Make sure we were able to grow the pool. */ if (new_class_space <= old_class_space) { fatal(0, "LUN expansion failed: class_space %llu < %llu\n", old_class_space, new_class_space); } if (ztest_opts.zo_verbose >= 5) { char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); (void) printf("%s grew from %s to %s\n", spa->spa_name, oldnumbuf, newnumbuf); } spa_config_exit(spa, SCL_STATE, spa); mutex_exit(&ztest_vdev_lock); mutex_exit(&ztest_checkpoint_lock); } /* * Verify that dmu_objset_{create,destroy,open,close} work as expected. */ /* ARGSUSED */ static void ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { /* * Create the objects common to all ztest datasets. */ VERIFY0(zap_create_claim(os, ZTEST_DIROBJ, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx)); } static int ztest_dataset_create(char *dsname) { int err; uint64_t rand; dsl_crypto_params_t *dcp = NULL; /* * 50% of the time, we create encrypted datasets * using a random cipher suite and a hard-coded * wrapping key. */ rand = ztest_random(2); if (rand != 0) { nvlist_t *crypto_args = fnvlist_alloc(); nvlist_t *props = fnvlist_alloc(); /* slight bias towards the default cipher suite */ rand = ztest_random(ZIO_CRYPT_FUNCTIONS); if (rand < ZIO_CRYPT_AES_128_CCM) rand = ZIO_CRYPT_ON; fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); fnvlist_add_uint8_array(crypto_args, "wkeydata", (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); /* * These parameters aren't really used by the kernel. They * are simply stored so that userspace knows how to load * the wrapping key. */ fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); fnvlist_add_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, crypto_args, &dcp)); /* * Cycle through all available encryption implementations * to verify interoperability. */ VERIFY0(gcm_impl_set("cycle")); VERIFY0(aes_impl_set("cycle")); fnvlist_free(crypto_args); fnvlist_free(props); } err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, ztest_objset_create_cb, NULL); dsl_crypto_params_free(dcp, !!err); rand = ztest_random(100); if (err || rand < 80) return (err); if (ztest_opts.zo_verbose >= 5) (void) printf("Setting dataset %s to sync always\n", dsname); return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, ZFS_SYNC_ALWAYS, B_FALSE)); } /* ARGSUSED */ static int ztest_objset_destroy_cb(const char *name, void *arg) { objset_t *os; dmu_object_info_t doi; int error; /* * Verify that the dataset contains a directory object. */ VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, FTAG, &os)); error = dmu_object_info(os, ZTEST_DIROBJ, &doi); if (error != ENOENT) { /* We could have crashed in the middle of destroying it */ ASSERT0(error); ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); ASSERT3S(doi.doi_physical_blocks_512, >=, 0); } dmu_objset_disown(os, B_TRUE, FTAG); /* * Destroy the dataset. */ if (strchr(name, '@') != NULL) { VERIFY0(dsl_destroy_snapshot(name, B_TRUE)); } else { error = dsl_destroy_head(name); if (error == ENOSPC) { /* There could be checkpoint or insufficient slop */ ztest_record_enospc(FTAG); } else if (error != EBUSY) { /* There could be a hold on this dataset */ ASSERT0(error); } } return (0); } static boolean_t ztest_snapshot_create(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); error = dmu_objset_snapshot_one(osname, snapname); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (B_FALSE); } if (error != 0 && error != EEXIST) { fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, snapname, error); } return (B_TRUE); } static boolean_t ztest_snapshot_destroy(char *osname, uint64_t id) { char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, (u_longlong_t)id); error = dsl_destroy_snapshot(snapname, B_FALSE); if (error != 0 && error != ENOENT) fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); return (B_TRUE); } /* ARGSUSED */ void ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) { ztest_ds_t *zdtmp; int iters; int error; objset_t *os, *os2; char name[ZFS_MAX_DATASET_NAME_LEN]; zilog_t *zilog; int i; zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) snprintf(name, sizeof (name), "%s/temp_%llu", ztest_opts.zo_pool, (u_longlong_t)id); /* * If this dataset exists from a previous run, process its replay log * half of the time. If we don't replay it, then dsl_destroy_head() * (invoked from ztest_objset_destroy_cb()) should just throw it away. */ if (ztest_random(2) == 0 && ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os) == 0) { ztest_zd_init(zdtmp, NULL, os); zil_replay(os, zdtmp, ztest_replay_vector); ztest_zd_fini(zdtmp); dmu_objset_disown(os, B_TRUE, FTAG); } /* * There may be an old instance of the dataset we're about to * create lying around from a previous run. If so, destroy it * and all of its snapshots. */ (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); /* * Verify that the destroyed dataset is no longer in the namespace. */ VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, B_TRUE, FTAG, &os)); /* * Verify that we can create a new dataset. */ error = ztest_dataset_create(name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_objset_create(%s) = %d", name, error); } VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os)); ztest_zd_init(zdtmp, NULL, os); /* * Open the intent log for it. */ zilog = zil_open(os, ztest_get_data); /* * Put some objects in there, do a little I/O to them, * and randomly take a couple of snapshots along the way. */ iters = ztest_random(5); for (i = 0; i < iters; i++) { ztest_dmu_object_alloc_free(zdtmp, id); if (ztest_random(iters) == 0) (void) ztest_snapshot_create(name, i); } /* * Verify that we cannot create an existing dataset. */ VERIFY3U(EEXIST, ==, dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); /* * Verify that we can hold an objset that is also owned. */ VERIFY0(dmu_objset_hold(name, FTAG, &os2)); dmu_objset_rele(os2, FTAG); /* * Verify that we cannot own an objset that is already owned. */ VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, FTAG, &os2)); zil_close(zilog); dmu_objset_disown(os, B_TRUE, FTAG); ztest_zd_fini(zdtmp); out: (void) pthread_rwlock_unlock(&ztest_name_lock); umem_free(zdtmp, sizeof (ztest_ds_t)); } /* * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. */ void ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) { (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_snapshot_destroy(zd->zd_name, id); (void) ztest_snapshot_create(zd->zd_name, id); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Cleanup non-standard snapshots and clones. */ static void ztest_dsl_dataset_cleanup(char *osname, uint64_t id) { char *snap1name; char *clone1name; char *snap2name; char *clone2name; char *snap3name; int error; snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%llu", osname, (u_longlong_t)id); (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%llu", osname, (u_longlong_t)id); (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%llu", clone1name, (u_longlong_t)id); (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%llu", osname, (u_longlong_t)id); (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%llu", clone1name, (u_longlong_t)id); error = dsl_destroy_head(clone2name); if (error && error != ENOENT) fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); error = dsl_destroy_snapshot(snap3name, B_FALSE); if (error && error != ENOENT) fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); error = dsl_destroy_snapshot(snap2name, B_FALSE); if (error && error != ENOENT) fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); error = dsl_destroy_head(clone1name); if (error && error != ENOENT) fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); error = dsl_destroy_snapshot(snap1name, B_FALSE); if (error && error != ENOENT) fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); } /* * Verify dsl_dataset_promote handles EBUSY */ void ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) { objset_t *os; char *snap1name; char *clone1name; char *snap2name; char *clone2name; char *snap3name; char *osname = zd->zd_name; int error; snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); (void) pthread_rwlock_rdlock(&ztest_name_lock); ztest_dsl_dataset_cleanup(osname, id); (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, "%s@s1_%llu", osname, (u_longlong_t)id); (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, "%s/c1_%llu", osname, (u_longlong_t)id); (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, "%s@s2_%llu", clone1name, (u_longlong_t)id); (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, "%s/c2_%llu", osname, (u_longlong_t)id); (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, "%s@s3_%llu", clone1name, (u_longlong_t)id); error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); } error = dmu_objset_clone(clone1name, snap1name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); } error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); } error = dmu_objset_clone(clone2name, snap3name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); goto out; } fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); } error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os); if (error) fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); error = dsl_dataset_promote(clone2name, NULL); if (error == ENOSPC) { dmu_objset_disown(os, B_TRUE, FTAG); ztest_record_enospc(FTAG); goto out; } if (error != EBUSY) fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, error); dmu_objset_disown(os, B_TRUE, FTAG); out: ztest_dsl_dataset_cleanup(osname, id); (void) pthread_rwlock_unlock(&ztest_name_lock); umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 4 /* * Verify that dmu_object_{alloc,free} work as expected. */ void ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; int batchsize; int size; int b; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); batchsize = OD_ARRAY_SIZE; for (b = 0; b < batchsize; b++) ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0, 0); /* * Destroy the previous batch of objects, create a new batch, * and do some I/O on the new objects. */ if (ztest_object_init(zd, od, size, B_TRUE) != 0) return; while (ztest_random(4 * batchsize) != 0) ztest_io(zd, od[ztest_random(batchsize)].od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); umem_free(od, size); } /* * Rewind the global allocator to verify object allocation backfilling. */ void ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; uint64_t object; /* * Rewind the global allocator randomly back to a lower object number * to force backfilling and reclamation of recently freed dnodes. */ mutex_enter(&os->os_obj_lock); object = ztest_random(os->os_obj_next_chunk); os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); mutex_exit(&os->os_obj_lock); } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 2 /* * Verify that dmu_{read,write} work as expected. */ void ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) { int size; ztest_od_t *od; objset_t *os = zd->zd_os; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); dmu_tx_t *tx; int i, freeit, error; uint64_t n, s, txg; bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 40; int free_percent = 5; /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is arbitrary. It doesn't have to be a power of two, * and it doesn't have any relation to the object blocksize. * The only requirement is that it can hold at least two bufwads. * * Normally, we write the bufwad to each of these locations. * However, free_percent of the time we instead write zeroes to * packobj and perform a dmu_free_range() on bigobj. By comparing * bigobj to packobj, we can verify that the DMU is correctly * tracking which parts of an object are allocated and free, * and that the contents of the allocated blocks are correct. */ /* * Read the directory info. If it's the first time, set things up. */ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, size, B_FALSE) != 0) { umem_free(od, size); return; } bigobj = od[0].od_object; packobj = od[1].od_object; chunksize = od[0].od_gen; ASSERT3U(chunksize, ==, od[1].od_gen); /* * Prefetch a random chunk of the big object. * Our aim here is to get some async reads in flight * for blocks that we may free below; the DMU should * handle this race correctly. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(2 * width - 1); dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, ZIO_PRIORITY_SYNC_READ); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_alloc(packsize, UMEM_NOFAIL); bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); /* * free_percent of the time, free a range of bigobj rather than * overwriting it. */ freeit = (ztest_random(100) < free_percent); /* * Read the current contents of our objects. */ error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); if (freeit) dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); else dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); /* This accounts for setting the checksum/compression. */ dmu_tx_hold_bonus(tx, bigobj); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(od, size); return; } enum zio_checksum cksum; do { cksum = (enum zio_checksum) ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); dmu_object_set_checksum(os, bigobj, cksum, tx); enum zio_compress comp; do { comp = (enum zio_compress) ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); dmu_object_set_compress(os, bigobj, comp, tx); /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) fatal(0, "future leak: got %llx, open txg is %llx", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(0, "wrong index: got %llx, wanted %llx+%llx", pack->bw_index, n, i); if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); if (freeit) { bzero(pack, sizeof (bufwad_t)); } else { pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); } *bigH = *pack; *bigT = *pack; } /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (freeit) { if (ztest_opts.zo_verbose >= 7) { (void) printf("freeing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } VERIFY0(dmu_free_range(os, bigobj, bigoff, bigsize, tx)); } else { if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT0(bcmp(packbuf, packcheck, packsize)); ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(od, size); } static void compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) { uint64_t i; bufwad_t *pack; bufwad_t *bigH; bufwad_t *bigT; /* * For each index from n to n + s, verify that the existing bufwad * in packobj matches the bufwads at the head and tail of the * corresponding chunk in bigobj. Then update all three bufwads * with the new values we want to write out. */ for (i = 0; i < s; i++) { /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT3U((uintptr_t)bigH - (uintptr_t)bigbuf, <, bigsize); ASSERT3U((uintptr_t)bigT - (uintptr_t)bigbuf, <, bigsize); if (pack->bw_txg > txg) fatal(0, "future leak: got %llx, open txg is %llx", pack->bw_txg, txg); if (pack->bw_data != 0 && pack->bw_index != n + i) fatal(0, "wrong index: got %llx, wanted %llx+%llx", pack->bw_index, n, i); if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); pack->bw_index = n + i; pack->bw_txg = txg; pack->bw_data = 1 + ztest_random(-2ULL); *bigH = *pack; *bigT = *pack; } } #undef OD_ARRAY_SIZE #define OD_ARRAY_SIZE 2 void ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; dmu_tx_t *tx; uint64_t i; int error; int size; uint64_t n, s, txg; bufwad_t *packbuf, *bigbuf; uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; uint64_t blocksize = ztest_random_blocksize(); uint64_t chunksize = blocksize; uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 9; dmu_buf_t *bonus_db; arc_buf_t **bigbuf_arcbufs; dmu_object_info_t doi; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); /* * This test uses two objects, packobj and bigobj, that are always * updated together (i.e. in the same tx) so that their contents are * in sync and can be compared. Their contents relate to each other * in a simple way: packobj is a dense array of 'bufwad' structures, * while bigobj is a sparse array of the same bufwads. Specifically, * for any index n, there are three bufwads that should be identical: * * packobj, at offset n * sizeof (bufwad_t) * bigobj, at the head of the nth chunk * bigobj, at the tail of the nth chunk * * The chunk size is set equal to bigobj block size so that * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. */ /* * Read the directory info. If it's the first time, set things up. */ ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, chunksize); if (ztest_object_init(zd, od, size, B_FALSE) != 0) { umem_free(od, size); return; } bigobj = od[0].od_object; packobj = od[1].od_object; blocksize = od[0].od_blocksize; chunksize = blocksize; ASSERT3U(chunksize, ==, od[1].od_gen); VERIFY0(dmu_object_info(os, bigobj, &doi)); VERIFY(ISP2(doi.doi_data_block_size)); VERIFY3U(chunksize, ==, doi.doi_data_block_size); VERIFY3U(chunksize, >=, 2 * sizeof (bufwad_t)); /* * Pick a random index and compute the offsets into packobj and bigobj. */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(width - 1); packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); bigoff = n * chunksize; bigsize = s * chunksize; packbuf = umem_zalloc(packsize, UMEM_NOFAIL); bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); /* * Iteration 0 test zcopy for DB_UNCACHED dbufs. * Iteration 1 test zcopy to already referenced dbufs. * Iteration 2 test zcopy to dirty dbuf in the same txg. * Iteration 3 test zcopy to dbuf dirty in previous txg. * Iteration 4 test zcopy when dbuf is no longer dirty. * Iteration 5 test zcopy when it can't be done. * Iteration 6 one more zcopy write. */ for (i = 0; i < 7; i++) { uint64_t j; uint64_t off; /* * In iteration 5 (i == 5) use arcbufs * that don't match bigobj blksz to test * dmu_assign_arcbuf_by_dbuf() when it can't directly * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { bigbuf_arcbufs[2 * j] = dmu_request_arcbuf(bonus_db, chunksize / 2); bigbuf_arcbufs[2 * j + 1] = dmu_request_arcbuf(bonus_db, chunksize / 2); } } /* * Get a tx for the mods to both packobj and bigobj. */ tx = dmu_tx_create(os); dmu_tx_hold_write(tx, packobj, packoff, packsize); dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( bigbuf_arcbufs[2 * j]); dmu_return_arcbuf( bigbuf_arcbufs[2 * j + 1]); } } umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); umem_free(od, size); dmu_buf_rele(bonus_db, FTAG); return; } /* * 50% of the time don't read objects in the 1st iteration to * test dmu_assign_arcbuf_by_dbuf() for the case when there are * no existing dbufs for the specified offsets. */ if (i != 0 || ztest_random(2) != 0) { error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); ASSERT0(error); } compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, n, chunksize, txg); /* * We've verified all the old bufwads, and made new ones. * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bcopy((caddr_t)bigbuf + (off - bigoff), bigbuf_arcbufs[j]->b_data, chunksize); } else { bcopy((caddr_t)bigbuf + (off - bigoff), bigbuf_arcbufs[2 * j]->b_data, chunksize / 2); bcopy((caddr_t)bigbuf + (off - bigoff) + chunksize / 2, bigbuf_arcbufs[2 * j + 1]->b_data, chunksize / 2); } if (i == 1) { VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off, bigbuf_arcbufs[j], tx)); } else { VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off, bigbuf_arcbufs[2 * j], tx)); VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, off + chunksize / 2, bigbuf_arcbufs[2 * j + 1], tx)); } if (i == 1) { dmu_buf_rele(dbt, FTAG); } } dmu_tx_commit(tx); /* * Sanity check the stuff we just wrote. */ { void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT0(bcmp(packbuf, packcheck, packsize)); ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); } if (i == 2) { txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); } else if (i == 3) { txg_wait_synced(dmu_objset_pool(os), 0); } } dmu_buf_rele(bonus_db, FTAG); umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); umem_free(od, size); } /* ARGSUSED */ void ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); uint64_t offset = (1ULL << (ztest_random(20) + 43)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); /* * Have multiple threads write to large offsets in an object * to verify that parallel writes to an object -- even to the * same blocks within the object -- doesn't cause any trouble. */ ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) return; while (ztest_random(10) != 0) ztest_io(zd, od->od_object, offset); umem_free(od, sizeof (ztest_od_t)); } void ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) { ztest_od_t *od; uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); uint64_t count = ztest_random(20) + 1; uint64_t blocksize = ztest_random_blocksize(); void *data; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } ztest_prealloc(zd, od->od_object, offset, count * blocksize); data = umem_zalloc(blocksize, UMEM_NOFAIL); while (ztest_random(count) != 0) { uint64_t randoff = offset + (ztest_random(count) * blocksize); if (ztest_write(zd, od->od_object, randoff, blocksize, data) != 0) break; while (ztest_random(4) != 0) ztest_io(zd, od->od_object, randoff); } umem_free(data, blocksize); umem_free(od, sizeof (ztest_od_t)); } /* * Verify that zap_{create,destroy,add,remove,update} work as expected. */ #define ZTEST_ZAP_MIN_INTS 1 #define ZTEST_ZAP_MAX_INTS 4 #define ZTEST_ZAP_MAX_PROPS 1000 void ztest_zap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t object; uint64_t txg, last_txg; uint64_t value[ZTEST_ZAP_MAX_INTS]; uint64_t zl_ints, zl_intsize, prop; int i, ints; dmu_tx_t *tx; char propname[100], txgname[100]; int error; char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) goto out; object = od->od_object; /* * Generate a known hash collision, and verify that * we can lookup and remove both entries. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; for (i = 0; i < 2; i++) { value[i] = i; VERIFY0(zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); } for (i = 0; i < 2; i++) { VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 1, &value[i], tx)); VERIFY0( zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); } for (i = 0; i < 2; i++) { VERIFY0(zap_remove(os, object, hc[i], tx)); } dmu_tx_commit(tx); /* * Generate a bunch of random entries. */ ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); bzero(value, sizeof (value)); last_txg = 0; /* * If these zap entries already exist, validate their contents. */ error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == 0) { ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, 1); VERIFY0(zap_lookup(os, object, txgname, zl_intsize, zl_ints, &last_txg)); VERIFY0(zap_length(os, object, propname, &zl_intsize, &zl_ints)); ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); ASSERT3U(zl_ints, ==, ints); VERIFY0(zap_lookup(os, object, propname, zl_intsize, zl_ints, value)); for (i = 0; i < ints; i++) { ASSERT3U(value[i], ==, last_txg + object + i); } } else { ASSERT3U(error, ==, ENOENT); } /* * Atomically update two entries in our zap object. * The first is named txg_%llu, and contains the txg * in which the property was last updated. The second * is named prop_%llu, and the nth element of its value * should be txg + object + n. */ tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; if (last_txg > txg) fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); for (i = 0; i < ints; i++) value[i] = txg + object + i; VERIFY0(zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx)); VERIFY0(zap_update(os, object, propname, sizeof (uint64_t), ints, value, tx)); dmu_tx_commit(tx); /* * Remove a random pair of entries. */ prop = ztest_random(ZTEST_ZAP_MAX_PROPS); (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == ENOENT) goto out; ASSERT0(error); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; VERIFY0(zap_remove(os, object, txgname, tx)); VERIFY0(zap_remove(os, object, propname, tx)); dmu_tx_commit(tx); out: umem_free(od, sizeof (ztest_od_t)); } /* * Test case to test the upgrading of a microzap to fatzap. */ void ztest_fzap(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t object, txg; int i; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) goto out; object = od->od_object; /* * Add entries to this ZAP and make sure it spills over * and gets upgraded to a fatzap. Also, since we are adding * 2050 entries we should see ptrtbl growth and leaf-block split. */ for (i = 0; i < 2050; i++) { char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t value = i; dmu_tx_t *tx; int error; (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", (u_longlong_t)id, (u_longlong_t)value); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, name); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) goto out; error = zap_add(os, object, name, sizeof (uint64_t), 1, &value, tx); ASSERT(error == 0 || error == EEXIST); dmu_tx_commit(tx); } out: umem_free(od, sizeof (ztest_od_t)); } /* ARGSUSED */ void ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; dmu_tx_t *tx; int i, namelen, error; int micro = ztest_random(2); char name[20], string_value[20]; void *data; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } object = od->od_object; /* * Generate a random name of the form 'xxx.....' where each * x is a random printable character and the dots are dots. * There are 94 such characters, and the name length goes from * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. */ namelen = ztest_random(sizeof (name) - 5) + 5 + 1; for (i = 0; i < 3; i++) name[i] = '!' + ztest_random('~' - '!' + 1); for (; i < namelen - 1; i++) name[i] = '.'; name[i] = '\0'; if ((namelen & 1) || micro) { wsize = sizeof (txg); wc = 1; data = &txg; } else { wsize = 1; wc = namelen; data = string_value; } count = -1ULL; VERIFY0(zap_count(os, object, &count)); ASSERT3S(count, !=, -1ULL); /* * Select an operation: length, lookup, add, update, remove. */ i = ztest_random(5); if (i >= 2) { tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(od, sizeof (ztest_od_t)); return; } bcopy(name, string_value, namelen); } else { tx = NULL; txg = 0; bzero(string_value, namelen); } switch (i) { case 0: error = zap_length(os, object, name, &zl_wsize, &zl_wc); if (error == 0) { ASSERT3U(wsize, ==, zl_wsize); ASSERT3U(wc, ==, zl_wc); } else { ASSERT3U(error, ==, ENOENT); } break; case 1: error = zap_lookup(os, object, name, wsize, wc, data); if (error == 0) { if (data == string_value && bcmp(name, data, namelen) != 0) fatal(0, "name '%s' != val '%s' len %d", name, data, namelen); } else { ASSERT3U(error, ==, ENOENT); } break; case 2: error = zap_add(os, object, name, wsize, wc, data, tx); ASSERT(error == 0 || error == EEXIST); break; case 3: VERIFY0(zap_update(os, object, name, wsize, wc, data, tx)); break; case 4: error = zap_remove(os, object, name, tx); ASSERT(error == 0 || error == ENOENT); break; } if (tx != NULL) dmu_tx_commit(tx); umem_free(od, sizeof (ztest_od_t)); } /* * Commit callback data. */ typedef struct ztest_cb_data { list_node_t zcd_node; uint64_t zcd_txg; int zcd_expected_err; boolean_t zcd_added; boolean_t zcd_called; spa_t *zcd_spa; } ztest_cb_data_t; /* This is the actual commit callback function */ static void ztest_commit_callback(void *arg, int error) { ztest_cb_data_t *data = arg; uint64_t synced_txg; VERIFY3P(data, !=, NULL); VERIFY3S(data->zcd_expected_err, ==, error); VERIFY(!data->zcd_called); synced_txg = spa_last_synced_txg(data->zcd_spa); if (data->zcd_txg > synced_txg) fatal(0, "commit callback of txg %" PRIu64 " called prematurely" ", last synced txg = %" PRIu64 "\n", data->zcd_txg, synced_txg); data->zcd_called = B_TRUE; if (error == ECANCELED) { ASSERT0(data->zcd_txg); ASSERT(!data->zcd_added); /* * The private callback data should be destroyed here, but * since we are going to check the zcd_called field after * dmu_tx_abort(), we will destroy it there. */ return; } ASSERT(data->zcd_added); ASSERT3U(data->zcd_txg, !=, 0); (void) mutex_enter(&zcl.zcl_callbacks_lock); /* See if this cb was called more quickly */ if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) zc_min_txg_delay = synced_txg - data->zcd_txg; /* Remove our callback from the list */ list_remove(&zcl.zcl_callbacks, data); (void) mutex_exit(&zcl.zcl_callbacks_lock); umem_free(data, sizeof (ztest_cb_data_t)); } /* Allocate and initialize callback data structure */ static ztest_cb_data_t * ztest_create_cb_data(objset_t *os, uint64_t txg) { ztest_cb_data_t *cb_data; cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); cb_data->zcd_txg = txg; cb_data->zcd_spa = dmu_objset_spa(os); list_link_init(&cb_data->zcd_node); return (cb_data); } /* * Commit callback test. */ void ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; ztest_od_t *od; dmu_tx_t *tx; ztest_cb_data_t *cb_data[3], *tmp_cb; uint64_t old_txg, txg; int i, error = 0; od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { umem_free(od, sizeof (ztest_od_t)); return; } tx = dmu_tx_create(os); cb_data[0] = ztest_create_cb_data(os, 0); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); /* Every once in a while, abort the transaction on purpose */ if (ztest_random(100) == 0) error = -1; if (!error) error = dmu_tx_assign(tx, TXG_NOWAIT); txg = error ? 0 : dmu_tx_get_txg(tx); cb_data[0]->zcd_txg = txg; cb_data[1] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); if (error) { /* * It's not a strict requirement to call the registered * callbacks from inside dmu_tx_abort(), but that's what * it's supposed to happen in the current implementation * so we will check for that. */ for (i = 0; i < 2; i++) { cb_data[i]->zcd_expected_err = ECANCELED; VERIFY(!cb_data[i]->zcd_called); } dmu_tx_abort(tx); for (i = 0; i < 2; i++) { VERIFY(cb_data[i]->zcd_called); umem_free(cb_data[i], sizeof (ztest_cb_data_t)); } umem_free(od, sizeof (ztest_od_t)); return; } cb_data[2] = ztest_create_cb_data(os, txg); dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); /* * Read existing data to make sure there isn't a future leak. */ VERIFY0(dmu_read(os, od->od_object, 0, sizeof (uint64_t), &old_txg, DMU_READ_PREFETCH)); if (old_txg > txg) fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, old_txg, txg); dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); (void) mutex_enter(&zcl.zcl_callbacks_lock); /* * Since commit callbacks don't have any ordering requirement and since * it is theoretically possible for a commit callback to be called * after an arbitrary amount of time has elapsed since its txg has been * synced, it is difficult to reliably determine whether a commit * callback hasn't been called due to high load or due to a flawed * implementation. * * In practice, we will assume that if after a certain number of txgs a * commit callback hasn't been called, then most likely there's an * implementation bug.. */ tmp_cb = list_head(&zcl.zcl_callbacks); if (tmp_cb != NULL && tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { fatal(0, "Commit callback threshold exceeded, oldest txg: %" PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); } /* * Let's find the place to insert our callbacks. * * Even though the list is ordered by txg, it is possible for the * insertion point to not be the end because our txg may already be * quiescing at this point and other callbacks in the open txg * (from other objsets) may have sneaked in. */ tmp_cb = list_tail(&zcl.zcl_callbacks); while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); /* Add the 3 callbacks to the list */ for (i = 0; i < 3; i++) { if (tmp_cb == NULL) list_insert_head(&zcl.zcl_callbacks, cb_data[i]); else list_insert_after(&zcl.zcl_callbacks, tmp_cb, cb_data[i]); cb_data[i]->zcd_added = B_TRUE; VERIFY(!cb_data[i]->zcd_called); tmp_cb = cb_data[i]; } zc_cb_counter += 3; (void) mutex_exit(&zcl.zcl_callbacks_lock); dmu_tx_commit(tx); umem_free(od, sizeof (ztest_od_t)); } /* * Visit each object in the dataset. Verify that its properties * are consistent what was stored in the block tag when it was created, * and that its unused bonus buffer space has not been overwritten. */ /* ARGSUSED */ void ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) { objset_t *os = zd->zd_os; uint64_t obj; int err = 0; for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { ztest_block_tag_t *bt = NULL; dmu_object_info_t doi; dmu_buf_t *db; ztest_object_lock(zd, obj, RL_READER); if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { ztest_object_unlock(zd, obj); continue; } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_size >= sizeof (*bt)) bt = ztest_bt_bonus(db); if (bt && bt->bt_magic == BT_MAGIC) { ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, bt->bt_offset, bt->bt_gen, bt->bt_txg, bt->bt_crtxg); ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); } dmu_buf_rele(db, FTAG); ztest_object_unlock(zd, obj); } } /* ARGSUSED */ void ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) { zfs_prop_t proplist[] = { ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_COPIES, ZFS_PROP_DEDUP }; int p; (void) pthread_rwlock_rdlock(&ztest_name_lock); for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, ztest_random_blocksize(), (int)ztest_random(2))); (void) pthread_rwlock_unlock(&ztest_name_lock); } /* ARGSUSED */ void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) { nvlist_t *props = NULL; (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); VERIFY0(spa_prop_get(ztest_spa, &props)); if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); fnvlist_free(props); (void) pthread_rwlock_unlock(&ztest_name_lock); } static int user_release_one(const char *snapname, const char *holdname) { nvlist_t *snaps, *holds; int error; snaps = fnvlist_alloc(); holds = fnvlist_alloc(); fnvlist_add_boolean(holds, holdname); fnvlist_add_nvlist(snaps, snapname, holds); fnvlist_free(holds); error = dsl_dataset_user_release(snaps, NULL); fnvlist_free(snaps); return (error); } /* * Test snapshot hold/release and deferred destroy. */ void ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) { int error; objset_t *os = zd->zd_os; objset_t *origin; char snapname[100]; char fullname[100]; char clonename[100]; char tag[100]; char osname[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *holds; (void) pthread_rwlock_rdlock(&ztest_name_lock); dmu_objset_name(os, osname); (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", (u_longlong_t)id); (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); (void) snprintf(clonename, sizeof (clonename), "%s/ch1_%llu", osname, (u_longlong_t)id); (void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id); /* * Clean up from any previous run. */ error = dsl_destroy_head(clonename); if (error != ENOENT) ASSERT0(error); error = user_release_one(fullname, tag); if (error != ESRCH && error != ENOENT) ASSERT0(error); error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != ENOENT) ASSERT0(error); /* * Create snapshot, clone it, mark snap for deferred destroy, * destroy clone, verify snap was also destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); } error = dmu_objset_clone(clonename, fullname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_clone"); goto out; } fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = dsl_destroy_head(clonename); if (error) fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); error = dmu_objset_hold(fullname, FTAG, &origin); if (error != ENOENT) fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); /* * Create snapshot, add temporary hold, verify that we can't * destroy a held snapshot, mark for deferred destroy, * release hold, verify snapshot was destroyed. */ error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); goto out; } fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); } holds = fnvlist_alloc(); fnvlist_add_string(holds, fullname, tag); error = dsl_dataset_user_hold(holds, 0, NULL); fnvlist_free(holds); if (error == ENOSPC) { ztest_record_enospc("dsl_dataset_user_hold"); goto out; } else if (error) { fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", fullname, tag, error); } error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != EBUSY) { fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", fullname, error); } error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } error = user_release_one(fullname, tag); if (error) fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); out: (void) pthread_rwlock_unlock(&ztest_name_lock); } /* * Inject random faults into the on-disk data. */ /* ARGSUSED */ void ztest_fault_inject(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; int fd; uint64_t offset; uint64_t leaves; uint64_t bad = 0x1990c0ffeedecadeull; uint64_t top, leaf; char *path0; char *pathrand; size_t fsize; int bshift = SPA_MAXBLOCKSHIFT + 2; int iters = 1000; int maxfaults; int mirror_save; vdev_t *vd0 = NULL; uint64_t guid0 = 0; boolean_t islog = B_FALSE; path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); /* * Device removal is in progress, fault injection must be disabled * until it completes and the pool is scrubbed. The fault injection * strategy for damaging blocks does not take in to account evacuated * blocks which may have already been damaged. */ if (ztest_device_removal_active) { mutex_exit(&ztest_vdev_lock); goto out; } maxfaults = MAXFAULTS(zs); leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); ASSERT3U(leaves, >=, 1); /* * While ztest is running the number of leaves will not change. This * is critical for the fault injection logic as it determines where * errors can be safely injected such that they are always repairable. * * When restarting ztest a different number of leaves may be requested * which will shift the regions to be damaged. This is fine as long * as the pool has been scrubbed prior to using the new mapping. * Failure to do can result in non-repairable damage being injected. */ if (ztest_pool_scrubbed == B_FALSE) goto out; /* * Grab the name lock as reader. There are some operations * which don't like to have their vdevs changed while * they are in progress (i.e. spa_change_guid). Those * operations will have grabbed the name lock as writer. */ (void) pthread_rwlock_rdlock(&ztest_name_lock); /* * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (ztest_random(2) == 0) { /* * Inject errors on a normal data device or slog device. */ top = ztest_random_vdev_top(spa, B_TRUE); leaf = ztest_random(leaves) + zs->zs_splits; /* * Generate paths to the first leaf in this top-level vdev, * and to the random leaf we selected. We'll induce transient * write failures and random online/offline activity on leaf 0, * and we'll write random garbage to the randomly chosen leaf. */ (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + zs->zs_splits); (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, ztest_opts.zo_dir, ztest_opts.zo_pool, top * leaves + leaf); vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); if (vd0 != NULL && vd0->vdev_top->vdev_islog) islog = B_TRUE; /* * If the top-level vdev needs to be resilvered * then we only allow faults on the device that is * resilvering. */ if (vd0 != NULL && maxfaults != 1 && (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || vd0->vdev_resilver_txg != 0)) { /* * Make vd0 explicitly claim to be unreadable, * or unwriteable, or reach behind its back * and close the underlying fd. We can do this if * maxfaults == 0 because we'll fail and reexecute, * and we can do it if maxfaults >= 2 because we'll * have enough redundancy. If maxfaults == 1, the * combination of this with injection of random data * corruption below exceeds the pool's fault tolerance. */ vdev_file_t *vf = vd0->vdev_tsd; zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", (long long)vd0->vdev_id, (int)maxfaults); if (vf != NULL && ztest_random(3) == 0) { (void) close(vf->vf_file->f_fd); vf->vf_file->f_fd = -1; } else if (ztest_random(2) == 0) { vd0->vdev_cant_read = B_TRUE; } else { vd0->vdev_cant_write = B_TRUE; } guid0 = vd0->vdev_guid; } } else { /* * Inject errors on an l2cache device. */ spa_aux_vdev_t *sav = &spa->spa_l2cache; if (sav->sav_count == 0) { spa_config_exit(spa, SCL_STATE, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); goto out; } vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; guid0 = vd0->vdev_guid; (void) strcpy(path0, vd0->vdev_path); (void) strcpy(pathrand, vd0->vdev_path); leaf = 0; leaves = 1; maxfaults = INT_MAX; /* no limit on cache devices */ } spa_config_exit(spa, SCL_STATE, FTAG); (void) pthread_rwlock_unlock(&ztest_name_lock); /* * If we can tolerate two or more faults, or we're dealing * with a slog, randomly online/offline vd0. */ if ((maxfaults >= 2 || islog) && guid0 != 0) { if (ztest_random(10) < 6) { int flags = (ztest_random(2) == 0 ? ZFS_OFFLINE_TEMPORARY : 0); /* * We have to grab the zs_name_lock as writer to * prevent a race between offlining a slog and * destroying a dataset. Offlining the slog will * grab a reference on the dataset which may cause * dsl_destroy_head() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ if (islog) (void) pthread_rwlock_wrlock(&ztest_name_lock); VERIFY3U(vdev_offline(spa, guid0, flags), !=, EBUSY); if (islog) (void) pthread_rwlock_unlock(&ztest_name_lock); } else { /* * Ideally we would like to be able to randomly * call vdev_[on|off]line without holding locks * to force unpredictable failures but the side * effects of vdev_[on|off]line prevent us from * doing so. We grab the ztest_vdev_lock here to * prevent a race between injection testing and * aux_vdev removal. */ mutex_enter(&ztest_vdev_lock); (void) vdev_online(spa, guid0, 0, NULL); mutex_exit(&ztest_vdev_lock); } } if (maxfaults == 0) goto out; /* * We have at least single-fault tolerance, so inject data corruption. */ fd = open(pathrand, O_RDWR); if (fd == -1) /* we hit a gap in the device namespace */ goto out; fsize = lseek(fd, 0, SEEK_END); while (--iters != 0) { /* * The offset must be chosen carefully to ensure that * we do not inject a given logical block with errors * on two different leaf devices, because ZFS can not * tolerate that (if maxfaults==1). * * To achieve this we divide each leaf device into * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). * Each chunk is further divided into error-injection * ranges (can accept errors) and clear ranges (we do * not inject errors in those). Each error-injection * range can accept errors only for a single leaf vdev. * Error-injection ranges are separated by clear ranges. * * For example, with 3 leaves, each chunk looks like: * 0 to 32M: injection range for leaf 0 * 32M to 64M: clear range - no injection allowed * 64M to 96M: injection range for leaf 1 * 96M to 128M: clear range - no injection allowed * 128M to 160M: injection range for leaf 2 * 160M to 192M: clear range - no injection allowed * * Each clear range must be large enough such that a * single block cannot straddle it. This way a block * can't be a target in two different injection ranges * (on different leaf vdevs). */ offset = ztest_random(fsize / (leaves << bshift)) * (leaves << bshift) + (leaf << bshift) + (ztest_random(1ULL << (bshift - 1)) & -8ULL); /* * Only allow damage to the labels at one end of the vdev. * * If all labels are damaged, the device will be totally * inaccessible, which will result in loss of data, * because we also damage (parts of) the other side of * the mirror/raidz. * * Additionally, we will always have both an even and an * odd label, so that we can handle crashes in the * middle of vdev_config_sync(). */ if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) continue; /* * The two end labels are stored at the "end" of the disk, but * the end of the disk (vdev_psize) is aligned to * sizeof (vdev_label_t). */ uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); if ((leaf & 1) == 1 && offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) continue; mutex_enter(&ztest_vdev_lock); if (mirror_save != zs->zs_mirrors) { mutex_exit(&ztest_vdev_lock); (void) close(fd); goto out; } if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) fatal(1, "can't inject bad word at 0x%llx in %s", offset, pathrand); mutex_exit(&ztest_vdev_lock); if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%llx\n", pathrand, (u_longlong_t)offset); } (void) close(fd); out: umem_free(path0, MAXPATHLEN); umem_free(pathrand, MAXPATHLEN); } /* * By design ztest will never inject uncorrectable damage in to the pool. * Issue a scrub, wait for it to complete, and verify there is never any * persistent damage. * * Only after a full scrub has been completed is it safe to start injecting * data corruption. See the comment in zfs_fault_inject(). */ static int ztest_scrub_impl(spa_t *spa) { int error = spa_scan(spa, POOL_SCAN_SCRUB); if (error) return (error); while (dsl_scan_scrubbing(spa_get_dsl(spa))) txg_wait_synced(spa_get_dsl(spa), 0); if (spa_get_errlog_size(spa) > 0) return (ECKSUM); ztest_pool_scrubbed = B_TRUE; return (0); } /* * Scrub the pool. */ /* ARGSUSED */ void ztest_scrub(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; int error; /* * Scrub in progress by device removal. */ if (ztest_device_removal_active) return; /* * Start a scrub, wait a moment, then force a restart. */ (void) spa_scan(spa, POOL_SCAN_SCRUB); (void) poll(NULL, 0, 100); error = ztest_scrub_impl(spa); if (error == EBUSY) error = 0; ASSERT0(error); } /* * Change the guid for the pool. */ /* ARGSUSED */ void ztest_reguid(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; uint64_t orig, load; int error; if (ztest_opts.zo_mmp_test) return; orig = spa_guid(spa); load = spa_load_guid(spa); (void) pthread_rwlock_wrlock(&ztest_name_lock); error = spa_change_guid(spa); (void) pthread_rwlock_unlock(&ztest_name_lock); if (error != 0) return; if (ztest_opts.zo_verbose >= 4) { (void) printf("Changed guid old %llu -> %llu\n", (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); } VERIFY3U(orig, !=, spa_guid(spa)); VERIFY3U(load, ==, spa_load_guid(spa)); } void ztest_fletcher(ztest_ds_t *zd, uint64_t id) { hrtime_t end = gethrtime() + NANOSEC; while (gethrtime() <= end) { int run_count = 100; void *buf; struct abd *abd_data, *abd_meta; uint32_t size; int *ptr; int i; zio_cksum_t zc_ref; zio_cksum_t zc_ref_byteswap; size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); abd_data = abd_alloc(size, B_FALSE); abd_meta = abd_alloc(size, B_TRUE); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); abd_copy_from_buf_off(abd_data, buf, 0, size); abd_copy_from_buf_off(abd_meta, buf, 0, size); VERIFY0(fletcher_4_impl_set("scalar")); fletcher_4_native(buf, size, NULL, &zc_ref); fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); VERIFY0(fletcher_4_impl_set("cycle")); while (run_count-- > 0) { zio_cksum_t zc; zio_cksum_t zc_byteswap; fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); fletcher_4_native(buf, size, NULL, &zc); VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); /* Test ABD - data */ abd_fletcher_4_byteswap(abd_data, size, NULL, &zc_byteswap); abd_fletcher_4_native(abd_data, size, NULL, &zc); VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); /* Test ABD - metadata */ abd_fletcher_4_byteswap(abd_meta, size, NULL, &zc_byteswap); abd_fletcher_4_native(abd_meta, size, NULL, &zc); VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, sizeof (zc_byteswap))); } umem_free(buf, size); abd_free(abd_data); abd_free(abd_meta); } } void ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) { void *buf; size_t size; int *ptr; int i; zio_cksum_t zc_ref; zio_cksum_t zc_ref_bswap; hrtime_t end = gethrtime() + NANOSEC; while (gethrtime() <= end) { int run_count = 100; size = ztest_random_blocksize(); buf = umem_alloc(size, UMEM_NOFAIL); for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) *ptr = ztest_random(UINT_MAX); VERIFY0(fletcher_4_impl_set("scalar")); fletcher_4_native(buf, size, NULL, &zc_ref); fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); VERIFY0(fletcher_4_impl_set("cycle")); while (run_count-- > 0) { zio_cksum_t zc; zio_cksum_t zc_bswap; size_t pos = 0; ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); while (pos < size) { size_t inc = 64 * ztest_random(size / 67); /* sometimes add few bytes to test non-simd */ if (ztest_random(100) < 10) inc += P2ALIGN(ztest_random(64), sizeof (uint32_t)); if (inc > (size - pos)) inc = size - pos; fletcher_4_incremental_native(buf + pos, inc, &zc); fletcher_4_incremental_byteswap(buf + pos, inc, &zc_bswap); pos += inc; } VERIFY3U(pos, ==, size); VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); /* * verify if incremental on the whole buffer is * equivalent to non-incremental version */ ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); fletcher_4_incremental_native(buf, size, &zc); fletcher_4_incremental_byteswap(buf, size, &zc_bswap); VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); } umem_free(buf, size); } } +static int +ztest_set_global_vars(void) +{ + for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { + char *kv = ztest_opts.zo_gvars[i]; + VERIFY3U(strlen(kv), <=, ZO_GVARS_MAX_ARGLEN); + VERIFY3U(strlen(kv), >, 0); + int err = set_global_var(kv); + if (ztest_opts.zo_verbose > 0) { + (void) printf("setting global var %s ... %s\n", kv, + err ? "failed" : "ok"); + } + if (err != 0) { + (void) fprintf(stderr, + "failed to set global var '%s'\n", kv); + return (err); + } + } + return (0); +} + +static char ** +ztest_global_vars_to_zdb_args(void) +{ + char **args = calloc(2*ztest_opts.zo_gvars_count + 1, sizeof (char *)); + char **cur = args; + for (size_t i = 0; i < ztest_opts.zo_gvars_count; i++) { + char *kv = ztest_opts.zo_gvars[i]; + *cur = "-o"; + cur++; + *cur = strdup(kv); + cur++; + } + ASSERT3P(cur, ==, &args[2*ztest_opts.zo_gvars_count]); + *cur = NULL; + return (args); +} + +/* The end of strings is indicated by a NULL element */ +static char * +join_strings(char **strings, const char *sep) +{ + size_t totallen = 0; + for (char **sp = strings; *sp != NULL; sp++) { + totallen += strlen(*sp); + totallen += strlen(sep); + } + if (totallen > 0) { + ASSERT(totallen >= strlen(sep)); + totallen -= strlen(sep); + } + + size_t buflen = totallen + 1; + char *o = malloc(buflen); /* trailing 0 byte */ + o[0] = '\0'; + for (char **sp = strings; *sp != NULL; sp++) { + size_t would; + would = strlcat(o, *sp, buflen); + VERIFY3U(would, <, buflen); + if (*(sp+1) == NULL) { + break; + } + would = strlcat(o, sep, buflen); + VERIFY3U(would, <, buflen); + } + ASSERT3S(strlen(o), ==, totallen); + return (o); +} + static int ztest_check_path(char *path) { struct stat s; /* return true on success */ return (!stat(path, &s)); } static void ztest_get_zdb_bin(char *bin, int len) { char *zdb_path; /* * Try to use ZDB_PATH and in-tree zdb path. If not successful, just * let popen to search through PATH. */ if ((zdb_path = getenv("ZDB_PATH"))) { strlcpy(bin, zdb_path, len); /* In env */ if (!ztest_check_path(bin)) { ztest_dump_core = 0; fatal(1, "invalid ZDB_PATH '%s'", bin); } return; } VERIFY3P(realpath(getexecname(), bin), !=, NULL); if (strstr(bin, "/ztest/")) { strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */ strcat(bin, "/zdb/zdb"); if (ztest_check_path(bin)) return; } strcpy(bin, "zdb"); } static vdev_t * ztest_random_concrete_vdev_leaf(vdev_t *vd) { if (vd == NULL) return (NULL); if (vd->vdev_children == 0) return (vd); vdev_t *eligible[vd->vdev_children]; int eligible_idx = 0, i; for (i = 0; i < vd->vdev_children; i++) { vdev_t *cvd = vd->vdev_child[i]; if (cvd->vdev_top->vdev_removing) continue; if (cvd->vdev_children > 0 || (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { eligible[eligible_idx++] = cvd; } } VERIFY3S(eligible_idx, >, 0); uint64_t child_no = ztest_random(eligible_idx); return (ztest_random_concrete_vdev_leaf(eligible[child_no])); } /* ARGSUSED */ void ztest_initialize(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; int error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* Random leaf vdev */ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); if (rand_vd == NULL) { spa_config_exit(spa, SCL_VDEV, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * The random vdev we've selected may change as soon as we * drop the spa_config_lock. We create local copies of things * we're interested in. */ uint64_t guid = rand_vd->vdev_guid; char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_initialize_thread != NULL; zfs_dbgmsg("vd %px, guid %llu", rand_vd, guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *vdev_errlist = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, path, guid); error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); fnvlist_free(vdev_guids); fnvlist_free(vdev_errlist); switch (cmd) { case POOL_INITIALIZE_CANCEL: if (ztest_opts.zo_verbose >= 4) { (void) printf("Cancel initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; case POOL_INITIALIZE_START: if (ztest_opts.zo_verbose >= 4) { (void) printf("Start initialize %s", path); if (active && error == 0) (void) printf(" failed (already active)"); else if (error != 0) (void) printf(" failed (error %d)", error); (void) printf("\n"); } break; case POOL_INITIALIZE_SUSPEND: if (ztest_opts.zo_verbose >= 4) { (void) printf("Suspend initialize %s", path); if (!active) (void) printf(" failed (no initialize active)"); (void) printf("\n"); } break; } free(path); mutex_exit(&ztest_vdev_lock); } /* ARGSUSED */ void ztest_trim(ztest_ds_t *zd, uint64_t id) { spa_t *spa = ztest_spa; int error = 0; mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* Random leaf vdev */ vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); if (rand_vd == NULL) { spa_config_exit(spa, SCL_VDEV, FTAG); mutex_exit(&ztest_vdev_lock); return; } /* * The random vdev we've selected may change as soon as we * drop the spa_config_lock. We create local copies of things * we're interested in. */ uint64_t guid = rand_vd->vdev_guid; char *path = strdup(rand_vd->vdev_path); boolean_t active = rand_vd->vdev_trim_thread != NULL; zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); spa_config_exit(spa, SCL_VDEV, FTAG); uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); uint64_t rate = 1 << ztest_random(30); boolean_t partial = (ztest_random(5) > 0); boolean_t secure = (ztest_random(5) > 0); nvlist_t *vdev_guids = fnvlist_alloc(); nvlist_t *vdev_errlist = fnvlist_alloc(); fnvlist_add_uint64(vdev_guids, path, guid); error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, secure, vdev_errlist); fnvlist_free(vdev_guids); fnvlist_free(vdev_errlist); switch (cmd) { case POOL_TRIM_CANCEL: if (ztest_opts.zo_verbose >= 4) { (void) printf("Cancel TRIM %s", path); if (!active) (void) printf(" failed (no TRIM active)"); (void) printf("\n"); } break; case POOL_TRIM_START: if (ztest_opts.zo_verbose >= 4) { (void) printf("Start TRIM %s", path); if (active && error == 0) (void) printf(" failed (already active)"); else if (error != 0) (void) printf(" failed (error %d)", error); (void) printf("\n"); } break; case POOL_TRIM_SUSPEND: if (ztest_opts.zo_verbose >= 4) { (void) printf("Suspend TRIM %s", path); if (!active) (void) printf(" failed (no TRIM active)"); (void) printf("\n"); } break; } free(path); mutex_exit(&ztest_vdev_lock); } /* * Verify pool integrity by running zdb. */ static void ztest_run_zdb(char *pool) { int status; char *bin; char *zdb; char *zbuf; const int len = MAXPATHLEN + MAXNAMELEN + 20; FILE *fp; bin = umem_alloc(len, UMEM_NOFAIL); zdb = umem_alloc(len, UMEM_NOFAIL); zbuf = umem_alloc(1024, UMEM_NOFAIL); ztest_get_zdb_bin(bin, len); - (void) sprintf(zdb, - "%s -bcc%s%s -G -d -Y -e -y -p %s %s", + char **set_gvars_args = ztest_global_vars_to_zdb_args(); + char *set_gvars_args_joined = join_strings(set_gvars_args, " "); + free(set_gvars_args); + + size_t would = snprintf(zdb, len, + "%s -bcc%s%s -G -d -Y -e -y %s -p %s %s", bin, ztest_opts.zo_verbose >= 3 ? "s" : "", ztest_opts.zo_verbose >= 4 ? "v" : "", + set_gvars_args_joined, ztest_opts.zo_dir, pool); + ASSERT3U(would, <, len); + + free(set_gvars_args_joined); if (ztest_opts.zo_verbose >= 5) (void) printf("Executing %s\n", strstr(zdb, "zdb ")); fp = popen(zdb, "r"); while (fgets(zbuf, 1024, fp) != NULL) if (ztest_opts.zo_verbose >= 3) (void) printf("%s", zbuf); status = pclose(fp); if (status == 0) goto out; ztest_dump_core = 0; if (WIFEXITED(status)) fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); else fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); out: umem_free(bin, len); umem_free(zdb, len); umem_free(zbuf, 1024); } static void ztest_walk_pool_directory(char *header) { spa_t *spa = NULL; if (ztest_opts.zo_verbose >= 6) (void) printf("%s\n", header); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) if (ztest_opts.zo_verbose >= 6) (void) printf("\t%s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); } static void ztest_spa_import_export(char *oldname, char *newname) { nvlist_t *config, *newconfig; uint64_t pool_guid; spa_t *spa; int error; if (ztest_opts.zo_verbose >= 4) { (void) printf("import/export: old = %s, new = %s\n", oldname, newname); } /* * Clean up from previous runs. */ (void) spa_destroy(newname); /* * Get the pool's configuration and guid. */ VERIFY0(spa_open(oldname, &spa, FTAG)); /* * Kick off a scrub to tickle scrub/export races. */ if (ztest_random(2) == 0) (void) spa_scan(spa, POOL_SCAN_SCRUB); pool_guid = spa_guid(spa); spa_close(spa, FTAG); ztest_walk_pool_directory("pools before export"); /* * Export it. */ VERIFY0(spa_export(oldname, &config, B_FALSE, B_FALSE)); ztest_walk_pool_directory("pools after export"); /* * Try to import it. */ newconfig = spa_tryimport(config); ASSERT3P(newconfig, !=, NULL); fnvlist_free(newconfig); /* * Import it under the new name. */ error = spa_import(newname, config, NULL, 0); if (error != 0) { dump_nvlist(config, 0); fatal(B_FALSE, "couldn't import pool %s as %s: error %u", oldname, newname, error); } ztest_walk_pool_directory("pools after import"); /* * Try to import it again -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); /* * Try to import it under a different name -- should fail with EEXIST. */ VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); /* * Verify that the pool is no longer visible under the old name. */ VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); /* * Verify that we can open and close the pool using the new name. */ VERIFY0(spa_open(newname, &spa, FTAG)); ASSERT3U(pool_guid, ==, spa_guid(spa)); spa_close(spa, FTAG); fnvlist_free(config); } static void ztest_resume(spa_t *spa) { if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) (void) printf("resuming from suspended state\n"); spa_vdev_state_enter(spa, SCL_NONE); vdev_clear(spa, NULL); (void) spa_vdev_state_exit(spa, NULL, 0); (void) zio_resume(spa); } static void ztest_resume_thread(void *arg) { spa_t *spa = arg; while (!ztest_exiting) { if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); /* * Periodically change the zfs_compressed_arc_enabled setting. */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); /* * Periodically change the zfs_abd_scatter_enabled setting. */ if (ztest_random(10) == 0) zfs_abd_scatter_enabled = ztest_random(2); } thread_exit(); } static void ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; spa_t *spa = ztest_spa; hrtime_t delay, overdue, last_run = gethrtime(); delay = (zs->zs_thread_stop - zs->zs_thread_start) + MSEC2NSEC(zfs_deadman_synctime_ms); while (!ztest_exiting) { /* * Wait for the delay timer while checking occasionally * if we should stop. */ if (gethrtime() < last_run + delay) { (void) poll(NULL, 0, 1000); continue; } /* * If the pool is suspended then fail immediately. Otherwise, * check to see if the pool is making any progress. If * vdev_deadman() discovers that there hasn't been any recent * I/Os then it will end up aborting the tests. */ if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { fatal(0, "aborting test after %llu seconds because " "pool has transitioned to a suspended state.", zfs_deadman_synctime_ms / 1000); } vdev_deadman(spa->spa_root_vdev, FTAG); /* * If the process doesn't complete within a grace period of * zfs_deadman_synctime_ms over the expected finish time, * then it may be hung and is terminated. */ overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); if (gethrtime() > overdue) { fatal(0, "aborting test after %llu seconds because " "the process is overdue for termination.", (gethrtime() - zs->zs_proc_start) / NANOSEC); } (void) printf("ztest has been running for %lld seconds\n", (gethrtime() - zs->zs_proc_start) / NANOSEC); last_run = gethrtime(); delay = MSEC2NSEC(zfs_deadman_checktime_ms); } thread_exit(); } static void ztest_execute(int test, ztest_info_t *zi, uint64_t id) { ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); hrtime_t functime = gethrtime(); int i; for (i = 0; i < zi->zi_iters; i++) zi->zi_func(zd, id); functime = gethrtime() - functime; atomic_add_64(&zc->zc_count, 1); atomic_add_64(&zc->zc_time, functime); if (ztest_opts.zo_verbose >= 4) (void) printf("%6.2f sec in %s\n", (double)functime / NANOSEC, zi->zi_funcname); } static void ztest_thread(void *arg) { int rand; uint64_t id = (uintptr_t)arg; ztest_shared_t *zs = ztest_shared; uint64_t call_next; hrtime_t now; ztest_info_t *zi; ztest_shared_callstate_t *zc; while ((now = gethrtime()) < zs->zs_thread_stop) { /* * See if it's time to force a crash. */ if (now > zs->zs_thread_kill) ztest_kill(zs); /* * If we're getting ENOSPC with some regularity, stop. */ if (zs->zs_enospc_count > 10) break; /* * Pick a random function to execute. */ rand = ztest_random(ZTEST_FUNCS); zi = &ztest_info[rand]; zc = ZTEST_GET_SHARED_CALLSTATE(rand); call_next = zc->zc_next; if (now >= call_next && atomic_cas_64(&zc->zc_next, call_next, call_next + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { ztest_execute(rand, zi, id); } } thread_exit(); } static void ztest_dataset_name(char *dsname, char *pool, int d) { (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); } static void ztest_dataset_destroy(int d) { char name[ZFS_MAX_DATASET_NAME_LEN]; int t; ztest_dataset_name(name, ztest_opts.zo_pool, d); if (ztest_opts.zo_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); /* * Cleanup any non-standard clones and snapshots. In general, * ztest thread t operates on dataset (t % zopt_datasets), * so there may be more than one thing to clean up. */ for (t = d; t < ztest_opts.zo_threads; t += ztest_opts.zo_datasets) ztest_dsl_dataset_cleanup(name, t); (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); } static void ztest_dataset_dirobj_verify(ztest_ds_t *zd) { uint64_t usedobjs, dirobjs, scratch; /* * ZTEST_DIROBJ is the object directory for the entire dataset. * Therefore, the number of objects in use should equal the * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. * If not, we have an object leak. * * Note that we can only check this in ztest_dataset_open(), * when the open-context and syncing-context values agree. * That's because zap_count() returns the open-context value, * while dmu_objset_space() returns the rootbp fill count. */ VERIFY0(zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); ASSERT3U(dirobjs + 1, ==, usedobjs); } static int ztest_dataset_open(int d) { ztest_ds_t *zd = &ztest_ds[d]; uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; objset_t *os; zilog_t *zilog; char name[ZFS_MAX_DATASET_NAME_LEN]; int error; ztest_dataset_name(name, ztest_opts.zo_pool, d); (void) pthread_rwlock_rdlock(&ztest_name_lock); error = ztest_dataset_create(name); if (error == ENOSPC) { (void) pthread_rwlock_unlock(&ztest_name_lock); ztest_record_enospc(FTAG); return (error); } ASSERT(error == 0 || error == EEXIST); VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, zd, &os)); (void) pthread_rwlock_unlock(&ztest_name_lock); ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); zilog = zd->zd_zilog; if (zilog->zl_header->zh_claim_lr_seq != 0 && zilog->zl_header->zh_claim_lr_seq < committed_seq) fatal(0, "missing log records: claimed %llu < committed %llu", zilog->zl_header->zh_claim_lr_seq, committed_seq); ztest_dataset_dirobj_verify(zd); zil_replay(os, zd, ztest_replay_vector); ztest_dataset_dirobj_verify(zd); if (ztest_opts.zo_verbose >= 6) (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", zd->zd_name, (u_longlong_t)zilog->zl_parse_blk_count, (u_longlong_t)zilog->zl_parse_lr_count, (u_longlong_t)zilog->zl_replaying_seq); zilog = zil_open(os, ztest_get_data); if (zilog->zl_replaying_seq != 0 && zilog->zl_replaying_seq < committed_seq) fatal(0, "missing log records: replayed %llu < committed %llu", zilog->zl_replaying_seq, committed_seq); return (0); } static void ztest_dataset_close(int d) { ztest_ds_t *zd = &ztest_ds[d]; zil_close(zd->zd_zilog); dmu_objset_disown(zd->zd_os, B_TRUE, zd); ztest_zd_fini(zd); } /* ARGSUSED */ static int ztest_replay_zil_cb(const char *name, void *arg) { objset_t *os; ztest_ds_t *zdtmp; VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); ztest_zd_init(zdtmp, NULL, os); zil_replay(os, zdtmp, ztest_replay_vector); ztest_zd_fini(zdtmp); if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && ztest_opts.zo_verbose >= 6) { zilog_t *zilog = dmu_objset_zil(os); (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", name, (u_longlong_t)zilog->zl_parse_blk_count, (u_longlong_t)zilog->zl_parse_lr_count, (u_longlong_t)zilog->zl_replaying_seq); } umem_free(zdtmp, sizeof (ztest_ds_t)); dmu_objset_disown(os, B_TRUE, FTAG); return (0); } static void ztest_freeze(void) { ztest_ds_t *zd = &ztest_ds[0]; spa_t *spa; int numloops = 0; if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); VERIFY0(ztest_dataset_open(0)); ztest_spa = spa; /* * Force the first log block to be transactionally allocated. * We have to do this before we freeze the pool -- otherwise * the log chain won't be anchored. */ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { ztest_dmu_object_alloc_free(zd, 0); zil_commit(zd->zd_zilog, 0); } txg_wait_synced(spa_get_dsl(spa), 0); /* * Freeze the pool. This stops spa_sync() from doing anything, * so that the only way to record changes from now on is the ZIL. */ spa_freeze(spa); /* * Because it is hard to predict how much space a write will actually * require beforehand, we leave ourselves some fudge space to write over * capacity. */ uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; /* * Run tests that generate log records but don't alter the pool config * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). * We do a txg_wait_synced() after each iteration to force the txg * to increase well beyond the last synced value in the uberblock. * The ZIL should be OK with that. * * Run a random number of times less than zo_maxloops and ensure we do * not run out of space on the pool. */ while (ztest_random(10) != 0 && numloops++ < ztest_opts.zo_maxloops && metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { ztest_od_t od; ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); ztest_io(zd, od.od_object, ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); txg_wait_synced(spa_get_dsl(spa), 0); } /* * Commit all of the changes we just generated. */ zil_commit(zd->zd_zilog, 0); txg_wait_synced(spa_get_dsl(spa), 0); /* * Close our dataset and close the pool. */ ztest_dataset_close(0); spa_close(spa, FTAG); kernel_fini(); /* * Open and close the pool and dataset to induce log replay. */ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); ASSERT3U(spa_freeze_txg(spa), ==, UINT64_MAX); VERIFY0(ztest_dataset_open(0)); ztest_spa = spa; txg_wait_synced(spa_get_dsl(spa), 0); ztest_dataset_close(0); ztest_reguid(NULL, 0); spa_close(spa, FTAG); kernel_fini(); } static void ztest_import_impl(ztest_shared_t *zs) { importargs_t args = { 0 }; nvlist_t *cfg = NULL; int nsearch = 1; char *searchdirs[nsearch]; int flags = ZFS_IMPORT_MISSING_LOG; searchdirs[0] = ztest_opts.zo_dir; args.paths = nsearch; args.path = searchdirs; args.can_be_active = B_FALSE; VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, &libzpool_config_ops)); VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); fnvlist_free(cfg); } /* * Import a storage pool with the given name. */ static void ztest_import(ztest_shared_t *zs) { spa_t *spa; mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); ztest_import_impl(zs); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { ztest_run_zdb(ztest_opts.zo_pool); ztest_freeze(); ztest_run_zdb(ztest_opts.zo_pool); } (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } /* * Kick off threads to run tests on all datasets in parallel. */ static void ztest_run(ztest_shared_t *zs) { spa_t *spa; objset_t *os; kthread_t *resume_thread, *deadman_thread; kthread_t **run_threads; uint64_t object; int error; int t, d; ztest_exiting = B_FALSE; /* * Initialize parent/child shared state. */ mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); zs->zs_thread_start = gethrtime(); zs->zs_thread_stop = zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); zs->zs_thread_kill = zs->zs_thread_stop; if (ztest_random(100) < ztest_opts.zo_killrate) { zs->zs_thread_kill -= ztest_random(ztest_opts.zo_passtime * NANOSEC); } mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), offsetof(ztest_cb_data_t, zcd_node)); /* * Open our pool. It may need to be imported first depending on * what tests were running when the previous pass was terminated. */ kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); error = spa_open(ztest_opts.zo_pool, &spa, FTAG); if (error) { VERIFY3S(error, ==, ENOENT); ztest_import_impl(zs); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; } metaslab_preload_limit = ztest_random(20) + 1; ztest_spa = spa; VERIFY0(vdev_raidz_impl_set("cycle")); dmu_objset_stats_t dds; VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); zs->zs_guid = dds.dds_guid; dmu_objset_disown(os, B_TRUE, FTAG); /* * Create a thread to periodically resume suspended I/O. */ resume_thread = thread_create(NULL, 0, ztest_resume_thread, spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); /* * Create a deadman thread and set to panic if we hang. */ deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; /* * Verify that we can safely inquire about any object, * whether it's allocated or not. To make it interesting, * we probe a 5-wide window around each power of two. * This hits all edge cases, including zero and the max. */ for (t = 0; t < 64; t++) { for (d = -5; d <= 5; d++) { error = dmu_object_info(spa->spa_meta_objset, (1ULL << t) + d, NULL); ASSERT(error == 0 || error == ENOENT || error == EINVAL); } } /* * If we got any ENOSPC errors on the previous run, destroy something. */ if (zs->zs_enospc_count != 0) { int d = ztest_random(ztest_opts.zo_datasets); ztest_dataset_destroy(d); } zs->zs_enospc_count = 0; /* * If we were in the middle of ztest_device_removal() and were killed * we need to ensure the removal and scrub complete before running * any tests that check ztest_device_removal_active. The removal will * be restarted automatically when the spa is opened, but we need to * initiate the scrub manually if it is not already in progress. Note * that we always run the scrub whenever an indirect vdev exists * because we have no way of knowing for sure if ztest_device_removal() * fully completed its scrub before the pool was reimported. */ if (spa->spa_removing_phys.sr_state == DSS_SCANNING || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { while (spa->spa_removing_phys.sr_state == DSS_SCANNING) txg_wait_synced(spa_get_dsl(spa), 0); error = ztest_scrub_impl(spa); if (error == EBUSY) error = 0; ASSERT0(error); } run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), UMEM_NOFAIL); if (ztest_opts.zo_verbose >= 4) (void) printf("starting main threads...\n"); /* * Replay all logs of all datasets in the pool. This is primarily for * temporary datasets which wouldn't otherwise get replayed, which * can trigger failures when attempting to offline a SLOG in * ztest_fault_inject(). */ (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, NULL, DS_FIND_CHILDREN); /* * Kick off all the tests that run in parallel. */ for (t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); return; } run_threads[t] = thread_create(NULL, 0, ztest_thread, (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); } /* * Wait for all of the tests to complete. */ for (t = 0; t < ztest_opts.zo_threads; t++) VERIFY0(thread_join(run_threads[t])); /* * Close all datasets. This must be done after all the threads * are joined so we can be sure none of the datasets are in-use * by any of the threads. */ for (t = 0; t < ztest_opts.zo_threads; t++) { if (t < ztest_opts.zo_datasets) ztest_dataset_close(t); } txg_wait_synced(spa_get_dsl(spa), 0); zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); /* Kill the resume and deadman threads */ ztest_exiting = B_TRUE; VERIFY0(thread_join(resume_thread)); VERIFY0(thread_join(deadman_thread)); ztest_resume(spa); /* * Right before closing the pool, kick off a bunch of async I/O; * spa_close() should wait for it to complete. */ for (object = 1; object < 50; object++) { dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, ZIO_PRIORITY_SYNC_READ); } /* Verify that at least one commit cb was called in a timely fashion */ if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) VERIFY0(zc_min_txg_delay); spa_close(spa, FTAG); /* * Verify that we can loop over all pools. */ mutex_enter(&spa_namespace_lock); for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) if (ztest_opts.zo_verbose > 3) (void) printf("spa_next: found %s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); /* * Verify that we can export the pool and reimport it under a * different name. */ if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { char name[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s_import", ztest_opts.zo_pool); ztest_spa_import_export(ztest_opts.zo_pool, name); ztest_spa_import_export(name, ztest_opts.zo_pool); } kernel_fini(); list_destroy(&zcl.zcl_callbacks); mutex_destroy(&zcl.zcl_callbacks_lock); (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void print_time(hrtime_t t, char *timebuf) { hrtime_t s = t / NANOSEC; hrtime_t m = s / 60; hrtime_t h = m / 60; hrtime_t d = h / 24; s -= m * 60; m -= h * 60; h -= d * 24; timebuf[0] = '\0'; if (d) (void) sprintf(timebuf, "%llud%02lluh%02llum%02llus", d, h, m, s); else if (h) (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); else if (m) (void) sprintf(timebuf, "%llum%02llus", m, s); else (void) sprintf(timebuf, "%llus", s); } static nvlist_t * make_random_props(void) { nvlist_t *props; props = fnvlist_alloc(); if (ztest_random(2) == 0) return (props); fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1); return (props); } /* * Create a storage pool with the given name and initial vdev size. * Then test spa_freeze() functionality. */ static void ztest_init(ztest_shared_t *zs) { spa_t *spa; nvlist_t *nvroot, *props; int i; mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); /* * Create the storage pool. */ (void) spa_destroy(ztest_opts.zo_pool); ztest_shared->zs_vdev_next_leaf = 0; zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); props = make_random_props(); /* * We don't expect the pool to suspend unless maxfaults == 0, * in which case ztest_fault_inject() temporarily takes away * the only valid replica. */ fnvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT); for (i = 0; i < SPA_FEATURES; i++) { char *buf; /* * 75% chance of using the log space map feature. We want ztest * to exercise both the code paths that use the log space map * feature and the ones that don't. */ if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) continue; VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", spa_feature_table[i].fi_uname)); fnvlist_add_uint64(props, buf, 0); free(buf); } VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); fnvlist_free(nvroot); fnvlist_free(props); VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); zs->zs_metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); kernel_fini(); if (!ztest_opts.zo_mmp_test) { ztest_run_zdb(ztest_opts.zo_pool); ztest_freeze(); ztest_run_zdb(ztest_opts.zo_pool); } (void) pthread_rwlock_destroy(&ztest_name_lock); mutex_destroy(&ztest_vdev_lock); mutex_destroy(&ztest_checkpoint_lock); } static void setup_data_fd(void) { static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; ztest_fd_data = mkstemp(ztest_name_data); ASSERT3S(ztest_fd_data, >=, 0); (void) unlink(ztest_name_data); } static int shared_data_size(ztest_shared_hdr_t *hdr) { int size; size = hdr->zh_hdr_size; size += hdr->zh_opts_size; size += hdr->zh_size; size += hdr->zh_stats_size * hdr->zh_stats_count; size += hdr->zh_ds_size * hdr->zh_ds_count; return (size); } static void setup_hdr(void) { int size; ztest_shared_hdr_t *hdr; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); VERIFY0(ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); hdr->zh_opts_size = sizeof (ztest_shared_opts_t); hdr->zh_size = sizeof (ztest_shared_t); hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); hdr->zh_stats_count = ZTEST_FUNCS; hdr->zh_ds_size = sizeof (ztest_shared_ds_t); hdr->zh_ds_count = ztest_opts.zo_datasets; size = shared_data_size(hdr); VERIFY0(ftruncate(ztest_fd_data, size)); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); } static void setup_data(void) { int size, offset; ztest_shared_hdr_t *hdr; uint8_t *buf; hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), PROT_READ, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); size = shared_data_size(hdr); (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); ASSERT3P(hdr, !=, MAP_FAILED); buf = (uint8_t *)hdr; offset = hdr->zh_hdr_size; ztest_shared_opts = (void *)&buf[offset]; offset += hdr->zh_opts_size; ztest_shared = (void *)&buf[offset]; offset += hdr->zh_size; ztest_shared_callstate = (void *)&buf[offset]; offset += hdr->zh_stats_size * hdr->zh_stats_count; ztest_shared_ds = (void *)&buf[offset]; } static boolean_t exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) { pid_t pid; int status; char *cmdbuf = NULL; pid = fork(); if (cmd == NULL) { cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); cmd = cmdbuf; } if (pid == -1) fatal(1, "fork failed"); if (pid == 0) { /* child */ char *emptyargv[2] = { cmd, NULL }; char fd_data_str[12]; struct rlimit rl = { 1024, 1024 }; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) close(ztest_fd_rand); VERIFY3S(11, >=, snprintf(fd_data_str, 12, "%d", ztest_fd_data)); VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); (void) enable_extended_FILE_stdio(-1, -1); if (libpath != NULL) VERIFY0(setenv("LD_LIBRARY_PATH", libpath, 1)); (void) execv(cmd, emptyargv); ztest_dump_core = B_FALSE; fatal(B_TRUE, "exec failed: %s", cmd); } if (cmdbuf != NULL) { umem_free(cmdbuf, MAXPATHLEN); cmd = NULL; } while (waitpid(pid, &status, 0) != pid) continue; if (statusp != NULL) *statusp = status; if (WIFEXITED(status)) { if (WEXITSTATUS(status) != 0) { (void) fprintf(stderr, "child exited with code %d\n", WEXITSTATUS(status)); exit(2); } return (B_FALSE); } else if (WIFSIGNALED(status)) { if (!ignorekill || WTERMSIG(status) != SIGKILL) { (void) fprintf(stderr, "child died with signal %d\n", WTERMSIG(status)); exit(3); } return (B_TRUE); } else { (void) fprintf(stderr, "something strange happened to child\n"); exit(4); /* NOTREACHED */ } } static void ztest_run_init(void) { int i; ztest_shared_t *zs = ztest_shared; /* * Blow away any existing copy of zpool.cache */ (void) remove(spa_config_path); if (ztest_opts.zo_init == 0) { if (ztest_opts.zo_verbose >= 1) (void) printf("Importing pool %s\n", ztest_opts.zo_pool); ztest_import(zs); return; } /* * Create and initialize our storage pool. */ for (i = 1; i <= ztest_opts.zo_init; i++) { bzero(zs, sizeof (ztest_shared_t)); if (ztest_opts.zo_verbose >= 3 && ztest_opts.zo_init != 1) { (void) printf("ztest_init(), pass %d\n", i); } ztest_init(zs); } } int main(int argc, char **argv) { int kills = 0; int iters = 0; int older = 0; int newer = 0; ztest_shared_t *zs; ztest_info_t *zi; ztest_shared_callstate_t *zc; char timebuf[100]; char numbuf[NN_NUMBUF_SZ]; char *cmd; boolean_t hasalt; - int f; + int f, err; char *fd_data_str = getenv("ZTEST_FD_DATA"); struct sigaction action; (void) setvbuf(stdout, NULL, _IOLBF, 0); dprintf_setup(&argc, argv); zfs_deadman_synctime_ms = 300000; zfs_deadman_checktime_ms = 30000; /* * As two-word space map entries may not come up often (especially * if pool and vdev sizes are small) we want to force at least some * of them so the feature get tested. */ zfs_force_some_double_word_sm_entries = B_TRUE; /* * Verify that even extensively damaged split blocks with many * segments can be reconstructed in a reasonable amount of time * when reconstruction is known to be possible. * * Note: the lower this value is, the more damage we inflict, and * the more time ztest spends in recovering that damage. We chose * to induce damage 1/100th of the time so recovery is tested but * not so frequently that ztest doesn't get to test other code paths. */ zfs_reconstruct_indirect_damage_fraction = 100; action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); action.sa_flags = 0; if (sigaction(SIGSEGV, &action, NULL) < 0) { (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", strerror(errno)); exit(EXIT_FAILURE); } if (sigaction(SIGABRT, &action, NULL) < 0) { (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", strerror(errno)); exit(EXIT_FAILURE); } /* * Force random_get_bytes() to use /dev/urandom in order to prevent * ztest from needlessly depleting the system entropy pool. */ random_path = "/dev/urandom"; ztest_fd_rand = open(random_path, O_RDONLY); ASSERT3S(ztest_fd_rand, >=, 0); if (!fd_data_str) { process_options(argc, argv); setup_data_fd(); setup_hdr(); setup_data(); bcopy(&ztest_opts, ztest_shared_opts, sizeof (*ztest_shared_opts)); } else { ztest_fd_data = atoi(fd_data_str); setup_data(); bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); } ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); + err = ztest_set_global_vars(); + if (err != 0 && !fd_data_str) { + /* error message done by ztest_set_global_vars */ + exit(EXIT_FAILURE); + } else { + /* children should not be spawned if setting gvars fails */ + VERIFY3S(err, ==, 0); + } + /* Override location of zpool.cache */ VERIFY3S(asprintf((char **)&spa_config_path, "%s/zpool.cache", ztest_opts.zo_dir), !=, -1); ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), UMEM_NOFAIL); zs = ztest_shared; if (fd_data_str) { metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; metaslab_df_alloc_threshold = zs->zs_metaslab_df_alloc_threshold; if (zs->zs_do_init) ztest_run_init(); else ztest_run(zs); exit(0); } hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); if (ztest_opts.zo_verbose >= 1) { (void) printf("%llu vdevs, %d datasets, %d threads," "%d %s disks, %llu seconds...\n\n", (u_longlong_t)ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, ztest_opts.zo_raid_children, ztest_opts.zo_raid_type, (u_longlong_t)ztest_opts.zo_time); } cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); (void) strlcpy(cmd, getexecname(), MAXNAMELEN); zs->zs_do_init = B_TRUE; if (strlen(ztest_opts.zo_alt_ztest) != 0) { if (ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest for " "initialization: %s\n", ztest_opts.zo_alt_ztest); } VERIFY(!exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_FALSE, NULL)); } else { VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); } zs->zs_do_init = B_FALSE; zs->zs_proc_start = gethrtime(); zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; for (f = 0; f < ZTEST_FUNCS; f++) { zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) zc->zc_next = UINT64_MAX; else zc->zc_next = zs->zs_proc_start + ztest_random(2 * zi->zi_interval[0] + 1); } /* * Run the tests in a loop. These tests include fault injection * to verify that self-healing data works, and forced crashes * to verify that we never lose on-disk consistency. */ while (gethrtime() < zs->zs_proc_stop) { int status; boolean_t killed; /* * Initialize the workload counters for each function. */ for (f = 0; f < ZTEST_FUNCS; f++) { zc = ZTEST_GET_SHARED_CALLSTATE(f); zc->zc_count = 0; zc->zc_time = 0; } /* Set the allocation switch size */ zs->zs_metaslab_df_alloc_threshold = ztest_random(zs->zs_metaslab_sz / 4) + 1; if (!hasalt || ztest_random(2) == 0) { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing newer ztest: %s\n", cmd); } newer++; killed = exec_child(cmd, NULL, B_TRUE, &status); } else { if (hasalt && ztest_opts.zo_verbose >= 1) { (void) printf("Executing older ztest: %s\n", ztest_opts.zo_alt_ztest); } older++; killed = exec_child(ztest_opts.zo_alt_ztest, ztest_opts.zo_alt_libpath, B_TRUE, &status); } if (killed) kills++; iters++; if (ztest_opts.zo_verbose >= 1) { hrtime_t now = gethrtime(); now = MIN(now, zs->zs_proc_stop); print_time(zs->zs_proc_stop - now, timebuf); nicenum(zs->zs_space, numbuf, sizeof (numbuf)); (void) printf("Pass %3d, %8s, %3llu ENOSPC, " "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", iters, WIFEXITED(status) ? "Complete" : "SIGKILL", (u_longlong_t)zs->zs_enospc_count, 100.0 * zs->zs_alloc / zs->zs_space, numbuf, 100.0 * (now - zs->zs_proc_start) / (ztest_opts.zo_time * NANOSEC), timebuf); } if (ztest_opts.zo_verbose >= 2) { (void) printf("\nWorkload summary:\n\n"); (void) printf("%7s %9s %s\n", "Calls", "Time", "Function"); (void) printf("%7s %9s %s\n", "-----", "----", "--------"); for (f = 0; f < ZTEST_FUNCS; f++) { zi = &ztest_info[f]; zc = ZTEST_GET_SHARED_CALLSTATE(f); print_time(zc->zc_time, timebuf); (void) printf("%7llu %9s %s\n", (u_longlong_t)zc->zc_count, timebuf, zi->zi_funcname); } (void) printf("\n"); } if (!ztest_opts.zo_mmp_test) ztest_run_zdb(ztest_opts.zo_pool); } if (ztest_opts.zo_verbose >= 1) { if (hasalt) { (void) printf("%d runs of older ztest: %s\n", older, ztest_opts.zo_alt_ztest); (void) printf("%d runs of newer ztest: %s\n", newer, cmd); } (void) printf("%d killed, %d completed, %.0f%% kill rate\n", kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } umem_free(cmd, MAXNAMELEN); return (0); } diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/uio.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/uio.h index f1d30195f048..b71f2f2e5625 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/uio.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/uio.h @@ -1,112 +1,81 @@ /* * Copyright (c) 2010 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_UIO_H_ #define _OPENSOLARIS_SYS_UIO_H_ #ifndef _STANDALONE #include_next #include #include typedef struct iovec iovec_t; typedef enum uio_seg zfs_uio_seg_t; typedef enum uio_rw zfs_uio_rw_t; typedef struct zfs_uio { struct uio *uio; } zfs_uio_t; #define GET_UIO_STRUCT(u) (u)->uio #define zfs_uio_segflg(u) GET_UIO_STRUCT(u)->uio_segflg #define zfs_uio_offset(u) GET_UIO_STRUCT(u)->uio_offset #define zfs_uio_resid(u) GET_UIO_STRUCT(u)->uio_resid #define zfs_uio_iovcnt(u) GET_UIO_STRUCT(u)->uio_iovcnt #define zfs_uio_iovlen(u, idx) GET_UIO_STRUCT(u)->uio_iov[(idx)].iov_len #define zfs_uio_iovbase(u, idx) GET_UIO_STRUCT(u)->uio_iov[(idx)].iov_base #define zfs_uio_td(u) GET_UIO_STRUCT(u)->uio_td #define zfs_uio_rw(u) GET_UIO_STRUCT(u)->uio_rw #define zfs_uio_fault_disable(u, set) #define zfs_uio_prefaultpages(size, u) (0) - -static __inline void -zfs_uio_init(zfs_uio_t *uio, struct uio *uio_s) -{ - GET_UIO_STRUCT(uio) = uio_s; -} - -static __inline void +static inline void zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) { zfs_uio_offset(uio) = off; } -static __inline int -zfs_uiomove(void *cp, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) -{ - ASSERT(zfs_uio_rw(uio) == dir); - return (uiomove(cp, (int)n, GET_UIO_STRUCT(uio))); -} - -int zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, - size_t *cbytes); -void zfs_uioskip(zfs_uio_t *uiop, size_t n); -int zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio); - -static inline void -zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) -{ - *base = zfs_uio_iovbase(uio, idx); - *len = zfs_uio_iovlen(uio, idx); -} - static inline void zfs_uio_advance(zfs_uio_t *uio, size_t size) { zfs_uio_resid(uio) -= size; zfs_uio_offset(uio) += size; } -static inline offset_t -zfs_uio_index_at_offset(zfs_uio_t *uio, offset_t off, uint_t *vec_idx) +static __inline void +zfs_uio_init(zfs_uio_t *uio, struct uio *uio_s) { - *vec_idx = 0; - while (*vec_idx < zfs_uio_iovcnt(uio) && - off >= zfs_uio_iovlen(uio, *vec_idx)) { - off -= zfs_uio_iovlen(uio, *vec_idx); - (*vec_idx)++; - } - - return (off); + GET_UIO_STRUCT(uio) = uio_s; } +int zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio); + #endif /* !_STANDALONE */ #endif /* !_OPENSOLARIS_SYS_UIO_H_ */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h index 0deed3c5736d..66af2b0b534c 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h @@ -1,163 +1,145 @@ /* * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. * Copyright (C) 2007 The Regents of the University of California. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * UCRL-CODE-235197 * * This file is part of the SPL, Solaris Porting Layer. * * The SPL is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. * * The SPL is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . */ #ifndef _SPL_UIO_H #define _SPL_UIO_H #include #include #include #include #include #include #include #include typedef struct iovec iovec_t; typedef enum zfs_uio_rw { UIO_READ = 0, UIO_WRITE = 1, } zfs_uio_rw_t; typedef enum zfs_uio_seg { UIO_USERSPACE = 0, UIO_SYSSPACE = 1, UIO_BVEC = 2, #if defined(HAVE_VFS_IOV_ITER) UIO_ITER = 3, #endif } zfs_uio_seg_t; typedef struct zfs_uio { union { const struct iovec *uio_iov; const struct bio_vec *uio_bvec; #if defined(HAVE_VFS_IOV_ITER) struct iov_iter *uio_iter; #endif }; int uio_iovcnt; offset_t uio_loffset; zfs_uio_seg_t uio_segflg; boolean_t uio_fault_disable; uint16_t uio_fmode; uint16_t uio_extflg; ssize_t uio_resid; size_t uio_skip; } zfs_uio_t; #define zfs_uio_segflg(u) (u)->uio_segflg #define zfs_uio_offset(u) (u)->uio_loffset #define zfs_uio_resid(u) (u)->uio_resid #define zfs_uio_iovcnt(u) (u)->uio_iovcnt #define zfs_uio_iovlen(u, idx) (u)->uio_iov[(idx)].iov_len #define zfs_uio_iovbase(u, idx) (u)->uio_iov[(idx)].iov_base #define zfs_uio_fault_disable(u, set) (u)->uio_fault_disable = set #define zfs_uio_rlimit_fsize(z, u) (0) #define zfs_uio_fault_move(p, n, rw, u) zfs_uiomove((p), (n), (rw), (u)) +extern int zfs_uio_prefaultpages(ssize_t, zfs_uio_t *); + static inline void zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) { uio->uio_loffset = off; } -static inline void -zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) -{ - *base = zfs_uio_iovbase(uio, idx); - *len = zfs_uio_iovlen(uio, idx); -} - static inline void zfs_uio_advance(zfs_uio_t *uio, size_t size) { uio->uio_resid -= size; uio->uio_loffset += size; } -static inline offset_t -zfs_uio_index_at_offset(zfs_uio_t *uio, offset_t off, uint_t *vec_idx) -{ - *vec_idx = 0; - while (*vec_idx < zfs_uio_iovcnt(uio) && - off >= zfs_uio_iovlen(uio, *vec_idx)) { - off -= zfs_uio_iovlen(uio, *vec_idx); - (*vec_idx)++; - } - - return (off); -} - static inline void zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, unsigned long nr_segs, offset_t offset, zfs_uio_seg_t seg, ssize_t resid, size_t skip) { ASSERT(seg == UIO_USERSPACE || seg == UIO_SYSSPACE); uio->uio_iov = iov; uio->uio_iovcnt = nr_segs; uio->uio_loffset = offset; uio->uio_segflg = seg; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; } static inline void zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio) { uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; uio->uio_segflg = UIO_BVEC; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = BIO_BI_SIZE(bio); uio->uio_skip = BIO_BI_SKIP(bio); } #if defined(HAVE_VFS_IOV_ITER) static inline void zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, ssize_t resid, size_t skip) { uio->uio_iter = iter; uio->uio_iovcnt = iter->nr_segs; uio->uio_loffset = offset; uio->uio_segflg = UIO_ITER; uio->uio_fault_disable = B_FALSE; uio->uio_fmode = 0; uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; } #endif #endif /* SPL_UIO_H */ diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_context_os.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_context_os.h index 9e5fdd79f019..de7015b929b6 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_context_os.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_context_os.h @@ -1,30 +1,29 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ #ifndef ZFS_CONTEXT_OS_H #define ZFS_CONTEXT_OS_H -#include #include #include #endif diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index 0762ae8a3e13..8391be8328b6 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -1,1225 +1,1226 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Allan Jude * Copyright (c) 2019, Klara Inc. */ #ifndef _SYS_SPA_H #define _SYS_SPA_H #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Forward references that lots of things need. */ typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct metaslab_group metaslab_group_t; typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; struct bpobj; struct bplist; struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * * Note that although blocks up to 16MB are supported, the recordsize * property can not be set larger than zfs_max_recordsize (default 1MB). * See the comment near zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. */ #define SPA_MINBLOCKSHIFT 9 #define SPA_OLD_MAXBLOCKSHIFT 17 #define SPA_MAXBLOCKSHIFT 24 #define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) #define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) #define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) /* * Alignment Shift (ashift) is an immutable, internal top-level vdev property * which can only be set at vdev creation time. Physical writes are always done * according to it, which makes 2^ashift the smallest possible IO on a vdev. * * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB * (2^16 = 65,536). */ #define ASHIFT_MIN 9 #define ASHIFT_MAX 16 /* * Size of block to hold the configuration data (a packed nvlist) */ #define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. * The ASIZE encoding should be at least 64 times larger (6 more bits) * to support up to 4-way RAID-Z mirror mode with worst-case gang block * overhead, three DVAs per bp, plus one more bit in case we do anything * else that expands the ASIZE. */ #define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 #define SPA_VDEVBITS 24 #define SPA_COMPRESSMASK ((1U << SPA_COMPRESSBITS) - 1) /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. */ typedef struct dva { uint64_t dva_word[2]; } dva_t; /* * Some checksums/hashes need a 256-bit initialization salt. This salt is kept * secret and is suitable for use in MAC algorithms as the key. */ typedef struct zio_cksum_salt { uint8_t zcs_bytes[32]; } zio_cksum_salt_t; /* * Each block is described by its DVAs, time of birth, checksum, etc. * The word-by-word, bit-by-bit layout of the blkptr is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | pad | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | pad | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | pad | vdev3 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | checksum[2] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | checksum[3] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * vdev virtual device ID * offset offset into virtual device * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) * GRID RAID-Z layout information (reserved for future use) * cksum checksum function * comp compression function * G gang block indicator * B byteorder (endianness) * D dedup * X encryption * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * phys birth txg when dva[0] was written; zero if same as logical birth txg * note that typically all the dva's would be written in this * txg, but they could be different if they were moved by * device removal. * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ /* * The blkptr_t's of encrypted blocks also need to store the encryption * parameters so that the block can be decrypted. This layout is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | salt | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 | IV1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | IV2 | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | MAC[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | MAC[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * salt Salt for generating encryption keys * IV1 First 64 bits of encryption IV * X Block requires encryption handling (set to 1) * E blkptr_t contains embedded data (set to 0, see below) * fill count number of non-zero blocks under this bp (truncated to 32 bits) * IV2 Last 32 bits of encryption IV * checksum[2] 128-bit checksum of the data this bp describes * MAC[2] 128-bit message authentication code for this data * * The X bit being set indicates that this block is one of 3 types. If this is * a level 0 block with an encrypted object type, the block is encrypted * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted * object type, this block is authenticated with an HMAC (see * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC * words to store a checksum-of-MACs from the level below (see * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() * refers to any of these 3 kinds of blocks. * * The additional encryption parameters are the salt, IV, and MAC which are * explained in greater detail in the block comment at the top of zio_crypt.c. * The MAC occupies half of the checksum space since it serves a very similar * purpose: to prevent data corruption on disk. The only functional difference * is that the checksum is used to detect on-disk corruption whether or not the * encryption key is loaded and the MAC provides additional protection against * malicious disk tampering. We use the 3rd DVA to store the salt and first * 64 bits of the IV. As a result encrypted blocks can only have 2 copies * maximum instead of the normal 3. The last 32 bits of the IV are stored in * the upper bits of what is usually the fill count. Note that only blocks at * level 0 or -2 are ever encrypted, which allows us to guarantee that these * 32 bits are not trampled over by other code (see zio_crypt.c for details). * The salt and IV are not used for authenticated bps or bps with an indirect * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits * for the fill count. */ /* * "Embedded" blkptr_t's don't actually point to a block, instead they * have a data payload embedded in the blkptr_t itself. See the comment * in blkptr.c for more details. * * The blkptr_t is laid out as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | payload | * 1 | payload | * 2 | payload | * 3 | payload | * 4 | payload | * 5 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | payload | * 8 | payload | * 9 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | payload | * c | payload | * d | payload | * e | payload | * f | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * payload contains the embedded data * B (byteorder) byteorder (endianness) * D (dedup) padding (set to zero) * X encryption (set to zero) * E (embedded) set to one * lvl indirection level * type DMU object type * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) * comp compression function of payload * PSIZE size of payload after compression, in bytes * LSIZE logical size of payload, in bytes * note that 25 bits is enough to store the largest * "normal" BP's LSIZE (2^16 * 2^9) in bytes * log. birth transaction group in which the block was logically born * * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded * bp's they are stored in units of SPA_MINBLOCKSHIFT. * Generally, the generic BP_GET_*() macros can be used on embedded BP's. * The B, D, X, lvl, type, and comp fields are stored the same as with normal * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before * other macros, as they assert that they are only used on BP's of the correct * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use * the payload space for encryption parameters (see the comment above on * how encryption parameters are stored). */ #define BPE_GET_ETYPE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET((bp)->blk_prop, 40, 8)) #define BPE_SET_ETYPE(bp, t) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, t); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_LSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) #define BPE_SET_LSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_PSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */ BP_EMBEDDED_TYPE_REDACTED, NUM_BP_EMBEDDED_TYPES } bp_embedded_type_t; #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ #define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ /* * A block is a hole when it has either 1) never been written to, or * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads * without physically allocating disk space. Holes are represented in the * blkptr_t structure by zeroed blk_dva. Correct checking for holes is * done through the BP_IS_HOLE macro. For holes, the logical size, level, * DMU object type, and birth times are all also stored for holes that * were written to at some point (i.e. were punched after having been filled). */ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_pad[2]; /* Extra space for the future */ uint64_t blk_phys_birth; /* txg when block was allocated */ uint64_t blk_birth; /* transaction group at birth */ uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; /* * Macros to get and set fields in a bp or DVA. */ #define DVA_GET_ASIZE(dva) \ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) #define DVA_SET_VDEV(dva, x) \ BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_OFFSET(dva, x) \ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? \ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_LSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_PSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_PSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_COMPRESS(bp) \ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) #define BP_SET_COMPRESS(bp, x) \ BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) #define BP_GET_CHECKSUM(bp) \ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ BF64_GET((bp)->blk_prop, 40, 8)) #define BP_SET_CHECKSUM(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) /* encrypted, authenticated, and MAC cksum bps use the same bit */ #define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) #define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) #define BP_IS_ENCRYPTED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_IS_AUTHENTICATED(bp) \ (BP_USES_CRYPT(bp) && \ BP_GET_LEVEL(bp) <= 0 && \ !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) #define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) #define BP_IS_PROTECTED(bp) \ (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) #define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) #define BP_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ (bp)->blk_birth = (logical); \ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ } #define BP_GET_FILL(bp) \ ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) #define BP_SET_FILL(bp, fill) \ { \ if (BP_IS_ENCRYPTED(bp)) \ BF64_SET((bp)->blk_fill, 0, 32, fill); \ else \ (bp)->blk_fill = fill; \ } #define BP_GET_IV2(bp) \ (ASSERT(BP_IS_ENCRYPTED(bp)), \ BF64_GET((bp)->blk_fill, 32, 32)) #define BP_SET_IV2(bp, iv2) \ { \ ASSERT(BP_IS_ENCRYPTED(bp)); \ BF64_SET((bp)->blk_fill, 32, 32, iv2); \ } #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_GET_UCSIZE(bp) \ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_COUNT_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ (bp1)->blk_birth == (bp2)->blk_birth && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) #define BP_IS_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ (dva)->dva_word[1] == 0ULL) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) #define BP_SET_REDACTED(bp) \ { \ BP_SET_EMBEDDED(bp, B_TRUE); \ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED); \ } #define BP_IS_REDACTED(bp) \ (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) #define BP_ZERO(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[0] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ (bp)->blk_phys_birth = 0; \ (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } #ifdef _ZFS_BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else #define ZFS_HOST_BYTEORDER (1ULL) #endif #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) #define BP_SPRINTF_LEN 400 /* * This macro allows code sharing between zfs, libzpool, and mdb. * 'func' is either snprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *copyname[] = \ { "zero", "single", "double", "triple" }; \ int len = 0; \ int copies = 0; \ const char *crypt_type; \ if (bp != NULL) { \ if (BP_IS_ENCRYPTED(bp)) { \ crypt_type = "encrypted"; \ /* LINTED E_SUSPICIOUS_COMPARISON */ \ } else if (BP_IS_AUTHENTICATED(bp)) { \ crypt_type = "authenticated"; \ } else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { \ crypt_type = "indirect-MAC"; \ } else { \ crypt_type = "unencrypted"; \ } \ } \ if (bp == NULL) { \ len += func(buf + len, size - len, ""); \ } else if (BP_IS_HOLE(bp)) { \ len += func(buf + len, size - len, \ "HOLE [L%llu %s] " \ "size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ "size=%llxL/%llxP birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (int)BPE_GET_ETYPE(bp), \ compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else if (BP_IS_REDACTED(bp)) { \ len += func(buf + len, size - len, \ "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ if (DVA_IS_VALID(dva)) \ copies++; \ len += func(buf + len, size - len, \ "DVA[%d]=<%llu:%llx:%llx>%c", d, \ (u_longlong_t)DVA_GET_VDEV(dva), \ (u_longlong_t)DVA_GET_OFFSET(dva), \ (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ if (BP_IS_ENCRYPTED(bp)) { \ len += func(buf + len, size - len, \ "salt=%llx iv=%llx:%llx%c", \ (u_longlong_t)bp->blk_dva[2].dva_word[0], \ (u_longlong_t)bp->blk_dva[2].dva_word[1], \ (u_longlong_t)BP_GET_IV2(bp), \ ws); \ } \ if (BP_IS_GANG(bp) && \ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ copies--; \ len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ "cksum=%llx:%llx:%llx:%llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ compress, \ crypt_type, \ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ BP_IS_GANG(bp) ? "gang" : "contiguous", \ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ copyname[copies], \ ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth, \ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \ (u_longlong_t)bp->blk_cksum.zc_word[2], \ (u_longlong_t)bp->blk_cksum.zc_word[3]); \ } \ ASSERT(len < size); \ } #define BP_GET_BUFC_TYPE(bp) \ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, SPA_IMPORT_ASSEMBLE } spa_import_type_t; typedef enum spa_mode { SPA_MODE_UNINIT = 0, SPA_MODE_READ = 1, SPA_MODE_WRITE = 2, } spa_mode_t; /* * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources. */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, #ifdef IN_FREEBSD_BASE SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON, #else SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF, #endif } spa_autotrim_t; /* * Reason TRIM command was issued, used internally for accounting purposes. */ typedef enum trim_type { TRIM_TYPE_MANUAL = 0, TRIM_TYPE_AUTO = 1, TRIM_TYPE_SIMPLE = 2 } trim_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(const char *pool); extern int spa_checkpoint(const char *pool); extern int spa_checkpoint_discard(const char *pool); extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); extern int spa_reset(const char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern int spa_async_tasks(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 #define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 #define SPA_ASYNC_REBUILD_DONE 0x2000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist); extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); /* L2ARC state (which is global across all pools) */ extern void spa_l2cache_add(vdev_t *vd); extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); /* scanning */ extern int spa_scan(spa_t *spa, pool_scan_func_t func); extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); extern int zfs_sync_pass_deferred_free; /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; /* * SPA configuration functions in spa_config.c */ #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype); /* * Miscellaneous SPA routines in spa_misc.c */ /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); extern void spa_async_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ #define SCL_ALLOC 0x08 #define SCL_ZIO 0x10 #define SCL_FREE 0x20 #define SCL_VDEV 0x40 #define SCL_LOCKS 7 #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) /* Historical pool statistics */ typedef struct spa_history_kstat { kmutex_t lock; uint64_t count; uint64_t size; kstat_t *kstat; void *priv; list_t list; } spa_history_kstat_t; typedef struct spa_history_list { uint64_t size; procfs_list_t procfs_list; } spa_history_list_t; typedef struct spa_stats { spa_history_list_t read_history; spa_history_list_t txg_history; spa_history_kstat_t tx_assign_histogram; spa_history_kstat_t io_history; spa_history_list_t mmp_history; spa_history_kstat_t state; /* pool state */ spa_history_kstat_t iostats; } spa_stats_t; typedef enum txg_state { TXG_STATE_BIRTH = 0, TXG_STATE_OPEN = 1, TXG_STATE_QUIESCED = 2, TXG_STATE_WAIT_FOR_SYNC = 3, TXG_STATE_SYNCED = 4, TXG_STATE_COMMITTED = 5, } txg_state_t; typedef struct txg_stat { vdev_stat_t vs1; vdev_stat_t vs2; uint64_t txg; uint64_t ndirty; } txg_stat_t; /* Assorted pool IO kstats */ typedef struct spa_iostats { kstat_named_t trim_extents_written; kstat_named_t trim_bytes_written; kstat_named_t trim_extents_skipped; kstat_named_t trim_bytes_skipped; kstat_named_t trim_extents_failed; kstat_named_t trim_bytes_failed; kstat_named_t autotrim_extents_written; kstat_named_t autotrim_bytes_written; kstat_named_t autotrim_extents_skipped; kstat_named_t autotrim_bytes_skipped; kstat_named_t autotrim_extents_failed; kstat_named_t autotrim_bytes_failed; kstat_named_t simple_trim_extents_written; kstat_named_t simple_trim_bytes_written; kstat_named_t simple_trim_extents_skipped; kstat_named_t simple_trim_bytes_skipped; kstat_named_t simple_trim_extents_failed; kstat_named_t simple_trim_bytes_failed; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags); extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); extern int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time); extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t, struct dsl_pool *); extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *); extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, hrtime_t duration); extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, int error); extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining); extern int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t max_txg); extern int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t spa_load_state); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Log state */ typedef enum spa_log_state { SPA_LOG_UNKNOWN = 0, /* unknown log state */ SPA_LOG_MISSING, /* missing log(s) */ SPA_LOG_CLEAR, /* clear the log(s) */ SPA_LOG_GOOD, /* log(s) are good */ } spa_log_state_t; extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_reset_logs(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); extern void spa_deadman(void *); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); extern int spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_checkpoint_space(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern uint64_t spa_get_failmode(spa_t *spa); extern uint64_t spa_get_deadman_failmode(spa_t *spa); extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern space_map_t *spa_syncing_log_sm(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); extern uint64_t spa_deadman_ziotime(spa_t *spa); extern uint64_t spa_dirty_data(spa_t *spa); extern spa_autotrim_t spa_get_autotrim(spa_t *spa); /* Miscellaneous support routines */ extern void spa_load_failed(spa_t *spa, const char *fmt, ...); extern void spa_load_note(spa_t *spa, const char *fmt, ...); extern void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); extern uint64_t spa_generate_guid(spa_t *spa); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern int spa_maxdnodesize(spa_t *spa); extern boolean_t spa_has_checkpoint(spa_t *spa); extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); extern boolean_t spa_suspend_async_destroy(spa_t *spa); extern uint64_t spa_min_claim_txg(spa_t *spa); extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp); typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, void *arg); extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg); extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); extern char *spa_his_ievent_table[]; extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf); extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); extern void spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx); extern void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, zio_t *zio); extern void zfs_ereport_taskq_fini(void); +extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd); extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); /* vdev cache */ extern void vdev_cache_stat_init(void); extern void vdev_cache_stat_fini(void); /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); /* Initialization and termination */ extern void spa_init(spa_mode_t mode); extern void spa_fini(void); extern void spa_boot_init(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, const char *name); /* waiting for pool activities to complete */ extern int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited); extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited); extern void spa_notify_waiters(spa_t *spa); extern void spa_wake_waiters(spa_t *spa); /* module param call functions */ int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS); int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS); int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ _NOTE(CONSTCOND) } while (0) #else #define dprintf_bp(bp, fmt, ...) #endif extern spa_mode_t spa_mode_global; extern int zfs_deadman_enabled; extern unsigned long zfs_deadman_synctime_ms; extern unsigned long zfs_deadman_ziotime_ms; extern unsigned long zfs_deadman_checktime_ms; #ifdef __cplusplus } #endif #endif /* _SYS_SPA_H */ diff --git a/sys/contrib/openzfs/include/sys/uio_impl.h b/sys/contrib/openzfs/include/sys/uio_impl.h index be70cea54818..cde3ef40485b 100644 --- a/sys/contrib/openzfs/include/sys/uio_impl.h +++ b/sys/contrib/openzfs/include/sys/uio_impl.h @@ -1,49 +1,68 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ #ifndef _SYS_UIO_IMPL_H #define _SYS_UIO_IMPL_H #include extern int zfs_uiomove(void *, size_t, zfs_uio_rw_t, zfs_uio_t *); -extern int zfs_uio_prefaultpages(ssize_t, zfs_uio_t *); extern int zfs_uiocopy(void *, size_t, zfs_uio_rw_t, zfs_uio_t *, size_t *); extern void zfs_uioskip(zfs_uio_t *, size_t); +static inline void +zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) +{ + *base = zfs_uio_iovbase(uio, idx); + *len = zfs_uio_iovlen(uio, idx); +} + +static inline offset_t +zfs_uio_index_at_offset(zfs_uio_t *uio, offset_t off, uint_t *vec_idx) +{ + *vec_idx = 0; + while (*vec_idx < zfs_uio_iovcnt(uio) && + off >= zfs_uio_iovlen(uio, *vec_idx)) { + off -= zfs_uio_iovlen(uio, *vec_idx); + (*vec_idx)++; + } + + return (off); +} + #endif /* _SYS_UIO_IMPL_H */ diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h index ee3216d6763a..6cdcc6d30966 100644 --- a/sys/contrib/openzfs/include/sys/zfs_context.h +++ b/sys/contrib/openzfs/include/sys/zfs_context.h @@ -1,777 +1,778 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H #ifdef __cplusplus extern "C" { #endif /* * This code compiles in three different contexts. When __KERNEL__ is defined, * the code uses "unix-like" kernel interfaces. When _STANDALONE is defined, the * code is running in a reduced capacity environment of the boot loader which is * generally a subset of both POSIX and kernel interfaces (with a few unique * interfaces too). When neither are defined, it's in a userland POSIX or * similar environment. */ #if defined(__KERNEL__) || defined(_STANDALONE) #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #else /* _KERNEL || _STANDALONE */ #define _SYS_MUTEX_H #define _SYS_RWLOCK_H #define _SYS_CONDVAR_H #define _SYS_VNODE_H #define _SYS_VFS_H #define _SYS_SUNDDI_H #define _SYS_CALLB_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Stack */ #define noinline __attribute__((noinline)) #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) /* * Debugging */ /* * Note that we are not using the debugging levels. */ #define CE_CONT 0 /* continuation */ #define CE_NOTE 1 /* notice */ #define CE_WARN 2 /* warning */ #define CE_PANIC 3 /* panic */ #define CE_IGNORE 4 /* print nothing */ /* * ZFS debugging */ extern void dprintf_setup(int *argc, char **argv); extern void cmn_err(int, const char *, ...); extern void vcmn_err(int, const char *, va_list); extern void panic(const char *, ...) __NORETURN; extern void vpanic(const char *, va_list) __NORETURN; #define fm_panic panic extern int aok; /* * DTrace SDT probes have different signatures in userland than they do in * the kernel. If they're being used in kernel code, re-define them out of * existence for their counterparts in libzpool. * * Here's an example of how to use the set-error probes in userland: * zfs$target:::set-error /arg0 == EBUSY/ {stack();} * * Here's an example of how to use DTRACE_PROBE probes in userland: * If there is a probe declared as follows: * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn); * Then you can use it as follows: * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/ * {printf("%u %p\n", arg1, arg2);} */ #ifdef DTRACE_PROBE #undef DTRACE_PROBE #endif /* DTRACE_PROBE */ #define DTRACE_PROBE(a) #ifdef DTRACE_PROBE1 #undef DTRACE_PROBE1 #endif /* DTRACE_PROBE1 */ #define DTRACE_PROBE1(a, b, c) #ifdef DTRACE_PROBE2 #undef DTRACE_PROBE2 #endif /* DTRACE_PROBE2 */ #define DTRACE_PROBE2(a, b, c, d, e) #ifdef DTRACE_PROBE3 #undef DTRACE_PROBE3 #endif /* DTRACE_PROBE3 */ #define DTRACE_PROBE3(a, b, c, d, e, f, g) #ifdef DTRACE_PROBE4 #undef DTRACE_PROBE4 #endif /* DTRACE_PROBE4 */ #define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) /* * Tunables. */ typedef struct zfs_kernel_param { const char *name; /* unused stub */ } zfs_kernel_param_t; #define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) #define ZFS_MODULE_PARAM_ARGS void #define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ getfunc, perm, desc) /* * Threads. */ typedef pthread_t kthread_t; #define TS_RUN 0x00000002 #define TS_JOINABLE 0x00000004 #define curthread ((void *)(uintptr_t)pthread_self()) #define kpreempt(x) yield() #define getcomm() "unknown" #define thread_create_named(name, stk, stksize, func, arg, len, \ pp, state, pri) \ zk_thread_create(func, arg, stksize, state) #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(func, arg, stksize, state) #define thread_exit() pthread_exit(NULL) #define thread_join(t) pthread_join((pthread_t)(t), NULL) #define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) /* in libzpool, p0 exists only to have its address taken */ typedef struct proc { uintptr_t this_is_never_used_dont_dereference_it; } proc_t; extern struct proc p0; #define curproc (&p0) #define PS_NONE -1 extern kthread_t *zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state); #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) #define kpreempt_disable() ((void)0) #define kpreempt_enable() ((void)0) #define cond_resched() sched_yield() /* * Mutexes */ typedef struct kmutex { pthread_mutex_t m_lock; pthread_t m_owner; } kmutex_t; #define MUTEX_DEFAULT 0 #define MUTEX_NOLOCKDEP MUTEX_DEFAULT #define MUTEX_HELD(mp) pthread_equal((mp)->m_owner, pthread_self()) #define MUTEX_NOT_HELD(mp) !MUTEX_HELD(mp) extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); #define NESTED_SINGLE 1 #define mutex_enter_nested(mp, class) mutex_enter(mp) /* * RW locks */ typedef struct krwlock { pthread_rwlock_t rw_lock; pthread_t rw_owner; uint_t rw_readers; } krwlock_t; typedef int krw_t; #define RW_READER 0 #define RW_WRITER 1 #define RW_DEFAULT RW_READER #define RW_NOLOCKDEP RW_READER #define RW_READ_HELD(rw) ((rw)->rw_readers > 0) #define RW_WRITE_HELD(rw) pthread_equal((rw)->rw_owner, pthread_self()) #define RW_LOCK_HELD(rw) (RW_READ_HELD(rw) || RW_WRITE_HELD(rw)) extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); extern void rw_destroy(krwlock_t *rwlp); extern void rw_enter(krwlock_t *rwlp, krw_t rw); extern int rw_tryenter(krwlock_t *rwlp, krw_t rw); extern int rw_tryupgrade(krwlock_t *rwlp); extern void rw_exit(krwlock_t *rwlp); #define rw_downgrade(rwlp) do { } while (0) /* * Credentials */ extern uid_t crgetuid(cred_t *cr); extern uid_t crgetruid(cred_t *cr); extern gid_t crgetgid(cred_t *cr); extern int crgetngroups(cred_t *cr); extern gid_t *crgetgroups(cred_t *cr); /* * Condition variables */ typedef pthread_cond_t kcondvar_t; #define CV_DEFAULT 0 #define CALLOUT_FLAG_ABSOLUTE 0x2 extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); extern void cv_destroy(kcondvar_t *cv); extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp); extern int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); extern int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, int flag); extern void cv_signal(kcondvar_t *cv); extern void cv_broadcast(kcondvar_t *cv); #define cv_timedwait_io(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_timedwait_idle(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_timedwait_sig(cv, mp, at) cv_timedwait(cv, mp, at) #define cv_wait_io(cv, mp) cv_wait(cv, mp) #define cv_wait_idle(cv, mp) cv_wait(cv, mp) #define cv_wait_io_sig(cv, mp) cv_wait_sig(cv, mp) #define cv_timedwait_sig_hires(cv, mp, t, r, f) \ cv_timedwait_hires(cv, mp, t, r, f) #define cv_timedwait_idle_hires(cv, mp, t, r, f) \ cv_timedwait_hires(cv, mp, t, r, f) /* * Thread-specific data */ #define tsd_get(k) pthread_getspecific(k) #define tsd_set(k, v) pthread_setspecific(k, v) #define tsd_create(kp, d) pthread_key_create((pthread_key_t *)kp, d) #define tsd_destroy(kp) /* nothing */ #ifdef __FreeBSD__ typedef off_t loff_t; #endif /* * kstat creation, installation and deletion */ extern kstat_t *kstat_create(const char *, int, const char *, const char *, uchar_t, ulong_t, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); extern void kstat_waitq_enter(kstat_io_t *); extern void kstat_waitq_exit(kstat_io_t *); extern void kstat_runq_enter(kstat_io_t *); extern void kstat_runq_exit(kstat_io_t *); extern void kstat_waitq_to_runq(kstat_io_t *); extern void kstat_runq_back_to_waitq(kstat_io_t *); extern void kstat_set_raw_ops(kstat_t *ksp, int (*headers)(char *buf, size_t size), int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)); /* * procfs list manipulation */ typedef struct procfs_list { void *pl_private; kmutex_t pl_lock; list_t pl_list; uint64_t pl_next_id; size_t pl_node_offset; } procfs_list_t; #ifndef __cplusplus struct seq_file { }; void seq_printf(struct seq_file *m, const char *fmt, ...); typedef struct procfs_list_node { list_node_t pln_link; uint64_t pln_id; } procfs_list_node_t; void procfs_list_install(const char *module, const char *submodule, const char *name, mode_t mode, procfs_list_t *procfs_list, int (*show)(struct seq_file *f, void *p), int (*show_header)(struct seq_file *f), int (*clear)(procfs_list_t *procfs_list), size_t procfs_list_node_off); void procfs_list_uninstall(procfs_list_t *procfs_list); void procfs_list_destroy(procfs_list_t *procfs_list); void procfs_list_add(procfs_list_t *procfs_list, void *p); #endif /* * Kernel memory */ #define KM_SLEEP UMEM_NOFAIL #define KM_PUSHPAGE KM_SLEEP #define KM_NOSLEEP UMEM_DEFAULT #define KM_NORMALPRI 0 /* not needed with UMEM_DEFAULT */ #define KMC_NODEBUG UMC_NODEBUG #define KMC_KVMEM 0x0 #define kmem_alloc(_s, _f) umem_alloc(_s, _f) #define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) #define kmem_free(_b, _s) umem_free(_b, _s) #define vmem_alloc(_s, _f) kmem_alloc(_s, _f) #define vmem_zalloc(_s, _f) kmem_zalloc(_s, _f) #define vmem_free(_b, _s) kmem_free(_b, _s) #define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \ umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) #define kmem_cache_destroy(_c) umem_cache_destroy(_c) #define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f) #define kmem_cache_free(_c, _b) umem_cache_free(_c, _b) #define kmem_debugging() 0 #define kmem_cache_reap_now(_c) umem_cache_reap_now(_c); #define kmem_cache_set_move(_c, _cb) /* nothing */ #define POINTER_INVALIDATE(_pp) /* nothing */ #define POINTER_IS_VALID(_p) 0 typedef umem_cache_t kmem_cache_t; typedef enum kmem_cbrc { KMEM_CBRC_YES, KMEM_CBRC_NO, KMEM_CBRC_LATER, KMEM_CBRC_DONT_NEED, KMEM_CBRC_DONT_KNOW } kmem_cbrc_t; /* * Task queues */ #define TASKQ_NAMELEN 31 typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); typedef struct taskq_ent { struct taskq_ent *tqent_next; struct taskq_ent *tqent_prev; task_func_t *tqent_func; void *tqent_arg; uintptr_t tqent_flags; } taskq_ent_t; typedef struct taskq { char tq_name[TASKQ_NAMELEN + 1]; kmutex_t tq_lock; krwlock_t tq_threadlock; kcondvar_t tq_dispatch_cv; kcondvar_t tq_wait_cv; kthread_t **tq_threadlist; int tq_flags; int tq_active; int tq_nthreads; int tq_nalloc; int tq_minalloc; int tq_maxalloc; kcondvar_t tq_maxalloc_cv; int tq_maxalloc_wait; taskq_ent_t *tq_freelist; taskq_ent_t tq_task; } taskq_t; #define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */ #define TASKQ_PREPOPULATE 0x0001 #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ #define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */ #define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */ #define TQ_SLEEP KM_SLEEP /* Can block for memory */ #define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_FRONT 0x08 /* Queue in front */ #define TASKQID_INVALID ((taskqid_t)0) extern taskq_t *system_taskq; extern taskq_t *system_delay_taskq; extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); #define taskq_create_proc(a, b, c, d, e, p, f) \ (taskq_create(a, b, c, d, e, f)) #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ (taskq_create(a, b, maxclsyspri, d, e, f)) extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, clock_t); extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *); extern int taskq_empty_ent(taskq_ent_t *); extern void taskq_init_ent(taskq_ent_t *); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern void taskq_wait_id(taskq_t *, taskqid_t); extern void taskq_wait_outstanding(taskq_t *, taskqid_t); extern int taskq_member(taskq_t *, kthread_t *); extern taskq_t *taskq_of_curthread(void); extern int taskq_cancel_id(taskq_t *, taskqid_t); extern void system_taskq_init(void); extern void system_taskq_fini(void); #define XVA_MAPSIZE 3 #define XVA_MAGIC 0x78766174 extern char *vn_dumpdir; #define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ typedef struct xoptattr { inode_timespec_t xoa_createtime; /* Create time of file */ uint8_t xoa_archive; uint8_t xoa_system; uint8_t xoa_readonly; uint8_t xoa_hidden; uint8_t xoa_nounlink; uint8_t xoa_immutable; uint8_t xoa_appendonly; uint8_t xoa_nodump; uint8_t xoa_settable; uint8_t xoa_opaque; uint8_t xoa_av_quarantined; uint8_t xoa_av_modified; uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; uint8_t xoa_reparse; uint8_t xoa_offline; uint8_t xoa_sparse; } xoptattr_t; typedef struct vattr { uint_t va_mask; /* bit-mask of attributes */ u_offset_t va_size; /* file size in bytes */ } vattr_t; typedef struct xvattr { vattr_t xva_vattr; /* Embedded vattr structure */ uint32_t xva_magic; /* Magic Number */ uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ xoptattr_t xva_xoptattrs; /* Optional attributes */ } xvattr_t; typedef struct vsecattr { uint_t vsa_mask; /* See below */ int vsa_aclcnt; /* ACL entry count */ void *vsa_aclentp; /* pointer to ACL entries */ int vsa_dfaclcnt; /* default ACL entry count */ void *vsa_dfaclentp; /* pointer to default ACL entries */ size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ } vsecattr_t; #define AT_MODE 0x00002 #define AT_UID 0x00004 #define AT_GID 0x00008 #define AT_FSID 0x00010 #define AT_NODEID 0x00020 #define AT_NLINK 0x00040 #define AT_SIZE 0x00080 #define AT_ATIME 0x00100 #define AT_MTIME 0x00200 #define AT_CTIME 0x00400 #define AT_RDEV 0x00800 #define AT_BLKSIZE 0x01000 #define AT_NBLOCKS 0x02000 #define AT_SEQ 0x08000 #define AT_XVATTR 0x10000 #define CRCREAT 0 #define F_FREESP 11 #define FIGNORECASE 0x80000 /* request case-insensitive lookups */ /* * Random stuff */ #define ddi_get_lbolt() (gethrtime() >> 23) #define ddi_get_lbolt64() (gethrtime() >> 23) #define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */ #define ddi_time_before(a, b) (a < b) #define ddi_time_after(a, b) ddi_time_before(b, a) #define ddi_time_before_eq(a, b) (!ddi_time_after(a, b)) #define ddi_time_after_eq(a, b) ddi_time_before_eq(b, a) #define ddi_time_before64(a, b) (a < b) #define ddi_time_after64(a, b) ddi_time_before64(b, a) #define ddi_time_before_eq64(a, b) (!ddi_time_after64(a, b)) #define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a) extern void delay(clock_t ticks); #define SEC_TO_TICK(sec) ((sec) * hz) #define MSEC_TO_TICK(msec) (howmany((hrtime_t)(msec) * hz, MILLISEC)) #define USEC_TO_TICK(usec) (howmany((hrtime_t)(usec) * hz, MICROSEC)) #define NSEC_TO_TICK(nsec) (howmany((hrtime_t)(nsec) * hz, NANOSEC)) #define max_ncpus 64 #define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN)) /* * Process priorities as defined by setpriority(2) and getpriority(2). */ #define minclsyspri 19 #define maxclsyspri -20 #define defclsyspri 0 #define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1)) #define CPU_SEQID_UNSTABLE CPU_SEQID #define kcred NULL #define CRED() NULL #define ptob(x) ((x) * PAGESIZE) #define NN_DIVISOR_1000 (1U << 0) #define NN_NUMBUF_SZ (6) extern uint64_t physmem; extern char *random_path; extern char *urandom_path; extern int highbit64(uint64_t i); extern int lowbit64(uint64_t i); extern int random_get_bytes(uint8_t *ptr, size_t len); extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); extern void kernel_init(int mode); extern void kernel_fini(void); extern void random_init(void); extern void random_fini(void); struct spa; extern void show_pool_stats(struct spa *); -extern int set_global_var(char *arg); +extern int set_global_var(char const *arg); typedef struct callb_cpr { kmutex_t *cc_lockp; } callb_cpr_t; #define CALLB_CPR_INIT(cp, lockp, func, name) { \ (cp)->cc_lockp = lockp; \ } #define CALLB_CPR_SAFE_BEGIN(cp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ } #define CALLB_CPR_SAFE_END(cp, lockp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ } #define CALLB_CPR_EXIT(cp) { \ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ mutex_exit((cp)->cc_lockp); \ } #define zone_dataset_visible(x, y) (1) #define INGLOBALZONE(z) (1) extern uint32_t zone_get_hostid(void *zonep); extern char *kmem_vasprintf(const char *fmt, va_list adx); extern char *kmem_asprintf(const char *fmt, ...); #define kmem_strfree(str) kmem_free((str), strlen(str) + 1) #define kmem_strdup(s) strdup(s) /* * Hostname information */ extern char hw_serial[]; /* for userland-emulated hostid access */ extern int ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result); extern int ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result); typedef struct utsname utsname_t; extern utsname_t *utsname(void); /* ZFS Boot Related stuff. */ struct _buf { intptr_t _fd; }; struct bootstat { uint64_t st_size; }; typedef struct ace_object { uid_t a_who; uint32_t a_access_mask; uint16_t a_flags; uint16_t a_type; uint8_t a_obj_type[16]; uint8_t a_inherit_obj_type[16]; } ace_object_t; #define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 #define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 #define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 #define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int secpolicy_zfs(const cred_t *cr); extern int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc); extern zoneid_t getzoneid(void); /* SID stuff */ typedef struct ksiddomain { uint_t kd_ref; uint_t kd_len; char *kd_name; } ksiddomain_t; ksiddomain_t *ksid_lookupdomain(const char *); void ksiddomain_rele(ksiddomain_t *); #define DDI_SLEEP KM_SLEEP #define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \ sysevent_post_event(_c, _d, _b, "libzpool", _e, _f) #define zfs_sleep_until(wakeup) \ do { \ hrtime_t delta = wakeup - gethrtime(); \ struct timespec ts; \ ts.tv_sec = delta / NANOSEC; \ ts.tv_nsec = delta % NANOSEC; \ (void) nanosleep(&ts, NULL); \ } while (0) typedef int fstrans_cookie_t; extern fstrans_cookie_t spl_fstrans_mark(void); extern void spl_fstrans_unmark(fstrans_cookie_t); extern int __spl_pf_fstrans_check(void); extern int kmem_cache_reap_active(void); #define ____cacheline_aligned /* * Kernel modules */ #define __init #define __exit #endif /* _KERNEL || _STANDALONE */ #ifdef __cplusplus }; #endif #endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/sys/contrib/openzfs/lib/libzpool/util.c b/sys/contrib/openzfs/lib/libzpool/util.c index ebfaa9b41a2a..2da2375a1d2d 100644 --- a/sys/contrib/openzfs/lib/libzpool/util.c +++ b/sys/contrib/openzfs/lib/libzpool/util.c @@ -1,257 +1,293 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2017 Jason King * Copyright (c) 2017, Intel Corporation. */ #include #include #include #include #include #include #include #include #include #include #include #include /* * Routines needed by more than one client of libzpool. */ static void show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) { vdev_stat_t *vs; vdev_stat_t *v0 = { 0 }; uint64_t sec; uint64_t is_log = 0; nvlist_t **child; uint_t c, children; char used[6], avail[6]; char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6]; v0 = umem_zalloc(sizeof (*v0), UMEM_NOFAIL); if (indent == 0 && desc != NULL) { (void) printf(" " " capacity operations bandwidth ---- errors ----\n"); (void) printf("description " "used avail read write read write read write cksum\n"); } if (desc != NULL) { char *suffix = "", *bias = NULL; char bias_suffix[32]; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) != 0) vs = v0; if (bias != NULL) { (void) snprintf(bias_suffix, sizeof (bias_suffix), " (%s)", bias); suffix = bias_suffix; } else if (is_log) { suffix = " (log)"; } sec = MAX(1, vs->vs_timestamp / NANOSEC); nicenum(vs->vs_alloc, used, sizeof (used)); nicenum(vs->vs_space - vs->vs_alloc, avail, sizeof (avail)); nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops, sizeof (rops)); nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops, sizeof (wops)); nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes, sizeof (rbytes)); nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes, sizeof (wbytes)); nicenum(vs->vs_read_errors, rerr, sizeof (rerr)); nicenum(vs->vs_write_errors, werr, sizeof (werr)); nicenum(vs->vs_checksum_errors, cerr, sizeof (cerr)); (void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n", indent, "", desc, (int)(indent+strlen(desc)-25-(vs->vs_space ? 0 : 12)), suffix, vs->vs_space ? 6 : 0, vs->vs_space ? used : "", vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", rops, wops, rbytes, wbytes, rerr, werr, cerr); } umem_free(v0, sizeof (*v0)); if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0) return; for (c = 0; c < children; c++) { nvlist_t *cnv = child[c]; char *cname = NULL, *tname; uint64_t np; int len; if (nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &cname) && nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &cname)) cname = ""; len = strlen(cname) + 2; tname = umem_zalloc(len, UMEM_NOFAIL); (void) strlcpy(tname, cname, len); if (nvlist_lookup_uint64(cnv, ZPOOL_CONFIG_NPARITY, &np) == 0) tname[strlen(tname)] = '0' + np; show_vdev_stats(tname, ctype, cnv, indent + 2); umem_free(tname, len); } } void show_pool_stats(spa_t *spa) { nvlist_t *config, *nvroot; char *name; VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0); VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); show_vdev_stats(name, ZPOOL_CONFIG_CHILDREN, nvroot, 0); show_vdev_stats(NULL, ZPOOL_CONFIG_L2CACHE, nvroot, 0); show_vdev_stats(NULL, ZPOOL_CONFIG_SPARES, nvroot, 0); nvlist_free(config); } +/* *k_out must be freed by the caller */ +static int +set_global_var_parse_kv(const char *arg, char **k_out, u_longlong_t *v_out) +{ + int err; + VERIFY(arg); + char *d = strdup(arg); + + char *save = NULL; + char *k = strtok_r(d, "=", &save); + char *v_str = strtok_r(NULL, "=", &save); + char *follow = strtok_r(NULL, "=", &save); + if (k == NULL || v_str == NULL || follow != NULL) { + err = EINVAL; + goto err_free; + } + + u_longlong_t val = strtoull(v_str, NULL, 0); + if (val > UINT32_MAX) { + fprintf(stderr, "Value for global variable '%s' must " + "be a 32-bit unsigned integer, got '%s'\n", k, v_str); + err = EOVERFLOW; + goto err_free; + } + + *k_out = k; + *v_out = val; + return (0); + +err_free: + free(k); + + return (err); +} + /* * Sets given global variable in libzpool to given unsigned 32-bit value. * arg: "=" */ int -set_global_var(char *arg) +set_global_var(char const *arg) { void *zpoolhdl; - char *varname = arg, *varval; + char *varname; u_longlong_t val; + int ret; -#ifndef _LITTLE_ENDIAN +#ifndef _ZFS_LITTLE_ENDIAN /* * On big endian systems changing a 64-bit variable would set the high * 32 bits instead of the low 32 bits, which could cause unexpected * results. */ fprintf(stderr, "Setting global variables is only supported on " "little-endian systems\n"); - return (ENOTSUP); + ret = ENOTSUP; + goto out_ret; #endif - if (arg != NULL && (varval = strchr(arg, '=')) != NULL) { - *varval = '\0'; - varval++; - val = strtoull(varval, NULL, 0); - if (val > UINT32_MAX) { - fprintf(stderr, "Value for global variable '%s' must " - "be a 32-bit unsigned integer\n", varname); - return (EOVERFLOW); - } - } else { - return (EINVAL); + + if ((ret = set_global_var_parse_kv(arg, &varname, &val)) != 0) { + goto out_ret; } zpoolhdl = dlopen("libzpool.so", RTLD_LAZY); if (zpoolhdl != NULL) { uint32_t *var; var = dlsym(zpoolhdl, varname); if (var == NULL) { fprintf(stderr, "Global variable '%s' does not exist " "in libzpool.so\n", varname); - return (EINVAL); + ret = EINVAL; + goto out_dlclose; } *var = (uint32_t)val; - dlclose(zpoolhdl); } else { fprintf(stderr, "Failed to open libzpool.so to set global " "variable\n"); - return (EIO); + ret = EIO; + goto out_dlclose; } - return (0); + ret = 0; + +out_dlclose: + dlclose(zpoolhdl); + free(varname); +out_ret: + return (ret); } static nvlist_t * refresh_config(void *unused, nvlist_t *tryconfig) { return (spa_tryimport(tryconfig)); } static int pool_active(void *unused, const char *name, uint64_t guid, boolean_t *isactive) { zfs_cmd_t *zcp; nvlist_t *innvl; char *packed = NULL; size_t size = 0; int fd, ret; /* * Use ZFS_IOC_POOL_SYNC to confirm if a pool is active */ fd = open(ZFS_DEV, O_RDWR); if (fd < 0) return (-1); zcp = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL); innvl = fnvlist_alloc(); fnvlist_add_boolean_value(innvl, "force", B_FALSE); (void) strlcpy(zcp->zc_name, name, sizeof (zcp->zc_name)); packed = fnvlist_pack(innvl, &size); zcp->zc_nvlist_src = (uint64_t)(uintptr_t)packed; zcp->zc_nvlist_src_size = size; ret = zfs_ioctl_fd(fd, ZFS_IOC_POOL_SYNC, zcp); fnvlist_pack_free(packed, size); free((void *)(uintptr_t)zcp->zc_nvlist_dst); nvlist_free(innvl); umem_free(zcp, sizeof (zfs_cmd_t)); (void) close(fd); *isactive = (ret == 0); return (0); } const pool_config_ops_t libzpool_config_ops = { .pco_refresh_config = refresh_config, .pco_pool_active = pool_active, }; diff --git a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_device_path_os.c b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_device_path_os.c index 36331fd72bf7..1f767bb7a6e7 100644 --- a/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_device_path_os.c +++ b/sys/contrib/openzfs/lib/libzutil/os/linux/zutil_device_path_os.c @@ -1,506 +1,538 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #ifdef HAVE_LIBUDEV #include #endif #include /* * Append partition suffix to an otherwise fully qualified device path. * This is used to generate the name the full path as its stored in * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length * of 'path' will be returned on error a negative value is returned. */ int zfs_append_partition(char *path, size_t max_len) { int len = strlen(path); if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { if (len + 6 >= max_len) return (-1); (void) strcat(path, "-part1"); len += 6; } else { if (len + 2 >= max_len) return (-1); if (isdigit(path[len-1])) { (void) strcat(path, "p1"); len += 2; } else { (void) strcat(path, "1"); len += 1; } } return (len); } /* * Remove partition suffix from a vdev path. Partition suffixes may take three * forms: "-partX", "pX", or "X", where X is a string of digits. The second * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The * third case only occurs when preceded by a string matching the regular * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. * * caller must free the returned string */ char * zfs_strip_partition(char *path) { char *tmp = strdup(path); char *part = NULL, *d = NULL; if (!tmp) return (NULL); if ((part = strstr(tmp, "-part")) && part != tmp) { d = part + 5; } else if ((part = strrchr(tmp, 'p')) && part > tmp + 1 && isdigit(*(part-1))) { d = part + 1; } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && tmp[1] == 'd') { for (d = &tmp[2]; isalpha(*d); part = ++d) { } } else if (strncmp("xvd", tmp, 3) == 0) { for (d = &tmp[3]; isalpha(*d); part = ++d) { } } if (part && d && *d != '\0') { for (; isdigit(*d); d++) { } if (*d == '\0') *part = '\0'; } return (tmp); } /* * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname * * path: /dev/sda1 * returns: /dev/sda * * Returned string must be freed. */ static char * zfs_strip_partition_path(char *path) { char *newpath = strdup(path); char *sd_offset; char *new_sd; if (!newpath) return (NULL); /* Point to "sda1" part of "/dev/sda1" */ sd_offset = strrchr(newpath, '/') + 1; /* Get our new name "sda" */ new_sd = zfs_strip_partition(sd_offset); if (!new_sd) { free(newpath); return (NULL); } /* Paste the "sda" where "sda1" was */ strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); /* Free temporary "sda" */ free(new_sd); return (newpath); } /* * Strip the unwanted portion of a device path. */ char * zfs_strip_path(char *path) { return (strrchr(path, '/') + 1); } +/* + * Given a dev name like "sda", return the full enclosure sysfs path to + * the disk. You can also pass in the name with "/dev" prepended + * to it (like /dev/sda). + * + * For example, disk "sda" in enclosure slot 1: + * dev: "sda" + * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" + * + * 'dev' must be a non-devicemapper device. + * + * Returned string must be freed. + */ +char * +zfs_get_enclosure_sysfs_path(const char *dev_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char buf[MAXPATHLEN]; + char *tmp1 = NULL; + char *tmp2 = NULL; + char *tmp3 = NULL; + char *path = NULL; + size_t size; + int tmpsize; + + if (dev_name == NULL) + return (NULL); + + /* If they preface 'dev' with a path (like "/dev") then strip it off */ + tmp1 = strrchr(dev_name, '/'); + if (tmp1 != NULL) + dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ + + tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); + if (tmpsize == -1 || tmp1 == NULL) { + tmp1 = NULL; + goto end; + } + + dp = opendir(tmp1); + if (dp == NULL) { + tmp1 = NULL; /* To make free() at the end a NOP */ + goto end; + } + + /* + * Look though all sysfs entries in /sys/block//device for + * the enclosure symlink. + */ + while ((ep = readdir(dp))) { + /* Ignore everything that's not our enclosure_device link */ + if (strstr(ep->d_name, "enclosure_device") == NULL) + continue; + + if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 || + tmp2 == NULL) + break; + + size = readlink(tmp2, buf, sizeof (buf)); + + /* Did readlink fail or crop the link name? */ + if (size == -1 || size >= sizeof (buf)) { + free(tmp2); + tmp2 = NULL; /* To make free() at the end a NOP */ + break; + } + + /* + * We got a valid link. readlink() doesn't terminate strings + * so we have to do it. + */ + buf[size] = '\0'; + + /* + * Our link will look like: + * + * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" + * + * We want to grab the "enclosure/1:0:3:0/SLOT 1" part + */ + tmp3 = strstr(buf, "enclosure"); + if (tmp3 == NULL) + break; + + if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { + /* If asprintf() fails, 'path' is undefined */ + path = NULL; + break; + } + + if (path == NULL) + break; + } + +end: + free(tmp2); + free(tmp1); + + if (dp != NULL) + closedir(dp); + + return (path); +} + /* * Allocate and return the underlying device name for a device mapper device. - * If a device mapper device maps to multiple devices, return the first device. * * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a * DM device (like /dev/disk/by-vdev/A0) are also allowed. * - * Returns device name, or NULL on error or no match. If dm_name is not a DM - * device then return NULL. + * If the DM device has multiple underlying devices (like with multipath + * DM devices), then favor underlying devices that have a symlink back to their + * back to their enclosure device in sysfs. This will be useful for the + * zedlet scripts that toggle the fault LED. + * + * Returns an underlying device name, or NULL on error or no match. If dm_name + * is not a DM device then return NULL. * * NOTE: The returned name string must be *freed*. */ static char * dm_get_underlying_path(const char *dm_name) { DIR *dp = NULL; struct dirent *ep; char *realp; char *tmp = NULL; char *path = NULL; char *dev_str; int size; + char *first_path = NULL; + char *enclosure_path; if (dm_name == NULL) return (NULL); /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ realp = realpath(dm_name, NULL); if (realp == NULL) return (NULL); /* * If they preface 'dev' with a path (like "/dev") then strip it off. * We just want the 'dm-N' part. */ tmp = strrchr(realp, '/'); if (tmp != NULL) dev_str = tmp + 1; /* +1 since we want the chr after '/' */ else dev_str = tmp; size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str); if (size == -1 || !tmp) goto end; dp = opendir(tmp); if (dp == NULL) goto end; /* - * Return first entry (that isn't itself a directory) in the - * directory containing device-mapper dependent (underlying) - * devices. + * A device-mapper device can have multiple paths to it (multipath). + * Favor paths that have a symlink back to their enclosure device. + * We have to do this since some enclosures may only provide a symlink + * back for one underlying path to a disk and not the other. + * + * If no paths have links back to their enclosure, then just return the + * first path. */ while ((ep = readdir(dp))) { if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ + if (!first_path) + first_path = strdup(ep->d_name); + + enclosure_path = + zfs_get_enclosure_sysfs_path(ep->d_name); + + if (!enclosure_path) + continue; + size = asprintf(&path, "/dev/%s", ep->d_name); + free(enclosure_path); break; } } end: if (dp != NULL) closedir(dp); free(tmp); free(realp); + + if (!path) { + /* + * None of the underlying paths had a link back to their + * enclosure devices. Throw up out hands and return the first + * underlying path. + */ + size = asprintf(&path, "/dev/%s", first_path); + } + + free(first_path); return (path); } /* * Return B_TRUE if device is a device mapper or multipath device. * Return B_FALSE if not. */ boolean_t zfs_dev_is_dm(const char *dev_name) { char *tmp; tmp = dm_get_underlying_path(dev_name); if (tmp == NULL) return (B_FALSE); free(tmp); return (B_TRUE); } /* * By "whole disk" we mean an entire physical disk (something we can * label, toggle the write cache on, etc.) as opposed to the full * capacity of a pseudo-device such as lofi or did. We act as if we * are labeling the disk, which should be a pretty good test of whether * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if * it isn't. */ boolean_t zfs_dev_is_whole_disk(const char *dev_name) { struct dk_gpt *label; int fd; if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0) return (B_FALSE); if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { (void) close(fd); return (B_FALSE); } efi_free(label); (void) close(fd); return (B_TRUE); } /* * Lookup the underlying device for a device name * * Often you'll have a symlink to a device, a partition device, * or a multipath device, and want to look up the underlying device. * This function returns the underlying device name. If the device * name is already the underlying device, then just return the same * name. If the device is a DM device with multiple underlying devices * then return the first one. * * For example: * * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 * returns: /dev/sda * * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) * dev_name: /dev/mapper/mpatha * returns: /dev/sda (first device) * * 3. /dev/sda (already the underlying device) * dev_name: /dev/sda * returns: /dev/sda * * 4. /dev/dm-3 (mapped to /dev/sda) * dev_name: /dev/dm-3 * returns: /dev/sda * * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 * returns: /dev/sdb * * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a * returns: /dev/sda * * Returns underlying device name, or NULL on error or no match. * * NOTE: The returned name string must be *freed*. */ char * zfs_get_underlying_path(const char *dev_name) { char *name = NULL; char *tmp; if (dev_name == NULL) return (NULL); tmp = dm_get_underlying_path(dev_name); /* dev_name not a DM device, so just un-symlinkize it */ if (tmp == NULL) tmp = realpath(dev_name, NULL); if (tmp != NULL) { name = zfs_strip_partition_path(tmp); free(tmp); } return (name); } -/* - * Given a dev name like "sda", return the full enclosure sysfs path to - * the disk. You can also pass in the name with "/dev" prepended - * to it (like /dev/sda). - * - * For example, disk "sda" in enclosure slot 1: - * dev: "sda" - * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" - * - * 'dev' must be a non-devicemapper device. - * - * Returned string must be freed. - */ -char * -zfs_get_enclosure_sysfs_path(const char *dev_name) -{ - DIR *dp = NULL; - struct dirent *ep; - char buf[MAXPATHLEN]; - char *tmp1 = NULL; - char *tmp2 = NULL; - char *tmp3 = NULL; - char *path = NULL; - size_t size; - int tmpsize; - - if (dev_name == NULL) - return (NULL); - - /* If they preface 'dev' with a path (like "/dev") then strip it off */ - tmp1 = strrchr(dev_name, '/'); - if (tmp1 != NULL) - dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ - - tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); - if (tmpsize == -1 || tmp1 == NULL) { - tmp1 = NULL; - goto end; - } - - dp = opendir(tmp1); - if (dp == NULL) { - tmp1 = NULL; /* To make free() at the end a NOP */ - goto end; - } - - /* - * Look though all sysfs entries in /sys/block//device for - * the enclosure symlink. - */ - while ((ep = readdir(dp))) { - /* Ignore everything that's not our enclosure_device link */ - if (strstr(ep->d_name, "enclosure_device") == NULL) - continue; - - if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 || - tmp2 == NULL) - break; - - size = readlink(tmp2, buf, sizeof (buf)); - - /* Did readlink fail or crop the link name? */ - if (size == -1 || size >= sizeof (buf)) { - free(tmp2); - tmp2 = NULL; /* To make free() at the end a NOP */ - break; - } - - /* - * We got a valid link. readlink() doesn't terminate strings - * so we have to do it. - */ - buf[size] = '\0'; - - /* - * Our link will look like: - * - * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" - * - * We want to grab the "enclosure/1:0:3:0/SLOT 1" part - */ - tmp3 = strstr(buf, "enclosure"); - if (tmp3 == NULL) - break; - - if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { - /* If asprintf() fails, 'path' is undefined */ - path = NULL; - break; - } - - if (path == NULL) - break; - } - -end: - free(tmp2); - free(tmp1); - - if (dp != NULL) - closedir(dp); - - return (path); -} #ifdef HAVE_LIBUDEV /* * A disk is considered a multipath whole disk when: * DEVNAME key value has "dm-" * DM_NAME key value has "mpath" prefix * DM_UUID key exists * ID_PART_TABLE_TYPE key does not exist or is not gpt */ static boolean_t udev_mpath_whole_disk(struct udev_device *dev) { const char *devname, *type, *uuid; devname = udev_device_get_property_value(dev, "DEVNAME"); type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); uuid = udev_device_get_property_value(dev, "DM_UUID"); if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && ((type == NULL) || (strcmp(type, "gpt") != 0)) && (uuid != NULL)) { return (B_TRUE); } return (B_FALSE); } /* * Check if a disk is effectively a multipath whole disk */ boolean_t is_mpath_whole_disk(const char *path) { struct udev *udev; struct udev_device *dev = NULL; char nodepath[MAXPATHLEN]; char *sysname; boolean_t wholedisk = B_FALSE; if (realpath(path, nodepath) == NULL) return (B_FALSE); sysname = strrchr(nodepath, '/') + 1; if (strncmp(sysname, "dm-", 3) != 0) return (B_FALSE); if ((udev = udev_new()) == NULL) return (B_FALSE); if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", sysname)) == NULL) { udev_device_unref(dev); return (B_FALSE); } wholedisk = udev_mpath_whole_disk(dev); udev_device_unref(dev); return (wholedisk); } #else /* HAVE_LIBUDEV */ /* ARGSUSED */ boolean_t is_mpath_whole_disk(const char *path) { return (B_FALSE); } #endif /* HAVE_LIBUDEV */ diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c index f5f3524f7b9d..59a781ee1b64 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c @@ -1,100 +1,107 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ /* * $FreeBSD$ */ #include -#include +#include #include #include +int +zfs_uiomove(void *cp, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) +{ + ASSERT(zfs_uio_rw(uio) == dir); + return (uiomove(cp, (int)n, GET_UIO_STRUCT(uio))); +} + /* * same as zfs_uiomove() but doesn't modify uio structure. * return in cbytes how many bytes were copied. */ int zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) { struct iovec small_iovec[1]; struct uio small_uio_clone; struct uio *uio_clone; int error; ASSERT3U(zfs_uio_rw(uio), ==, rw); if (zfs_uio_iovcnt(uio) == 1) { small_uio_clone = *(GET_UIO_STRUCT(uio)); small_iovec[0] = *(GET_UIO_STRUCT(uio)->uio_iov); small_uio_clone.uio_iov = small_iovec; uio_clone = &small_uio_clone; } else { uio_clone = cloneuio(GET_UIO_STRUCT(uio)); } error = vn_io_fault_uiomove(p, n, uio_clone); *cbytes = zfs_uio_resid(uio) - uio_clone->uio_resid; if (uio_clone != &small_uio_clone) free(uio_clone, M_IOV); return (error); } /* * Drop the next n chars out of *uiop. */ void zfs_uioskip(zfs_uio_t *uio, size_t n) { zfs_uio_seg_t segflg; /* For the full compatibility with illumos. */ if (n > zfs_uio_resid(uio)) return; segflg = zfs_uio_segflg(uio); zfs_uio_segflg(uio) = UIO_NOCOPY; zfs_uiomove(NULL, n, zfs_uio_rw(uio), uio); zfs_uio_segflg(uio) = segflg; } int zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) { ASSERT(zfs_uio_rw(uio) == dir); return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio))); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 40adfbcee4e1..a54cd6ca800e 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -1,4422 +1,4426 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright 2019 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif /* * Grand theory statement on scan queue sorting * * Scanning is implemented by recursively traversing all indirection levels * in an object and reading all blocks referenced from said objects. This * results in us approximately traversing the object from lowest logical * offset to the highest. For best performance, we would want the logical * blocks to be physically contiguous. However, this is frequently not the * case with pools given the allocation patterns of copy-on-write filesystems. * So instead, we put the I/Os into a reordering queue and issue them in a * way that will most benefit physical disks (LBA-order). * * Queue management: * * Ideally, we would want to scan all metadata and queue up all block I/O * prior to starting to issue it, because that allows us to do an optimal * sorting job. This can however consume large amounts of memory. Therefore * we continuously monitor the size of the queues and constrain them to 5% * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this * limit, we clear out a few of the largest extents at the head of the queues * to make room for more scanning. Hopefully, these extents will be fairly * large and contiguous, allowing us to approach sequential I/O throughput * even without a fully sorted tree. * * Metadata scanning takes place in dsl_scan_visit(), which is called from * dsl_scan_sync() every spa_sync(). If we have either fully scanned all * metadata on the pool, or we need to make room in memory because our * queues are too large, dsl_scan_visit() is postponed and * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies * that metadata scanning and queued I/O issuing are mutually exclusive. This * allows us to provide maximum sequential I/O throughput for the majority of * I/O's issued since sequential I/O performance is significantly negatively * impacted if it is interleaved with random I/O. * * Implementation Notes * * One side effect of the queued scanning algorithm is that the scanning code * needs to be notified whenever a block is freed. This is needed to allow * the scanning code to remove these I/Os from the issuing queue. Additionally, * we do not attempt to queue gang blocks to be issued sequentially since this * is very hard to do and would have an extremely limited performance benefit. * Instead, we simply issue gang I/Os as soon as we find them using the legacy * algorithm. * * Backwards compatibility * * This new algorithm is backwards compatible with the legacy on-disk data * structures (and therefore does not require a new feature flag). * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan * will stop scanning metadata (in logical order) and wait for all outstanding * sorted I/O to complete. Once this is done, we write out a checkpoint * bookmark, indicating that we have scanned everything logically before it. * If the pool is imported on a machine without the new sorting algorithm, * the scan simply resumes from the last checkpoint using the legacy algorithm. */ typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; static int scan_ds_queue_compare(const void *a, const void *b); static int scan_prefetch_queue_compare(const void *a, const void *b); static void scan_ds_queue_clear(dsl_scan_t *scn); static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn); static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg); static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); static uint64_t dsl_scan_count_leaves(vdev_t *vd); extern int zfs_vdev_async_write_active_min_dirty_percent; /* * By default zfs will check to ensure it is not over the hard memory * limit before each txg. If finer-grained control of this is needed * this value can be set to 1 to enable checking before scanning each * block. */ int zfs_scan_strict_mem_lim = B_FALSE; /* * Maximum number of parallelly executed bytes per leaf vdev. We attempt * to strike a balance here between keeping the vdev queues full of I/Os * at all times and not overflowing the queues to cause long latency, * which would cause long txg sync times. No matter what, we will not * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ unsigned long zfs_scan_vdev_limit = 4 << 20; int zfs_scan_issue_strategy = 0; int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ unsigned long zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ /* * fill_weight is non-tunable at runtime, so we copy it at module init from * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would * break queue sorting. */ int zfs_scan_fill_weight = 3; static uint64_t fill_weight; /* See dsl_scan_should_clear() for details on the memory limit tunables */ uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ int zfs_scan_checkpoint_intval = 7200; /* in seconds */ int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */ int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ unsigned long zfs_async_block_max_blocks = ULONG_MAX; /* max number of dedup blocks to free in a single TXG */ unsigned long zfs_max_async_dedup_frees = 100000; int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */ /* * We wait a few txgs after importing a pool to begin scanning so that * the import / mounting code isn't held up by scrub / resilver IO. * Unfortunately, it is a bit difficult to determine exactly how long * this will take since userspace will trigger fs mounts asynchronously * and the kernel will create zvol minors asynchronously. As a result, * the value provided here is a bit arbitrary, but represents a * reasonable estimate of how many txgs it will take to finish fully * importing a pool */ #define SCAN_IMPORT_WAIT_TXGS 5 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) /* * Enable/disable the processing of the free_bpobj object. */ int zfs_free_bpobj_enabled = 1; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; /* In core node for the scn->scn_queue. Represents a dataset to be scanned */ typedef struct { uint64_t sds_dsobj; uint64_t sds_txg; avl_node_t sds_node; } scan_ds_t; /* * This controls what conditions are placed on dsl_scan_sync_state(): * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise * write out the scn_phys_cached version. * See dsl_scan_sync_state for details. */ typedef enum { SYNC_OPTIONAL, SYNC_MANDATORY, SYNC_CACHED } state_sync_type_t; /* * This struct represents the minimum information needed to reconstruct a * zio for sequential scanning. This is useful because many of these will * accumulate in the sequential IO queues before being issued, so saving * memory matters here. */ typedef struct scan_io { /* fields from blkptr_t */ uint64_t sio_blk_prop; uint64_t sio_phys_birth; uint64_t sio_birth; zio_cksum_t sio_cksum; uint32_t sio_nr_dvas; /* fields from zio_t */ uint32_t sio_flags; zbookmark_phys_t sio_zb; /* members for queue sorting */ union { avl_node_t sio_addr_node; /* link into issuing queue */ list_node_t sio_list_node; /* link for issuing to disk */ } sio_nodes; /* * There may be up to SPA_DVAS_PER_BP DVAs here from the bp, * depending on how many were in the original bp. Only the * first DVA is really used for sorting and issuing purposes. * The other DVAs (if provided) simply exist so that the zio * layer can find additional copies to repair from in the * event of an error. This array must go at the end of the * struct to allow this for the variable number of elements. */ dva_t sio_dva[0]; } scan_io_t; #define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) #define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x) #define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0]) #define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0]) #define SIO_GET_END_OFFSET(sio) \ (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio)) #define SIO_GET_MUSED(sio) \ (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t))) struct dsl_scan_io_queue { dsl_scan_t *q_scn; /* associated dsl_scan_t */ vdev_t *q_vd; /* top-level vdev that this queue represents */ /* trees used for sorting I/Os and extents of I/Os */ range_tree_t *q_exts_by_addr; zfs_btree_t q_exts_by_size; avl_tree_t q_sios_by_addr; uint64_t q_sio_memused; /* members for zio rate limiting */ uint64_t q_maxinflight_bytes; uint64_t q_inflight_bytes; kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ /* per txg statistics */ uint64_t q_total_seg_size_this_txg; uint64_t q_segs_this_txg; uint64_t q_total_zio_size_this_txg; uint64_t q_zios_this_txg; }; /* private data for dsl_scan_prefetch_cb() */ typedef struct scan_prefetch_ctx { zfs_refcount_t spc_refcnt; /* refcount for memory management */ dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ boolean_t spc_root; /* is this prefetch for an objset? */ uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ } scan_prefetch_ctx_t; /* private data for dsl_scan_prefetch() */ typedef struct scan_prefetch_issue_ctx { avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ blkptr_t spic_bp; /* bp to prefetch */ zbookmark_phys_t spic_zb; /* bookmark to prefetch */ } scan_prefetch_issue_ctx_t; static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio); static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); static void scan_io_queues_destroy(dsl_scan_t *scn); static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP]; /* sio->sio_nr_dvas must be set so we know which cache to free from */ static void sio_free(scan_io_t *sio) { ASSERT3U(sio->sio_nr_dvas, >, 0); ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio); } /* It is up to the caller to set sio->sio_nr_dvas for freeing */ static scan_io_t * sio_alloc(unsigned short nr_dvas) { ASSERT3U(nr_dvas, >, 0); ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP); return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP)); } void scan_init(void) { /* * This is used in ext_size_compare() to weight segments * based on how sparse they are. This cannot be changed * mid-scan and the tree comparison functions don't currently * have a mechanism for passing additional context to the * compare functions. Thus we store this value globally and * we only allow it to be set at module initialization time */ fill_weight = zfs_scan_fill_weight; for (int i = 0; i < SPA_DVAS_PER_BP; i++) { char name[36]; (void) snprintf(name, sizeof (name), "sio_cache_%d", i); sio_cache[i] = kmem_cache_create(name, (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), 0, NULL, NULL, NULL, NULL, NULL, 0); } } void scan_fini(void) { for (int i = 0; i < SPA_DVAS_PER_BP; i++) { kmem_cache_destroy(sio_cache[i]); } } static inline boolean_t dsl_scan_is_running(const dsl_scan_t *scn) { return (scn->scn_phys.scn_state == DSS_SCANNING); } boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { return (dsl_scan_is_running(dp->dp_scan) && dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); } static inline void sio2bp(const scan_io_t *sio, blkptr_t *bp) { bzero(bp, sizeof (*bp)); bp->blk_prop = sio->sio_blk_prop; bp->blk_phys_birth = sio->sio_phys_birth; bp->blk_birth = sio->sio_birth; bp->blk_fill = 1; /* we always only work with data pointers */ bp->blk_cksum = sio->sio_cksum; ASSERT3U(sio->sio_nr_dvas, >, 0); ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t)); } static inline void bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) { sio->sio_blk_prop = bp->blk_prop; sio->sio_phys_birth = bp->blk_phys_birth; sio->sio_birth = bp->blk_birth; sio->sio_cksum = bp->blk_cksum; sio->sio_nr_dvas = BP_GET_NDVAS(bp); /* * Copy the DVAs to the sio. We need all copies of the block so * that the self healing code can use the alternate copies if the * first is corrupted. We want the DVA at index dva_i to be first * in the sio since this is the primary one that we want to issue. */ for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) { sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas]; } } int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { int err; dsl_scan_t *scn; spa_t *spa = dp->dp_spa; uint64_t f; scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; /* * It's possible that we're resuming a scan after a reboot so * make sure that the scan_async_destroying flag is initialized * appropriately. */ ASSERT(!scn->scn_async_destroying); scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); /* * Calculate the max number of in-flight bytes for pool-wide * scanning operations (minimum 1MB). Limits for the issuing * phase are done per top-level vdev and are handled separately. */ scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, sizeof (scan_prefetch_issue_ctx_t), offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { /* * There was an old-style scrub in progress. Restart a * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", (longlong_t)scn->scn_restart_txg); /* * Load the queue obj from the old location so that it * can be freed by dsl_scan_done(). */ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); /* * Detect if the pool contains the signature of #2094. If it * does properly update the scn->scn_phys structure and notify * the administrator by setting an errata for the pool. */ if (err == EOVERFLOW) { uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1]; VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24); VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==, (23 * sizeof (uint64_t))); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp); if (err == 0) { uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS]; if (overflow & ~DSL_SCAN_FLAGS_MASK || scn->scn_async_destroying) { spa->spa_errata = ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY; return (EOVERFLOW); } bcopy(zaptmp, &scn->scn_phys, SCAN_PHYS_NUMINTS * sizeof (uint64_t)); scn->scn_phys.scn_flags = overflow; /* Required scrub already in progress. */ if (scn->scn_phys.scn_state == DSS_FINISHED || scn->scn_phys.scn_state == DSS_CANCELED) spa->spa_errata = ZPOOL_ERRATA_ZOL_2094_SCRUB; } } if (err == ENOENT) return (0); else if (err) return (err); /* * We might be restarting after a reboot, so jump the issued * counter to how far we've scanned. We know we're consistent * up to here. */ scn->scn_issued_before_pass = scn->scn_phys.scn_examined; if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old * pool, and the pool was accessed by old * software. Restart from the beginning, since * the old software may have changed the pool in * the meantime. */ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", (longlong_t)scn->scn_restart_txg); } else if (dsl_scan_resilvering(dp)) { /* * If a resilver is in progress and there are already * errors, restart it instead of finishing this scan and * then restarting it. If there haven't been any errors * then remember that the incore DTL is valid. */ if (scn->scn_phys.scn_errors > 0) { scn->scn_restart_txg = txg; zfs_dbgmsg("resilver can't excise DTL_MISSING " "when finished; restarting in txg %llu", (u_longlong_t)scn->scn_restart_txg); } else { /* it's safe to excise DTL when finished */ spa->spa_scrub_started = B_TRUE; } } } bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); /* reload the queue into the in-core state */ if (scn->scn_phys.scn_queue_obj != 0) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, scn->scn_phys.scn_queue_obj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { scan_ds_queue_insert(scn, zfs_strtonum(za.za_name, NULL), za.za_first_integer); } zap_cursor_fini(&zc); } spa_scan_stat_init(spa); return (0); } void dsl_scan_fini(dsl_pool_t *dp) { if (dp->dp_scan != NULL) { dsl_scan_t *scn = dp->dp_scan; if (scn->scn_taskq != NULL) taskq_destroy(scn->scn_taskq); scan_ds_queue_clear(scn); avl_destroy(&scn->scn_queue); scan_ds_prefetch_queue_clear(scn); avl_destroy(&scn->scn_prefetch_queue); kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } static boolean_t dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) { return (scn->scn_restart_txg != 0 && scn->scn_restart_txg <= tx->tx_txg); } boolean_t dsl_scan_resilver_scheduled(dsl_pool_t *dp) { return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) || (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER)); } boolean_t dsl_scan_scrubbing(const dsl_pool_t *dp) { dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; return (scn_phys->scn_state == DSS_SCANNING && scn_phys->scn_func == POOL_SCAN_SCRUB); } boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn) { return (dsl_scan_scrubbing(scn->scn_dp) && scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); } /* * Writes out a persistent dsl_scan_phys_t record to the pool directory. * Because we can be running in the block sorting algorithm, we do not always * want to write out the record, only when it is "safe" to do so. This safety * condition is achieved by making sure that the sorting queues are empty * (scn_bytes_pending == 0). When this condition is not true, the sync'd state * is inconsistent with how much actual scanning progress has been made. The * kind of sync to be performed is specified by the sync_type argument. If the * sync is optional, we only sync if the queues are empty. If the sync is * mandatory, we do a hard ASSERT to make sure that the queues are empty. The * third possible state is a "cached" sync. This is done in response to: * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been * destroyed, so we wouldn't be able to restart scanning from it. * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been * superseded by a newer snapshot. * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been * swapped with its clone. * In all cases, a cached sync simply rewrites the last record we've written, * just slightly modified. For the modifications that are performed to the * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. */ static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) { int i; spa_t *spa = scn->scn_dp->dp_spa; ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); if (scn->scn_bytes_pending == 0) { for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; if (q == NULL) continue; mutex_enter(&vd->vdev_scan_io_queue_lock); ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, NULL); ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); mutex_exit(&vd->vdev_scan_io_queue_lock); } if (scn->scn_phys.scn_queue_obj != 0) scan_ds_queue_sync(scn, tx); VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); if (scn->scn_checkpointing) zfs_dbgmsg("finish scan checkpoint"); scn->scn_checkpointing = B_FALSE; scn->scn_last_checkpoint = ddi_get_lbolt(); } else if (sync_type == SYNC_CACHED) { VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys_cached, tx)); } } /* ARGSUSED */ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) return (SET_ERROR(EBUSY)); return (0); } void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; scn->scn_phys.scn_max_txg = tx->tx_txg; scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; scn->scn_issued_before_pass = 0; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; scn->scn_last_checkpoint = 0; scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; /* rewrite all disk labels */ vdev_config_dirty(spa->spa_root_vdev); if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "healing"); spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_START); nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); } spa->spa_scrub_started = B_TRUE; /* * If this is an incremental scrub, limit the DDT scrub phase * to just the auto-ditto class (for correctness); the rest * of the scrub should go faster using top-down pruning. */ if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; /* * When starting a resilver clear any existing rebuild state. * This is required to prevent stale rebuild status from * being reported when a rebuild is run, then a resilver and * finally a scrub. In which case only the scrub status * should be reported by 'zpool status'. */ if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; vdev_rebuild_clear_sync( (void *)(uintptr_t)vd->vdev_id, tx); } } } /* back to the generic stuff */ if (dp->dp_blkstats == NULL) { dp->dp_blkstats = vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); mutex_init(&dp->dp_blkstats->zab_lock, NULL, MUTEX_DEFAULT, NULL); } bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", *funcp, (u_longlong_t)scn->scn_phys.scn_min_txg, (u_longlong_t)scn->scn_phys.scn_max_txg); } /* * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. * Can also be called to resume a paused scrub. */ int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; /* * Purge all vdev caches and probe all devices. We do this here * rather than in sync context because this requires a writer lock * on the spa_config lock, which we can't do from sync context. The * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ spa_vdev_state_enter(spa, SCL_NONE); spa->spa_scrub_reopen = B_TRUE; vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_RESILVER) { dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); return (0); } if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { /* got scrub start cmd, resume paused scrub */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, POOL_SCRUB_NORMAL); if (err == 0) { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); return (SET_ERROR(ECANCELED)); } return (SET_ERROR(err)); } return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { static const char *old_names[] = { "scrub_bookmark", "scrub_ddt_bookmark", "scrub_ddt_class_max", "scrub_queue", "scrub_min_txg", "scrub_max_txg", "scrub_func", "scrub_errors", NULL }; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int i; /* Remove any remnants of an old-style scrub. */ for (i = 0; old_names[i]; i++) { (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); } if (scn->scn_phys.scn_queue_obj != 0) { VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } scan_ds_queue_clear(scn); scan_ds_prefetch_queue_clear(scn); scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; /* * If we were "restarted" from a stopped state, don't bother * with anything else. */ if (!dsl_scan_is_running(scn)) { ASSERT(!scn->scn_is_sorted); return; } if (scn->scn_is_sorted) { scan_io_queues_destroy(scn); scn->scn_is_sorted = B_FALSE; if (scn->scn_taskq != NULL) { taskq_destroy(scn->scn_taskq); scn->scn_taskq = NULL; } } scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; spa_notify_waiters(spa); if (dsl_scan_restarting(scn, tx)) spa_history_log_internal(spa, "scan aborted, restarting", tx, "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); else if (!complete) spa_history_log_internal(spa, "scan cancelled", tx, "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); else spa_history_log_internal(spa, "scan done", tx, "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { spa->spa_scrub_active = B_FALSE; /* * If the scrub/resilver completed, update all DTLs to * reflect this. Whether it succeeded or not, vacate * all temporary scrub DTLs. * * As the scrub does not currently support traversing * data that have been freed but are part of a checkpoint, * we don't mark the scrub as done in the DTLs as faults * may still exist in those vdevs. */ if (complete && !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); if (scn->scn_phys.scn_min_txg) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "healing"); spa_event_notify(spa, NULL, aux, ESC_ZFS_RESILVER_FINISH); nvlist_free(aux); } else { spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_FINISH); } } else { vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 0, B_TRUE, B_FALSE); } spa_errlog_rotate(spa); /* * Don't clear flag until after vdev_dtl_reassess to ensure that * DTL_MISSING will get updated when possible. */ spa->spa_scrub_started = B_FALSE; /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); /* * Clear any resilver_deferred flags in the config. * If there are drives that need resilvering, kick * off an asynchronous request to start resilver. * vdev_clear_resilver_deferred() may update the config * before the resilver can restart. In the event of * a crash during this period, the spa loading code * will find the drives that need to be resilvered * and start the resilver then. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) && vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { spa_history_log_internal(spa, "starting deferred resilver", tx, "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); spa_async_request(spa, SPA_ASYNC_RESILVER); } + + /* Clear recent error events (i.e. duplicate events tracking) */ + if (complete) + zfs_ereport_clear(spa, NULL); } scn->scn_phys.scn_end_time = gethrestime_sec(); if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) spa->spa_errata = 0; ASSERT(!dsl_scan_is_running(scn)); } /* ARGSUSED */ static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (!dsl_scan_is_running(scn)) return (SET_ERROR(ENOENT)); return (0); } /* ARGSUSED */ static void dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); } int dsl_scan_cancel(dsl_pool_t *dp) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } static int dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ if (!dsl_scan_scrubbing(dp)) return (SET_ERROR(ENOENT)); /* can't pause a paused scrub */ if (dsl_scan_is_paused_scrub(scn)) return (SET_ERROR(EBUSY)); } else if (*cmd != POOL_SCRUB_NORMAL) { return (SET_ERROR(ENOTSUP)); } return (0); } static void dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ spa->spa_scan_pass_scrub_pause = gethrestime_sec(); scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx, SYNC_CACHED); spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); spa_notify_waiters(spa); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); if (dsl_scan_is_paused_scrub(scn)) { /* * We need to keep track of how much time we spend * paused per pass so that we can adjust the scrub rate * shown in the output of 'zpool status' */ spa->spa_scan_pass_scrub_spent_paused += gethrestime_sec() - spa->spa_scan_pass_scrub_pause; spa->spa_scan_pass_scrub_pause = 0; scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx, SYNC_CACHED); } } } /* * Set scrub pause/resume state if it makes sense to do so */ int dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); } /* start a new scan, or restart an existing one. */ void dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; dmu_tx_commit(tx); } else { dp->dp_scan->scn_restart_txg = txg; } zfs_dbgmsg("restarting resilver txg=%llu", (longlong_t)txg); } void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { zio_free(dp->dp_spa, txg, bp); } void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); } static int scan_ds_queue_compare(const void *a, const void *b) { const scan_ds_t *sds_a = a, *sds_b = b; if (sds_a->sds_dsobj < sds_b->sds_dsobj) return (-1); if (sds_a->sds_dsobj == sds_b->sds_dsobj) return (0); return (1); } static void scan_ds_queue_clear(dsl_scan_t *scn) { void *cookie = NULL; scan_ds_t *sds; while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { kmem_free(sds, sizeof (*sds)); } } static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) { scan_ds_t srch, *sds; srch.sds_dsobj = dsobj; sds = avl_find(&scn->scn_queue, &srch, NULL); if (sds != NULL && txg != NULL) *txg = sds->sds_txg; return (sds != NULL); } static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) { scan_ds_t *sds; avl_index_t where; sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); sds->sds_dsobj = dsobj; sds->sds_txg = txg; VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); avl_insert(&scn->scn_queue, sds, where); } static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) { scan_ds_t srch, *sds; srch.sds_dsobj = dsobj; sds = avl_find(&scn->scn_queue, &srch, NULL); VERIFY(sds != NULL); avl_remove(&scn->scn_queue, sds); kmem_free(sds, sizeof (*sds)); } static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; ASSERT0(scn->scn_bytes_pending); ASSERT(scn->scn_phys.scn_queue_obj != 0); VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, DMU_OT_NONE, 0, tx); for (scan_ds_t *sds = avl_first(&scn->scn_queue); sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { VERIFY0(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, sds->sds_dsobj, sds->sds_txg, tx)); } } /* * Computes the memory limit state that we're currently in. A sorted scan * needs quite a bit of memory to hold the sorting queue, so we need to * reasonably constrain the size so it doesn't impact overall system * performance. We compute two limits: * 1) Hard memory limit: if the amount of memory used by the sorting * queues on a pool gets above this value, we stop the metadata * scanning portion and start issuing the queued up and sorted * I/Os to reduce memory usage. * This limit is calculated as a fraction of physmem (by default 5%). * We constrain the lower bound of the hard limit to an absolute * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain * the upper bound to 5% of the total pool size - no chance we'll * ever need that much memory, but just to keep the value in check. * 2) Soft memory limit: once we hit the hard memory limit, we start * issuing I/O to reduce queue memory usage, but we don't want to * completely empty out the queues, since we might be able to find I/Os * that will fill in the gaps of our non-sequential IOs at some point * in the future. So we stop the issuing of I/Os once the amount of * memory used drops below the soft limit (at which point we stop issuing * I/O and start scanning metadata again). * * This limit is calculated by subtracting a fraction of the hard * limit from the hard limit. By default this fraction is 5%, so * the soft limit is 95% of the hard limit. We cap the size of the * difference between the hard and soft limits at an absolute * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is * sufficient to not cause too frequent switching between the * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's * worth of queues is about 1.2 GiB of on-pool data, so scanning * that should take at least a decent fraction of a second). */ static boolean_t dsl_scan_should_clear(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; uint64_t alloc, mlim_hard, mlim_soft, mused; alloc = metaslab_class_get_alloc(spa_normal_class(spa)); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, zfs_scan_mem_lim_min); mlim_hard = MIN(mlim_hard, alloc / 20); mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, zfs_scan_mem_lim_soft_max); mused = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *tvd = rvd->vdev_child[i]; dsl_scan_io_queue_t *queue; mutex_enter(&tvd->vdev_scan_io_queue_lock); queue = tvd->vdev_scan_io_queue; if (queue != NULL) { /* # extents in exts_by_size = # in exts_by_addr */ mused += zfs_btree_numnodes(&queue->q_exts_by_size) * sizeof (range_seg_gap_t) + queue->q_sio_memused; } mutex_exit(&tvd->vdev_scan_io_queue_lock); } dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); if (mused == 0) ASSERT0(scn->scn_bytes_pending); /* * If we are above our hard limit, we need to clear out memory. * If we are below our soft limit, we need to accumulate sequential IOs. * Otherwise, we should keep doing whatever we are currently doing. */ if (mused >= mlim_hard) return (B_TRUE); else if (mused < mlim_soft) return (B_FALSE); else return (scn->scn_clearing); } static boolean_t dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* we never skip user/group accounting objects */ if (zb && (int64_t)zb->zb_object < 0) return (B_FALSE); if (scn->scn_suspending) return (B_TRUE); /* we're already suspending */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 and objset blocks. */ if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL)) return (B_FALSE); /* * We suspend if: * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly * (default 30%), someone is explicitly waiting for this txg * to complete, or we have used up all of the time in the txg * timeout (default 5 sec). * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. * or * - the scan queue has reached its memory use limit */ uint64_t curr_time_ns = gethrtime(); uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; if ((NSEC2MSEC(scan_time_ns) > mintime && (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa) || (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { if (zb && zb->zb_level == ZB_ROOT_LEVEL) { dprintf("suspending at first available bookmark " "%llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); SET_BOOKMARK(&scn->scn_phys.scn_bookmark, zb->zb_objset, 0, 0, 0); } else if (zb != NULL) { dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; } else { #ifdef ZFS_DEBUG dsl_scan_phys_t *scnp = &scn->scn_phys; dprintf("suspending at at DDT bookmark " "%llx/%llx/%llx/%llx\n", (longlong_t)scnp->scn_ddt_bookmark.ddb_class, (longlong_t)scnp->scn_ddt_bookmark.ddb_type, (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); #endif } scn->scn_suspending = B_TRUE; return (B_TRUE); } return (B_FALSE); } typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; } zil_scan_arg_t; /* ARGSUSED */ static int dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; zbookmark_phys_t zb; ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * One block ("stubby") can be allocated a long time ago; we * want to visit that one because it has been allocated * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); return (0); } /* ARGSUSED */ static int dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, uint64_t claim_txg) { if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; const lr_write_t *lr = (const lr_write_t *)lrc; const blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; ASSERT(!BP_IS_REDACTED(bp)); if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * birth can be < claim_txg if this record's txg is * already txg sync'ed (but this log block contains * other records that are not synced) */ if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); } return (0); } static void dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; ASSERT(spa_writeable(dp->dp_spa)); /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ if (claim_txg == 0) return; zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, claim_txg, B_FALSE); zil_free(zilog); } /* * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea * here is to sort the AVL tree by the order each block will be needed. */ static int scan_prefetch_queue_compare(const void *a, const void *b) { const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; return (zbookmark_compare(spc_a->spc_datablkszsec, spc_a->spc_indblkshift, spc_b->spc_datablkszsec, spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); } static void scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) { if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) { zfs_refcount_destroy(&spc->spc_refcnt); kmem_free(spc, sizeof (scan_prefetch_ctx_t)); } } static scan_prefetch_ctx_t * scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) { scan_prefetch_ctx_t *spc; spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); zfs_refcount_create(&spc->spc_refcnt); zfs_refcount_add(&spc->spc_refcnt, tag); spc->spc_scn = scn; if (dnp != NULL) { spc->spc_datablkszsec = dnp->dn_datablkszsec; spc->spc_indblkshift = dnp->dn_indblkshift; spc->spc_root = B_FALSE; } else { spc->spc_datablkszsec = 0; spc->spc_indblkshift = 0; spc->spc_root = B_TRUE; } return (spc); } static void scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag) { zfs_refcount_add(&spc->spc_refcnt, tag); } static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; void *cookie = NULL; scan_prefetch_issue_ctx_t *spic = NULL; mutex_enter(&spa->spa_scrub_lock); while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue, &cookie)) != NULL) { scan_prefetch_ctx_rele(spic->spic_spc, scn); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } mutex_exit(&spa->spa_scrub_lock); } static boolean_t dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, const zbookmark_phys_t *zb) { zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; dnode_phys_t tmp_dnp; dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; if (zb->zb_objset != last_zb->zb_objset) return (B_TRUE); if ((int64_t)zb->zb_object < 0) return (B_FALSE); tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; tmp_dnp.dn_indblkshift = spc->spc_indblkshift; if (zbookmark_subtree_completed(dnp, zb, last_zb)) return (B_TRUE); return (B_FALSE); } static void dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) { avl_index_t idx; dsl_scan_t *scn = spc->spc_scn; spa_t *spa = scn->scn_dp->dp_spa; scan_prefetch_issue_ctx_t *spic; if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp)) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) return; if (dsl_scan_check_prefetch_resume(spc, zb)) return; scan_prefetch_ctx_add_ref(spc, scn); spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); spic->spic_spc = spc; spic->spic_bp = *bp; spic->spic_zb = *zb; /* * Add the IO to the queue of blocks to prefetch. This allows us to * prioritize blocks that we will need first for the main traversal * thread. */ mutex_enter(&spa->spa_scrub_lock); if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { /* this block is already queued for prefetch */ kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); scan_prefetch_ctx_rele(spc, scn); mutex_exit(&spa->spa_scrub_lock); return; } avl_insert(&scn->scn_prefetch_queue, spic, idx); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } static void dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int i; zbookmark_phys_t zb; scan_prefetch_ctx_t *spc; if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) return; SET_BOOKMARK(&zb, objset, object, 0, 0); spc = scan_prefetch_ctx_create(scn, dnp, FTAG); for (i = 0; i < dnp->dn_nblkptr; i++) { zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); zb.zb_blkid = i; dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zb.zb_level = 0; zb.zb_blkid = DMU_SPILL_BLKID; dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb); } scan_prefetch_ctx_rele(spc, FTAG); } static void dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *private) { scan_prefetch_ctx_t *spc = private; dsl_scan_t *scn = spc->spc_scn; spa_t *spa = scn->scn_dp->dp_spa; /* broadcast that the IO has completed for rate limiting purposes */ mutex_enter(&spa->spa_scrub_lock); ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); /* if there was an error or we are done prefetching, just cleanup */ if (buf == NULL || scn->scn_prefetch_stop) goto out; if (BP_GET_LEVEL(bp) > 0) { int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t czb; for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_prefetch(spc, cbp, &czb); } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { dnode_phys_t *cdnp; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_prefetch_dnode(scn, cdnp, zb->zb_objset, zb->zb_blkid * epb + i); } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { objset_phys_t *osp = buf->b_data; dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); if (OBJSET_BUF_HAS_USERUSED(buf)) { dsl_scan_prefetch_dnode(scn, &osp->os_groupused_dnode, zb->zb_objset, DMU_GROUPUSED_OBJECT); dsl_scan_prefetch_dnode(scn, &osp->os_userused_dnode, zb->zb_objset, DMU_USERUSED_OBJECT); } } out: if (buf != NULL) arc_buf_destroy(buf, private); scan_prefetch_ctx_rele(spc, scn); } /* ARGSUSED */ static void dsl_scan_prefetch_thread(void *arg) { dsl_scan_t *scn = arg; spa_t *spa = scn->scn_dp->dp_spa; scan_prefetch_issue_ctx_t *spic; /* loop until we are told to stop */ while (!scn->scn_prefetch_stop) { arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; mutex_enter(&spa->spa_scrub_lock); /* * Wait until we have an IO to issue and are not above our * maximum in flight limit. */ while (!scn->scn_prefetch_stop && (avl_numnodes(&scn->scn_prefetch_queue) == 0 || spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } /* recheck if we should stop since we waited for the cv */ if (scn->scn_prefetch_stop) { mutex_exit(&spa->spa_scrub_lock); break; } /* remove the prefetch IO from the tree */ spic = avl_first(&scn->scn_prefetch_queue); spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); avl_remove(&scn->scn_prefetch_queue, spic); mutex_exit(&spa->spa_scrub_lock); if (BP_IS_PROTECTED(&spic->spic_bp)) { ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE || BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET); ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0); zio_flags |= ZIO_FLAG_RAW; } /* issue the prefetch asynchronously */ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } ASSERT(scn->scn_prefetch_stop); /* free any prefetches we didn't get to complete */ mutex_enter(&spa->spa_scrub_lock); while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { avl_remove(&scn->scn_prefetch_queue, spic); scan_prefetch_ctx_rele(spic->spic_spc, scn); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); mutex_exit(&spa->spa_scrub_lock); } static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ if (zbookmark_subtree_completed(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it to a different object, zero it out to * indicate that it's OK to start checking for suspending * again. */ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { dprintf("resuming at %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); } } return (B_FALSE); } static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); inline __attribute__((always_inline)) static void dsl_scan_visitdnode( dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ inline __attribute__((always_inline)) static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; ASSERT(!BP_IS_REDACTED(bp)); if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; if (BP_IS_PROTECTED(bp)) { ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); zio_flags |= ZIO_FLAG_RAW; } err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } osp = buf->b_data; dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); if (OBJSET_BUF_HAS_USERUSED(buf)) { /* * We also always visit user/group/project accounting * objects, and never skip them, even if we are * suspending. This is necessary so that the * space deltas from this txg get integrated. */ if (OBJSET_BUF_HAS_PROJECTUSED(buf)) dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_projectused_dnode, DMU_PROJECTUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_groupused_dnode, DMU_GROUPUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); } return (0); } inline __attribute__((always_inline)) static void dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx) { int j; for (j = 0; j < dnp->dn_nblkptr; j++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, dnp->dn_nlevels - 1, j); dsl_scan_visitbp(&dnp->dn_blkptr[j], &czb, dnp, ds, scn, ostype, tx); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), &czb, dnp, ds, scn, ostype, tx); } } /* * The arguments are in this order because mdb can only print the * first 5; we want them to be useful. */ static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; blkptr_t *bp_toread = NULL; if (dsl_scan_check_suspend(scn, zb)) return; if (dsl_scan_check_resume(scn, dnp, zb)) return; scn->scn_visited_this_txg++; /* * This debugging is commented out to conserve stack space. This * function is called recursively and the debugging adds several * bytes to the stack for each call. It can be commented back in * if required to debug an issue in dsl_scan_visitbp(). * * dprintf_bp(bp, * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", * ds, ds ? ds->ds_object : 0, * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, * bp); */ if (BP_IS_HOLE(bp)) { scn->scn_holes_this_txg++; return; } if (BP_IS_REDACTED(bp)) { ASSERT(dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)); return; } if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { scn->scn_lt_min_this_txg++; return; } bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); *bp_toread = *bp; if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) goto out; /* * If dsl_scan_ddt() has already visited this block, it will have * already done any translations or scrubbing, so don't call the * callback again. */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { scn->scn_ddt_contained_this_txg++; goto out; } /* * If this block is from the future (after cur_max_txg), then we * are doing this on behalf of a deleted snapshot, and we will * revisit the future block on the next pass of this dataset. * Don't scan it now unless we need to because something * under it was modified. */ if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { scn->scn_gt_max_this_txg++; goto out; } scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); out: kmem_free(bp_toread, sizeof (blkptr_t)); } static void dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_phys_t zb; scan_prefetch_ctx_t *spc; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { SET_BOOKMARK(&scn->scn_prefetch_bookmark, zb.zb_objset, 0, 0, 0); } else { scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; } scn->scn_objsets_visited_this_txg++; spc = scan_prefetch_ctx_create(scn, NULL, FTAG); dsl_scan_prefetch(spc, bp, &zb); scan_prefetch_ctx_rele(spc, FTAG); dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } static void ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) { if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { if (ds->ds_is_snapshot) { /* * Note: * - scn_cur_{min,max}_txg stays the same. * - Setting the flag is not really necessary if * scn_cur_max_txg == scn_max_txg, because there * is nothing after this snapshot that we care * about. However, we set it anyway and then * ignore it when we retraverse it in * dsl_scan_visitds(). */ scn_phys->scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn_phys->scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object); } } } /* * Invoked when a dataset is destroyed. We need to make sure that: * * 1) If it is the dataset that was currently being scanned, we write * a new dsl_scan_phys_t and marking the objset reference in it * as destroyed. * 2) Remove it from the work queue, if it was present. * * If the dataset was actually a snapshot, instead of marking the dataset * as destroyed, we instead substitute the next snapshot in line. */ void dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (!dsl_scan_is_running(scn)) return; ds_destroyed_scn_phys(ds, &scn->scn_phys); ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { scan_ds_queue_remove(scn, ds->ds_object); if (ds->ds_is_snapshot) scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); if (ds->ds_is_snapshot) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was * deleted too. */ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("destroying ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); } else { zfs_dbgmsg("destroying ds %llu; in queue; removing", (u_longlong_t)ds->ds_object); } } /* * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static void ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) { if (scn_bookmark->zb_objset == ds->ds_object) { scn_bookmark->zb_objset = dsl_dataset_phys(ds)->ds_prev_snap_obj; zfs_dbgmsg("snapshotting ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } } /* * Called when a dataset is snapshotted. If we were currently traversing * this snapshot, we reset our bookmark to point at the newly created * snapshot. We also modify our work queue to remove the old snapshot and * replace with the new one. */ void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (!dsl_scan_is_running(scn)) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { scan_ds_queue_remove(scn, ds->ds_object); scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("snapshotting ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } dsl_scan_sync_state(scn, tx, SYNC_CACHED); } static void ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, zbookmark_phys_t *scn_bookmark) { if (scn_bookmark->zb_objset == ds1->ds_object) { scn_bookmark->zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (scn_bookmark->zb_objset == ds2->ds_object) { scn_bookmark->zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } } /* * Called when an origin dataset and its clone are swapped. If we were * currently traversing the dataset, we need to switch to traversing the * newly promoted clone. */ void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg1, mintxg2; boolean_t ds1_queued, ds2_queued; if (!dsl_scan_is_running(scn)) return; ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); /* * Handle the in-memory scan queue. */ ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1); ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2); /* Sanity checking. */ if (ds1_queued) { ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds2_queued) { ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds1_queued && ds2_queued) { /* * If both are queued, we don't need to do anything. * The swapping code below would not handle this case correctly, * since we can't insert ds2 if it is already there. That's * because scan_ds_queue_insert() prohibits a duplicate insert * and panics. */ } else if (ds1_queued) { scan_ds_queue_remove(scn, ds1->ds_object); scan_ds_queue_insert(scn, ds2->ds_object, mintxg1); } else if (ds2_queued) { scan_ds_queue_remove(scn, ds2->ds_object); scan_ds_queue_insert(scn, ds1->ds_object, mintxg2); } /* * Handle the on-disk scan queue. * The on-disk state is an out-of-date version of the in-memory state, * so the in-memory and on-disk values for ds1_queued and ds2_queued may * be different. Therefore we need to apply the swap logic to the * on-disk state independently of the in-memory state. */ ds1_queued = zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0; ds2_queued = zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0; /* Sanity checking. */ if (ds1_queued) { ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds2_queued) { ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); } if (ds1_queued && ds2_queued) { /* * If both are queued, we don't need to do anything. * Alternatively, we could check for EEXIST from * zap_add_int_key() and back out to the original state, but * that would be more work than checking for this case upfront. */ } else if (ds1_queued) { VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (ds2_queued) { VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } dsl_scan_sync_state(scn, tx, SYNC_CACHED); } /* ARGSUSED */ static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { uint64_t originobj = *(uint64_t *)arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); dsl_dataset_rele(ds, FTAG); if (err) return (err); ds = prev; } scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (scn->scn_phys.scn_cur_min_txg >= scn->scn_phys.scn_max_txg) { /* * This can happen if this snapshot was created after the * scan started, and we already completed a previous snapshot * that was created after the scan started. This snapshot * only references blocks with: * * birth < our ds_creation_txg * cur_min_txg is no less than ds_creation_txg. * We have already visited these blocks. * or * birth > scn_max_txg * The scan requested not to visit these blocks. * * Subsequent snapshots (and clones) can reference our * blocks, or blocks with even higher birth times. * Therefore we do not need to visit them either, * so we do not add them to the work queue. * * Note that checking for cur_min_txg >= cur_max_txg * is not sufficient, because in that case we may need to * visit subsequent snapshots. This happens when min_txg > 0, * which raises cur_min_txg. In this case we will visit * this dataset but skip all of its blocks, because the * rootbp's birth time is < cur_min_txg. Then we will * add the next snapshots/clones to the work queue. */ char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " "cur_min_txg (%llu) >= max_txg (%llu)", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_max_txg); kmem_free(dsname, MAXNAMELEN); goto out; } /* * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same * BP as in the head), they must be ignored. In addition, $ORIGIN * doesn't have a objset (i.e. its ds_bp is a hole) so we don't * need to look for a ZIL in it either. So we traverse the ZIL here, * rather than in scan_recurse(), because the regular snapshot * block-sharing rules don't apply to it. */ if (!dsl_dataset_is_snapshot(ds) && (dp->dp_origin_snap == NULL || ds->ds_dir != dp->dp_origin_snap->ds_dir)) { objset_t *os; if (dmu_objset_from_ds(ds, &os) != 0) { goto out; } dsl_scan_zil(dp, &os->os_zil_header); } /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "suspending=%u", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_cur_max_txg, (int)scn->scn_suspending); kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); if (scn->scn_suspending) goto out; /* * We've finished this pass over this dataset. */ /* * If we did not completely visit this dataset, do another pass. */ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass; visiting again"); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; scan_ds_queue_insert(scn, ds->ds_object, scn->scn_phys.scn_cur_max_txg); goto out; } /* * Add descendant datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, dsl_dataset_phys(ds)->ds_creation_txg); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { uint64_t count; /* * A bug in a previous version of the code could * cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a * missing entry. Therefore we can only use the * next_clones_obj when its count is correct. */ int err = zap_count(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, &count); if (err == 0 && count == dsl_dataset_phys(ds)->ds_num_children - 1) usenext = B_TRUE; } if (usenext) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { scan_ds_queue_insert(scn, zfs_strtonum(za.za_name, NULL), dsl_dataset_phys(ds)->ds_creation_txg); } zap_cursor_fini(&zc); } else { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_clones_cb, &ds->ds_object, DS_FIND_CHILDREN)); } } out: dsl_dataset_rele(ds, FTAG); } /* ARGSUSED */ static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } /* * If this is a clone, we don't need to worry about it for now. */ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(prev, FTAG); return (0); } dsl_dataset_rele(ds, FTAG); ds = prev; } scan_ds_queue_insert(scn, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } /* ARGSUSED */ void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx) { const ddt_key_t *ddk = &dde->dde_key; ddt_phys_t *ddp = dde->dde_phys; blkptr_t bp; zbookmark_phys_t zb = { 0 }; int p; if (!dsl_scan_is_running(scn)) return; /* * This function is special because it is the only thing * that can add scan_io_t's to the vdev scan queues from * outside dsl_scan_sync(). For the most part this is ok * as long as it is called from within syncing context. * However, dsl_scan_sync() expects that no new sio's will * be added between when all the work for a scan is done * and the next txg when the scan is actually marked as * completed. This check ensures we do not issue new sio's * during this period. */ if (scn->scn_done_txg != 0) return; for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) continue; ddt_bp_create(checksum, ddk, ddp, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); } } /* * Scrub/dedup interaction. * * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * * We leverage the fact that the dde's replication class (enum ddt_class) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * * To prevent excess scrubbing, the scrub begins by walking the DDT * to find all blocks with refcnt > 1, and scrubs each of these once. * Since there are two replication classes which contain blocks with * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. * * There would be nothing more to say if a block's refcnt couldn't change * during a scrub, but of course it can so we must account for changes * in a block's replication class. * * Here's an example of what can occur: * * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 * when visited during the top-down scrub phase, it will be scrubbed twice. * This negates our scrub optimization, but is otherwise harmless. * * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 * on each visit during the top-down scrub phase, it will never be scrubbed. * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 * while a scrub is in progress, it scrubs the block right then. */ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ddt_entry_t dde; int error; uint64_t n = 0; bzero(&dde, sizeof (ddt_entry_t)); while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) break; dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", (longlong_t)ddb->ddb_class, (longlong_t)ddb->ddb_type, (longlong_t)ddb->ddb_checksum, (longlong_t)ddb->ddb_cursor); /* There should be no pending changes to the dedup table */ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); n++; if (dsl_scan_check_suspend(scn, NULL)) break; } zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; " "suspending=%u", (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; if (ds->ds_is_snapshot) return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { scan_ds_t *sds; dsl_pool_t *dp = scn->scn_dp; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_ddt(scn, tx); if (scn->scn_suspending) return; } if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { /* First do the MOS & ORIGIN */ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_suspending) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_cb, NULL, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); } ASSERT(!scn->scn_suspending); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; /* * If we were suspended, continue from here. Note if the * ds we were suspended on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* * In case we suspended right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); /* * Keep pulling things out of the dataset avl queue. Updates to the * persistent zap-object-as-queue happen only at checkpoints. */ while ((sds = avl_first(&scn->scn_queue)) != NULL) { dsl_dataset_t *ds; uint64_t dsobj = sds->sds_dsobj; uint64_t txg = sds->sds_txg; /* dequeue and free the ds from the queue */ scan_ds_queue_remove(scn, dsobj); sds = NULL; /* set up min / max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (txg != 0) { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, txg); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, dsl_dataset_phys(ds)->ds_prev_snap_txg); } scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* No more objsets to fetch, we're done */ scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; ASSERT0(scn->scn_suspending); } static uint64_t dsl_scan_count_leaves(vdev_t *vd) { uint64_t i, leaves = 0; /* we only count leaves that belong to the main pool and are readable */ if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache || !vdev_readable(vd)) return (0); if (vd->vdev_ops->vdev_op_leaf) return (1); for (i = 0; i < vd->vdev_children; i++) { leaves += dsl_scan_count_leaves(vd->vdev_child[i]); } return (leaves); } static void scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) { int i; uint64_t cur_size = 0; for (i = 0; i < BP_GET_NDVAS(bp); i++) { cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); } q->q_total_zio_size_this_txg += cur_size; q->q_zios_this_txg++; } static void scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, uint64_t end) { q->q_total_seg_size_this_txg += end - start; q->q_segs_this_txg++; } static boolean_t scan_io_queue_check_suspend(dsl_scan_t *scn) { /* See comment in dsl_scan_check_suspend() */ uint64_t curr_time_ns = gethrtime(); uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; uint64_t sync_time_ns = curr_time_ns - scn->scn_dp->dp_spa->spa_sync_starttime; int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; return ((NSEC2MSEC(scan_time_ns) > mintime && (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || txg_sync_waiting(scn->scn_dp) || NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || spa_shutting_down(scn->scn_dp->dp_spa)); } /* * Given a list of scan_io_t's in io_list, this issues the I/Os out to * disk. This consumes the io_list and frees the scan_io_t's. This is * called when emptying queues, either when we're up against the memory * limit or when we have finished scanning. Returns B_TRUE if we stopped * processing the list before we finished. Any sios that were not issued * will remain in the io_list. */ static boolean_t scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; int64_t bytes_issued = 0; boolean_t suspended = B_FALSE; while ((sio = list_head(io_list)) != NULL) { blkptr_t bp; if (scan_io_queue_check_suspend(scn)) { suspended = B_TRUE; break; } sio2bp(sio, &bp); bytes_issued += SIO_GET_ASIZE(sio); scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, &sio->sio_zb, queue); (void) list_remove_head(io_list); scan_io_queues_update_zio_stats(queue, &bp); sio_free(sio); } atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); return (suspended); } /* * This function removes sios from an IO queue which reside within a given * range_seg_t and inserts them (in offset order) into a list. Note that * we only ever return a maximum of 32 sios at once. If there are more sios * to process within this segment that did not make it onto the list we * return B_TRUE and otherwise B_FALSE. */ static boolean_t scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) { scan_io_t *srch_sio, *sio, *next_sio; avl_index_t idx; uint_t num_sios = 0; int64_t bytes_issued = 0; ASSERT(rs != NULL); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); srch_sio = sio_alloc(1); srch_sio->sio_nr_dvas = 1; SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr)); /* * The exact start of the extent might not contain any matching zios, * so if that's the case, examine the next one in the tree. */ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); sio_free(srch_sio); if (sio == NULL) sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, queue->q_exts_by_addr) && num_sios <= 32) { ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs, queue->q_exts_by_addr)); ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs, queue->q_exts_by_addr)); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio); queue->q_sio_memused -= SIO_GET_MUSED(sio); bytes_issued += SIO_GET_ASIZE(sio); num_sios++; list_insert_tail(list, sio); sio = next_sio; } /* * We limit the number of sios we process at once to 32 to avoid * biting off more than we can chew. If we didn't take everything * in the segment we update it to reflect the work we were able to * complete. Otherwise, we remove it from the range tree entirely. */ if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, queue->q_exts_by_addr)) { range_tree_adjust_fill(queue->q_exts_by_addr, rs, -bytes_issued); range_tree_resize_segment(queue->q_exts_by_addr, rs, SIO_GET_OFFSET(sio), rs_get_end(rs, queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); return (B_TRUE); } else { uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); return (B_FALSE); } } /* * This is called from the queue emptying thread and selects the next * extent from which we are to issue I/Os. The behavior of this function * depends on the state of the scan, the current memory consumption and * whether or not we are performing a scan shutdown. * 1) We select extents in an elevator algorithm (LBA-order) if the scan * needs to perform a checkpoint * 2) We select the largest available extent if we are up against the * memory limit. * 3) Otherwise we don't select any extents. */ static range_seg_t * scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; range_tree_t *rt = queue->q_exts_by_addr; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(scn->scn_is_sorted); /* handle tunable overrides */ if (scn->scn_checkpointing || scn->scn_clearing) { if (zfs_scan_issue_strategy == 1) { return (range_tree_first(rt)); } else if (zfs_scan_issue_strategy == 2) { /* * We need to get the original entry in the by_addr * tree so we can modify it. */ range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, NULL); if (size_rs == NULL) return (NULL); uint64_t start = rs_get_start(size_rs, rt); uint64_t size = rs_get_end(size_rs, rt) - start; range_seg_t *addr_rs = range_tree_find(rt, start, size); ASSERT3P(addr_rs, !=, NULL); ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs, rt)); ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt)); return (addr_rs); } } /* * During normal clearing, we want to issue our largest segments * first, keeping IO as sequential as possible, and leaving the * smaller extents for later with the hope that they might eventually * grow to larger sequential segments. However, when the scan is * checkpointing, no new extents will be added to the sorting queue, * so the way we are sorted now is as good as it will ever get. * In this case, we instead switch to issuing extents in LBA order. */ if (scn->scn_checkpointing) { return (range_tree_first(rt)); } else if (scn->scn_clearing) { /* * We need to get the original entry in the by_addr * tree so we can modify it. */ range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, NULL); if (size_rs == NULL) return (NULL); uint64_t start = rs_get_start(size_rs, rt); uint64_t size = rs_get_end(size_rs, rt) - start; range_seg_t *addr_rs = range_tree_find(rt, start, size); ASSERT3P(addr_rs, !=, NULL); ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs, rt)); ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt)); return (addr_rs); } else { return (NULL); } } static void scan_io_queues_run_one(void *arg) { dsl_scan_io_queue_t *queue = arg; kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; boolean_t suspended = B_FALSE; range_seg_t *rs = NULL; scan_io_t *sio = NULL; list_t sio_list; uint64_t bytes_per_leaf = zfs_scan_vdev_limit; uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); ASSERT(queue->q_scn->scn_is_sorted); list_create(&sio_list, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_list_node)); mutex_enter(q_lock); /* calculate maximum in-flight bytes for this txg (min 1MB) */ queue->q_maxinflight_bytes = MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); /* reset per-queue scan statistics for this txg */ queue->q_total_seg_size_this_txg = 0; queue->q_segs_this_txg = 0; queue->q_total_zio_size_this_txg = 0; queue->q_zios_this_txg = 0; /* loop until we run out of time or sios */ while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) { uint64_t seg_start = 0, seg_end = 0; boolean_t more_left = B_TRUE; ASSERT(list_is_empty(&sio_list)); /* loop while we still have sios left to process in this rs */ while (more_left) { scan_io_t *first_sio, *last_sio; /* * We have selected which extent needs to be * processed next. Gather up the corresponding sios. */ more_left = scan_io_queue_gather(queue, rs, &sio_list); ASSERT(!list_is_empty(&sio_list)); first_sio = list_head(&sio_list); last_sio = list_tail(&sio_list); seg_end = SIO_GET_END_OFFSET(last_sio); if (seg_start == 0) seg_start = SIO_GET_OFFSET(first_sio); /* * Issuing sios can take a long time so drop the * queue lock. The sio queue won't be updated by * other threads since we're in syncing context so * we can be sure that our trees will remain exactly * as we left them. */ mutex_exit(q_lock); suspended = scan_io_queue_issue(queue, &sio_list); mutex_enter(q_lock); if (suspended) break; } /* update statistics for debugging purposes */ scan_io_queues_update_seg_stats(queue, seg_start, seg_end); if (suspended) break; } /* * If we were suspended in the middle of processing, * requeue any unfinished sios and exit. */ while ((sio = list_head(&sio_list)) != NULL) { list_remove(&sio_list, sio); scan_io_queue_insert_impl(queue, sio); } mutex_exit(q_lock); list_destroy(&sio_list); } /* * Performs an emptying run on all scan queues in the pool. This just * punches out one thread per top-level vdev, each of which processes * only that vdev's scan queue. We can parallelize the I/O here because * we know that each queue's I/Os only affect its own top-level vdev. * * This function waits for the queue runs to complete, and must be * called from dsl_scan_sync (or in general, syncing context). */ static void scan_io_queues_run(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; ASSERT(scn->scn_is_sorted); ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (scn->scn_bytes_pending == 0) return; if (scn->scn_taskq == NULL) { int nthreads = spa->spa_root_vdev->vdev_children; /* * We need to make this taskq *always* execute as many * threads in parallel as we have top-level vdevs and no * less, otherwise strange serialization of the calls to * scan_io_queues_run_one can occur during spa_sync runs * and that significantly impacts performance. */ scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads, minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE); } for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; mutex_enter(&vd->vdev_scan_io_queue_lock); if (vd->vdev_scan_io_queue != NULL) { VERIFY(taskq_dispatch(scn->scn_taskq, scan_io_queues_run_one, vd->vdev_scan_io_queue, TQ_SLEEP) != TASKQID_INVALID); } mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * Wait for the queues to finish issuing their IOs for this run * before we return. There may still be IOs in flight at this * point. */ taskq_wait(scn->scn_taskq); } static boolean_t dsl_scan_async_block_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; if (zfs_recover) return (B_FALSE); if (zfs_async_block_max_blocks != 0 && scn->scn_visited_this_txg >= zfs_async_block_max_blocks) { return (B_TRUE); } if (zfs_max_async_dedup_frees != 0 && scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) { return (B_TRUE); } elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } static int dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_scan_t *scn = arg; if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; if (BP_GET_DEDUP(bp)) scn->scn_dedup_frees_this_txg++; return (0); } static void dsl_scan_update_stats(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t i; uint64_t seg_size_total = 0, zio_size_total = 0; uint64_t seg_count_total = 0, zio_count_total = 0; for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; if (queue == NULL) continue; seg_size_total += queue->q_total_seg_size_this_txg; zio_size_total += queue->q_total_zio_size_this_txg; seg_count_total += queue->q_segs_this_txg; zio_count_total += queue->q_zios_this_txg; } if (seg_count_total == 0 || zio_count_total == 0) { scn->scn_avg_seg_size_this_txg = 0; scn->scn_avg_zio_size_this_txg = 0; scn->scn_segs_this_txg = 0; scn->scn_zios_this_txg = 0; return; } scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; scn->scn_segs_this_txg = seg_count_total; scn->scn_zios_this_txg = zio_count_total; } static int bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (dsl_scan_free_block_cb(arg, bp, tx)); } static int dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); dsl_scan_t *scn = arg; const dva_t *dva = &bp->blk_dva[0]; if (dsl_scan_async_block_should_pause(scn)) return (SET_ERROR(ERESTART)); spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), tx); scn->scn_visited_this_txg++; return (0); } boolean_t dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; boolean_t clones_left; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } clones_left = spa_livelist_delete_check(spa); return ((used != 0) || (clones_left)); } static boolean_t dsl_scan_check_deferred(vdev_t *vd) { boolean_t need_resilver = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) { need_resilver |= dsl_scan_check_deferred(vd->vdev_child[c]); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (need_resilver); if (!vd->vdev_resilver_deferred) need_resilver = B_TRUE; return (need_resilver); } static boolean_t dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_t *vd; vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd->vdev_ops == &vdev_indirect_ops) { /* * The indirect vdev can point to multiple * vdevs. For simplicity, always create * the resilver zio_t. zio_vdev_io_start() * will bypass the child resilver i/o's if * they are on vdevs that don't have DTL's. */ return (B_TRUE); } if (DVA_GET_GANG(dva)) { /* * Gang members may be spread across multiple * vdevs, so the best estimate we have is the * scrub range, which has already been checked. * XXX -- it would be better to change our * allocation policy to ensure that all * gang members reside on the same vdev. */ return (B_TRUE); } /* * Check if the top-level vdev must resilver this offset. * When the offset does not intersect with a dirty leaf DTL * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth)) return (B_FALSE); /* * Check that this top-level vdev has a device under it which * is resilvering and is not deferred. */ if (!dsl_scan_check_deferred(vd)) return (B_FALSE); return (B_TRUE); } static int dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; int err = 0; if (spa_suspend_async_destroy(spa)) return (0); if (zfs_free_bpobj_enabled && spa_version(spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, bpobj_dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); } if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); scn->scn_zio_root = NULL; if (err == EIO || err == ECKSUM) { err = 0; } else if (err != 0 && err != ERESTART) { zfs_panic_recover("error %u from " "traverse_dataset_destroyed()", err); } if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { /* finished; deactivate async destroy feature */ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); ASSERT(!spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, tx)); VERIFY0(bptree_free(dp->dp_meta_objset, dp->dp_bptree_obj, tx)); dp->dp_bptree_obj = 0; scn->scn_async_destroying = B_FALSE; scn->scn_async_stalled = B_FALSE; } else { /* * If we didn't make progress, mark the async * destroy as stalled, so that we will not initiate * a spa_sync() on its behalf. Note that we only * check this if we are not finished, because if the * bptree had no blocks for us to visit, we can * finish without "making progress". */ scn->scn_async_stalled = (scn->scn_visited_this_txg == 0); } } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " "free_bpobj/bptree txg %llu; err=%u", (longlong_t)scn->scn_visited_this_txg, (longlong_t) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)tx->tx_txg, err); scn->scn_visited_this_txg = 0; scn->scn_dedup_frees_this_txg = 0; /* * Write out changes to the DDT that may be required as a * result of the blocks freed. This ensures that the DDT * is clean when a scrub/resilver runs. */ ddt_sync(spa, tx->tx_txg); } if (err != 0) return (err); if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && zfs_free_leak_on_eio && (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { /* * We have finished background destroying, but there is still * some space left in the dp_free_dir. Transfer this leaked * space to the dp_leak_dir. */ if (dp->dp_leak_dir == NULL) { rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, LEAK_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir)); rrw_exit(&dp->dp_config_rwlock, FTAG); } dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && !spa_livelist_delete_check(spa)) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } spa_notify_waiters(spa); EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ)); if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)); scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; err = bpobj_iterate(&dp->dp_obsolete_bpobj, dsl_scan_obsolete_block_cb, scn, tx); if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) dsl_pool_destroy_obsolete_bpobj(dp, tx); } return (0); } /* * This is the primary entry point for scans that is called from syncing * context. Scans must happen entirely during syncing context so that we * can guarantee that blocks we are currently scanning will not change out * from under us. While a scan is active, this function controls how quickly * transaction groups proceed, instead of the normal handling provided by * txg_sync_thread(). */ void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { int err = 0; dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; state_sync_type_t sync_type = SYNC_OPTIONAL; if (spa->spa_resilver_deferred && !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx); /* * Check for scn_restart_txg before checking spa_load_state, so * that we can restart an old-style scan while the pool is being * imported (see dsl_scan_init). We also restart scans if there * is a deferred resilver and the user has manually disabled * deferred resilvers via the tunable. */ if (dsl_scan_restarting(scn, tx) || (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", func, (longlong_t)tx->tx_txg); dsl_scan_setup_sync(&func, tx); } /* * Only process scans in sync pass 1. */ if (spa_sync_pass(spa) > 1) return; /* * If the spa is shutting down, then stop scanning. This will * ensure that the scan does not dirty any new data during the * shutdown phase. */ if (spa_shutting_down(spa)) return; /* * If the scan is inactive due to a stalled async destroy, try again. */ if (!scn->scn_async_stalled && !dsl_scan_active(scn)) return; /* reset scan statistics */ scn->scn_visited_this_txg = 0; scn->scn_dedup_frees_this_txg = 0; scn->scn_holes_this_txg = 0; scn->scn_lt_min_this_txg = 0; scn->scn_gt_max_this_txg = 0; scn->scn_ddt_contained_this_txg = 0; scn->scn_objsets_visited_this_txg = 0; scn->scn_avg_seg_size_this_txg = 0; scn->scn_segs_this_txg = 0; scn->scn_avg_zio_size_this_txg = 0; scn->scn_zios_this_txg = 0; scn->scn_suspending = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; /* * First process the async destroys. If we suspend, don't do * any scrubbing or resilvering. This ensures that there are no * async destroys while we are scanning, so the scan code doesn't * have to worry about traversing it. It is also faster to free the * blocks than to scrub them. */ err = dsl_process_async_destroys(dp, tx); if (err != 0) return; if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; /* * Wait a few txgs after importing to begin scanning so that * we can get the pool imported quickly. */ if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) return; /* * zfs_scan_suspend_progress can be set to disable scan progress. * We don't want to spin the txg_sync thread, so we add a delay * here to simulate the time spent doing a scan. This is mostly * useful for testing and debugging. */ if (zfs_scan_suspend_progress) { uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; while (zfs_scan_suspend_progress && !txg_sync_waiting(scn->scn_dp) && !spa_shutting_down(scn->scn_dp->dp_spa) && NSEC2MSEC(scan_time_ns) < mintime) { delay(hz); scan_time_ns = gethrtime() - scn->scn_sync_start_time; } return; } /* * It is possible to switch from unsorted to sorted at any time, * but afterwards the scan will remain sorted unless reloaded from * a checkpoint after a reboot. */ if (!zfs_scan_legacy) { scn->scn_is_sorted = B_TRUE; if (scn->scn_last_checkpoint == 0) scn->scn_last_checkpoint = ddi_get_lbolt(); } /* * For sorted scans, determine what kind of work we will be doing * this txg based on our memory limitations and whether or not we * need to perform a checkpoint. */ if (scn->scn_is_sorted) { /* * If we are over our checkpoint interval, set scn_clearing * so that we can begin checkpointing immediately. The * checkpoint allows us to save a consistent bookmark * representing how much data we have scrubbed so far. * Otherwise, use the memory limit to determine if we should * scan for metadata or start issue scrub IOs. We accumulate * metadata until we hit our hard memory limit at which point * we issue scrub IOs until we are at our soft memory limit. */ if (scn->scn_checkpointing || ddi_get_lbolt() - scn->scn_last_checkpoint > SEC_TO_TICK(zfs_scan_checkpoint_intval)) { if (!scn->scn_checkpointing) zfs_dbgmsg("begin scan checkpoint"); scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; } else { boolean_t should_clear = dsl_scan_should_clear(scn); if (should_clear && !scn->scn_clearing) { zfs_dbgmsg("begin scan clearing"); scn->scn_clearing = B_TRUE; } else if (!should_clear && scn->scn_clearing) { zfs_dbgmsg("finish scan clearing"); scn->scn_clearing = B_FALSE; } } } else { ASSERT0(scn->scn_checkpointing); ASSERT0(scn->scn_clearing); } if (!scn->scn_clearing && scn->scn_done_txg == 0) { /* Need to scan metadata for more blocks to scrub */ dsl_scan_phys_t *scnp = &scn->scn_phys; taskqid_t prefetch_tqid; uint64_t bytes_per_leaf = zfs_scan_vdev_limit; uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev); /* * Recalculate the max number of in-flight bytes for pool-wide * scanning operations (minimum 1MB). Limits for the issuing * phase are done per top-level vdev and are handled separately. */ scn->scn_maxinflight_bytes = MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); zfs_dbgmsg("doing scan sync txg %llu; " "ddt bm=%llu/%llu/%llu/%llx", (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_ddt_bookmark.ddb_class, (longlong_t)scnp->scn_ddt_bookmark.ddb_type, (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); } else { zfs_dbgmsg("doing scan sync txg %llu; " "bm=%llu/%llu/%llu/%llu", (longlong_t)tx->tx_txg, (longlong_t)scnp->scn_bookmark.zb_objset, (longlong_t)scnp->scn_bookmark.zb_object, (longlong_t)scnp->scn_bookmark.zb_level, (longlong_t)scnp->scn_bookmark.zb_blkid); } scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); scn->scn_prefetch_stop = B_FALSE; prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, dsl_scan_prefetch_thread, scn, TQ_SLEEP); ASSERT(prefetch_tqid != TASKQID_INVALID); dsl_pool_config_enter(dp, FTAG); dsl_scan_visit(scn, tx); dsl_pool_config_exit(dp, FTAG); mutex_enter(&dp->dp_spa->spa_scrub_lock); scn->scn_prefetch_stop = B_TRUE; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&dp->dp_spa->spa_scrub_lock); taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; zfs_dbgmsg("scan visited %llu blocks in %llums " "(%llu os's, %llu holes, %llu < mintxg, " "%llu in ddt, %llu > maxtxg)", (longlong_t)scn->scn_visited_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)scn->scn_objsets_visited_this_txg, (longlong_t)scn->scn_holes_this_txg, (longlong_t)scn->scn_lt_min_this_txg, (longlong_t)scn->scn_ddt_contained_this_txg, (longlong_t)scn->scn_gt_max_this_txg); if (!scn->scn_suspending) { ASSERT0(avl_numnodes(&scn->scn_queue)); scn->scn_done_txg = tx->tx_txg + 1; if (scn->scn_is_sorted) { scn->scn_checkpointing = B_TRUE; scn->scn_clearing = B_TRUE; } zfs_dbgmsg("scan complete txg %llu", (longlong_t)tx->tx_txg); } } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { ASSERT(scn->scn_clearing); /* need to issue scrubbing IOs from per-vdev queues */ scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); scan_io_queues_run(scn); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; /* calculate and dprintf the current memory usage */ (void) dsl_scan_should_clear(scn); dsl_scan_update_stats(scn); zfs_dbgmsg("scan issued %llu blocks (%llu segs) in %llums " "(avg_block_size = %llu, avg_seg_size = %llu)", (longlong_t)scn->scn_zios_this_txg, (longlong_t)scn->scn_segs_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)scn->scn_avg_zio_size_this_txg, (longlong_t)scn->scn_avg_seg_size_this_txg); } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { /* Finished with everything. Mark the scrub as complete */ zfs_dbgmsg("scan issuing complete txg %llu", (longlong_t)tx->tx_txg); ASSERT3U(scn->scn_done_txg, !=, 0); ASSERT0(spa->spa_scrub_inflight); ASSERT0(scn->scn_bytes_pending); dsl_scan_done(scn, B_TRUE, tx); sync_type = SYNC_MANDATORY; } dsl_scan_sync_state(scn, tx, sync_type); } static void count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; /* * Don't count embedded bp's, since we already did the work of * scanning these when we scanned the containing block. */ if (BP_IS_EMBEDDED(bp)) return; /* * Update the spa's stats on how many bytes we have issued. * Sequential scrubs create a zio for each DVA of the bp. Each * of these will include all DVAs for repair purposes, but the * zio code will only try the first one unless there is an issue. * Therefore, we should only count the first DVA for these IOs. */ if (scn->scn_is_sorted) { atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, DVA_GET_ASIZE(&bp->blk_dva[0])); } else { spa_t *spa = scn->scn_dp->dp_spa; for (i = 0; i < BP_GET_NDVAS(bp); i++) { atomic_add_64(&spa->spa_scan_pass_issued, DVA_GET_ASIZE(&bp->blk_dva[i])); } } /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. */ if (zab == NULL) return; mutex_enter(&zab->zab_lock); for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; if (t & DMU_OT_NEWTYPE) t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_2_of_2_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal == 1) zb->zb_ditto_2_of_3_samevdev++; else if (equal == 3) zb->zb_ditto_3_of_3_samevdev++; break; } } mutex_exit(&zab->zab_lock); } static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { avl_index_t idx; int64_t asize = SIO_GET_ASIZE(sio); dsl_scan_t *scn = queue->q_scn; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { /* block is already scheduled for reading */ atomic_add_64(&scn->scn_bytes_pending, -asize); sio_free(sio); return; } avl_insert(&queue->q_sios_by_addr, sio, idx); queue->q_sio_memused += SIO_GET_MUSED(sio); range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize); } /* * Given all the info we got from our metadata scanning process, we * construct a scan_io_t and insert it into the scan sorting queue. The * I/O must already be suitable for us to process. This is controlled * by dsl_scan_enqueue(). */ static void scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, int zio_flags, const zbookmark_phys_t *zb) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); ASSERT0(BP_IS_GANG(bp)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); bp2sio(bp, sio, dva_i); sio->sio_flags = zio_flags; sio->sio_zb = *zb; /* * Increment the bytes pending counter now so that we can't * get an integer underflow in case the worker processes the * zio before we get to incrementing this counter. */ atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio)); scan_io_queue_insert_impl(queue, sio); } /* * Given a set of I/O parameters as discovered by the metadata traversal * process, attempts to place the I/O into the sorted queues (if allowed), * or immediately executes the I/O. */ static void dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb) { spa_t *spa = dp->dp_spa; ASSERT(!BP_IS_EMBEDDED(bp)); /* * Gang blocks are hard to issue sequentially, so we just issue them * here immediately instead of queuing them. */ if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { scan_exec_io(dp, bp, zio_flags, zb, NULL); return; } for (int i = 0; i < BP_GET_NDVAS(bp); i++) { dva_t dva; vdev_t *vdev; dva = bp->blk_dva[i]; vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); ASSERT(vdev != NULL); mutex_enter(&vdev->vdev_scan_io_queue_lock); if (vdev->vdev_scan_io_queue == NULL) vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); ASSERT(dp->dp_scan != NULL); scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, i, zio_flags, zb); mutex_exit(&vdev->vdev_scan_io_queue_lock); } } static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io = B_FALSE; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) { count_block(scn, dp->dp_blkstats, bp); return (0); } /* Embedded BP's have phys_birth==0, so we reject them above. */ ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; for (int d = 0; d < BP_GET_NDVAS(bp); d++) { const dva_t *dva = &bp->blk_dva[d]; /* * Keep track of how much data we've examined so that * zpool(8) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); /* if it's a resilver, this may not be in the target range */ if (!needs_io) needs_io = dsl_scan_need_resilver(spa, dva, psize, phys_birth); } if (needs_io && !zfs_no_scrub_io) { dsl_scan_enqueue(dp, bp, zio_flags, zb); } else { count_block(scn, dp->dp_blkstats, bp); } /* do not relocate this block */ return (0); } static void dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; dsl_scan_io_queue_t *queue = zio->io_private; abd_free(zio->io_abd); if (queue == NULL) { mutex_enter(&spa->spa_scrub_lock); ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } else { mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); queue->q_inflight_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&queue->q_zio_cv); mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); } if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); } } /* * Given a scanning zio's information, executes the zio. The zio need * not necessarily be only sortable, this function simply executes the * zio, no matter what it is. The optional queue argument allows the * caller to specify that they want per top level vdev IO rate limiting * instead of the legacy global limiting. */ static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); abd_t *data = abd_alloc_for_io(size, B_FALSE); ASSERT3U(scn->scn_maxinflight_bytes, >, 0); if (queue == NULL) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight += BP_GET_PSIZE(bp); mutex_exit(&spa->spa_scrub_lock); } else { kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; mutex_enter(q_lock); while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) cv_wait(&queue->q_zio_cv, q_lock); queue->q_inflight_bytes += BP_GET_PSIZE(bp); mutex_exit(q_lock); } count_block(scn, dp->dp_blkstats, bp); zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size, dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* * This is the primary extent sorting algorithm. We balance two parameters: * 1) how many bytes of I/O are in an extent * 2) how well the extent is filled with I/O (as a fraction of its total size) * Since we allow extents to have gaps between their constituent I/Os, it's * possible to have a fairly large extent that contains the same amount of * I/O bytes than a much smaller extent, which just packs the I/O more tightly. * The algorithm sorts based on a score calculated from the extent's size, * the relative fill volume (in %) and a "fill weight" parameter that controls * the split between whether we prefer larger extents or more well populated * extents: * * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) * * Example: * 1) assume extsz = 64 MiB * 2) assume fill = 32 MiB (extent is half full) * 3) assume fill_weight = 3 * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 * SCORE = 32M + (50 * 3 * 32M) / 100 * SCORE = 32M + (4800M / 100) * SCORE = 32M + 48M * ^ ^ * | +--- final total relative fill-based score * +--------- final total fill-based score * SCORE = 80M * * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards * extents that are more completely filled (in a 3:2 ratio) vs just larger. * Note that as an optimization, we replace multiplication and division by * 100 with bitshifting by 7 (which effectively multiplies and divides by 128). */ static int ext_size_compare(const void *x, const void *y) { const range_seg_gap_t *rsa = x, *rsb = y; uint64_t sa = rsa->rs_end - rsa->rs_start; uint64_t sb = rsb->rs_end - rsb->rs_start; uint64_t score_a, score_b; score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * fill_weight * rsa->rs_fill) >> 7); score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * fill_weight * rsb->rs_fill) >> 7); if (score_a > score_b) return (-1); if (score_a == score_b) { if (rsa->rs_start < rsb->rs_start) return (-1); if (rsa->rs_start == rsb->rs_start) return (0); return (1); } return (1); } /* * Comparator for the q_sios_by_addr tree. Sorting is simply performed * based on LBA-order (from lowest to highest). */ static int sio_addr_compare(const void *x, const void *y) { const scan_io_t *a = x, *b = y; return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); } /* IO queues are created on demand when they are needed. */ static dsl_scan_io_queue_t * scan_io_queue_create(vdev_t *vd) { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); q->q_scn = scn; q->q_vd = vd; q->q_sio_memused = 0; cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP, &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap); avl_create(&q->q_sios_by_addr, sio_addr_compare, sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); return (q); } /* * Destroys a scan queue and all segments and scan_io_t's contained in it. * No further execution of I/O occurs, anything pending in the queue is * simply freed without being executed. */ void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) { dsl_scan_t *scn = queue->q_scn; scan_io_t *sio; void *cookie = NULL; int64_t bytes_dequeued = 0; ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != NULL) { ASSERT(range_tree_contains(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); bytes_dequeued += SIO_GET_ASIZE(sio); queue->q_sio_memused -= SIO_GET_MUSED(sio); sio_free(sio); } ASSERT0(queue->q_sio_memused); atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); range_tree_vacate(queue->q_exts_by_addr, NULL, queue); range_tree_destroy(queue->q_exts_by_addr); avl_destroy(&queue->q_sios_by_addr); cv_destroy(&queue->q_zio_cv); kmem_free(queue, sizeof (*queue)); } /* * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is * called on behalf of vdev_top_transfer when creating or destroying * a mirror vdev due to zpool attach/detach. */ void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) { mutex_enter(&svd->vdev_scan_io_queue_lock); mutex_enter(&tvd->vdev_scan_io_queue_lock); VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; svd->vdev_scan_io_queue = NULL; if (tvd->vdev_scan_io_queue != NULL) tvd->vdev_scan_io_queue->q_vd = tvd; mutex_exit(&tvd->vdev_scan_io_queue_lock); mutex_exit(&svd->vdev_scan_io_queue_lock); } static void scan_io_queues_destroy(dsl_scan_t *scn) { vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *tvd = rvd->vdev_child[i]; mutex_enter(&tvd->vdev_scan_io_queue_lock); if (tvd->vdev_scan_io_queue != NULL) dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); tvd->vdev_scan_io_queue = NULL; mutex_exit(&tvd->vdev_scan_io_queue_lock); } } static void dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) { dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; vdev_t *vdev; kmutex_t *q_lock; dsl_scan_io_queue_t *queue; scan_io_t *srch_sio, *sio; avl_index_t idx; uint64_t start, size; vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); ASSERT(vdev != NULL); q_lock = &vdev->vdev_scan_io_queue_lock; queue = vdev->vdev_scan_io_queue; mutex_enter(q_lock); if (queue == NULL) { mutex_exit(q_lock); return; } srch_sio = sio_alloc(BP_GET_NDVAS(bp)); bp2sio(bp, srch_sio, dva_i); start = SIO_GET_OFFSET(srch_sio); size = SIO_GET_ASIZE(srch_sio); /* * We can find the zio in two states: * 1) Cold, just sitting in the queue of zio's to be issued at * some point in the future. In this case, all we do is * remove the zio from the q_sios_by_addr tree, decrement * its data volume from the containing range_seg_t and * resort the q_exts_by_size tree to reflect that the * range_seg_t has lost some of its 'fill'. We don't shorten * the range_seg_t - this is usually rare enough not to be * worth the extra hassle of trying keep track of precise * extent boundaries. * 2) Hot, where the zio is currently in-flight in * dsl_scan_issue_ios. In this case, we can't simply * reach in and stop the in-flight zio's, so we instead * block the caller. Eventually, dsl_scan_issue_ios will * be done with issuing the zio's it gathered and will * signal us. */ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); sio_free(srch_sio); if (sio != NULL) { int64_t asize = SIO_GET_ASIZE(sio); blkptr_t tmpbp; /* Got it while it was cold in the queue */ ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); ASSERT3U(size, ==, asize); avl_remove(&queue->q_sios_by_addr, sio); queue->q_sio_memused -= SIO_GET_MUSED(sio); ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); /* * We only update scn_bytes_pending in the cold path, * otherwise it will already have been accounted for as * part of the zio's execution. */ atomic_add_64(&scn->scn_bytes_pending, -asize); /* count the block as though we issued it */ sio2bp(sio, &tmpbp); count_block(scn, dp->dp_blkstats, &tmpbp); sio_free(sio); } mutex_exit(q_lock); } /* * Callback invoked when a zio_free() zio is executing. This needs to be * intercepted to prevent the zio from deallocating a particular portion * of disk space and it then getting reallocated and written to, while we * still have it queued up for processing. */ void dsl_scan_freed(spa_t *spa, const blkptr_t *bp) { dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT(scn != NULL); if (!dsl_scan_is_running(scn)) return; for (int i = 0; i < BP_GET_NDVAS(bp); i++) dsl_scan_freed_dva(spa, bp, i); } /* * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has * not started, start it. Otherwise, only restart if max txg in DTL range is * greater than the max txg in the current scan. If the DTL max is less than * the scan max, then the vdev has not missed any new data since the resilver * started, so a restart is not needed. */ void dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) { uint64_t min, max; if (!vdev_resilver_needed(vd, &min, &max)) return; if (!dsl_scan_resilvering(dp)) { spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); return; } if (max <= dp->dp_scan->scn_phys.scn_max_txg) return; /* restart is needed, check if it can be deferred */ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) vdev_defer_resilver(vd); else spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, ULONG, ZMOD_RW, "Max bytes in flight per leaf vdev for scrubs and resilvers"); ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, INT, ZMOD_RW, "Min millisecs to scrub per txg"); ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, INT, ZMOD_RW, "Min millisecs to obsolete per txg"); ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, INT, ZMOD_RW, "Min millisecs to free per txg"); ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, INT, ZMOD_RW, "Min millisecs to resilver per txg"); ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW, "Set to prevent scans from progressing"); ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW, "Set to disable scrub I/O"); ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW, "Set to disable scrub prefetching"); ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, ULONG, ZMOD_RW, "Max number of blocks freed in one txg"); ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, ULONG, ZMOD_RW, "Max number of dedup blocks freed in one txg"); ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW, "Enable processing of the free_bpobj"); ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, INT, ZMOD_RW, "Fraction of RAM for scan hard limit"); ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, INT, ZMOD_RW, "IO issuing strategy during scrubbing. " "0 = default, 1 = LBA, 2 = size"); ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW, "Scrub using legacy non-sequential method"); ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, INT, ZMOD_RW, "Scan progress on-disk checkpointing interval"); ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, ULONG, ZMOD_RW, "Max gap in bytes between sequential scrub / resilver I/Os"); ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, INT, ZMOD_RW, "Fraction of hard limit used as soft limit"); ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, "Tunable to attempt to reduce lock contention"); ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW, "Tunable to adjust bias towards more filled segments during scans"); ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c index 4a3144313267..ad82932ce567 100644 --- a/sys/contrib/openzfs/module/zfs/spa_config.c +++ b/sys/contrib/openzfs/module/zfs/spa_config.c @@ -1,623 +1,623 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2021, Colm Buckley */ #include #include #include #include #include -#include #include #include #include #include #include #include #include +#include #ifdef _KERNEL #include #endif /* * Pool configuration repository. * * Pool configuration is stored as a packed nvlist on the filesystem. By * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot * (when the ZFS module is loaded). Pools can also have the 'cachefile' * property set that allows them to be stored in an alternate location until * the control of external software. * * For each cache file, we have a single nvlist which holds all the * configuration information. When the module loads, we read this information * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is * maintained independently in spa.c. Whenever the namespace is modified, or * the configuration of a pool is changed, we call spa_write_cachefile(), which * walks through all the active pools and writes the configuration to disk. */ static uint64_t spa_config_generation = 1; /* * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ char *spa_config_path = ZPOOL_CACHE; int zfs_autoimport_disable = 1; /* * Called when the module is first loaded, this routine loads the configuration * file into the SPA namespace. It does not actually open or load the pools; it * only populates the namespace. */ void spa_config_load(void) { void *buf = NULL; nvlist_t *nvlist, *child; nvpair_t *nvpair; char *pathname; zfs_file_t *fp; zfs_file_attr_t zfa; uint64_t fsize; int err; #ifdef _KERNEL if (zfs_autoimport_disable) return; #endif /* * Open the configuration file. */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); err = zfs_file_open(pathname, O_RDONLY, 0, &fp); #ifdef __FreeBSD__ if (err) err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp); #endif kmem_free(pathname, MAXPATHLEN); if (err) return; if (zfs_file_getattr(fp, &zfa)) goto out; fsize = zfa.zfa_size; buf = kmem_alloc(fsize, KM_SLEEP); /* * Read the nvlist from the file. */ if (zfs_file_read(fp, buf, fsize, NULL) < 0) goto out; /* * Unpack the nvlist. */ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) goto out; /* * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. */ mutex_enter(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; child = fnvpair_value_nvlist(nvpair); if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; (void) spa_add(nvpair_name(nvpair), child, NULL); } mutex_exit(&spa_namespace_lock); nvlist_free(nvlist); out: if (buf != NULL) kmem_free(buf, fsize); zfs_file_close(fp); } static int spa_config_remove(spa_config_dirent_t *dp) { int error = 0; /* * Remove the cache file. If zfs_file_unlink() in not supported by the * platform fallback to truncating the file which is functionally * equivalent. */ error = zfs_file_unlink(dp->scd_path); if (error == EOPNOTSUPP) { int flags = O_RDWR | O_TRUNC; zfs_file_t *fp; error = zfs_file_open(dp->scd_path, flags, 0644, &fp); if (error == 0) { (void) zfs_file_fsync(fp, O_SYNC); (void) zfs_file_close(fp); } } return (error); } static int spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; char *buf; int oflags = O_RDWR | O_TRUNC | O_CREAT | O_LARGEFILE; char *temp; int err; zfs_file_t *fp; /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { err = spa_config_remove(dp); if (err == ENOENT) err = 0; return (err); } /* * Pack the configuration into a buffer. */ buf = fnvlist_pack(nvl, &buflen); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); /* * Write the configuration to disk. Due to the complexity involved * in performing a rename and remove from within the kernel the file * is instead truncated and overwritten in place. This way we always * have a consistent view of the data or a zero length file. */ err = zfs_file_open(dp->scd_path, oflags, 0644, &fp); if (err == 0) { err = zfs_file_write(fp, buf, buflen, NULL); if (err == 0) err = zfs_file_fsync(fp, O_SYNC); zfs_file_close(fp); if (err) (void) spa_config_remove(dp); } fnvlist_pack_free(buf, buflen); kmem_free(temp, MAXPATHLEN); return (err); } /* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. Synchronizing the pool cache is typically done after * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not * contain the correct information to open the pool and an explicit import * would be required. */ void spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; char *pool_name; boolean_t ccw_failure; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (!(spa_mode_global & SPA_MODE_WRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { /* * Skip over our own pool if we're about to remove * ourselves from the spa namespace or any pool that * is readonly. Since we cannot guarantee that a * readonly pool would successfully import upon reboot, * we don't allow them to be written to the cache file. */ if ((spa == target && removing) || !spa_writeable(spa)) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) nvl = fnvlist_alloc(); if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) pool_name = fnvlist_lookup_string( spa->spa_config, ZPOOL_CONFIG_POOL_NAME); else pool_name = spa_name(spa); fnvlist_add_nvlist(nvl, pool_name, spa->spa_config); mutex_exit(&spa->spa_props_lock); } error = spa_config_write(dp, nvl); if (error != 0) ccw_failure = B_TRUE; nvlist_free(nvl); } if (ccw_failure) { /* * Keep trying so that configuration data is * written if/when any temporary filesystem * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { (void) zfs_ereport_post( FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, target, NULL, NULL, NULL, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); } else { /* * Do not rate limit future attempts to update * the config cache. */ target->spa_ccw_fail_time = 0; } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); } /* * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, * and we don't want to allow the local zone to see all the pools anyway. * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ nvlist_t * spa_all_configs(uint64_t *generation) { nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) return (NULL); pools = fnvlist_alloc(); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curproc) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } } *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); return (pools); } void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); if (spa->spa_config != NULL && spa->spa_config != config) nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } /* * Generate the pool's configuration based on the current in-core state. * * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ nvlist_t * spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; uint64_t split_guid; char *pool_name; if (vd == NULL) { vd = rvd; locked = B_TRUE; spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == (SCL_CONFIG | SCL_STATE)); /* * If txg is -1, report the current value of spa->spa_config_txg. */ if (txg == -1ULL) txg = spa->spa_config_txg; /* * Originally, users had to handle spa namespace collisions by either * exporting the already imported pool or by specifying a new name for * the pool with a conflicting name. In the case of root pools from * virtual guests, neither approach to collision resolution is * reasonable. This is addressed by extending the new name syntax with * an option to specify that the new name is temporary. When specified, * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us * to use the previous name, which we do below. */ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { VERIFY0(nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &pool_name)); } else pool_name = spa_name(spa); config = fnvlist_alloc(); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); if (spa->spa_comment != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); if (spa->spa_compatibility != NULL) fnvlist_add_string(config, ZPOOL_CONFIG_COMPATIBILITY, spa->spa_compatibility); hostid = spa_get_hostid(spa); if (hostid != 0) fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename); int config_gen_flags = 0; if (vd != rvd) { fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_isspare) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 1ULL); if (vd->vdev_islog) fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL); vd = vd->vdev_top; /* label contains top config */ } else { /* * Only add the (potentially large) split information * in the mos config, and not in the vdev labels */ if (spa->spa_config_splitting != NULL) fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, spa->spa_config_splitting); fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); config_gen_flags |= VDEV_CONFIG_MOS; } /* * Add the top-level config. We even add this on pools which * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); /* * If we're splitting, record the original pool's guid. */ if (spa->spa_config_splitting != NULL && nvlist_lookup_uint64(spa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid); } nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); nvlist_free(nvroot); /* * Store what's necessary for reading the MOS in the label. */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, spa->spa_label_features); if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); kmem_free(ddh, sizeof (ddt_histogram_t)); ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); ddt_get_dedup_object_stats(spa, ddo); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); kmem_free(ddo, sizeof (ddt_object_t)); dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); ddt_get_dedup_stats(spa, dds); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); kmem_free(dds, sizeof (ddt_stat_t)); } if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); } else { /* * If we have top-level vdevs that were added but have * not yet been prepared for allocation, do that now. * (It's safe now because the config cache is up to date, * so it will be able to translate the new DVAs.) * See comments in spa_vdev_add() for full details. */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * Explicitly skip vdevs that are indirect or * log vdevs that are being removed. The reason * is that both of those can have vdev_ms_array * set to 0 and we wouldn't want to change their * metaslab size nor call vdev_expand() on them. */ if (!vdev_is_concrete(tvd) || (tvd->vdev_islog && tvd->vdev_removing)) continue; if (tvd->vdev_ms_array == 0) vdev_metaslab_set_size(tvd); vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. */ txg_wait_synced(spa->spa_dsl_pool, txg); /* * Update the global config cache to reflect the new mosconfig. */ if (!spa->spa_is_root) { spa_write_cachefile(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); } if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } EXPORT_SYMBOL(spa_config_load); EXPORT_SYMBOL(spa_all_configs); EXPORT_SYMBOL(spa_config_set); EXPORT_SYMBOL(spa_config_generate); EXPORT_SYMBOL(spa_config_update); /* BEGIN CSTYLED */ #ifdef __linux__ /* string sysctls require a char array on FreeBSD */ ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, "SPA config file (/etc/zfs/zpool.cache)"); #endif ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, "Disable pool import at module load"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 553e470ad461..ad4f3efb87b1 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -1,5420 +1,5423 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if (mc == spa_embedded_log_class(vd->vdev_spa) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } /* ARGSUSED */ void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vd->vdev_enc_sysfs_path) == 0) vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd, 1); } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundry * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. */ if (vd->vdev_mg->mg_class == spa_normal_class(spa) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (!expanding && !vd->vdev_removing) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); /* * Regardless whether this vdev was just added or it is being * expanded, the metaslab count has changed. Recalculate the * block limit. */ spa_log_sm_set_blocklimit(spa); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); } } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overriden at creation time, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift == 0) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd) { vdev_ashift_optimize(vd); } } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); if (min_alloc < spa->spa_min_alloc) spa->spa_min_alloc = min_alloc; } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path, svd->vdev_path); spa_strfree(dvd->vdev_path); dvd->vdev_path = spa_strdup(svd->vdev_path); } } else if (svd->vdev_path != NULL) { dvd->vdev_path = spa_strdup(svd->vdev_path); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vdev_get_nparity(vd) != 0) minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } range_tree_vacate(rt, NULL, NULL); range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", vd->vdev_top_zap, error); return (error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect vdev. */ if (!vdev_is_concrete(vd)) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; + + /* Clear recent error events cache (i.e. duplicate events tracking) */ + zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_class[t].vqc_active; vsx->vsx_pend_queue[t] = avl_numnodes( &vd->vdev_queue.vq_class[t].vqc_queued_tree); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; vs->vs_physical_ashift = vd->vdev_physical_ashift; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_IOCTL. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to catagorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM. */ if (priority == ZIO_PRIORITY_REBUILD) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_SCRUB); } else if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { return (B_FALSE); } } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %d active IOs", vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t vdev_xlate_is_empty(range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { range_seg64_t iter_rs = *logical_rs; range_seg64_t physical_rs; range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW, "Default limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below zed threshold)."); ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_ulong, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index b26d0993711a..416f4c54d8e8 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -1,1911 +1,1909 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2014, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * An indirect vdev corresponds to a vdev that has been removed. Since * we cannot rewrite block pointers of snapshots, etc., we keep a * mapping from old location on the removed device to the new location * on another device in the pool and use this mapping whenever we need * to access the DVA. Unfortunately, this mapping did not respect * logical block boundaries when it was first created, and so a DVA on * this indirect vdev may be "split" into multiple sections that each * map to a different location. As a consequence, not all DVAs can be * translated to an equivalent new DVA. Instead we must provide a * "vdev_remap" operation that executes a callback on each contiguous * segment of the new location. This function is used in multiple ways: * * - i/os to this vdev use the callback to determine where the * data is now located, and issue child i/os for each segment's new * location. * * - frees and claims to this vdev use the callback to free or claim * each mapped segment. (Note that we don't actually need to claim * log blocks on indirect vdevs, because we don't allocate to * removing vdevs. However, zdb uses zio_claim() for its leak * detection.) */ /* * "Big theory statement" for how we mark blocks obsolete. * * When a block on an indirect vdev is freed or remapped, a section of * that vdev's mapping may no longer be referenced (aka "obsolete"). We * keep track of how much of each mapping entry is obsolete. When * an entry becomes completely obsolete, we can remove it, thus reducing * the memory used by the mapping. The complete picture of obsolescence * is given by the following data structures, described below: * - the entry-specific obsolete count * - the vdev-specific obsolete spacemap * - the pool-specific obsolete bpobj * * == On disk data structures used == * * We track the obsolete space for the pool using several objects. Each * of these objects is created on demand and freed when no longer * needed, and is assumed to be empty if it does not exist. * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. * * - Each vic_mapping_object (associated with an indirect vdev) can * have a vimp_counts_object. This is an array of uint32_t's * with the same number of entries as the vic_mapping_object. When * the mapping is condensed, entries from the vic_obsolete_sm_object * (see below) are folded into the counts. Therefore, each * obsolete_counts entry tells us the number of bytes in the * corresponding mapping entry that were not referenced when the * mapping was last condensed. * * - Each indirect or removing vdev can have a vic_obsolete_sm_object. * This is a space map containing an alloc entry for every DVA that * has been obsoleted since the last time this indirect vdev was * condensed. We use this object in order to improve performance * when marking a DVA as obsolete. Instead of modifying an arbitrary * offset of the vimp_counts_object, we only need to append an entry * to the end of this object. When a DVA becomes obsolete, it is * added to the obsolete space map. This happens when the DVA is * freed, remapped and not referenced by a snapshot, or the last * snapshot referencing it is destroyed. * * - Each dataset can have a ds_remap_deadlist object. This is a * deadlist object containing all blocks that were remapped in this * dataset but referenced in a previous snapshot. Blocks can *only* * appear on this list if they were remapped (dsl_dataset_block_remapped); * blocks that were killed in a head dataset are put on the normal * ds_deadlist and marked obsolete when they are freed. * * - The pool can have a dp_obsolete_bpobj. This is a list of blocks * in the pool that need to be marked obsolete. When a snapshot is * destroyed, we move some of the ds_remap_deadlist to the obsolete * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then * asynchronously process the obsolete bpobj, moving its entries to * the specific vdevs' obsolete space maps. * * == Summary of how we mark blocks as obsolete == * * - When freeing a block: if any DVA is on an indirect vdev, append to * vic_obsolete_sm_object. * - When remapping a block, add dva to ds_remap_deadlist (if prev snap * references; otherwise append to vic_obsolete_sm_object). * - When freeing a snapshot: move parts of ds_remap_deadlist to * dp_obsolete_bpobj (same algorithm as ds_deadlist). * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to * individual vdev's vic_obsolete_sm_object. */ /* * "Big theory statement" for how we condense indirect vdevs. * * Condensing an indirect vdev's mapping is the process of determining * the precise counts of obsolete space for each mapping entry (by * integrating the obsolete spacemap into the obsolete counts) and * writing out a new mapping that contains only referenced entries. * * We condense a vdev when we expect the mapping to shrink (see * vdev_indirect_should_condense()), but only perform one condense at a * time to limit the memory usage. In addition, we use a separate * open-context thread (spa_condense_indirect_thread) to incrementally * create the new mapping object in a way that minimizes the impact on * the rest of the system. * * == Generating a new mapping == * * To generate a new mapping, we follow these steps: * * 1. Save the old obsolete space map and create a new mapping object * (see spa_condense_indirect_start_sync()). This initializes the * spa_condensing_indirect_phys with the "previous obsolete space map", * which is now read only. Newly obsolete DVAs will be added to a * new (initially empty) obsolete space map, and will not be * considered as part of this condense operation. * * 2. Construct in memory the precise counts of obsolete space for each * mapping entry, by incorporating the obsolete space map into the * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) * * 3. Iterate through each mapping entry, writing to the new mapping any * entries that are not completely obsolete (i.e. which don't have * obsolete count == mapping length). (See * spa_condense_indirect_generate_new_mapping().) * * 4. Destroy the old mapping object and switch over to the new one * (spa_condense_indirect_complete_sync). * * == Restarting from failure == * * To restart the condense when we import/open the pool, we must start * at the 2nd step above: reconstruct the precise counts in memory, * based on the space map + counts. Then in the 3rd step, we start * iterating where we left off: at vimp_max_offset of the new mapping * object. */ int zfs_condense_indirect_vdevs_enable = B_TRUE; /* * Condense if at least this percent of the bytes in the mapping is * obsolete. With the default of 25%, the amount of space mapped * will be reduced to 1% of its original size after at most 16 * condenses. Higher values will condense less often (causing less * i/o); lower values will reduce the mapping size more quickly. */ int zfs_indirect_condense_obsolete_pct = 25; /* * Condense if the obsolete space map takes up more than this amount of * space on disk (logically). This limits the amount of disk space * consumed by the obsolete space map; the default of 1GB is small enough * that we typically don't mind "wasting" it. */ unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; /* * Don't bother condensing if the mapping uses less than this amount of * memory. The default of 128KB is considered a "trivial" amount of * memory and not worth reducing. */ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a condense (which might otherwise * complete too quickly). If used to reduce the performance impact of * condensing in production, a maximum value of 1 should be sufficient. */ int zfs_condense_indirect_commit_entry_delay_ms = 0; /* * If an indirect split block contains more than this many possible unique * combinations when being reconstructed, consider it too computationally * expensive to check them all. Instead, try at most 100 randomly-selected * combinations each time the block is accessed. This allows all segment * copies to participate fairly in the reconstruction when all combinations * cannot be checked and prevents repeated use of one bad copy. */ int zfs_reconstruct_indirect_combinations_max = 4096; /* * Enable to simulate damaged segments and validate reconstruction. This * is intentionally not exposed as a module parameter. */ unsigned long zfs_reconstruct_indirect_damage_fraction = 0; /* * The indirect_child_t represents the vdev that we will read from, when we * need to read all copies of the data (e.g. for scrub or reconstruction). * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, * ic_vdev is a child of the mirror. */ typedef struct indirect_child { abd_t *ic_data; vdev_t *ic_vdev; /* * ic_duplicate is NULL when the ic_data contents are unique, when it * is determined to be a duplicate it references the primary child. */ struct indirect_child *ic_duplicate; list_node_t ic_node; /* node on is_unique_child */ int ic_error; /* set when a child does not contain the data */ } indirect_child_t; /* * The indirect_split_t represents one mapped segment of an i/o to the * indirect vdev. For non-split (contiguously-mapped) blocks, there will be * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. * For split blocks, there will be several of these. */ typedef struct indirect_split { list_node_t is_node; /* link on iv_splits */ /* * is_split_offset is the offset into the i/o. * This is the sum of the previous splits' is_size's. */ uint64_t is_split_offset; vdev_t *is_vdev; /* top-level vdev */ uint64_t is_target_offset; /* offset on is_vdev */ uint64_t is_size; int is_children; /* number of entries in is_child[] */ int is_unique_children; /* number of entries in is_unique_child */ list_t is_unique_child; /* * is_good_child is the child that we are currently using to * attempt reconstruction. */ indirect_child_t *is_good_child; indirect_child_t is_child[1]; /* variable-length */ } indirect_split_t; /* * The indirect_vsd_t is associated with each i/o to the indirect vdev. * It is the "Vdev-Specific Data" in the zio_t's io_vsd. */ typedef struct indirect_vsd { boolean_t iv_split_block; boolean_t iv_reconstruct; uint64_t iv_unique_combinations; uint64_t iv_attempts; uint64_t iv_attempts_max; list_t iv_splits; /* list of indirect_split_t's */ } indirect_vsd_t; static void vdev_indirect_map_free(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; while ((is = list_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } list_remove(&iv->iv_splits, is); indirect_child_t *ic; while ((ic = list_head(&is->is_unique_child)) != NULL) list_remove(&is->is_unique_child, ic); list_destroy(&is->is_unique_child); kmem_free(is, offsetof(indirect_split_t, is_child[is->is_children])); } kmem_free(iv, sizeof (*iv)); } static const zio_vsd_ops_t vdev_indirect_vsd_ops = { .vsd_free = vdev_indirect_map_free, .vsd_cksum_report = zio_vsd_default_cksum_report }; /* * Mark the given offset and size as being obsolete. */ void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(size > 0); VERIFY(vdev_indirect_mapping_entry_for_offset( vd->vdev_indirect_mapping, offset) != NULL); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { mutex_enter(&vd->vdev_obsolete_lock); range_tree_add(vd->vdev_obsolete_segments, offset, size); mutex_exit(&vd->vdev_obsolete_lock); vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); } } /* * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This * wrapper is provided because the DMU does not know about vdev_t's and * cannot directly call vdev_indirect_mark_obsolete. */ void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, uint64_t size, dmu_tx_t *tx) { vdev_t *vd = vdev_lookup_top(spa, vdev_id); ASSERT(dmu_tx_is_syncing(tx)); /* The DMU can only remap indirect vdevs. */ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vdev_indirect_mark_obsolete(vd, offset, size); } static spa_condensing_indirect_t * spa_condensing_indirect_create(spa_t *spa) { spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); objset_t *mos = spa->spa_meta_objset; for (int i = 0; i < TXG_SIZE; i++) { list_create(&sci->sci_new_mapping_entries[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } sci->sci_new_mapping = vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); return (sci); } static void spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) { for (int i = 0; i < TXG_SIZE; i++) list_destroy(&sci->sci_new_mapping_entries[i]); if (sci->sci_new_mapping != NULL) vdev_indirect_mapping_close(sci->sci_new_mapping); kmem_free(sci, sizeof (*sci)); } boolean_t vdev_indirect_should_condense(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); if (!zfs_condense_indirect_vdevs_enable) return (B_FALSE); /* * We can only condense one indirect vdev at a time. */ if (spa->spa_condensing_indirect != NULL) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); /* * The mapping object size must not change while we are * condensing, so we can only condense indirect vdevs * (not vdevs that are still in the middle of being removed). */ if (vd->vdev_ops != &vdev_indirect_ops) return (B_FALSE); /* * If nothing new has been marked obsolete, there is no * point in condensing. */ uint64_t obsolete_sm_obj __maybe_unused; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); if (vd->vdev_obsolete_sm == NULL) { ASSERT0(obsolete_sm_obj); return (B_FALSE); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm)); uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); uint64_t mapping_size = vdev_indirect_mapping_size(vim); uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); ASSERT3U(bytes_obsolete, <=, bytes_mapped); /* * If a high percentage of the bytes that are mapped have become * obsolete, condense (unless the mapping is already small enough). * This has a good chance of reducing the amount of memory used * by the mapping. */ if (bytes_obsolete * 100 / bytes_mapped >= zfs_indirect_condense_obsolete_pct && mapping_size > zfs_condense_min_mapping_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete " "spacemap covers %d%% of %lluMB mapping", (u_longlong_t)vd->vdev_id, (int)(bytes_obsolete * 100 / bytes_mapped), (u_longlong_t)bytes_mapped / 1024 / 1024); return (B_TRUE); } /* * If the obsolete space map takes up too much space on disk, * condense in order to free up this disk space. */ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete sm " "length %lluMB >= max size %lluMB", (u_longlong_t)vd->vdev_id, (u_longlong_t)obsolete_sm_size / 1024 / 1024, (u_longlong_t)zfs_condense_max_obsolete_bytes / 1024 / 1024); return (B_TRUE); } return (B_FALSE); } /* * This sync task completes (finishes) a condense, deleting the old * mapping and replacing it with the new one. */ static void spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_meta_objset; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); uint64_t new_count = vdev_indirect_mapping_num_entries(sci->sci_new_mapping); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3P(sci, ==, spa->spa_condensing_indirect); for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } ASSERT(vic->vic_mapping_object != 0); ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); /* * Reset vdev_indirect_mapping to refer to the new object. */ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = sci->sci_new_mapping; rw_exit(&vd->vdev_indirect_rwlock); sci->sci_new_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = scip->scip_next_mapping_object; scip->scip_next_mapping_object = 0; space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); scip->scip_prev_obsolete_sm_object = 0; scip->scip_vdev = 0; VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, tx)); spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " "new mapping object %llu has %llu entries " "(was %llu entries)", vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, new_count, old_count); vdev_config_dirty(spa->spa_root_vdev); } /* * This sync task appends entries to the new mapping object. */ static void spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(sci, ==, spa->spa_condensing_indirect); vdev_indirect_mapping_add_entries(sci->sci_new_mapping, &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); } /* * Open-context function to add one entry to the new mapping. The new * entry will be remembered and written from syncing context. */ static void spa_condense_indirect_commit_entry(spa_t *spa, vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) { spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* * If we are the first entry committed this txg, kick off the sync * task to write to the MOS on our behalf. */ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { dsl_sync_task_nowait(dmu_tx_pool(tx), spa_condense_indirect_commit_sync, sci, tx); } vdev_indirect_mapping_entry_t *vime = kmem_alloc(sizeof (*vime), KM_SLEEP); vime->vime_mapping = *vimep; vime->vime_obsolete_count = count; list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); dmu_tx_commit(tx); } static void spa_condense_indirect_generate_new_mapping(vdev_t *vd, uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) { spa_t *spa = vd->vdev_spa; uint64_t mapi = start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_num_entries = vdev_indirect_mapping_num_entries(old_mapping); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); zfs_dbgmsg("starting condense of vdev %llu from index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); while (mapi < old_num_entries) { if (zthr_iscancelled(zthr)) { zfs_dbgmsg("pausing condense of vdev %llu " "at index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); break; } vdev_indirect_mapping_entry_phys_t *entry = &old_mapping->vim_entries[mapi]; uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); ASSERT3U(obsolete_counts[mapi], <=, entry_size); if (obsolete_counts[mapi] < entry_size) { spa_condense_indirect_commit_entry(spa, entry, obsolete_counts[mapi]); /* * This delay may be requested for testing, debugging, * or performance reasons. */ hrtime_t now = gethrtime(); hrtime_t sleep_until = now + MSEC2NSEC( zfs_condense_indirect_commit_entry_delay_ms); zfs_sleep_until(sleep_until); } mapi++; } } /* ARGSUSED */ static boolean_t spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) { spa_t *spa = arg; return (spa->spa_condensing_indirect != NULL); } /* ARGSUSED */ static void spa_condense_indirect_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *vd; ASSERT3P(spa->spa_condensing_indirect, !=, NULL); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint32_t *counts; uint64_t start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; space_map_t *prev_obsolete_sm = NULL; ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { /* * The list must start out empty in order for the * _commit_sync() sync task to be properly registered * on the first call to _commit_entry(); so it's wise * to double check and ensure we actually are starting * with empty lists. */ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); if (prev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, counts, prev_obsolete_sm); } space_map_close(prev_obsolete_sm); /* * Generate new mapping. Determine what index to continue from * based on the max offset that we've already written in the * new mapping. */ uint64_t max_offset = vdev_indirect_mapping_max_offset(sci->sci_new_mapping); if (max_offset == 0) { /* We haven't written anything to the new mapping yet. */ start_index = 0; } else { /* * Pick up from where we left off. _entry_for_offset() * returns a pointer into the vim_entries array. If * max_offset is greater than any of the mappings * contained in the table NULL will be returned and * that indicates we've exhausted our iteration of the * old_mapping. */ vdev_indirect_mapping_entry_phys_t *entry = vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, max_offset); if (entry == NULL) { /* * We've already written the whole new mapping. * This special value will cause us to skip the * generate_new_mapping step and just do the sync * task to complete the condense. */ start_index = UINT64_MAX; } else { start_index = entry - old_mapping->vim_entries; ASSERT3U(start_index, <, vdev_indirect_mapping_num_entries(old_mapping)); } } spa_condense_indirect_generate_new_mapping(vd, counts, start_index, zthr); vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); /* * If the zthr has received a cancellation signal while running * in generate_new_mapping() or at any point after that, then bail * early. We don't want to complete the condense if the spa is * shutting down. */ if (zthr_iscancelled(zthr)) return; VERIFY0(dsl_sync_task(spa_name(spa), NULL, spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* * Sync task to begin the condensing process. */ void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; ASSERT0(scip->scip_next_mapping_object); ASSERT0(scip->scip_prev_obsolete_sm_object); ASSERT0(scip->scip_vdev); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); uint64_t obsolete_sm_obj; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); ASSERT3U(obsolete_sm_obj, !=, 0); scip->scip_vdev = vd->vdev_id; scip->scip_next_mapping_object = vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; /* * We don't need to allocate a new space map object, since * vdev_indirect_sync_obsolete will allocate one when needed. */ space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (*scip) / sizeof (uint64_t), scip, tx)); ASSERT3P(spa->spa_condensing_indirect, ==, NULL); spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " "posm=%llu nm=%llu", vd->vdev_id, dmu_tx_get_txg(tx), (u_longlong_t)scip->scip_prev_obsolete_sm_object, (u_longlong_t)scip->scip_next_mapping_object); zthr_wakeup(spa->spa_condense_zthr); } /* * Sync to the given vdev's obsolete space map any segments that are no longer * referenced as of the given txg. * * If the obsolete space map doesn't exist yet, create and open it. */ void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; ASSERT3U(vic->vic_mapping_object, !=, 0); ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object == 0) { obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, zfs_vdev_standard_sm_blksz, tx); ASSERT(vd->vdev_top_zap != 0); VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); ASSERT3U(obsolete_sm_object, !=, 0); spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(space_map_open(&vd->vdev_obsolete_sm, spa->spa_meta_objset, obsolete_sm_object, 0, vd->vdev_asize, 0)); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); space_map_write(vd->vdev_obsolete_sm, vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } int spa_condense_init(spa_t *spa) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), &spa->spa_condensing_indirect_phys); if (error == 0) { if (spa_writeable(spa)) { spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); } return (0); } else if (error == ENOENT) { return (0); } else { return (error); } } void spa_condense_fini(spa_t *spa) { if (spa->spa_condensing_indirect != NULL) { spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; } } void spa_start_indirect_condensing_thread(spa_t *spa) { ASSERT3P(spa->spa_condense_zthr, ==, NULL); spa->spa_condense_zthr = zthr_create("z_indirect_condense", spa_condense_indirect_thread_check, spa_condense_indirect_thread, spa); } /* * Gets the obsolete spacemap object from the vdev's ZAP. On success sm_obj * will contain either the obsolete spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } /* * Gets the obsolete count are precise spacemap object from the vdev's ZAP. * On success are_precise will be set to reflect if the counts are precise. * All other errors are returned to the caller. */ int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *are_precise = B_FALSE; return (0); } uint64_t val = 0; int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); if (error == 0) { *are_precise = (val != 0); } else if (error == ENOENT) { *are_precise = B_FALSE; error = 0; } return (error); } /* ARGSUSED */ static void vdev_indirect_close(vdev_t *vd) { } /* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { *psize = *max_psize = vd->vdev_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; *logical_ashift = vd->vdev_ashift; *physical_ashift = vd->vdev_physical_ashift; return (0); } typedef struct remap_segment { vdev_t *rs_vd; uint64_t rs_offset; uint64_t rs_asize; uint64_t rs_split_offset; list_node_t rs_node; } remap_segment_t; static remap_segment_t * rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) { remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); rs->rs_vd = vd; rs->rs_offset = offset; rs->rs_asize = asize; rs->rs_split_offset = split_offset; return (rs); } /* * Given an indirect vdev and an extent on that vdev, it duplicates the * physical entries of the indirect mapping that correspond to the extent * to a new array and returns a pointer to it. In addition, copied_entries * is populated with the number of mapping entries that were duplicated. * * Note that the function assumes that the caller holds vdev_indirect_rwlock. * This ensures that the mapping won't change due to condensing as we * copy over its contents. * * Finally, since we are doing an allocation, it is up to the caller to * free the array allocated in this function. */ static vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t *copied_entries) { vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t entries = 0; ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); vdev_indirect_mapping_entry_phys_t *first_mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); ASSERT3P(first_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *m = first_mapping; while (asize > 0) { uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(asize, size - inner_offset); offset += inner_size; asize -= inner_size; entries++; m++; } size_t copy_length = entries * sizeof (*first_mapping); duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); bcopy(first_mapping, duplicate_mappings, copy_length); *copied_entries = entries; return (duplicate_mappings); } /* * Goes through the relevant indirect mappings until it hits a concrete vdev * and issues the callback. On the way to the concrete vdev, if any other * indirect vdevs are encountered, then the callback will also be called on * each of those indirect vdevs. For example, if the segment is mapped to * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is * mapped to segment B on concrete vdev 2, then the callback will be called on * both vdev 1 and vdev 2. * * While the callback passed to vdev_indirect_remap() is called on every vdev * the function encounters, certain callbacks only care about concrete vdevs. * These types of callbacks should return immediately and explicitly when they * are called on an indirect vdev. * * Because there is a possibility that a DVA section in the indirect device * has been split into multiple sections in our mapping, we keep track * of the relevant contiguous segments of the new location (remap_segment_t) * in a stack. This way we can call the callback for each of the new sections * created by a single section of the indirect device. Note though, that in * this scenario the callbacks in each split block won't occur in-order in * terms of offset, so callers should not make any assumptions about that. * * For callbacks that don't handle split blocks and immediately return when * they encounter them (as is the case for remap_blkptr_cb), the caller can * assume that its callback will be applied from the first indirect vdev * encountered to the last one and then the concrete vdev, in that order. */ static void vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) { list_t stack; spa_t *spa = vd->vdev_spa; list_create(&stack, sizeof (remap_segment_t), offsetof(remap_segment_t, rs_node)); for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); rs != NULL; rs = list_remove_head(&stack)) { vdev_t *v = rs->rs_vd; uint64_t num_entries = 0; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ASSERT(rs->rs_asize > 0); /* * Note: As this function can be called from open context * (e.g. zio_read()), we need the following rwlock to * prevent the mapping from being changed by condensing. * * So we grab the lock and we make a copy of the entries * that are relevant to the extent that we are working on. * Once that is done, we drop the lock and iterate over * our copy of the mapping. Once we are done with the with * the remap segment and we free it, we also free our copy * of the indirect mapping entries that are relevant to it. * * This way we don't need to wait until the function is * finished with a segment, to condense it. In addition, we * don't need a recursive rwlock for the case that a call to * vdev_indirect_remap() needs to call itself (through the * codepath of its callback) for the same vdev in the middle * of its execution. */ rw_enter(&v->vdev_indirect_rwlock, RW_READER); ASSERT3P(v->vdev_indirect_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *mapping = vdev_indirect_mapping_duplicate_adjacent_entries(v, rs->rs_offset, rs->rs_asize, &num_entries); ASSERT3P(mapping, !=, NULL); ASSERT3U(num_entries, >, 0); rw_exit(&v->vdev_indirect_rwlock); for (uint64_t i = 0; i < num_entries; i++) { /* * Note: the vdev_indirect_mapping can not change * while we are running. It only changes while the * removal is in progress, and then only from syncing * context. While a removal is in progress, this * function is only called for frees, which also only * happen from syncing context. */ vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; ASSERT3P(m, !=, NULL); ASSERT3U(rs->rs_asize, >, 0); uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); ASSERT3U(rs->rs_offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(rs->rs_offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); ASSERT3U(dst_vdev, !=, v->vdev_id); uint64_t inner_offset = rs->rs_offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(rs->rs_asize, size - inner_offset); vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); ASSERT3P(dst_v, !=, NULL); if (dst_v->vdev_ops == &vdev_indirect_ops) { list_insert_head(&stack, rs_alloc(dst_v, dst_offset + inner_offset, inner_size, rs->rs_split_offset)); } if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { /* * Note: This clause exists only solely for * testing purposes. We use it to ensure that * split blocks work and that the callbacks * using them yield the same result if issued * in reverse order. */ uint64_t inner_half = inner_size / 2; func(rs->rs_split_offset + inner_half, dst_v, dst_offset + inner_offset + inner_half, inner_half, arg); func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_half, arg); } else { func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_size, arg); } rs->rs_offset += inner_size; rs->rs_asize -= inner_size; rs->rs_split_offset += inner_size; } VERIFY0(rs->rs_asize); kmem_free(mapping, num_entries * sizeof (*mapping)); kmem_free(rs, sizeof (remap_segment_t)); } list_destroy(&stack); } static void vdev_indirect_child_io_done(zio_t *zio) { zio_t *pio = zio->io_private; mutex_enter(&pio->io_lock); pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); abd_free(zio->io_abd); } /* * This is a callback for vdev_indirect_remap() which allocates an * indirect_split_t for each split segment and adds it to iv_splits. */ static void vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { zio_t *zio = arg; indirect_vsd_t *iv = zio->io_vsd; ASSERT3P(vd, !=, NULL); if (vd->vdev_ops == &vdev_indirect_ops) return; int n = 1; if (vd->vdev_ops == &vdev_mirror_ops) n = vd->vdev_children; indirect_split_t *is = kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); is->is_children = n; is->is_size = size; is->is_split_offset = split_offset; is->is_target_offset = offset; is->is_vdev = vd; list_create(&is->is_unique_child, sizeof (indirect_child_t), offsetof(indirect_child_t, ic_node)); /* * Note that we only consider multiple copies of the data for * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even * though they use the same ops as mirror, because there's only one * "good" copy under the replacing/spare. */ if (vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < n; i++) { is->is_child[i].ic_vdev = vd->vdev_child[i]; list_link_init(&is->is_child[i].ic_node); } } else { is->is_child[0].ic_vdev = vd; } list_insert_tail(&iv->iv_splits, is); } static void vdev_indirect_read_split_done(zio_t *zio) { indirect_child_t *ic = zio->io_private; if (zio->io_error != 0) { /* * Clear ic_data to indicate that we do not have data for this * child. */ abd_free(ic->ic_data); ic->ic_data = NULL; } } /* * Issue reads for all copies (mirror children) of all splits. */ static void vdev_indirect_read_all(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (!vdev_readable(ic->ic_vdev)) continue; /* * If a child is missing the data, set ic_error. Used * in vdev_indirect_repair(). We perform the read * nevertheless which provides the opportunity to * reconstruct the split block if at all possible. */ if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING, zio->io_txg, 1)) ic->ic_error = SET_ERROR(ESTALE); ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); ic->ic_duplicate = NULL; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, ic->ic_data, is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_read_split_done, ic)); } } iv->iv_reconstruct = B_TRUE; } static void vdev_indirect_io_start(zio_t *zio) { spa_t *spa __maybe_unused = zio->io_spa; indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); list_create(&iv->iv_splits, sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); zio->io_vsd = iv; zio->io_vsd_ops = &vdev_indirect_vsd_ops; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (zio->io_type != ZIO_TYPE_READ) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); /* * Note: this code can handle other kinds of writes, * but we don't expect them. */ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); } vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, vdev_indirect_gather_splits, zio); indirect_split_t *first = list_head(&iv->iv_splits); if (first->is_size == zio->io_size) { /* * This is not a split block; we are pointing to the entire * data, which will checksum the same as the original data. * Pass the BP down so that the child i/o can verify the * checksum, and try a different location if available * (e.g. on a mirror). * * While this special case could be handled the same as the * general (split block) case, doing it this way ensures * that the vast majority of blocks on indirect vdevs * (which are not split) are handled identically to blocks * on non-indirect vdevs. This allows us to be less strict * about performance in the general (but rare) case. */ ASSERT0(first->is_split_offset); ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, first->is_vdev, first->is_target_offset, abd_get_offset(zio->io_abd, 0), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } else { iv->iv_split_block = B_TRUE; if (zio->io_type == ZIO_TYPE_READ && zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { /* * Read all copies. Note that for simplicity, * we don't bother consulting the DTL in the * resilver case. */ vdev_indirect_read_all(zio); } else { /* * If this is a read zio, we read one copy of each * split segment, from the top-level vdev. Since * we don't know the checksum of each split * individually, the child zio can't ensure that * we get the right data. E.g. if it's a mirror, * it will just read from a random (healthy) leaf * vdev. We have to verify the checksum in * vdev_indirect_io_done(). * * For write zios, the vdev code will ensure we write * to all children. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { zio_nowait(zio_vdev_child_io(zio, NULL, is->is_vdev, is->is_target_offset, abd_get_offset(zio->io_abd, is->is_split_offset), is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } } } zio_execute(zio); } /* * Report a checksum error for a child. */ static void vdev_indirect_checksum_error(zio_t *zio, indirect_split_t *is, indirect_child_t *ic) { vdev_t *vd = ic->ic_vdev; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); zio_bad_cksum_t zbc = {{{ 0 }}}; abd_t *bad_abd = ic->ic_data; abd_t *good_abd = is->is_good_child->ic_data; (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); } /* * Issue repair i/os for any incorrect copies. We do this by comparing * each split segment's correct data (is_good_child's ic_data) with each * other copy of the data. If they differ, then we overwrite the bad data * with the good copy. The DTL is checked in vdev_indirect_read_all() and * if a vdev is missing a copy of the data we set ic_error and the read is * performed. This provides the opportunity to reconstruct the split block * if at all possible. ic_error is checked here and if set it suppresses * incrementing the checksum counter. Aside from this DTLs are not checked, * which simplifies this code and also issues the optimal number of writes * (based on which copies actually read bad data, as opposed to which we * think might be wrong). For the same reason, we always use * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). */ static void vdev_indirect_repair(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; enum zio_flag flags = ZIO_FLAG_IO_REPAIR; if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) flags |= ZIO_FLAG_SELF_HEAL; if (!spa_writeable(zio->io_spa)) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; if (ic->ic_duplicate == is->is_good_child) continue; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, is->is_good_child->ic_data, is->is_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL)); /* * If ic_error is set the current child does not have * a copy of the data, so suppress incrementing the * checksum counter. */ if (ic->ic_error == ESTALE) continue; vdev_indirect_checksum_error(zio, is, ic); } } } /* * Report checksum errors on all children that we read from. */ static void vdev_indirect_all_checksum_errors(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data == NULL) continue; vdev_t *vd = ic->ic_vdev; - int ret = zfs_ereport_post_checksum(zio->io_spa, vd, + (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); - if (ret != EALREADY) { - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - } + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); } } } /* * Copy data from all the splits to a main zio then validate the checksum. * If then checksum is successfully validated return success. */ static int vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) { zio_bad_cksum_t zbc; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { ASSERT3P(is->is_good_child->ic_data, !=, NULL); ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); abd_copy_off(zio->io_abd, is->is_good_child->ic_data, is->is_split_offset, 0, is->is_size); } return (zio_checksum_error(zio, &zbc)); } /* * There are relatively few possible combinations making it feasible to * deterministically check them all. We do this by setting the good_child * to the next unique split version. If we reach the end of the list then * "carry over" to the next unique split version (like counting in base * is_unique_children, but each digit can have a different base). */ static int vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) { boolean_t more = B_TRUE; iv->iv_attempts = 0; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) is->is_good_child = list_head(&is->is_unique_child); while (more == B_TRUE) { iv->iv_attempts++; more = B_FALSE; if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_good_child = list_next(&is->is_unique_child, is->is_good_child); if (is->is_good_child != NULL) { more = B_TRUE; break; } is->is_good_child = list_head(&is->is_unique_child); } } ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); return (SET_ERROR(ECKSUM)); } /* * There are too many combinations to try all of them in a reasonable amount * of time. So try a fixed number of random combinations from the unique * split versions, after which we'll consider the block unrecoverable. */ static int vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) { iv->iv_attempts = 0; while (iv->iv_attempts < iv->iv_attempts_max) { iv->iv_attempts++; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic = list_head(&is->is_unique_child); int children = is->is_unique_children; for (int i = spa_get_random(children); i > 0; i--) ic = list_next(&is->is_unique_child, ic); ASSERT3P(ic, !=, NULL); is->is_good_child = ic; } if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); } return (SET_ERROR(ECKSUM)); } /* * This is a validation function for reconstruction. It randomly selects * a good combination, if one can be found, and then it intentionally * damages all other segment copes by zeroing them. This forces the * reconstruction algorithm to locate the one remaining known good copy. */ static int vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) { int error; /* Presume all the copies are unique for initial selection. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (ic->ic_data != NULL) { is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic); } } if (list_is_empty(&is->is_unique_child)) { error = SET_ERROR(EIO); goto out; } } /* * Set each is_good_child to a randomly-selected child which * is known to contain validated data. */ error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error) goto out; /* * Damage all but the known good copy by zeroing it. This will * result in two or less unique copies per indirect_child_t. * Both may need to be checked in order to reconstruct the block. * Set iv->iv_attempts_max such that all unique combinations will * enumerated, but limit the damage to at most 12 indirect splits. */ iv->iv_attempts_max = 1; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); } iv->iv_attempts_max *= 2; if (iv->iv_attempts_max >= (1ULL << 12)) { iv->iv_attempts_max = UINT64_MAX; break; } } out: /* Empty the unique children lists so they can be reconstructed. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; while ((ic = list_head(&is->is_unique_child)) != NULL) list_remove(&is->is_unique_child, ic); is->is_unique_children = 0; } return (error); } /* * This function is called when we have read all copies of the data and need * to try to find a combination of copies that gives us the right checksum. * * If we pointed to any mirror vdevs, this effectively does the job of the * mirror. The mirror vdev code can't do its own job because we don't know * the checksum of each split segment individually. * * We have to try every unique combination of copies of split segments, until * we find one that checksums correctly. Duplicate segment copies are first * identified and latter skipped during reconstruction. This optimization * reduces the search space and ensures that of the remaining combinations * at most one is correct. * * When the total number of combinations is small they can all be checked. * For example, if we have 3 segments in the split, and each points to a * 2-way mirror with unique copies, we will have the following pieces of data: * * | mirror child * split | [0] [1] * ======|===================== * A | data_A_0 data_A_1 * B | data_B_0 data_B_1 * C | data_C_0 data_C_1 * * We will try the following (mirror children)^(number of splits) (2^3=8) * combinations, which is similar to bitwise-little-endian counting in * binary. In general each "digit" corresponds to a split segment, and the * base of each digit is is_children, which can be different for each * digit. * * "low bit" "high bit" * v v * data_A_0 data_B_0 data_C_0 * data_A_1 data_B_0 data_C_0 * data_A_0 data_B_1 data_C_0 * data_A_1 data_B_1 data_C_0 * data_A_0 data_B_0 data_C_1 * data_A_1 data_B_0 data_C_1 * data_A_0 data_B_1 data_C_1 * data_A_1 data_B_1 data_C_1 * * Note that the split segments may be on the same or different top-level * vdevs. In either case, we may need to try lots of combinations (see * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror * has small silent errors on all of its children, we can still reconstruct * the correct data, as long as those errors are at sufficiently-separated * offsets (specifically, separated by the largest block size - default of * 128KB, but up to 16MB). */ static void vdev_indirect_reconstruct_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; boolean_t known_good = B_FALSE; int error; iv->iv_unique_combinations = 1; iv->iv_attempts_max = UINT64_MAX; if (zfs_reconstruct_indirect_combinations_max > 0) iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; /* * If nonzero, every 1/x blocks will be damaged, in order to validate * reconstruction when there are split segments with damaged copies. * Known_good will be TRUE when reconstruction is known to be possible. */ if (zfs_reconstruct_indirect_damage_fraction != 0 && spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0) known_good = (vdev_indirect_splits_damage(iv, zio) == 0); /* * Determine the unique children for a split segment and add them * to the is_unique_child list. By restricting reconstruction * to these children, only unique combinations will be considered. * This can vastly reduce the search space when there are a large * number of indirect splits. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic_i = &is->is_child[i]; if (ic_i->ic_data == NULL || ic_i->ic_duplicate != NULL) continue; for (int j = i + 1; j < is->is_children; j++) { indirect_child_t *ic_j = &is->is_child[j]; if (ic_j->ic_data == NULL || ic_j->ic_duplicate != NULL) continue; if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0) ic_j->ic_duplicate = ic_i; } is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic_i); } /* Reconstruction is impossible, no valid children */ EQUIV(list_is_empty(&is->is_unique_child), is->is_unique_children == 0); if (list_is_empty(&is->is_unique_child)) { zio->io_error = EIO; vdev_indirect_all_checksum_errors(zio); zio_checksum_verified(zio); return; } iv->iv_unique_combinations *= is->is_unique_children; } if (iv->iv_unique_combinations <= iv->iv_attempts_max) error = vdev_indirect_splits_enumerate_all(iv, zio); else error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error != 0) { /* All attempted combinations failed. */ ASSERT3B(known_good, ==, B_FALSE); zio->io_error = error; vdev_indirect_all_checksum_errors(zio); } else { /* * The checksum has been successfully validated. Issue * repair I/Os to any copies of splits which don't match * the validated version. */ ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); vdev_indirect_repair(zio); zio_checksum_verified(zio); } } static void vdev_indirect_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (iv->iv_reconstruct) { /* * We have read all copies of the data (e.g. from mirrors), * either because this was a scrub/resilver, or because the * one-copy read didn't checksum correctly. */ vdev_indirect_reconstruct_io_done(zio); return; } if (!iv->iv_split_block) { /* * This was not a split block, so we passed the BP down, * and the checksum was handled by the (one) child zio. */ return; } zio_bad_cksum_t zbc; int ret = zio_checksum_error(zio, &zbc); if (ret == 0) { zio_checksum_verified(zio); return; } /* * The checksum didn't match. Read all copies of all splits, and * then we will try to reconstruct. The next time * vdev_indirect_io_done() is called, iv_reconstruct will be set. */ vdev_indirect_read_all(zio); zio_vdev_io_redone(zio); } vdev_ops_t vdev_indirect_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = vdev_indirect_remap, .vdev_op_xlate = NULL, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* leaf vdev */ }; EXPORT_SYMBOL(spa_condense_fini); EXPORT_SYMBOL(spa_start_indirect_condensing_thread); EXPORT_SYMBOL(spa_condense_indirect_start_sync); EXPORT_SYMBOL(spa_condense_init); EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_should_condense); EXPORT_SYMBOL(vdev_indirect_sync_obsolete); EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); EXPORT_SYMBOL(vdev_obsolete_sm_object); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW, "Whether to attempt condensing indirect vdev mappings"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW, "Don't bother condensing if the mapping uses less than this amount of " "memory"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW, "Minimum size obsolete spacemap to attempt condensing"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW, "Used by tests to ensure certain actions happen in the middle of a " "condense. A maximum value of 1 should be sufficient."); ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW, "Maximum number of combinations when reconstructing split segments"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index f4812e61252c..57a594c80ce3 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -1,2747 +1,2743 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2016 Gvozden Nešković. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ #endif /* * Virtual device vector for RAID-Z. * * This vdev supports single, double, and triple parity. For single parity, * we use a simple XOR of all the data columns. For double or triple parity, * we use a special case of Reed-Solomon coding. This extends the * technique described in "The mathematics of RAID-6" by H. Peter Anvin by * drawing on the system described in "A Tutorial on Reed-Solomon Coding for * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the * former is also based. The latter is designed to provide higher performance * for writes. * * Note that the Plank paper claimed to support arbitrary N+M, but was then * amended six years later identifying a critical flaw that invalidates its * claims. Nevertheless, the technique can be adapted to work for up to * triple parity. For additional parity, the amendment "Note: Correction to * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding * is viable, but the additional complexity means that write performance will * suffer. * * All of the methods above operate on a Galois field, defined over the * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements * can be expressed with a single byte. Briefly, the operations on the * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B * o multiplication of A by 2 is defined by the following bitwise expression: * * (A * 2)_7 = A_6 * (A * 2)_6 = A_5 * (A * 2)_5 = A_4 * (A * 2)_4 = A_3 + A_7 * (A * 2)_3 = A_2 + A_7 * (A * 2)_2 = A_1 + A_7 * (A * 2)_1 = A_0 * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). * As an aside, this multiplication is derived from the error correcting * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather * than field addition). The inverse of a field element A (A^-1) is therefore * A ^ (255 - 1) = A^254. * * The up-to-three parity columns, P, Q, R over several data columns, * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial * XOR operation, and 2 and 4 can be computed quickly and generate linearly- * independent coefficients. (There are no additional coefficients that have * this property which is why the uncorrected Plank method breaks down.) * * See the reconstruction code below for how P, Q and R can used individually * or in concert to recover missing data columns. */ #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) /* * We provide a mechanism to perform the field multiplication operation on a * 64-bit value all at once rather than a byte at a time. This works by * creating a mask from the top bit in each byte and using that to * conditionally apply the XOR of 0x1d. */ #define VDEV_RAIDZ_64MUL_2(x, mask) \ { \ (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ } #define VDEV_RAIDZ_64MUL_4(x, mask) \ { \ VDEV_RAIDZ_64MUL_2((x), mask); \ VDEV_RAIDZ_64MUL_2((x), mask); \ } static void vdev_raidz_row_free(raidz_row_t *rr) { for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size != 0) abd_free(rc->rc_abd); if (rc->rc_gdata != NULL) abd_free(rc->rc_gdata); if (rc->rc_orig_data != NULL) zio_buf_free(rc->rc_orig_data, rc->rc_size); } if (rr->rr_abd_copy != NULL) abd_free(rr->rr_abd_copy); if (rr->rr_abd_empty != NULL) abd_free(rr->rr_abd_empty); kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); } void vdev_raidz_map_free(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) vdev_raidz_row_free(rm->rm_row[i]); kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void vdev_raidz_map_free_vsd(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; ASSERT0(rm->rm_freed); rm->rm_freed = B_TRUE; if (rm->rm_reports == 0) { vdev_raidz_map_free(rm); } } /*ARGSUSED*/ static void vdev_raidz_cksum_free(void *arg, size_t ignored) { raidz_map_t *rm = arg; ASSERT3U(rm->rm_reports, >, 0); if (--rm->rm_reports == 0 && rm->rm_freed) vdev_raidz_map_free(rm); } static void vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) { raidz_map_t *rm = zcr->zcr_cbdata; const size_t c = zcr->zcr_cbinfo; size_t x, offset; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); return; } ASSERT3U(rm->rm_nrows, ==, 1); raidz_row_t *rr = rm->rm_row[0]; const abd_t *good = NULL; const abd_t *bad = rr->rr_col[c].rc_abd; if (c < rr->rr_firstdatacol) { /* * The first time through, calculate the parity blocks for * the good data (this relies on the fact that the good * data never changes for a given logical ZIO) */ if (rr->rr_col[0].rc_gdata == NULL) { abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; /* * Set up the rr_col[]s to generate the parity for * good_data, first saving the parity bufs and * replacing them with buffers to hold the result. */ for (x = 0; x < rr->rr_firstdatacol; x++) { bad_parity[x] = rr->rr_col[x].rc_abd; rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = abd_alloc_sametype(rr->rr_col[x].rc_abd, rr->rr_col[x].rc_size); } /* fill in the data columns from good_data */ offset = 0; for (; x < rr->rr_cols; x++) { abd_free(rr->rr_col[x].rc_abd); rr->rr_col[x].rc_abd = abd_get_offset_size((abd_t *)good_data, offset, rr->rr_col[x].rc_size); offset += rr->rr_col[x].rc_size; } /* * Construct the parity from the good data. */ vdev_raidz_generate_parity_row(rm, rr); /* restore everything back to its original state */ for (x = 0; x < rr->rr_firstdatacol; x++) rr->rr_col[x].rc_abd = bad_parity[x]; offset = 0; for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { abd_free(rr->rr_col[x].rc_abd); rr->rr_col[x].rc_abd = abd_get_offset_size( rr->rr_abd_copy, offset, rr->rr_col[x].rc_size); offset += rr->rr_col[x].rc_size; } } ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, rr->rr_col[c].rc_size); } else { /* adjust good_data to point at the start of our column */ offset = 0; for (x = rr->rr_firstdatacol; x < c; x++) offset += rr->rr_col[x].rc_size; good = abd_get_offset_size((abd_t *)good_data, offset, rr->rr_col[c].rc_size); } /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); abd_free((abd_t *)good); } /* * Invoked indirectly by zfs_ereport_start_checksum(), called * below when our read operation fails completely. The main point * is to keep a copy of everything we read from disk, so that at * vdev_raidz_cksum_finish() time we can compare it with the good data. */ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; raidz_map_t *rm = zio->io_vsd; /* set up the report and bump the refcount */ zcr->zcr_cbdata = rm; zcr->zcr_cbinfo = c; zcr->zcr_finish = vdev_raidz_cksum_finish; zcr->zcr_free = vdev_raidz_cksum_free; rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); ASSERT3U(rm->rm_nrows, ==, 1); if (rm->rm_row[0]->rr_abd_copy != NULL) return; /* * It's the first time we're called for this raidz_map_t, so we need * to copy the data aside; there's no guarantee that our zio's buffer * won't be re-used for something else. * * Our parity data is already in separate buffers, so there's no need * to copy them. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; size_t offset = 0; size_t size = 0; for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) size += rr->rr_col[c].rc_size; rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *col = &rr->rr_col[c]; abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, offset, col->rc_size); abd_copy(tmp, col->rc_abd, col->rc_size); abd_free(col->rc_abd); col->rc_abd = tmp; offset += col->rc_size; } ASSERT3U(offset, ==, size); } } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { .vsd_free = vdev_raidz_map_free_vsd, .vsd_cksum_report = vdev_raidz_cksum_report }; /* * Divides the IO evenly across all child vdevs; usually, dcols is * the number of children in the target vdev. * * Avoid inlining the function to keep vdev_raidz_io_start(), which * is this functions only caller, as small as possible on the stack. */ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = zio->io_size >> ashift; /* The first column for this stripe. */ uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << ashift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); rm->rm_nrows = 1; /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. */ q = s / (dcols - nparity); /* * "Remainder": The number of partial stripe data sectors in this I/O. * This will add a sector to some, but not all, child vdevs. */ r = s - q * (dcols - nparity); /* The number of "big columns" - those which contain remainder data. */ bc = (r == 0 ? 0 : r + nparity); /* * The total number of data and parity sectors associated with * this I/O. */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); /* * acols: The columns that will be accessed. * scols: The columns that will be accessed or skipped. */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { acols = dcols; scols = dcols; } ASSERT3U(acols, <=, scols); rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); rm->rm_row[0] = rr; rr->rr_cols = acols; rr->rr_scols = scols; rr->rr_bigcols = bc; rr->rr_missingdata = 0; rr->rr_missingparity = 0; rr->rr_firstdatacol = nparity; rr->rr_abd_copy = NULL; rr->rr_abd_empty = NULL; rr->rr_nempty = 0; #ifdef ZFS_DEBUG rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif asize = 0; for (c = 0; c < scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } rc->rc_devidx = col; rc->rc_offset = coff; rc->rc_abd = NULL; rc->rc_gdata = NULL; rc->rc_orig_data = NULL; rc->rc_error = 0; rc->rc_tried = 0; rc->rc_skipped = 0; rc->rc_repair = 0; rc->rc_need_orig_restore = B_FALSE; if (c >= acols) rc->rc_size = 0; else if (c < bc) rc->rc_size = (q + 1) << ashift; else rc->rc_size = q << ashift; asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; rm->rm_skipstart = bc; for (c = 0; c < rr->rr_firstdatacol; c++) rr->rr_col[c].rc_abd = abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); for (uint64_t off = 0; c < acols; c++) { raidz_col_t *rc = &rr->rr_col[c]; rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, zio->io_abd, off, rc->rc_size); off += rc->rc_size; } /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read * during normal operation, that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical * matter unless we juggle the parity between all devices evenly, we * won't see any benefit. Further, occasional writes that aren't a * multiple of the LCM of the number of children and the minimum * stripe width are sufficient to avoid pessimal behavior. * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. * * If we intend to skip a sector in the zeroth column for padding * we must make sure to note this swap. We will never intend to * skip the first column since at least one data and one parity * column must appear in each row. */ ASSERT(rr->rr_cols >= 2); ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { devidx = rr->rr_col[0].rc_devidx; o = rr->rr_col[0].rc_offset; rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; rr->rr_col[1].rc_devidx = devidx; rr->rr_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); return (rm); } struct pqr_struct { uint64_t *p; uint64_t *q; uint64_t *r; }; static int vdev_raidz_p_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; int i, cnt = size / sizeof (src[0]); ASSERT(pqr->p && !pqr->q && !pqr->r); for (i = 0; i < cnt; i++, src++, pqr->p++) *pqr->p ^= *src; return (0); } static int vdev_raidz_pq_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int i, cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && !pqr->r); for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; } return (0); } static int vdev_raidz_pqr_func(void *buf, size_t size, void *private) { struct pqr_struct *pqr = private; const uint64_t *src = buf; uint64_t mask; int i, cnt = size / sizeof (src[0]); ASSERT(pqr->p && pqr->q && pqr->r); for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { *pqr->p ^= *src; VDEV_RAIDZ_64MUL_2(*pqr->q, mask); *pqr->q ^= *src; VDEV_RAIDZ_64MUL_4(*pqr->r, mask); *pqr->r ^= *src; } return (0); } static void vdev_raidz_generate_parity_p(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; if (c == rr->rr_firstdatacol) { abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void vdev_raidz_generate_parity_pq(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_Q].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); (void) memcpy(q, p, rr->rr_col[c].rc_size); for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } } else { struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ uint64_t mask; for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } } } static void vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == rr->rr_col[VDEV_RAIDZ_R].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { abd_t *src = rr->rr_col[c].rc_abd; uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); (void) memcpy(q, p, rr->rr_col[c].rc_size); (void) memcpy(r, p, rr->rr_col[c].rc_size); for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; } } else { struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ uint64_t mask; for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } } } } /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. */ void vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { ASSERT3U(rr->rr_cols, !=, 0); /* Generate using the new math implementation */ if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) return; switch (rr->rr_firstdatacol) { case 1: vdev_raidz_generate_parity_p(rr); break; case 2: vdev_raidz_generate_parity_pq(rr); break; case 3: vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } void vdev_raidz_generate_parity(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_generate_parity_row(rm, rr); } } /* ARGSUSED */ static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) { uint64_t *dst = dbuf; uint64_t *src = sbuf; int cnt = size / sizeof (src[0]); for (int i = 0; i < cnt; i++) { dst[i] ^= src[i]; } return (0); } /* ARGSUSED */ static int vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, void *private) { uint64_t *dst = dbuf; uint64_t *src = sbuf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, src++) { VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } return (0); } /* ARGSUSED */ static int vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) { uint64_t *dst = buf; uint64_t mask; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++) { /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ VDEV_RAIDZ_64MUL_2(*dst, mask); } return (0); } struct reconst_q_struct { uint64_t *q; int exp; }; static int vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) { struct reconst_q_struct *rq = private; uint64_t *dst = buf; int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, rq->q++) { int j; uint8_t *b; *dst ^= *rq->q; for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { *b = vdev_raidz_exp2(*b, rq->exp); } } return (0); } struct reconst_pq_struct { uint8_t *p; uint8_t *q; uint8_t *pxy; uint8_t *qxy; int aexp; int bexp; }; static int vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; uint8_t *yd = ybuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); *yd = *rpq->p ^ *rpq->pxy ^ *xd; } return (0); } static int vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) { struct reconst_pq_struct *rpq = private; uint8_t *xd = xbuf; for (int i = 0; i < size; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { /* same operation as vdev_raidz_reconst_pq_func() on xd */ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); } return (0); } static int vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; abd_t *dst, *src; ASSERT3U(ntgts, ==, 1); ASSERT3U(x, >=, rr->rr_firstdatacol); ASSERT3U(x, <, rr->rr_cols); ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; dst = rr->rr_col[x].rc_abd; abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { uint64_t size = MIN(rr->rr_col[x].rc_size, rr->rr_col[c].rc_size); src = rr->rr_col[c].rc_abd; if (c == x) continue; (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_p_func, NULL); } return (1 << VDEV_RAIDZ_P); } static int vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, rr->rr_col[c].rc_size); src = rr->rr_col[c].rc_abd; dst = rr->rr_col[x].rc_abd; if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); if (rr->rr_col[x].rc_size > size) { abd_zero_off(dst, size, rr->rr_col[x].rc_size - size); } } else { ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; dst = rr->rr_col[x].rc_abd; exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } static int vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; abd_t *xd, *yd; ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rr->rr_firstdatacol); ASSERT(y < rr->rr_cols); ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as * though columns x and y were full of zeros -- Pxy and Qxy. We want to * reuse the parity generation mechanism without trashing the actual * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; xsize = rr->rr_col[x].rc_size; ysize = rr->rr_col[y].rc_size; rr->rr_col[VDEV_RAIDZ_P].rc_abd = abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); rr->rr_col[VDEV_RAIDZ_Q].rc_abd = abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); rr->rr_col[x].rc_size = 0; rr->rr_col[y].rc_size = 0; vdev_raidz_generate_parity_pq(rr); rr->rr_col[x].rc_size = xsize; rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); xd = rr->rr_col[x].rc_abd; yd = rr->rr_col[y].rc_abd; /* * We now have: * Pxy = P + D_x + D_y * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y * * We can then solve for D_x: * D_x = A * (P + Pxy) + B * (Q + Qxy) * where * A = 2^(x - y) * (2^(x - y) + 1)^-1 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 * * With D_x in hand, we can easily solve for D_y: * D_y = P + Pxy + D_x */ a = vdev_raidz_pow2[255 + x - y]; b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; ASSERT3U(xsize, >=, ysize); struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; (void) abd_iterate_func2(xd, yd, 0, 0, ysize, vdev_raidz_reconst_pq_func, &rpq); (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } /* BEGIN CSTYLED */ /* * In the general case of reconstruction, we must solve the system of linear * equations defined by the coefficients used to generate parity as well as * the contents of the data and parity disks. This can be expressed with * vectors for the original data (D) and the actual data (d) and parity (p) * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): * * __ __ __ __ * | | __ __ | p_0 | * | V | | D_0 | | p_m-1 | * | | x | : | = | d_0 | * | I | | D_n-1 | | : | * | | ~~ ~~ | d_n-1 | * ~~ ~~ ~~ ~~ * * I is simply a square identity matrix of size n, and V is a vandermonde * matrix defined by the coefficients we chose for the various parity columns * (1, 2, 4). Note that these values were chosen both for simplicity, speedy * computation as well as linear separability. * * __ __ __ __ * | 1 .. 1 1 1 | | p_0 | * | 2^n-1 .. 4 2 1 | __ __ | : | * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | * | 1 .. 0 0 0 | | D_1 | | d_0 | * | 0 .. 0 0 0 | x | D_2 | = | d_1 | * | : : : : | | : | | d_2 | * | 0 .. 1 0 0 | | D_n-1 | | : | * | 0 .. 0 1 0 | ~~ ~~ | : | * | 0 .. 0 0 1 | | d_n-1 | * ~~ ~~ ~~ ~~ * * Note that I, V, d, and p are known. To compute D, we must invert the * matrix and use the known data and parity values to reconstruct the unknown * data values. We begin by removing the rows in V|I and d|p that correspond * to failed or missing columns; we then make V|I square (n x n) and d|p * sized n by removing rows corresponding to unused parity from the bottom up * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' * using Gauss-Jordan elimination. In the example below we use m=3 parity * columns, n=8 data columns, with errors in d_1, d_2, and p_1: * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks * | 19 205 116 29 64 16 4 1 | / / * | 1 0 0 0 0 0 0 0 | / / * | 0 1 0 0 0 0 0 0 | <--' / * (V|I) = | 0 0 1 0 0 0 0 0 | <---' * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 | * (V|I)' = | 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We * have carefully chosen the seed values 1, 2, and 4 to ensure that this * matrix is not singular. * __ __ * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | * ~~ ~~ * __ __ * | 0 0 1 0 0 0 0 0 | * | 167 100 5 41 159 169 217 208 | * | 166 100 4 40 158 168 216 209 | * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | * | 0 0 0 0 0 0 0 1 | * ~~ ~~ * * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values * of the missing data. * * As is apparent from the example above, the only non-trivial rows in the * inverse matrix correspond to the data disks that we're trying to * reconstruct. Indeed, those are the only rows we need as the others would * only be useful for reconstructing data known or assumed to be valid. For * that reason, we only build the coefficients in the rows that correspond to * targeted columns. */ /* END CSTYLED */ static void vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. */ for (i = 0; i < nmap; i++) { ASSERT3S(0, <=, map[i]); ASSERT3S(map[i], <=, 2); pow = map[i] * n; if (pow > 255) pow -= 255; ASSERT(pow <= 255); for (j = 0; j < n; j++) { pow -= map[i]; if (pow < 0) pow += 255; rows[i][j] = vdev_raidz_pow2[pow]; } } } static void vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; uint8_t log; /* * Assert that the first nmissing entries from the array of used * columns correspond to parity columns and that subsequent entries * correspond to data columns. */ for (i = 0; i < nmissing; i++) { ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* * First initialize the storage where we'll compute the inverse rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { invrows[i][j] = (i == j) ? 1 : 0; } } /* * Subtract all trivial rows from the rows of consequence. */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { ASSERT3U(used[j], >=, rr->rr_firstdatacol); jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; } } /* * For each of the rows of interest, we must normalize it and subtract * a multiple of it from the other rows. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < missing[i]; j++) { ASSERT0(rows[i][j]); } ASSERT3U(rows[i][missing[i]], !=, 0); /* * Compute the inverse of the first element and multiply each * element in the row by that value. */ log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; for (j = 0; j < n; j++) { rows[i][j] = vdev_raidz_exp2(rows[i][j], log); invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); } for (ii = 0; ii < nmissing; ii++) { if (i == ii) continue; ASSERT3U(rows[ii][missing[i]], !=, 0); log = vdev_raidz_log2[rows[ii][missing[i]]]; for (j = 0; j < n; j++) { rows[ii][j] ^= vdev_raidz_exp2(rows[i][j], log); invrows[ii][j] ^= vdev_raidz_exp2(invrows[i][j], log); } } } /* * Verify that the data that is left in the rows are properly part of * an identity matrix. */ for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { if (j == missing[i]) { ASSERT3U(rows[i][j], ==, 1); } else { ASSERT0(rows[i][j]); } } } } static void vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; uint8_t *src; uint64_t ccount; uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; uint8_t log = 0; uint8_t val; int ll; uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; psize = sizeof (invlog[0][0]) * n * nmissing; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing; i++) { invlog[i] = pp; pp += n; } for (i = 0; i < nmissing; i++) { for (j = 0; j < n; j++) { ASSERT3U(invrows[i][j], !=, 0); invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; } } for (i = 0; i < n; i++) { c = used[i]; ASSERT3U(c, <, rr->rr_cols); ccount = rr->rr_col[c].rc_size; ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); if (ccount == 0) continue; src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { cc = missing[j] + rr->rr_firstdatacol; ASSERT3U(cc, >=, rr->rr_firstdatacol); ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); dcount[j] = rr->rr_col[cc].rc_size; if (dcount[j] != 0) dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; for (cc = 0; cc < nmissing; cc++) { if (x >= dcount[cc]) continue; if (*src == 0) { val = 0; } else { if ((ll = log + invlog[cc][i]) >= 255) ll -= 255; val = vdev_raidz_pow2[ll]; } if (i == 0) dst[cc][x] = val; else dst[cc][x] ^= val; } } } kmem_free(p, psize); } static int vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; size_t psize; uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; abd_t **bufs = NULL; int code = 0; /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate * temporary linear ABDs if any non-linear ABDs are found. */ for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { if (!abd_is_linear(rr->rr_col[i].rc_abd)) { bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), KM_PUSHPAGE); for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *col = &rr->rr_col[c]; bufs[c] = col->rc_abd; if (bufs[c] != NULL) { col->rc_abd = abd_alloc_linear( col->rc_size, B_TRUE); abd_copy(col->rc_abd, bufs[c], col->rc_size); } } break; } } n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = tgts[t] - rr->rr_firstdatacol; } } /* * Figure out which parity columns to use to help generate the missing * data columns. */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. */ if (c == tgts[tt]) { tt++; continue; } code |= 1 << c; parity_map[i] = c; i++; } ASSERT(code != 0); ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * nmissing_rows * n + sizeof (used[0]) * n; p = kmem_alloc(psize, KM_SLEEP); for (pp = p, i = 0; i < nmissing_rows; i++) { rows[i] = pp; pp += n; invrows[i] = pp; pp += n; } used = pp; for (i = 0; i < nmissing_rows; i++) { used[i] = parity_map[i]; } for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } ASSERT3S(i, <, n); used[i] = c; i++; } /* * Initialize the interesting rows of the matrix. */ vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); /* * copy back from temporary linear abds and free them */ if (bufs) { for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *col = &rr->rr_col[c]; if (bufs[c] != NULL) { abd_copy(bufs[c], col->rc_abd, col->rc_size); abd_free(col->rc_abd); } col->rc_abd = bufs[c]; } kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } return (code); } static int vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; int i, c, ret; int code; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; nbadparity = rr->rr_firstdatacol; nbaddata = rr->rr_cols - nbadparity; ntgts = 0; for (i = 0, c = 0; c < rr->rr_cols; c++) { if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; nbadparity--; } } ASSERT(ntgts >= nt); ASSERT(nbaddata >= 0); ASSERT(nbaddata + nbadparity == ntgts); dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) return (ret); /* * See if we can use any of our optimized reconstruction routines. */ switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) return (vdev_raidz_reconstruct_p(rr, dt, 1)); ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) return (vdev_raidz_reconstruct_q(rr, dt, 1)); ASSERT(rr->rr_firstdatacol > 2); break; case 2: ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) return (vdev_raidz_reconstruct_pq(rr, dt, 2)); ASSERT(rr->rr_firstdatacol > 2); break; } code = vdev_raidz_reconstruct_general(rr, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); return (code); } static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; ASSERT(nparity > 0); if (nparity > VDEV_RAIDZ_MAXPARITY || vd->vdev_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (SET_ERROR(EINVAL)); } vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); *physical_ashift = MAX(*physical_ashift, cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; *max_asize *= vd->vdev_children; if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_raidz_close(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c] != NULL) vdev_close(vd->vdev_child[c]); } } static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t cols = vdrz->vd_logical_width; uint64_t nparity = vdrz->vd_nparity; asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); asize = roundup(asize, nparity + 1) << ashift; return (asize); } /* * The allocatable space for a raidz vdev is N * sizeof(smallest child) * so each child must provide at least 1/Nth of its asize. */ static uint64_t vdev_raidz_min_asize(vdev_t *vd) { return ((vd->vdev_min_asize + vd->vdev_children - 1) / vd->vdev_children); } void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; } static void vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) { #ifdef ZFS_DEBUG vdev_t *tvd = vd->vdev_top; range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + vdev_raidz_asize(vd, rr->rr_size); raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); ASSERT(vdev_xlate_is_empty(&remain_rs)); ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* * It would be nice to assert that rs_end is equal * to rc_offset + rc_size but there might be an * optional I/O at the end that is not accounted in * rc_size. */ if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size + (1 << tvd->vdev_ashift)); } else { ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); } #endif } static void vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; int c, i; vdev_raidz_generate_parity_row(rm, rr); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) continue; /* Verify physical to logical translation */ vdev_raidz_io_verify(vd, rr, c); zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } /* * Generate optional I/Os for skip sectors to improve aggregation * contiguity. */ for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { ASSERT(c <= rr->rr_scols); if (c == rr->rr_scols) c = 0; raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, zio->io_type, zio->io_priority, ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } } static void vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; /* * Iterate over the columns in reverse order so that we hit the parity * last -- any errors along the way will force us to read the parity. */ for (int c = rr->rr_cols - 1; c >= 0; c--) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_size == 0) continue; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if (!vdev_readable(cvd)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rr->rr_firstdatacol) rr->rr_missingdata++; else rr->rr_missingparity++; rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } } } /* * Start an IO operation on a RAIDZ VDev * * Outline: * - For write operations: * 1. Generate the parity data * 2. Create child zio write operations to each column's vdev, for both * data and parity. * 3. If the column skips any sectors for padding, create optional dummy * write zio children for those areas to improve aggregation continuity. * - For read operations: * 1. Create child zio read operations to each data column's vdev to read * the range of data required for zio. * 2. If this is a scrub or resilver operation, or if any of the data * vdevs have had errors, then create zio read operations to the parity * columns' VDevs as well. */ static void vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_raidz_t *vdrz = vd->vdev_tsd; raidz_map_t *rm; rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vdrz->vd_logical_width, vdrz->vd_nparity); /* * Until raidz expansion is implemented all maps for a raidz vdev * contain a single row. */ ASSERT3U(rm->rm_nrows, ==, 1); raidz_row_t *rr = rm->rm_row[0]; zio->io_vsd = rm; zio->io_vsd_ops = &vdev_raidz_vsd_ops; if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); } else { ASSERT(zio->io_type == ZIO_TYPE_READ); vdev_raidz_io_start_read(zio, rr); } zio_execute(zio); } /* * Report a checksum error for a child of a RAID-Z device. */ static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && zio->io_priority != ZIO_PRIORITY_REBUILD) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - int ret = zfs_ereport_post_checksum(zio->io_spa, vd, + (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); - if (ret != EALREADY) { - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - } + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); } } /* * We keep track of whether or not there were any injected errors, so that * any ereports we generate can note it. */ static int raidz_checksum_verify(zio_t *zio) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; bzero(&zbc, sizeof (zio_bad_cksum_t)); int ret = zio_checksum_error(zio, &zbc); if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; return (ret); } /* * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the * number of such failures. */ static int raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); abd_copy(orig[c], rc->rc_abd, rc->rc_size); } /* * Regenerates parity even for !tried||rc_error!=0 columns. This * isn't harmful but it does have the side effect of fixing stuff * we didn't realize was necessary (i.e. even if we return 0). */ vdev_raidz_generate_parity_row(rm, rr); for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; } abd_free(orig[c]); } return (ret); } static int vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; for (int c = 0; c < rr->rr_cols; c++) error = zio_worst_error(error, rr->rr_col[c].rc_error); return (error); } static void vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } /* * If we read more parity disks than were used for * reconstruction, confirm that the other parity disks produced * correct data. * * Note that we also regenerate parity when resilvering so we * can write it out to failed devices later. */ if (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors || (zio->io_flags & ZIO_FLAG_RESILVER)) { int n = raidz_parity_verify(zio, rr); unexpected_errors += n; ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); } if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. */ for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *vd = zio->io_vd; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; if ((rc->rc_error == 0 || rc->rc_size == 0) && (rc->rc_repair == 0)) { continue; } zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority == ZIO_PRIORITY_REBUILD ? ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } static void raidz_restore_orig_data(raidz_map_t *rm) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_need_orig_restore) { abd_copy_from_buf(rc->rc_abd, rc->rc_orig_data, rc->rc_size); rc->rc_need_orig_restore = B_FALSE; } } } } /* * returns EINVAL if reconstruction of the block will not be possible * returns ECKSUM if this specific reconstruction failed * returns 0 on successful reconstruction */ static int raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) { raidz_map_t *rm = zio->io_vsd; /* Reconstruct each row */ for (int r = 0; r < rm->rm_nrows; r++) { raidz_row_t *rr = rm->rm_row[r]; int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ int t = 0; int dead = 0; int dead_data = 0; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; ASSERT0(rc->rc_need_orig_restore); if (rc->rc_error != 0) { dead++; if (c >= nparity) dead_data++; continue; } if (rc->rc_size == 0) continue; for (int lt = 0; lt < ntgts; lt++) { if (rc->rc_devidx == ltgts[lt]) { if (rc->rc_orig_data == NULL) { rc->rc_orig_data = zio_buf_alloc(rc->rc_size); abd_copy_to_buf( rc->rc_orig_data, rc->rc_abd, rc->rc_size); } rc->rc_need_orig_restore = B_TRUE; dead++; if (c >= nparity) dead_data++; my_tgts[t++] = c; break; } } } if (dead > nparity) { /* reconstruction not possible */ raidz_restore_orig_data(rm); return (EINVAL); } rr->rr_code = 0; if (dead_data > 0) rr->rr_code = vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); } /* Check for success */ if (raidz_checksum_verify(zio) == 0) { /* Reconstruction succeeded - report errors */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_need_orig_restore) { /* * Note: if this is a parity column, * we don't really know if it's wrong. * We need to let * vdev_raidz_io_done_verified() check * it, and if we set rc_error, it will * think that it is a "known" error * that doesn't need to be checked * or corrected. */ if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) { raidz_checksum_error(zio, rc, rc->rc_gdata); rc->rc_error = SET_ERROR(ECKSUM); } rc->rc_need_orig_restore = B_FALSE; } } vdev_raidz_io_done_verified(zio, rr); } zio_checksum_verified(zio); return (0); } /* Reconstruction failed - restore original data */ raidz_restore_orig_data(rm); return (ECKSUM); } /* * Iterate over all combinations of N bad vdevs and attempt a reconstruction. * Note that the algorithm below is non-optimal because it doesn't take into * account how reconstruction is actually performed. For example, with * triple-parity RAID-Z the reconstruction procedure is the same if column 4 * is targeted as invalid as if columns 1 and 4 are targeted since in both * cases we'd only use parity information in column 0. * * The order that we find the various possible combinations of failed * disks is dictated by these rules: * - Examine each "slot" (the "i" in tgts[i]) * - Try to increment this slot (tgts[i] = tgts[i] + 1) * - if we can't increment because it runs into the next slot, * reset our slot to the minimum, and examine the next slot * * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose * 3 columns to reconstruct), we will generate the following sequence: * * STATE ACTION * 0 1 2 special case: skip since these are all parity * 0 1 3 first slot: reset to 0; middle slot: increment to 2 * 0 2 3 first slot: increment to 1 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 * 0 1 4 first: reset to 0; middle: increment to 2 * 0 2 4 first: increment to 1 * 1 2 4 first: reset to 0; middle: increment to 3 * 0 3 4 first: increment to 1 * 1 3 4 first: increment to 2 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 * 0 1 5 first: reset to 0; middle: increment to 2 * 0 2 5 first: increment to 1 * 1 2 5 first: reset to 0; middle: increment to 3 * 0 3 5 first: increment to 1 * 1 3 5 first: increment to 2 * 2 3 5 first: reset to 0; middle: increment to 4 * 0 4 5 first: increment to 1 * 1 4 5 first: increment to 2 * 2 4 5 first: increment to 3 * 3 4 5 done * * This strategy works for dRAID but is less effecient when there are a large * number of child vdevs and therefore permutations to check. Furthermore, * since the raidz_map_t rows likely do not overlap reconstruction would be * possible as long as there are no more than nparity data errors per row. * These additional permutations are not currently checked but could be as * a future improvement. */ static int vdev_raidz_combrec(zio_t *zio) { int nparity = vdev_get_nparity(zio->io_vd); raidz_map_t *rm = zio->io_vsd; /* Check if there's enough data to attempt reconstrution. */ for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; int total_errors = 0; for (int c = 0; c < rr->rr_cols; c++) { if (rr->rr_col[c].rc_error) total_errors++; } if (total_errors > nparity) return (vdev_raidz_worst_error(rr)); } for (int num_failures = 1; num_failures <= nparity; num_failures++) { int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *ltgts = &tstore[1]; /* value is logical child ID */ /* Determine number of logical children, n */ int n = zio->io_vd->vdev_children; ASSERT3U(num_failures, <=, nparity); ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); /* Handle corner cases in combrec logic */ ltgts[-1] = -1; for (int i = 0; i < num_failures; i++) { ltgts[i] = i; } ltgts[num_failures] = n; for (;;) { int err = raidz_reconstruct(zio, ltgts, num_failures, nparity); if (err == EINVAL) { /* * Reconstruction not possible with this # * failures; try more failures. */ break; } else if (err == 0) return (0); /* Compute next targets to try */ for (int t = 0; ; t++) { ASSERT3U(t, <, num_failures); ltgts[t]++; if (ltgts[t] == n) { /* try more failures */ ASSERT3U(t, ==, num_failures - 1); break; } ASSERT3U(ltgts[t], <, n); ASSERT3U(ltgts[t], <=, ltgts[t + 1]); /* * If that spot is available, we're done here. * Try the next combination. */ if (ltgts[t] != ltgts[t + 1]) break; /* * Otherwise, reset this tgt to the minimum, * and move on to the next tgt. */ ltgts[t] = ltgts[t - 1] + 1; ASSERT3U(ltgts[t], ==, t); } /* Increase the number of failures and keep trying. */ if (ltgts[num_failures - 1] == n) break; } } return (ECKSUM); } void vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { for (uint64_t row = 0; row < rm->rm_nrows; row++) { raidz_row_t *rr = rm->rm_row[row]; vdev_raidz_reconstruct_row(rm, rr, t, nt); } } /* * Complete a write IO operation on a RAIDZ VDev * * Outline: * 1. Check for errors on the child IOs. * 2. Return, setting an error code if too few child VDevs were written * to reconstruct the data later. Note that partial writes are * considered successful if they can be reconstructed at all. */ static void vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) { int total_errors = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ total_errors++; } } /* * Treat partial writes as a success. If we couldn't write enough * columns to reconstruct the data, the I/O failed. Otherwise, * good enough. * * Now that we support write reallocation, it would be better * to treat partial failure as real failure unless there are * no non-degraded top-level vdevs left, and not update DTLs * if we intend to reallocate. */ if (total_errors > rr->rr_firstdatacol) { zio->io_error = zio_worst_error(zio->io_error, vdev_raidz_worst_error(rr)); } } /* * return 0 if no reconstruction occurred, otherwise the "code" from * vdev_raidz_reconstruct(). */ static int vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr) { int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; int code = 0; ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; total_errors++; } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } /* * If there were data errors and the number of errors we saw was * correctable -- less than or equal to the number of parity disks read * -- reconstruct based on the missing data. */ if (data_errors != 0 && total_errors <= rr->rr_firstdatacol - parity_untried) { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we * wouldn't be here in the correctable case. There must * also have been fewer parity errors than parity * columns or, again, we wouldn't be in this code path. */ ASSERT(parity_untried == 0); ASSERT(parity_errors < rr->rr_firstdatacol); /* * Identify the data columns that reported an error. */ int n = 0; int tgts[VDEV_RAIDZ_MAXPARITY]; for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error != 0) { ASSERT(n < VDEV_RAIDZ_MAXPARITY); tgts[n++] = c; } } ASSERT(rr->rr_firstdatacol >= n); code = vdev_raidz_reconstruct_row(rm, rr, tgts, n); } return (code); } /* * Return the number of reads issued. */ static int vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) { vdev_t *vd = zio->io_vd; int nread = 0; rr->rr_missingdata = 0; rr->rr_missingparity = 0; /* * If this rows contains empty sectors which are not required * for a normal read then allocate an ABD for them now so they * may be read, verified, and any needed repairs performed. */ if (rr->rr_nempty && rr->rr_abd_empty == NULL) vdev_draid_map_alloc_empty(zio, rr); for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_tried || rc->rc_size == 0) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); nread++; } return (nread); } /* * We're here because either there were too many errors to even attempt * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() * failed. In either case, there is enough bad data to prevent reconstruction. * Start checksum ereports for all children which haven't failed. */ static void vdev_raidz_io_done_unrecoverable(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; if (rc->rc_error != 0) continue; zio_bad_cksum_t zbc; zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - int ret = zfs_ereport_start_checksum(zio->io_spa, + (void) zfs_ereport_start_checksum(zio->io_spa, cvd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, (void *)(uintptr_t)c, &zbc); - if (ret != EALREADY) { - mutex_enter(&cvd->vdev_stat_lock); - cvd->vdev_stat.vs_checksum_errors++; - mutex_exit(&cvd->vdev_stat_lock); - } + mutex_enter(&cvd->vdev_stat_lock); + cvd->vdev_stat.vs_checksum_errors++; + mutex_exit(&cvd->vdev_stat_lock); } } } void vdev_raidz_io_done(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; if (zio->io_type == ZIO_TYPE_WRITE) { for (int i = 0; i < rm->rm_nrows; i++) { vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); } } else { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; rr->rr_code = vdev_raidz_io_done_reconstruct_known_missing(zio, rm, rr); } if (raidz_checksum_verify(zio) == 0) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_verified(zio, rr); } zio_checksum_verified(zio); } else { /* * A sequential resilver has no checksum which makes * combinatoral reconstruction impossible. This code * path is unreachable since raidz_checksum_verify() * has no checksum to verify and must succeed. */ ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); /* * This isn't a typical situation -- either we got a * read error or a child silently returned bad data. * Read every block so we can try again with as much * data and parity as we can track down. If we've * already been through once before, all children will * be marked as tried so we'll proceed to combinatorial * reconstruction. */ int nread = 0; for (int i = 0; i < rm->rm_nrows; i++) { nread += vdev_raidz_read_all(zio, rm->rm_row[i]); } if (nread != 0) { /* * Normally our stage is VDEV_IO_DONE, but if * we've already called redone(), it will have * changed to VDEV_IO_START, in which case we * don't want to call redone() again. */ if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) zio_vdev_io_redone(zio); return; } zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { vdev_raidz_io_done_unrecoverable(zio); } } } } static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { vdev_raidz_t *vdrz = vd->vdev_tsd; if (faulted > vdrz->vd_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t dcols = vd->vdev_children; uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = DVA_GET_OFFSET(dva) >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = ((psize - 1) >> ashift) + 1; /* The first column for this stripe. */ uint64_t f = b % dcols; /* Unreachable by sequential resilver. */ ASSERT3U(phys_birth, !=, TXG_UNKNOWN); if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) return (B_FALSE); if (s + nparity >= dcols) return (B_TRUE); for (uint64_t c = 0; c < s + nparity; c++) { uint64_t devidx = (f + c) % dcols; vdev_t *cvd = vd->vdev_child[devidx]; /* * dsl_scan_need_resilver() already checked vd with * vdev_dtl_contains(). So here just check cvd with * vdev_dtl_empty(), cheaper and a good approximation. */ if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) return (B_TRUE); } return (B_FALSE); } static void vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); uint64_t width = raidvd->vdev_children; uint64_t tgt_col = cvd->vdev_id; uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* make sure the offsets are block-aligned */ ASSERT0(logical_rs->rs_start % (1 << ashift)); ASSERT0(logical_rs->rs_end % (1 << ashift)); uint64_t b_start = logical_rs->rs_start >> ashift; uint64_t b_end = logical_rs->rs_end >> ashift; uint64_t start_row = 0; if (b_start > tgt_col) /* avoid underflow */ start_row = ((b_start - tgt_col - 1) / width) + 1; uint64_t end_row = 0; if (b_end > tgt_col) end_row = ((b_end - tgt_col - 1) / width) + 1; physical_rs->rs_start = start_row << ashift; physical_rs->rs_end = end_row << ashift; ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, logical_rs->rs_end - logical_rs->rs_start); } /* * Initialize private RAIDZ specific fields from the nvlist. */ static int vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) { vdev_raidz_t *vdrz; uint64_t nparity; uint_t children; nvlist_t **child; int error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(EINVAL)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); vdrz->vd_logical_width = children; vdrz->vd_nparity = nparity; *tsd = vdrz; return (0); } static void vdev_raidz_fini(vdev_t *vd) { kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); } /* * Add RAIDZ specific fields to the config nvlist. */ static void vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) { ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); vdev_raidz_t *vdrz = vd->vdev_tsd; /* * Make sure someone hasn't managed to sneak a fancy new vdev * into a crufty old storage pool. */ ASSERT(vdrz->vd_nparity == 1 || (vdrz->vd_nparity <= 2 && spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || (vdrz->vd_nparity <= 3 && spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); /* * Note that we'll add these even on storage pools where they * aren't strictly required -- older software will just ignore * it. */ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); } static uint64_t vdev_raidz_nparity(vdev_t *vd) { vdev_raidz_t *vdrz = vd->vdev_tsd; return (vdrz->vd_nparity); } static uint64_t vdev_raidz_ndisks(vdev_t *vd) { return (vd->vdev_children); } vdev_ops_t vdev_raidz_ops = { .vdev_op_init = vdev_raidz_init, .vdev_op_fini = vdev_raidz_fini, .vdev_op_open = vdev_raidz_open, .vdev_op_close = vdev_raidz_close, .vdev_op_asize = vdev_raidz_asize, .vdev_op_min_asize = vdev_raidz_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, .vdev_op_state_change = vdev_raidz_state_change, .vdev_op_need_resilver = vdev_raidz_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_raidz_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = vdev_raidz_config_generate, .vdev_op_nparity = vdev_raidz_nparity, .vdev_op_ndisks = vdev_raidz_ndisks, .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c index 112c04d0ba04..a77ff99faa92 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c +++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c @@ -1,1147 +1,1150 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include #include #include #include #include /* * This file contains the sequential reconstruction implementation for * resilvering. This form of resilvering is internally referred to as device * rebuild to avoid conflating it with the traditional healing reconstruction * performed by the dsl scan code. * * When replacing a device, or scrubbing the pool, ZFS has historically used * a process called resilvering which is a form of healing reconstruction. * This approach has the advantage that as blocks are read from disk their * checksums can be immediately verified and the data repaired. Unfortunately, * it also results in a random IO pattern to the disk even when extra care * is taken to sequentialize the IO as much as possible. This substantially * increases the time required to resilver the pool and restore redundancy. * * For mirrored devices it's possible to implement an alternate sequential * reconstruction strategy when resilvering. Sequential reconstruction * behaves like a traditional RAID rebuild and reconstructs a device in LBA * order without verifying the checksum. After this phase completes a second * scrub phase is started to verify all of the checksums. This two phase * process will take longer than the healing reconstruction described above. * However, it has that advantage that after the reconstruction first phase * completes redundancy has been restored. At this point the pool can incur * another device failure without risking data loss. * * There are a few noteworthy limitations and other advantages of resilvering * using sequential reconstruction vs healing reconstruction. * * Limitations: * * - Sequential reconstruction is not possible on RAIDZ due to its * variable stripe width. Note dRAID uses a fixed stripe width which * avoids this issue, but comes at the expense of some usable capacity. * * - Block checksums are not verified during sequential reconstruction. * Similar to traditional RAID the parity/mirror data is reconstructed * but cannot be immediately double checked. For this reason when the * last active resilver completes the pool is automatically scrubbed * by default. * * - Deferred resilvers using sequential reconstruction are not currently * supported. When adding another vdev to an active top-level resilver * it must be restarted. * * Advantages: * * - Sequential reconstruction is performed in LBA order which may be faster * than healing reconstruction particularly when using using HDDs (or * especially with SMR devices). Only allocated capacity is resilvered. * * - Sequential reconstruction is not constrained by ZFS block boundaries. * This allows it to issue larger IOs to disk which span multiple blocks * allowing all of these logical blocks to be repaired with a single IO. * * - Unlike a healing resilver or scrub which are pool wide operations, * sequential reconstruction is handled by the top-level vdevs. This * allows for it to be started or canceled on a top-level vdev without * impacting any other top-level vdevs in the pool. * * - Data only referenced by a pool checkpoint will be repaired because * that space is reflected in the space maps. This differs for a * healing resilver or scrub which will not repair that data. */ /* * Size of rebuild reads; defaults to 1MiB per data disk and is capped at * SPA_MAXBLOCKSIZE. */ unsigned long zfs_rebuild_max_segment = 1024 * 1024; /* * Maximum number of parallelly executed bytes per leaf vdev caused by a * sequential resilver. We attempt to strike a balance here between keeping * the vdev queues full of I/Os at all times and not overflowing the queues * to cause long latency, which would cause long txg sync times. * * A large default value can be safely used here because the default target * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep * the queue depth short. * * 32MB was selected as the default value to achieve good performance with * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential * rebuild was unable to saturate all of the drives using smaller values. * With a value of 32MB the sequential resilver write rate was measured at * 800MB/s sustained while rebuilding to a distributed spare. */ unsigned long zfs_rebuild_vdev_limit = 32 << 20; /* * Automatically start a pool scrub when the last active sequential resilver * completes in order to verify the checksums of all blocks which have been * resilvered. This option is enabled by default and is strongly recommended. */ int zfs_rebuild_scrub_enabled = 1; /* * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). */ static void vdev_rebuild_thread(void *arg); /* * Clear the per-vdev rebuild bytes value for a vdev tree. */ static void clear_rebuild_bytes(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (uint64_t i = 0; i < vd->vdev_children; i++) clear_rebuild_bytes(vd->vdev_child[i]); mutex_enter(&vd->vdev_stat_lock); vs->vs_rebuild_processed = 0; mutex_exit(&vd->vdev_stat_lock); } /* * Determines whether a vdev_rebuild_thread() should be stopped. */ static boolean_t vdev_rebuild_should_stop(vdev_t *vd) { return (!vdev_writeable(vd) || vd->vdev_removing || vd->vdev_rebuild_exit_wanted || vd->vdev_rebuild_cancel_wanted || vd->vdev_rebuild_reset_wanted); } /* * Determine if the rebuild should be canceled. This may happen when all * vdevs with MISSING DTLs are detached. */ static boolean_t vdev_rebuild_should_cancel(vdev_t *vd) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) return (B_TRUE); return (B_FALSE); } /* * The sync task for updating the on-disk state of a rebuild. This is * scheduled by vdev_rebuild_range(). */ static void vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t txg = dmu_tx_get_txg(tx); mutex_enter(&vd->vdev_rebuild_lock); if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; vr->vr_scan_offset[txg & TXG_MASK] = 0; } vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); mutex_exit(&vd->vdev_rebuild_lock); } /* * Initialize the on-disk state for a new rebuild, start the rebuild thread. */ static void vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; ASSERT(vd->vdev_rebuilding); spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); mutex_enter(&vd->vdev_rebuild_lock); bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; vrp->vrp_min_txg = 0; vrp->vrp_max_txg = dmu_tx_get_txg(tx); vrp->vrp_start_time = gethrestime_sec(); vrp->vrp_scan_time_ms = 0; vr->vr_prev_scan_time_ms = 0; /* * Rebuilds are currently only used when replacing a device, in which * case there must be DTL_MISSING entries. In the future, we could * allow rebuilds to be used in a way similar to a scrub. This would * be useful because it would allow us to rebuild the space used by * pool checkpoints. */ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu started", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&vd->vdev_rebuild_lock); } static void vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); spa_event_notify(spa, vd, aux, name); nvlist_free(aux); } /* * Called to request that a new rebuild be started. The feature will remain * active for the duration of the rebuild, then revert to the enabled state. */ static void vdev_rebuild_initiate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_top == vd); ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); ASSERT(!vd->vdev_rebuilding); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); vd->vdev_rebuilding = B_TRUE; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx); dmu_tx_commit(tx); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); } /* * Update the on-disk state to completed when a rebuild finishes. */ static void vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; vrp->vrp_end_time = gethrestime_sec(); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu complete", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); /* Handles detaching of spares */ spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); vd->vdev_rebuilding = B_FALSE; mutex_exit(&vd->vdev_rebuild_lock); /* * While we're in syncing context take the opportunity to * setup the scrub when there are no more active rebuilds. */ if (!vdev_rebuild_active(spa->spa_root_vdev) && zfs_rebuild_scrub_enabled) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_setup_sync(&func, tx); } cv_broadcast(&vd->vdev_rebuild_cv); + + /* Clear recent error events (i.e. duplicate events tracking) */ + zfs_ereport_clear(spa, NULL); } /* * Update the on-disk state to canceled when a rebuild finishes. */ static void vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; vrp->vrp_end_time = gethrestime_sec(); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu canceled", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); vd->vdev_rebuild_cancel_wanted = B_FALSE; vd->vdev_rebuilding = B_FALSE; mutex_exit(&vd->vdev_rebuild_lock); spa_notify_waiters(spa); cv_broadcast(&vd->vdev_rebuild_cv); } /* * Resets the progress of a running rebuild. This will occur when a new * vdev is added to rebuild. */ static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); vrp->vrp_last_offset = 0; vrp->vrp_min_txg = 0; vrp->vrp_max_txg = dmu_tx_get_txg(tx); vrp->vrp_bytes_scanned = 0; vrp->vrp_bytes_issued = 0; vrp->vrp_bytes_rebuilt = 0; vrp->vrp_bytes_est = 0; vrp->vrp_scan_time_ms = 0; vr->vr_prev_scan_time_ms = 0; /* See vdev_rebuild_initiate_sync comment */ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu reset", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); vd->vdev_rebuild_reset_wanted = B_FALSE; ASSERT(vd->vdev_rebuilding); vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&vd->vdev_rebuild_lock); } /* * Clear the last rebuild status. */ void vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; objset_t *mos = spa_meta_objset(spa); mutex_enter(&vd->vdev_rebuild_lock); if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { mutex_exit(&vd->vdev_rebuild_lock); return; } clear_rebuild_bytes(vd); bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { VERIFY0(zap_update(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); } mutex_exit(&vd->vdev_rebuild_lock); } /* * The zio_done_func_t callback for each rebuild I/O issued. It's responsible * for updating the rebuild stats and limiting the number of in flight I/Os. */ static void vdev_rebuild_cb(zio_t *zio) { vdev_rebuild_t *vr = zio->io_private; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vdev_t *vd = vr->vr_top_vdev; mutex_enter(&vr->vr_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the top-level vdev was unavailable. * Attempt to roll back to the last completed offset, in order * resume from the correct location if the pool is resumed. * (This works because spa_sync waits on spa_txg_zio before * it runs sync tasks.) */ uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; *off = MIN(*off, zio->io_offset); } else if (zio->io_error) { vrp->vrp_errors++; } abd_free(zio->io_abd); ASSERT3U(vr->vr_bytes_inflight, >, 0); vr->vr_bytes_inflight -= zio->io_size; cv_broadcast(&vr->vr_io_cv); mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* * Initialize a block pointer that can be used to read the given segment * for sequential rebuild. */ static void vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, uint64_t asize) { ASSERT(vd->vdev_ops == &vdev_draid_ops || vd->vdev_ops == &vdev_mirror_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? vdev_draid_asize_to_psize(vd, asize) : asize; BP_ZERO(bp); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], start); DVA_SET_GANG(&bp->blk_dva[0], 0); DVA_SET_ASIZE(&bp->blk_dva[0], asize); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, psize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); } /* * Issues a rebuild I/O and takes care of rate limiting the number of queued * rebuild I/Os. The provided start and size must be properly aligned for the * top-level vdev type being rebuilt. */ static int vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) { uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; vdev_t *vd = vr->vr_top_vdev; spa_t *spa = vd->vdev_spa; blkptr_t blk; ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); vr->vr_pass_bytes_scanned += size; vr->vr_rebuild_phys.vrp_bytes_scanned += size; /* * Rebuild the data in this range by constructing a special block * pointer. It has no relation to any existing blocks in the pool. * However, by disabling checksum verification and issuing a scrub IO * we can reconstruct and repair any children with missing data. */ vdev_rebuild_blkptr_init(&blk, vd, start, size); uint64_t psize = BP_GET_PSIZE(&blk); if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) return (0); mutex_enter(&vr->vr_io_lock); /* Limit in flight rebuild I/Os */ while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); vr->vr_bytes_inflight += psize; mutex_exit(&vr->vr_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); mutex_enter(&vd->vdev_rebuild_lock); /* This is the first I/O for this txg. */ if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { vr->vr_scan_offset[txg & TXG_MASK] = start; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_update_sync, (void *)(uintptr_t)vd->vdev_id, tx); } /* When exiting write out our progress. */ if (vdev_rebuild_should_stop(vd)) { mutex_enter(&vr->vr_io_lock); vr->vr_bytes_inflight -= psize; mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_rebuild_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_rebuild_lock); dmu_tx_commit(tx); vr->vr_scan_offset[txg & TXG_MASK] = start + size; vr->vr_pass_bytes_issued += size; vr->vr_rebuild_phys.vrp_bytes_issued += size; zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | ZIO_FLAG_RESILVER, NULL)); return (0); } /* * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. */ static int vdev_rebuild_ranges(vdev_rebuild_t *vr) { vdev_t *vd = vr->vr_top_vdev; zfs_btree_t *t = &vr->vr_scan_tree->rt_root; zfs_btree_index_t idx; int error; for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; rs = zfs_btree_next(t, &idx, &idx)) { uint64_t start = rs_get_start(rs, vr->vr_scan_tree); uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; /* * zfs_scan_suspend_progress can be set to disable rebuild * progress for testing. See comment in dsl_scan_sync(). */ while (zfs_scan_suspend_progress && !vdev_rebuild_should_stop(vd)) { delay(hz); } while (size > 0) { uint64_t chunk_size; /* * Split range into legally-sized logical chunks * given the constraints of the top-level vdev * being rebuilt (dRAID or mirror). */ ASSERT3P(vd->vdev_ops, !=, NULL); chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, start, size, zfs_rebuild_max_segment); error = vdev_rebuild_range(vr, start, chunk_size); if (error != 0) return (error); size -= chunk_size; start += chunk_size; } } return (0); } /* * Calculates the estimated capacity which remains to be scanned. Since * we traverse the pool in metaslab order only allocated capacity beyond * the vrp_last_offset need be considered. All lower offsets must have * already been rebuilt and are thus already included in vrp_bytes_scanned. */ static void vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t bytes_est = vrp->vrp_bytes_scanned; if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) return; for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_ms[i]; mutex_enter(&msp->ms_lock); bytes_est += metaslab_allocated_space(msp); mutex_exit(&msp->ms_lock); } vrp->vrp_bytes_est = bytes_est; } /* * Load from disk the top-level vdev's rebuild information. */ int vdev_rebuild_load(vdev_t *vd) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; spa_t *spa = vd->vdev_spa; int err = 0; mutex_enter(&vd->vdev_rebuild_lock); vd->vdev_rebuilding = B_FALSE; if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); mutex_exit(&vd->vdev_rebuild_lock); return (SET_ERROR(ENOTSUP)); } ASSERT(vd->vdev_top == vd); err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp); /* * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should * not prevent a pool from being imported. Clear the rebuild * status allowing a new resilver/rebuild to be started. */ if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); } else if (err) { mutex_exit(&vd->vdev_rebuild_lock); return (err); } vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; vr->vr_top_vdev = vd; mutex_exit(&vd->vdev_rebuild_lock); return (0); } /* * Each scan thread is responsible for rebuilding a top-level vdev. The * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. */ static void vdev_rebuild_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; int error = 0; /* * If there's a scrub in process request that it be stopped. This * is not required for a correct rebuild, but we do want rebuilds to * emulate the resilver behavior as much as possible. */ dsl_pool_t *dsl = spa_get_dsl(spa); if (dsl_scan_scrubbing(dsl)) dsl_scan_cancel(dsl); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&vd->vdev_rebuild_lock); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); ASSERT(vd->vdev_rebuilding); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; vr->vr_bytes_inflight_max = MAX(1ULL << 20, zfs_rebuild_vdev_limit * vd->vdev_children); uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); clear_rebuild_bytes(vr->vr_top_vdev); mutex_exit(&vd->vdev_rebuild_lock); /* * Systematically walk the metaslabs and issue rebuild I/Os for * all ranges in the allocated space map. */ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_ms[i]; vr->vr_scan_msp = msp; /* * Removal of vdevs from the vdev tree may eliminate the need * for the rebuild, in which case it should be canceled. The * vdev_rebuild_cancel_wanted flag is set until the sync task * completes. This may be after the rebuild thread exits. */ if (vdev_rebuild_should_cancel(vd)) { vd->vdev_rebuild_cancel_wanted = B_TRUE; error = EINTR; break; } ASSERT0(range_tree_space(vr->vr_scan_tree)); /* Disable any new allocations to this metaslab */ spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * If there are outstanding allocations wait for them to be * synced. This is needed to ensure all allocated ranges are * on disk and therefore will be rebuilt. */ for (int j = 0; j < TXG_SIZE; j++) { if (range_tree_space(msp->ms_allocating[j])) { mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); txg_wait_synced(dsl, 0); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); break; } } /* * When a metaslab has been allocated from read its allocated * ranges from the space map object into the vr_scan_tree. * Then add inflight / unflushed ranges and remove inflight / * unflushed frees. This is the minimum range to be rebuilt. */ if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, vr->vr_scan_tree, SM_ALLOC)); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(range_tree_space( msp->ms_allocating[i])); } range_tree_walk(msp->ms_unflushed_allocs, range_tree_add, vr->vr_scan_tree); range_tree_walk(msp->ms_unflushed_frees, range_tree_remove, vr->vr_scan_tree); /* * Remove ranges which have already been rebuilt based * on the last offset. This can happen when restarting * a scan after exporting and re-importing the pool. */ range_tree_clear(vr->vr_scan_tree, 0, vrp->vrp_last_offset); } mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); /* * To provide an accurate estimate re-calculate the estimated * size every 5 minutes to account for recent allocations and * frees made to space maps which have not yet been rebuilt. */ if (gethrtime() > update_est_time + SEC2NSEC(300)) { update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, i); } /* * Walk the allocated space map and issue the rebuild I/O. */ error = vdev_rebuild_ranges(vr); range_tree_vacate(vr->vr_scan_tree, NULL, NULL); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); metaslab_enable(msp, B_FALSE, B_FALSE); if (error != 0) break; } range_tree_destroy(vr->vr_scan_tree); spa_config_exit(spa, SCL_CONFIG, FTAG); /* Wait for any remaining rebuild I/O to complete */ mutex_enter(&vr->vr_io_lock); while (vr->vr_bytes_inflight > 0) cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); mutex_exit(&vr->vr_io_lock); mutex_destroy(&vr->vr_io_lock); cv_destroy(&vr->vr_io_cv); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); dsl_pool_t *dp = spa_get_dsl(spa); dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); mutex_enter(&vd->vdev_rebuild_lock); if (error == 0) { /* * After a successful rebuild clear the DTLs of all ranges * which were missing when the rebuild was started. These * ranges must have been rebuilt as a consequence of rebuilding * all allocated space. Note that unlike a scrub or resilver * the rebuild operation will reconstruct data only referenced * by a pool checkpoint. See the dsl_scan_done() comments. */ dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, (void *)(uintptr_t)vd->vdev_id, tx); } else if (vd->vdev_rebuild_cancel_wanted) { /* * The rebuild operation was canceled. This will occur when * a device participating in the rebuild is detached. */ dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, (void *)(uintptr_t)vd->vdev_id, tx); } else if (vd->vdev_rebuild_reset_wanted) { /* * Reset the running rebuild without canceling and restarting * it. This will occur when a new device is attached and must * participate in the rebuild. */ dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, (void *)(uintptr_t)vd->vdev_id, tx); } else { /* * The rebuild operation should be suspended. This may occur * when detaching a child vdev or when exporting the pool. The * rebuild is left in the active state so it will be resumed. */ ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); vd->vdev_rebuilding = B_FALSE; } dmu_tx_commit(tx); vd->vdev_rebuild_thread = NULL; mutex_exit(&vd->vdev_rebuild_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); cv_broadcast(&vd->vdev_rebuild_cv); thread_exit(); } /* * Returns B_TRUE if any top-level vdev are rebuilding. */ boolean_t vdev_rebuild_active(vdev_t *vd) { spa_t *spa = vd->vdev_spa; boolean_t ret = B_FALSE; if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) { ret = vdev_rebuild_active(vd->vdev_child[i]); if (ret) return (ret); } } else if (vd->vdev_top_zap != 0) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); mutex_exit(&vd->vdev_rebuild_lock); } return (ret); } /* * Start a rebuild operation. The rebuild may be restarted when the * top-level vdev is currently actively rebuilding. */ void vdev_rebuild(vdev_t *vd) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; ASSERT(vd->vdev_top == vd); ASSERT(vdev_is_concrete(vd)); ASSERT(!vd->vdev_removing); ASSERT(spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD)); mutex_enter(&vd->vdev_rebuild_lock); if (vd->vdev_rebuilding) { ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); /* * Signal a running rebuild operation that it should restart * from the beginning because a new device was attached. The * vdev_rebuild_reset_wanted flag is set until the sync task * completes. This may be after the rebuild thread exits. */ if (!vd->vdev_rebuild_reset_wanted) vd->vdev_rebuild_reset_wanted = B_TRUE; } else { vdev_rebuild_initiate(vd); } mutex_exit(&vd->vdev_rebuild_lock); } static void vdev_rebuild_restart_impl(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_rebuild_restart_impl(vd->vdev_child[i]); } else if (vd->vdev_top_zap != 0) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && vdev_writeable(vd) && !vd->vdev_rebuilding) { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); vd->vdev_rebuilding = B_TRUE; vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); } mutex_exit(&vd->vdev_rebuild_lock); } } /* * Conditionally restart all of the vdev_rebuild_thread's for a pool. The * feature flag must be active and the rebuild in the active state. This * cannot be used to start a new rebuild. */ void vdev_rebuild_restart(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); vdev_rebuild_restart_impl(spa->spa_root_vdev); } /* * Stop and wait for all of the vdev_rebuild_thread's associated with the * vdev tree provide to be terminated (canceled or stopped). */ void vdev_rebuild_stop_wait(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_rebuild_stop_wait(vd->vdev_child[i]); } else if (vd->vdev_top_zap != 0) { ASSERT(vd == vd->vdev_top); mutex_enter(&vd->vdev_rebuild_lock); if (vd->vdev_rebuild_thread != NULL) { vd->vdev_rebuild_exit_wanted = B_TRUE; while (vd->vdev_rebuilding) { cv_wait(&vd->vdev_rebuild_cv, &vd->vdev_rebuild_lock); } vd->vdev_rebuild_exit_wanted = B_FALSE; } mutex_exit(&vd->vdev_rebuild_lock); } } /* * Stop all rebuild operations but leave them in the active state so they * will be resumed when importing the pool. */ void vdev_rebuild_stop_all(spa_t *spa) { vdev_rebuild_stop_wait(spa->spa_root_vdev); } /* * Rebuild statistics reported per top-level vdev. */ int vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) { spa_t *spa = tvd->vdev_spa; if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (SET_ERROR(ENOTSUP)); if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) return (SET_ERROR(EINVAL)); int error = zap_contains(spa_meta_objset(spa), tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); if (error == ENOENT) { bzero(vrs, sizeof (vdev_rebuild_stat_t)); vrs->vrs_state = VDEV_REBUILD_NONE; error = 0; } else if (error == 0) { vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&tvd->vdev_rebuild_lock); vrs->vrs_state = vrp->vrp_rebuild_state; vrs->vrs_start_time = vrp->vrp_start_time; vrs->vrs_end_time = vrp->vrp_end_time; vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; vrs->vrs_bytes_est = vrp->vrp_bytes_est; vrs->vrs_errors = vrp->vrp_errors; vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; mutex_exit(&tvd->vdev_rebuild_lock); } return (error); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, "Max segment size in bytes of rebuild reads"); ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW, "Max bytes in flight per leaf vdev for sequential resilvers"); ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, "Automatically scrub after sequential resilver completes"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c index ea71ef325c89..9e9f4a80ba1d 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fm.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c @@ -1,1416 +1,1460 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* - * Copyright (c) 2012,2020 by Delphix. All rights reserved. + * Copyright (c) 2012,2021 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include /* * This general routine is responsible for generating all the different ZFS * ereports. The payload is dependent on the class, and which arguments are * supplied to the function: * * EREPORT POOL VDEV IO * block X X X * data X X * device X X * pool X * * If we are in a loading state, all errors are chained together by the same * SPA-wide ENA (Error Numeric Association). * * For isolated I/O requests, we get the ENA from the zio_t. The propagation * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want * to chain together all ereports associated with a logical piece of data. For * read I/Os, there are basically three 'types' of I/O, which form a roughly * layered diagram: * * +---------------+ * | Aggregate I/O | No associated logical data or device * +---------------+ * | * V * +---------------+ Reads associated with a piece of logical data. * | Read I/O | This includes reads on behalf of RAID-Z, * +---------------+ mirrors, gang blocks, retries, etc. * | * V * +---------------+ Reads associated with a particular device, but * | Physical I/O | no logical data. Issued as part of vdev caching * +---------------+ and I/O aggregation. * * Note that 'physical I/O' here is not the same terminology as used in the rest * of ZIO. Typically, 'physical I/O' simply means that there is no attached * blockpointer. But I/O with no associated block pointer can still be related * to a logical piece of data (i.e. RAID-Z requests). * * Purely physical I/O always have unique ENAs. They are not related to a * particular piece of logical data, and therefore cannot be chained together. * We still generate an ereport, but the DE doesn't correlate it with any * logical piece of data. When such an I/O fails, the delegated I/O requests * will issue a retry, which will trigger the 'real' ereport with the correct * ENA. * * We keep track of the ENA for a ZIO chain through the 'io_logical' member. * When a new logical I/O is issued, we set this to point to itself. Child I/Os * then inherit this pointer, so that when it is first set subsequent failures * will use the same ENA. For vdev cache fill and queue aggregation I/O, * this pointer is set to NULL, and no ereport will be generated (since it * doesn't actually correspond to any particular device or piece of data, * and the caller will always retry without caching or queueing anyway). * * For checksum errors, we want to include more information about the actual * error which occurs. Accordingly, we build an ereport when the error is * noticed, but instead of sending it in immediately, we hang it off of the * io_cksum_report field of the logical IO. When the logical IO completes * (successfully or not), zfs_ereport_finish_checksum() is called with the * good and bad versions of the buffer (if available), and we annotate the * ereport with information about the differences. */ #ifdef _KERNEL /* * Duplicate ereport Detection * * Some ereports are retained momentarily for detecting duplicates. These * are kept in a recent_events_node_t in both a time-ordered list and an AVL * tree of recent unique ereports. * * The lifespan of these recent ereports is bounded (15 mins) and a cleaner * task is used to purge stale entries. */ static list_t recent_events_list; static avl_tree_t recent_events_tree; static kmutex_t recent_events_lock; static taskqid_t recent_events_cleaner_tqid; /* * Each node is about 128 bytes so 2,000 would consume 1/4 MiB. * * This setting can be changed dynamically and setting it to zero * disables duplicate detection. */ unsigned int zfs_zevent_retain_max = 2000; /* * The lifespan for a recent ereport entry. The default of 15 minutes is * intended to outlive the zfs diagnosis engine's threshold of 10 errors * over a period of 10 minutes. */ unsigned int zfs_zevent_retain_expire_secs = 900; typedef enum zfs_subclass { ZSC_IO, ZSC_DATA, ZSC_CHECKSUM } zfs_subclass_t; typedef struct { /* common criteria */ uint64_t re_pool_guid; uint64_t re_vdev_guid; int re_io_error; uint64_t re_io_size; uint64_t re_io_offset; zfs_subclass_t re_subclass; zio_priority_t re_io_priority; /* logical zio criteria (optional) */ zbookmark_phys_t re_io_bookmark; /* internal state */ avl_node_t re_tree_link; list_node_t re_list_link; uint64_t re_timestamp; } recent_events_node_t; static int recent_events_compare(const void *a, const void *b) { const recent_events_node_t *node1 = a; const recent_events_node_t *node2 = b; int cmp; /* * The comparison order here is somewhat arbitrary. * What's important is that if every criteria matches, then it * is a duplicate (i.e. compare returns 0) */ if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0) return (cmp); if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0) return (cmp); const zbookmark_phys_t *zb1 = &node1->re_io_bookmark; const zbookmark_phys_t *zb2 = &node2->re_io_bookmark; if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0) return (cmp); if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0) return (cmp); if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0) return (cmp); if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0) return (cmp); return (0); } static void zfs_ereport_schedule_cleaner(void); /* * background task to clean stale recent event nodes. */ /*ARGSUSED*/ static void zfs_ereport_cleaner(void *arg) { recent_events_node_t *entry; uint64_t now = gethrtime(); /* * purge expired entries */ mutex_enter(&recent_events_lock); while ((entry = list_tail(&recent_events_list)) != NULL) { uint64_t age = NSEC2SEC(now - entry->re_timestamp); if (age <= zfs_zevent_retain_expire_secs) break; /* remove expired node */ avl_remove(&recent_events_tree, entry); list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); } /* Restart the cleaner if more entries remain */ recent_events_cleaner_tqid = 0; if (!list_is_empty(&recent_events_list)) zfs_ereport_schedule_cleaner(); mutex_exit(&recent_events_lock); } static void zfs_ereport_schedule_cleaner(void) { ASSERT(MUTEX_HELD(&recent_events_lock)); uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1); recent_events_cleaner_tqid = taskq_dispatch_delay( system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(timeout)); } +/* + * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL + */ +void +zfs_ereport_clear(spa_t *spa, vdev_t *vd) +{ + uint64_t vdev_guid, pool_guid; + int cnt = 0; + + ASSERT(vd != NULL || spa != NULL); + if (vd == NULL) { + vdev_guid = 0; + pool_guid = spa_guid(spa); + } else { + vdev_guid = vd->vdev_guid; + pool_guid = 0; + } + + mutex_enter(&recent_events_lock); + + recent_events_node_t *next = list_head(&recent_events_list); + while (next != NULL) { + recent_events_node_t *entry = next; + + next = list_next(&recent_events_list, next); + + if (entry->re_vdev_guid == vdev_guid || + entry->re_pool_guid == pool_guid) { + avl_remove(&recent_events_tree, entry); + list_remove(&recent_events_list, entry); + kmem_free(entry, sizeof (*entry)); + cnt++; + } + } + + mutex_exit(&recent_events_lock); +} + /* * Check if an ereport would be a duplicate of one recently posted. * * An ereport is considered a duplicate if the set of criteria in * recent_events_node_t all match. * * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM * are candidates for duplicate checking. */ static boolean_t zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size) { recent_events_node_t search = {0}, *entry; if (vd == NULL || zio == NULL) return (B_FALSE); if (zfs_zevent_retain_max == 0) return (B_FALSE); if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) search.re_subclass = ZSC_IO; else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0) search.re_subclass = ZSC_DATA; else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) search.re_subclass = ZSC_CHECKSUM; else return (B_FALSE); search.re_pool_guid = spa_guid(spa); search.re_vdev_guid = vd->vdev_guid; search.re_io_error = zio->io_error; search.re_io_priority = zio->io_priority; /* if size is supplied use it over what's in zio */ if (size) { search.re_io_size = size; search.re_io_offset = offset; } else { search.re_io_size = zio->io_size; search.re_io_offset = zio->io_offset; } /* grab optional logical zio criteria */ if (zb != NULL) { search.re_io_bookmark.zb_objset = zb->zb_objset; search.re_io_bookmark.zb_object = zb->zb_object; search.re_io_bookmark.zb_level = zb->zb_level; search.re_io_bookmark.zb_blkid = zb->zb_blkid; } uint64_t now = gethrtime(); mutex_enter(&recent_events_lock); /* check if we have seen this one recently */ entry = avl_find(&recent_events_tree, &search, NULL); if (entry != NULL) { uint64_t age = NSEC2SEC(now - entry->re_timestamp); /* * There is still an active cleaner (since we're here). * Reset the last seen time for this duplicate entry * so that its lifespand gets extended. */ list_remove(&recent_events_list, entry); list_insert_head(&recent_events_list, entry); entry->re_timestamp = now; zfs_zevent_track_duplicate(); mutex_exit(&recent_events_lock); return (age <= zfs_zevent_retain_expire_secs); } if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) { /* recycle oldest node */ entry = list_tail(&recent_events_list); ASSERT(entry != NULL); list_remove(&recent_events_list, entry); avl_remove(&recent_events_tree, entry); } else { entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP); } /* record this as a recent ereport */ *entry = search; avl_add(&recent_events_tree, entry); list_insert_head(&recent_events_list, entry); entry->re_timestamp = now; /* Start a cleaner if not already scheduled */ if (recent_events_cleaner_tqid == 0) zfs_ereport_schedule_cleaner(); mutex_exit(&recent_events_lock); return (B_FALSE); } void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) { if (nvl) fm_nvlist_destroy(nvl, FM_NVA_FREE); if (detector) fm_nvlist_destroy(detector, FM_NVA_FREE); } /* * We want to rate limit ZIO delay and checksum events so as to not * flood ZED when a disk is acting up. * * Returns 1 if we're ratelimiting, 0 if not. */ static int zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) { int rc = 0; /* * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we * are. Invert it to get our return value. */ if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { rc = !zfs_ratelimit(&vd->vdev_delay_rl); } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { rc = !zfs_ratelimit(&vd->vdev_checksum_rl); } if (rc) { /* We're rate limiting */ fm_erpt_dropped_increment(); } return (rc); } /* * Return B_TRUE if the event actually posted, B_FALSE if not. */ static boolean_t zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t size) { nvlist_t *ereport, *detector; uint64_t ena; char class[64]; if ((ereport = fm_nvlist_create(NULL)) == NULL) return (B_FALSE); if ((detector = fm_nvlist_create(NULL)) == NULL) { fm_nvlist_destroy(ereport, FM_NVA_FREE); return (B_FALSE); } /* * Serialize ereport generation */ mutex_enter(&spa->spa_errlist_lock); /* * Determine the ENA to use for this event. If we are in a loading * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use * a root zio-wide ENA. Otherwise, simply use a unique ENA. */ if (spa_load_state(spa) != SPA_LOAD_NONE) { if (spa->spa_ena == 0) spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); ena = spa->spa_ena; } else if (zio != NULL && zio->io_logical != NULL) { if (zio->io_logical->io_ena == 0) zio->io_logical->io_ena = fm_ena_generate(0, FM_ENA_FMT1); ena = zio->io_logical->io_ena; } else { ena = fm_ena_generate(0, FM_ENA_FMT1); } /* * Construct the full class, detector, and other standard FMA fields. */ (void) snprintf(class, sizeof (class), "%s.%s", ZFS_ERROR_CLASS, subclass); fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), vd != NULL ? vd->vdev_guid : 0); fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); /* * Construct the per-ereport payload, depending on which parameters are * passed in. */ /* * Generic payload members common to all ereports. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, (uint64_t)spa_state(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, (int32_t)spa_load_state(spa), NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, DATA_TYPE_STRING, spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? FM_EREPORT_FAILMODE_WAIT : spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, NULL); if (vd != NULL) { vdev_t *pvd = vd->vdev_parent; vdev_queue_t *vq = &vd->vdev_queue; vdev_stat_t *vs = &vd->vdev_stat; vdev_t *spare_vd; uint64_t *spare_guids; char **spare_paths; int i, spare_count; fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, DATA_TYPE_UINT64, vd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); if (vd->vdev_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, DATA_TYPE_STRING, vd->vdev_path, NULL); if (vd->vdev_devid != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, DATA_TYPE_STRING, vd->vdev_devid, NULL); if (vd->vdev_fru != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, DATA_TYPE_STRING, vd->vdev_fru, NULL); if (vd->vdev_enc_sysfs_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); if (vd->vdev_ashift) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, DATA_TYPE_UINT64, vd->vdev_ashift, NULL); if (vq != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); } if (vs != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, DATA_TYPE_UINT64, vs->vs_read_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, DATA_TYPE_UINT64, vs->vs_write_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, DATA_TYPE_UINT64, vs->vs_checksum_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, DATA_TYPE_UINT64, vs->vs_slow_ios, NULL); } if (pvd != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, DATA_TYPE_UINT64, pvd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, NULL); if (pvd->vdev_path) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, DATA_TYPE_STRING, pvd->vdev_path, NULL); if (pvd->vdev_devid) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, DATA_TYPE_STRING, pvd->vdev_devid, NULL); } spare_count = spa->spa_spares.sav_count; spare_paths = kmem_zalloc(sizeof (char *) * spare_count, KM_SLEEP); spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, KM_SLEEP); for (i = 0; i < spare_count; i++) { spare_vd = spa->spa_spares.sav_vdevs[i]; if (spare_vd) { spare_paths[i] = spare_vd->vdev_path; spare_guids[i] = spare_vd->vdev_guid; } } fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); kmem_free(spare_guids, sizeof (uint64_t) * spare_count); kmem_free(spare_paths, sizeof (char *) * spare_count); } if (zio != NULL) { /* * Payload common to all I/Os. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, DATA_TYPE_INT32, zio->io_error, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, DATA_TYPE_INT32, zio->io_flags, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, DATA_TYPE_UINT32, zio->io_stage, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, DATA_TYPE_UINT32, zio->io_pipeline, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, DATA_TYPE_UINT64, zio->io_delay, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, DATA_TYPE_UINT64, zio->io_timestamp, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, DATA_TYPE_UINT64, zio->io_delta, NULL); fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, DATA_TYPE_UINT32, zio->io_priority, NULL); /* * If the 'size' parameter is non-zero, it indicates this is a * RAID-Z or other I/O where the physical offset and length are * provided for us, instead of within the zio_t. */ if (vd != NULL) { if (size) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, DATA_TYPE_UINT64, stateoroffset, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, DATA_TYPE_UINT64, size, NULL); else fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, DATA_TYPE_UINT64, zio->io_offset, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, DATA_TYPE_UINT64, zio->io_size, NULL); } } else if (vd != NULL) { /* * If we have a vdev but no zio, this is a device fault, and the * 'stateoroffset' parameter indicates the previous state of the * vdev. */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, DATA_TYPE_UINT64, stateoroffset, NULL); } /* * Payload for I/Os with corresponding logical information. */ if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, DATA_TYPE_UINT64, zb->zb_objset, FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, DATA_TYPE_UINT64, zb->zb_object, FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, DATA_TYPE_INT64, zb->zb_level, FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, DATA_TYPE_UINT64, zb->zb_blkid, NULL); } mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; *detector_out = detector; return (B_TRUE); } /* if it's <= 128 bytes, save the corruption directly */ #define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) #define MAX_RANGES 16 typedef struct zfs_ecksum_info { /* histograms of set and cleared bits by bit number in a 64-bit word */ uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; /* inline arrays of bits set and cleared. */ uint64_t zei_bits_set[ZFM_MAX_INLINE]; uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; /* * for each range, the number of bits set and cleared. The Hamming * distance between the good and bad buffers is the sum of them all. */ uint32_t zei_range_sets[MAX_RANGES]; uint32_t zei_range_clears[MAX_RANGES]; struct zei_ranges { uint32_t zr_start; uint32_t zr_end; } zei_ranges[MAX_RANGES]; size_t zei_range_count; uint32_t zei_mingap; uint32_t zei_allowed_mingap; } zfs_ecksum_info_t; static void update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) { size_t i; size_t bits = 0; uint64_t value = BE_64(value_arg); /* We store the bits in big-endian (largest-first) order */ for (i = 0; i < 64; i++) { if (value & (1ull << i)) { hist[63 - i]++; ++bits; } } /* update the count of bits changed */ *count += bits; } /* * We've now filled up the range array, and need to increase "mingap" and * shrink the range list accordingly. zei_mingap is always the smallest * distance between array entries, so we set the new_allowed_gap to be * one greater than that. We then go through the list, joining together * any ranges which are closer than the new_allowed_gap. * * By construction, there will be at least one. We also update zei_mingap * to the new smallest gap, to prepare for our next invocation. */ static void zei_shrink_ranges(zfs_ecksum_info_t *eip) { uint32_t mingap = UINT32_MAX; uint32_t new_allowed_gap = eip->zei_mingap + 1; size_t idx, output; size_t max = eip->zei_range_count; struct zei_ranges *r = eip->zei_ranges; ASSERT3U(eip->zei_range_count, >, 0); ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); output = idx = 0; while (idx < max - 1) { uint32_t start = r[idx].zr_start; uint32_t end = r[idx].zr_end; while (idx < max - 1) { idx++; uint32_t nstart = r[idx].zr_start; uint32_t nend = r[idx].zr_end; uint32_t gap = nstart - end; if (gap < new_allowed_gap) { end = nend; continue; } if (gap < mingap) mingap = gap; break; } r[output].zr_start = start; r[output].zr_end = end; output++; } ASSERT3U(output, <, eip->zei_range_count); eip->zei_range_count = output; eip->zei_mingap = mingap; eip->zei_allowed_mingap = new_allowed_gap; } static void zei_add_range(zfs_ecksum_info_t *eip, int start, int end) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; if (count >= MAX_RANGES) { zei_shrink_ranges(eip); count = eip->zei_range_count; } if (count == 0) { eip->zei_mingap = UINT32_MAX; eip->zei_allowed_mingap = 1; } else { int gap = start - r[count - 1].zr_end; if (gap < eip->zei_allowed_mingap) { r[count - 1].zr_end = end; return; } if (gap < eip->zei_mingap) eip->zei_mingap = gap; } r[count].zr_start = start; r[count].zr_end = end; eip->zei_range_count++; } static size_t zei_range_total_size(zfs_ecksum_info_t *eip) { struct zei_ranges *r = eip->zei_ranges; size_t count = eip->zei_range_count; size_t result = 0; size_t idx; for (idx = 0; idx < count; idx++) result += (r[idx].zr_end - r[idx].zr_start); return (result); } static zfs_ecksum_info_t * annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, const abd_t *goodabd, const abd_t *badabd, size_t size, boolean_t drop_if_identical) { const uint64_t *good; const uint64_t *bad; uint64_t allset = 0; uint64_t allcleared = 0; size_t nui64s = size / sizeof (uint64_t); size_t inline_size; int no_inline = 0; size_t idx; size_t range; size_t offset = 0; ssize_t start = -1; zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); /* don't do any annotation for injected checksum errors */ if (info != NULL && info->zbc_injected) return (eip); if (info != NULL && info->zbc_has_cksum) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, DATA_TYPE_UINT64_ARRAY, sizeof (info->zbc_expected) / sizeof (uint64_t), (uint64_t *)&info->zbc_expected, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, DATA_TYPE_UINT64_ARRAY, sizeof (info->zbc_actual) / sizeof (uint64_t), (uint64_t *)&info->zbc_actual, FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, DATA_TYPE_STRING, info->zbc_checksum_name, NULL); if (info->zbc_byteswapped) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, DATA_TYPE_BOOLEAN, 1, NULL); } } if (badabd == NULL || goodabd == NULL) return (eip); ASSERT3U(nui64s, <=, UINT32_MAX); ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, <=, UINT32_MAX); good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); /* build up the range list by comparing the two buffers. */ for (idx = 0; idx < nui64s; idx++) { if (good[idx] == bad[idx]) { if (start == -1) continue; zei_add_range(eip, start, idx); start = -1; } else { if (start != -1) continue; start = idx; } } if (start != -1) zei_add_range(eip, start, idx); /* See if it will fit in our inline buffers */ inline_size = zei_range_total_size(eip); if (inline_size > ZFM_MAX_INLINE) no_inline = 1; /* * If there is no change and we want to drop if the buffers are * identical, do so. */ if (inline_size == 0 && drop_if_identical) { kmem_free(eip, sizeof (*eip)); abd_return_buf((abd_t *)goodabd, (void *)good, size); abd_return_buf((abd_t *)badabd, (void *)bad, size); return (NULL); } /* * Now walk through the ranges, filling in the details of the * differences. Also convert our uint64_t-array offsets to byte * offsets. */ for (range = 0; range < eip->zei_range_count; range++) { size_t start = eip->zei_ranges[range].zr_start; size_t end = eip->zei_ranges[range].zr_end; for (idx = start; idx < end; idx++) { uint64_t set, cleared; // bits set in bad, but not in good set = ((~good[idx]) & bad[idx]); // bits set in good, but not in bad cleared = (good[idx] & (~bad[idx])); allset |= set; allcleared |= cleared; if (!no_inline) { ASSERT3U(offset, <, inline_size); eip->zei_bits_set[offset] = set; eip->zei_bits_cleared[offset] = cleared; offset++; } update_histogram(set, eip->zei_histogram_set, &eip->zei_range_sets[range]); update_histogram(cleared, eip->zei_histogram_cleared, &eip->zei_range_clears[range]); } /* convert to byte offsets */ eip->zei_ranges[range].zr_start *= sizeof (uint64_t); eip->zei_ranges[range].zr_end *= sizeof (uint64_t); } abd_return_buf((abd_t *)goodabd, (void *)good, size); abd_return_buf((abd_t *)badabd, (void *)bad, size); eip->zei_allowed_mingap *= sizeof (uint64_t); inline_size *= sizeof (uint64_t); /* fill in ereport */ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, (uint32_t *)eip->zei_ranges, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, DATA_TYPE_UINT32, eip->zei_allowed_mingap, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, NULL); if (!no_inline) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_set, FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, DATA_TYPE_UINT8_ARRAY, inline_size, (uint8_t *)eip->zei_bits_cleared, NULL); } else { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, DATA_TYPE_UINT32_ARRAY, NBBY * sizeof (uint64_t), eip->zei_histogram_set, FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, DATA_TYPE_UINT32_ARRAY, NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, NULL); } return (eip); } +#else +/*ARGSUSED*/ +void +zfs_ereport_clear(spa_t *spa, vdev_t *vd) +{ +} #endif /* * Make sure our event is still valid for the given zio/vdev/pool. For example, * we don't want to keep logging events for a faulted or missing vdev. */ boolean_t zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) { #ifdef _KERNEL /* * If we are doing a spa_tryimport() or in recovery mode, * ignore errors. */ if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) return (B_FALSE); /* * If we are in the middle of opening a pool, and the previous attempt * failed, don't bother logging any new ereports - we're just going to * get the same diagnosis anyway. */ if (spa_load_state(spa) != SPA_LOAD_NONE && spa->spa_last_open_failed) return (B_FALSE); if (zio != NULL) { /* * If this is not a read or write zio, ignore the error. This * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) return (B_FALSE); if (vd != NULL) { /* * If the vdev has already been marked as failing due * to a failed probe, then ignore any subsequent I/O * errors, as the DE will automatically fault the vdev * on the first such failure. This also catches cases * where vdev_remove_wanted is set and the device has * not yet been asynchronously placed into the REMOVED * state. */ if (zio->io_vd == vd && !vdev_accessible(vd, zio)) return (B_FALSE); /* * Ignore checksum errors for reads from DTL regions of * leaf vdevs. */ if (zio->io_type == ZIO_TYPE_READ && zio->io_error == ECKSUM && vd->vdev_ops->vdev_op_leaf && vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) return (B_FALSE); } } /* * For probe failure, we want to avoid posting ereports if we've * already removed the device in the meantime. */ if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) return (B_FALSE); /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && (zio != NULL) && (!zio->io_timestamp)) { return (B_FALSE); } #endif return (B_TRUE); } /* * Post an ereport for the given subclass * * Returns * - 0 if an event was posted * - EINVAL if there was a problem posting event * - EBUSY if the event was rate limited * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) return (EINVAL); if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0)) return (SET_ERROR(EALREADY)); if (zfs_is_ratelimiting_event(subclass, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, zb, zio, state, 0)) return (SET_ERROR(EINVAL)); /* couldn't post event */ if (ereport == NULL) return (SET_ERROR(EINVAL)); /* Cleanup is handled by the callback function */ rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); #endif return (rc); } /* * Prepare a checksum ereport * * Returns * - 0 if an event was posted * - EINVAL if there was a problem posting event * - EBUSY if the event was rate limited * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, zio_bad_cksum_t *info) { zio_cksum_report_t *report; #ifdef _KERNEL if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) return (SET_ERROR(EINVAL)); if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length)) return (SET_ERROR(EALREADY)); if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return (SET_ERROR(EBUSY)); #endif report = kmem_zalloc(sizeof (*report), KM_SLEEP); if (zio->io_vsd != NULL) zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); else zio_vsd_default_cksum_report(zio, report, arg); /* copy the checksum failure information if it was provided */ if (info != NULL) { report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); bcopy(info, report->zcr_ckinfo, sizeof (*info)); } report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; report->zcr_align = vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); report->zcr_length = length; #ifdef _KERNEL (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); if (report->zcr_ereport == NULL) { zfs_ereport_free_checksum(report); return (0); } #endif mutex_enter(&spa->spa_errlist_lock); report->zcr_next = zio->io_logical->io_cksum_report; zio->io_logical->io_cksum_report = report; mutex_exit(&spa->spa_errlist_lock); return (0); } void zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical) { #ifdef _KERNEL zfs_ecksum_info_t *info; info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, good_data, bad_data, report->zcr_length, drop_if_identical); if (info != NULL) zfs_zevent_post(report->zcr_ereport, report->zcr_detector, zfs_zevent_post_cb); else zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); report->zcr_ereport = report->zcr_detector = NULL; if (info != NULL) kmem_free(info, sizeof (*info)); #endif } void zfs_ereport_free_checksum(zio_cksum_report_t *rpt) { #ifdef _KERNEL if (rpt->zcr_ereport != NULL) { fm_nvlist_destroy(rpt->zcr_ereport, FM_NVA_FREE); fm_nvlist_destroy(rpt->zcr_detector, FM_NVA_FREE); } #endif rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); if (rpt->zcr_ckinfo != NULL) kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); kmem_free(rpt, sizeof (*rpt)); } /* * Post a checksum ereport * * Returns * - 0 if an event was posted * - EINVAL if there was a problem posting event * - EBUSY if the event was rate limited * - EALREADY if the event was already posted (duplicate) */ int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) { int rc = 0; #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; zfs_ecksum_info_t *info; if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio)) return (SET_ERROR(EINVAL)); if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length)) return (SET_ERROR(EALREADY)); if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) return (SET_ERROR(EBUSY)); if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length) || (ereport == NULL)) { return (SET_ERROR(EINVAL)); } info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, B_FALSE); if (info != NULL) { rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); kmem_free(info, sizeof (*info)); } #endif return (rc); } /* * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h * and are designed to be consumed by the ZFS Event Daemon (ZED). For * additional details refer to the zed(8) man page. */ nvlist_t * zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux) { nvlist_t *resource = NULL; #ifdef _KERNEL char class[64]; if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return (NULL); if ((resource = fm_nvlist_create(NULL)) == NULL) return (NULL); (void) snprintf(class, sizeof (class), "%s.%s.%s", type, ZFS_ERROR_CLASS, name); VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); VERIFY0(nvlist_add_int32(resource, FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); if (vd) { VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); VERIFY0(nvlist_add_uint64(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); if (vd->vdev_path != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); if (vd->vdev_devid != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); if (vd->vdev_fru != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); if (vd->vdev_enc_sysfs_path != NULL) VERIFY0(nvlist_add_string(resource, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path)); } /* also copy any optional payload data */ if (aux) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) (void) nvlist_add_nvpair(resource, elem); } #endif return (resource); } static void zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, nvlist_t *aux) { #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, type, name, aux); if (resource) zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); #endif } /* * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev * has been removed from the system. This will cause the DE to ignore any * recent I/O errors, inferring that they are due to the asynchronous device * removal. */ void zfs_post_remove(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); } /* * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool * has the 'autoreplace' property set, and therefore any broken vdevs will be * handled by higher level logic, and no vdev fault should be generated. */ void zfs_post_autoreplace(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); } /* * The 'resource.fs.zfs.statechange' event is an internal signal that the * given vdev has transitioned its state to DEGRADED or HEALTHY. This will * cause the retire agent to repair any outstanding fault management cases * open because the device was not found (fault.fs.zfs.device). */ void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) { #ifdef _KERNEL nvlist_t *aux; /* * Add optional supplemental keys to payload */ aux = fm_nvlist_create(NULL); if (vd && aux) { if (vd->vdev_physpath) { (void) nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, vd->vdev_physpath); } if (vd->vdev_enc_sysfs_path) { (void) nvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, vd->vdev_enc_sysfs_path); } (void) nvlist_add_uint64(aux, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); } zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, aux); if (aux) fm_nvlist_destroy(aux, FM_NVA_FREE); #endif } #ifdef _KERNEL void zfs_ereport_init(void) { mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&recent_events_list, sizeof (recent_events_node_t), offsetof(recent_events_node_t, re_list_link)); avl_create(&recent_events_tree, recent_events_compare, sizeof (recent_events_node_t), offsetof(recent_events_node_t, re_tree_link)); } /* * This 'early' fini needs to run before zfs_fini() which on Linux waits * for the system_delay_taskq to drain. */ void zfs_ereport_taskq_fini(void) { mutex_enter(&recent_events_lock); if (recent_events_cleaner_tqid != 0) { taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid); recent_events_cleaner_tqid = 0; } mutex_exit(&recent_events_lock); } void zfs_ereport_fini(void) { recent_events_node_t *entry; while ((entry = list_head(&recent_events_list)) != NULL) { avl_remove(&recent_events_tree, entry); list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); } avl_destroy(&recent_events_tree); list_destroy(&recent_events_list); mutex_destroy(&recent_events_lock); } EXPORT_SYMBOL(zfs_ereport_post); EXPORT_SYMBOL(zfs_ereport_is_valid); EXPORT_SYMBOL(zfs_ereport_post_checksum); EXPORT_SYMBOL(zfs_post_remove); EXPORT_SYMBOL(zfs_post_autoreplace); EXPORT_SYMBOL(zfs_post_state_change); ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW, "Maximum recent zevents records to retain for duplicate checking"); ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW, "Expiration time for recent zevents records"); #endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index 0e35fd069cbb..922253469fba 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -1,7688 +1,7688 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Portions Copyright 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * We want newer versions of libzfs and libzfs_core to run against * existing zfs kernel modules (i.e. a deferred reboot after an update). * Therefore the ioctl numbers cannot change from release to release. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * zfs_ioc_key_t *nvl_keys * The list of expected/allowable innvl input keys. This list is used * to validate the nvlist input to the ioctl. * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. * * IOCTL Interface Errors * * The following ioctl input errors can be returned: * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include #include #include kmutex_t zfsdev_state_lock; zfsdev_state_t *zfsdev_state_list; /* * Limit maximum nvlist size. We don't want users passing in insane values * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ unsigned long zfs_max_nvlist_src_size = 0; /* * When logging the output nvlist of an ioctl in the on-disk history, limit * the logged size to this many bytes. This must be less then DMU_MAX_ACCESS. * This applies primarily to zfs_ioc_channel_program(). */ unsigned long zfs_history_output_max = 1024 * 1024; uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ #define DATA_TYPE_ANY DATA_TYPE_UNKNOWN typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; const zfs_ioc_key_t *zvec_nvl_keys; size_t zvec_nvl_key_count; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_PERM_PROJECTOBJQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ /* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs_t; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static const char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, B_TRUE, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); #else return (SET_ERROR(ENOTSUP)); #endif /* HAVE_MLSLABEL */ } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { default: break; case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } /* ARGSUSED */ static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); if (error != 0) return (error); /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ return (0); } /* ARGSUSED */ static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } /* ARGSUSED */ static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; const char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); } static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strncpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ /* ARGSUSED */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } /* ARGSUSED */ static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } /* ARGSUSED */ static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } /* ARGSUSED */ static int zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_recv(zc, innvl, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; int error = 0; nvpair_t *pair; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each bookmark in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ /* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (secpolicy_zinject(cr)); } /* ARGSUSED */ static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA || zc->zc_objset_type == ZFS_PROP_USEROBJUSED || zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED || zc->zc_objset_type == ZFS_PROP_GROUPQUOTA || zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED || zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) { if (groupmember(zc->zc_guid, cr)) return (0); } /* else is for project quota/used */ } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } /* ARGSUSED */ static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } /* ARGSUSED */ static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; nvlist_t *holds; int error; holds = fnvlist_lookup_nvlist(innvl, "holds"); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (innvl != NULL) { if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); } return (error); } static int zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_LOAD_KEY, cr)); } static int zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CHANGE_KEY, cr)); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = vmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { vmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { vmem_free(packed, size); return (error); } vmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { error = SET_ERROR(ENOMEM); } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) { int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); /* bump s_active only when non-zero to prevent umount race */ error = zfs_vfs_ref(zfvp); mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, zfvp); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all inode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, B_FALSE, zfvp); if (error == 0) { if (writer) ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag); else ZFS_TEARDOWN_ENTER_READ(*zfvp, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ ZFS_TEARDOWN_EXIT(*zfvp, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { ZFS_TEARDOWN_EXIT(zfsvfs, tag); if (zfs_vfs_held(zfsvfs)) { zfs_vfs_rele(zfsvfs); } else { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; dsl_crypto_params_t *dcp = NULL; const char *spa_name = zc->zc_name; boolean_t unload_wkey = B_TRUE; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, &hidden_args); error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, rootprops, hidden_args, &dcp); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) spa_name = tname; } error = spa_create(zc->zc_name, config, props, zplprops, dcp); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(spa_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) { (void) spa_destroy(spa_name); unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */ } pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); dsl_crypto_params_free(dcp, unload_wkey && !!error); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = vmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); vmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL && zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config; int replacing = zc->zc_cookie; int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, rebuild); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; const char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; const char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading without owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) { nvlist_free(nv); return (error); } VERIFY0(error); } if (error == 0) error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_src iteration range nvlist * zc_nvlist_src_size size of iteration range nvlist * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { int error; objset_t *os, *ossnap; dsl_dataset_t *ds; uint64_t min_txg = 0, max_txg = 0; if (zc->zc_nvlist_src_size != 0) { nvlist_t *props = NULL; error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props); if (error != 0) return (error); (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, &min_txg); (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, &max_txg); nvlist_free(props); } error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? SET_ERROR(ESRCH) : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } while (error == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { error = SET_ERROR(EINTR); break; } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == ENOENT) { error = SET_ERROR(ESRCH); break; } else if (error != 0) { break; } error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, FTAG, &ds); if (error != 0) break; if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { dsl_dataset_rele(ds, FTAG); /* undo snapshot name append */ *(strchr(zc->zc_name, '@') + 1) = '\0'; /* skip snapshot */ continue; } if (zc->zc_simple) { dsl_dataset_rele(ds, FTAG); break; } if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } dsl_dataset_rele(ds, FTAG); break; } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *dash, *domain; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* all special properties are numeric except for keylocation */ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { strval = fnvpair_value_string(pair); } else { intval = fnvpair_value_uint64(pair); } switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_KEYLOCATION: err = dsl_crypto_can_set_keylocation(dsname, strval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_COMPRESSION: err = dsl_dataset_set_compression(dsname, source, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_SNAPDEV: err = zvol_set_snapdev(dsname, source, intval); break; case ZFS_PROP_VOLMODE: err = zvol_set_volmode(dsname, source, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dsname, sizeof (zc->zc_name)); (void) zfs_ioc_userspace_upgrade(zc); (void) zfs_ioc_id_quota_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; const char *strval; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); int err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && source == ZPROP_SRC_INHERITED) { /* inherited properties are expected to be booleans */ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) err = SET_ERROR(EINVAL); } else if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(ZFS_ERR_BADPROP); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { if (source == ZPROP_SRC_INHERITED) err = -1; /* does not need special handling */ else err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (!nvlist_empty(genericnvl) && dsl_props_set(dsname, source, genericnvl) != 0) { /* * If this fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); int err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { err = dsl_prop_inherit(dsname, propname, source); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } } nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (SET_ERROR(E2BIG)); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; if (!received) { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ dummy = fnvlist_alloc(); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: err = SET_ERROR(EINVAL); goto errout; } pair = nvlist_next_nvpair(dummy, NULL); if (pair == NULL) { err = SET_ERROR(EINVAL); } else { err = zfs_prop_set_special(zc->zc_name, source, pair); if (err == -1) /* property is not "special", needs handling */ err = dsl_prop_inherit(zc->zc_name, zc->zc_value, source); } errout: nvlist_free(dummy); return (err); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } /* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; int error; ASSERT(zplprops != NULL); /* parent dataset must be a filesystem */ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; spa_t *spa; uint64_t spa_vers; int error; zfs_get_parent(dataset, parentname, sizeof (parentname)); if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_create[] = { {"type", DATA_TYPE_INT32, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; dsl_crypto_params_t *dcp = NULL; type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize(fsname, volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, hidden_args, &dcp); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); nvlist_free(zct.zct_zplprops); dsl_crypto_params_free(dcp, !!error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) { spa_t *spa; int error2; /* * Volumes will return EBUSY and cannot be destroyed * until all asynchronous minor handling (e.g. from * setting the volmode property) has completed. Wait for * the spa_zvol_taskq to drain then retry. */ error2 = dsl_destroy_head(fsname); while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { error2 = spa_open(fsname, &spa, FTAG); if (error2 == 0) { taskq_wait(spa->spa_zvol_taskq); spa_close(spa, FTAG); } error2 = dsl_destroy_head(fsname); } } } return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outputs: * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_clone[] = { {"origin", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; const char *origin_name; origin_name = fnvlist_lookup_string(innvl, "origin"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } static const zfs_ioc_key_t zfs_keys_remap[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { /* This IOCTL is no longer supported. */ return (0); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_snapshot[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if ((error = zfs_check_userprops(props)) != 0) return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* * Check for permission to set the properties on the fs. */ if (!nvlist_empty(props)) { *cp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_USERPROP, CRED()); *cp = '@'; if (error != 0) return (error); } /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ static const zfs_ioc_key_t zfs_keys_log_history[] = { {"message", DATA_TYPE_STRING, 0}, }; /* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { const char *message; char *poolname; spa_t *spa; int error; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); if (poolname == NULL) return (SET_ERROR(EINVAL)); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); kmem_strfree(poolname); if (error != 0) return (error); message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * This ioctl is used to set the bootenv configuration on the current * pool. This configuration is stored in the second padding area of the label, * and it is used by the bootloader(s) to store the bootloader and/or system * specific data. * The data is stored as nvlist data stream, and is protected by * an embedded checksum. * The version can have two possible values: * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. * VB_NVLIST: nvlist with arbitrary pairs. */ static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { {"version", DATA_TYPE_UINT64, 0}, {"", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, }; static int zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int error; spa_t *spa; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { /* no nvl keys */ }; static int zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { if (strchr(snapname, '@') == NULL) return; (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE); } /* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; spa_t *spa; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); zfs_unmount_snap(nvpair_name(pair)); if (spa_open(name, &spa, FTAG) == 0) { zvol_remove_minors(spa, name, B_TRUE); spa_close(spa, FTAG); } } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. The bookmark names are of the form #. * All bookmarks and snapshots must be in the same pool. * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. * * innvl: { * new_bookmark1 -> existing_snapshot, * new_bookmark2 -> existing_bookmark, * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_bookmark[] = { {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, }; /* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, }; static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl is not used. * * outnvl: { * property 1, property 2, ... * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, nvlist_t *outnvl) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmname; bmname = strchr(bookmark, '#'); if (bmname == NULL) return (SET_ERROR(EINVAL)); bmname++; (void) strlcpy(fsname, bookmark, sizeof (fsname)); *(strchr(fsname, '#')) = '\0'; return (dsl_get_bookmark_props(fsname, bmname, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, }; static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static const zfs_ioc_key_t zfs_keys_channel_program[] = { {"program", DATA_TYPE_STRING, 0}, {"arg", DATA_TYPE_ANY, 0}, {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (SET_ERROR(EINVAL)); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (SET_ERROR(EINVAL)); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint_discard(poolname)); } /* * inputs: * zc_name name of dataset to destroy * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; int err; err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); if (ost == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) { err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); } else { err = dsl_destroy_head(zc->zc_name); if (err == EEXIST) { /* * It is possible that the given DS may have * hidden child (%recv) datasets - "leftovers" * resulting from the previously interrupted * 'zfs receive'. * * 6 extra bytes for /%recv */ char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6]; if (snprintf(namebuf, sizeof (namebuf), "%s/%s", zc->zc_name, recv_clone_name) >= sizeof (namebuf)) return (SET_ERROR(EINVAL)); /* * Try to remove the hidden child (%recv) and after * that try to remove the target dataset. * If the hidden child (%recv) does not exist * the original error (EEXIST) will be returned */ err = dsl_destroy_head(namebuf); if (err == 0) err = dsl_destroy_head(zc->zc_name); else if (err == ENOENT) err = SET_ERROR(EEXIST); } } return (err); } /* * innvl: { * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64) * "initialize_vdevs": { -> guids to initialize (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * } * * outnvl: { * "initialize_vdevs": { -> initialization errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { return (SET_ERROR(EINVAL)); } if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || cmd_type == POOL_INITIALIZE_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { return (SET_ERROR(EINVAL)); } for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist); } fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? EINVAL : 0); } /* * innvl: { * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64) * "trim_vdevs": { -> guids to TRIM (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * "trim_rate" -> Target TRIM rate in bytes/sec. * "trim_secure" -> Set to request a secure TRIM. * } * * outnvl: { * "trim_vdevs": { -> TRIM errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_trim[] = { {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0) return (SET_ERROR(EINVAL)); if (!(cmd_type == POOL_TRIM_CANCEL || cmd_type == POOL_TRIM_START || cmd_type == POOL_TRIM_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0) return (SET_ERROR(EINVAL)); for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } /* Optional, defaults to maximum rate when not provided */ uint64_t rate; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0) rate = 0; /* Optional, defaults to standard TRIM when not provided */ boolean_t secure; if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE, &secure) != 0) { secure = B_FALSE; } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type, rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist); fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? EINVAL : 0); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a tag is provided, it identifies a particular instance of an activity to * wait for. Currently, this is only valid for use with 'initialize', because * that is the only activity for which there can be multiple instances running * concurrently. In the case of 'initialize', the tag corresponds to the guid of * the vdev on which to wait. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * (optional) "wait_tag" -> uint64_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_pool_wait[] = { {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; uint64_t tag; boolean_t waited; int error; if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) return (EINVAL); if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) error = spa_wait_tag(name, activity, tag, &waited); else error = spa_wait(name, activity, &waited); if (error == 0) fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); return (error); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_fs_wait[] = { {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, }; static int zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; boolean_t waited = B_FALSE; int error; dsl_pool_t *dp; dsl_dir_t *dd; dsl_dataset_t *ds; if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) return (SET_ERROR(EINVAL)); if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) return (SET_ERROR(EINVAL)); if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) return (error); if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { dsl_pool_rele(dp, FTAG); return (error); } dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); dd->dd_activity_waiters++; /* * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t * aren't evicted while we're waiting. Normally this is prevented by * holding the pool, but we can't do that while we're waiting since * that would prevent TXGs from syncing out. Some of the functionality * of long-holds (e.g. preventing deletion) is unnecessary for this * case, since we would cancel the waiters before proceeding with a * deletion. An alternative mechanism for keeping the dataset around * could be developed but this is simpler. */ dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, FTAG); error = dsl_dir_wait(dd, ds, activity, &waited); dsl_dataset_long_rele(ds, FTAG); dd->dd_activity_waiters--; if (dd->dd_activity_waiters == 0) cv_signal(&dd->dd_activity_cv); mutex_exit(&dd->dd_activity_lock); dsl_dataset_rele(ds, FTAG); if (error == 0) fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); return (error); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ static const zfs_ioc_key_t zfs_keys_rollback[] = { {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; zvol_state_handle_t *zv; char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(fsname)) != NULL) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char *fullname; fullname = kmem_asprintf("%s@%s", fsname, snapname); zfs_unmount_snap(fullname); kmem_strfree(fullname); return (0); } /* * * snapname is the snapshot to redact. * innvl: { * "bookname" -> (string) * shortname of the redaction bookmark to generate * "snapnv" -> (nvlist, values ignored) * snapshots to redact snapname with respect to * } * * outnvl is unused */ /* ARGSUSED */ static const zfs_ioc_key_t zfs_keys_redact[] = { {"bookname", DATA_TYPE_STRING, 0}, {"snapnv", DATA_TYPE_NVLIST, 0}, }; static int zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *redactnvl = NULL; char *redactbook = NULL; if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) return (SET_ERROR(EINVAL)); if (fnvlist_num_pairs(redactnvl) == 0) return (SET_ERROR(ENXIO)); if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) return (SET_ERROR(EINVAL)); return (dmu_redact_snap(snapname, redactnvl, redactbook)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; boolean_t nounmount = !!(zc->zc_cookie & 2); char *at; int err; /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (ost == DMU_OST_ZFS && !nounmount) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval, compval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; const char *uiq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA]; const char *giq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA]; const char *pq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA]; const char *piq_prefix = zfs_userquota_prop_prefixes[\ ZFS_PROP_PROJECTOBJQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, uiq_prefix, strlen(uiq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USEROBJQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else if (strncmp(propname, giq_prefix, strlen(giq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPOBJQUOTA; } else if (strncmp(propname, pq_prefix, strlen(pq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTQUOTA; } else if (strncmp(propname, piq_prefix, strlen(piq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA; } else { /* {USER|GROUP|PROJECT}USED are read-only */ return (SET_ERROR(EINVAL)); } if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { compval = ZIO_COMPRESS_ALGO(intval); if (compval >= ZIO_COMPRESS_GZIP_1 && compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } if (compval == ZIO_COMPRESS_ZSTD) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_DNODESIZE: /* Dnode sizes above 512 need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval != ZFS_DNSIZE_LEGACY) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SPECIAL_SMALL_BLOCKS: /* * This property could require the allocation classes * feature to be active for setting, however we allow * it so that tests of settable properties succeed. * The CLI will issue a warning in this case. */ break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; int err; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { /* check prop value is enabled in features */ feature = zio_checksum_to_feature( intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; } default: break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strlcpy(zc->zc_value, nvpair_name(pair), sizeof (zc->zc_value)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; if (poolname != NULL) kmem_strfree(poolname); } #ifdef ZFS_DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * nvlist 'errors' is always allocated. It will contain descriptions of * encountered errors, if any. It's the callers responsibility to free. */ static int zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; int props_error = 0; offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; boolean_t tofs_was_redacted; zfs_file_t *input_fp; *read_bytes = 0; *errflags = 0; *errors = fnvlist_alloc(); off = 0; if ((error = zfs_file_get(input_fd, &input_fp))) return (error); noff = off = zfs_file_off(input_fp); error = dmu_recv_begin(tofs, tosnap, begin_record, force, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) goto out; tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (recvprops != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, * so stash away the existing ones. */ if (dsl_prop_get_received(tofs, &origrecvd) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(recvprops, origrecvd); if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0) (void) nvlist_merge(*errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origrecvd, first_recvd_props ? NULL : recvprops) != 0) *errflags |= ZPROP_ERR_NOCLEAR; } else { *errflags |= ZPROP_ERR_NOCLEAR; } } /* * Stash away existing properties so we can restore them on error unless * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which * case "origrecvd" will take care of that. */ if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) { objset_t *os; if (dmu_objset_hold(tofs, FTAG, &os) == 0) { if (dsl_prop_get_all(os, &origprops) != 0) { *errflags |= ZPROP_ERR_NOCLEAR; } dmu_objset_rele(os, FTAG); } else { *errflags |= ZPROP_ERR_NOCLEAR; } } if (recvprops != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { recv_delayprops = extract_delay_props(recvprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recvprops, *errors); } } if (localprops != NULL) { nvlist_t *oprops = fnvlist_alloc(); nvlist_t *xprops = fnvlist_alloc(); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) { /* -x property */ const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); if (prop != ZPROP_INVAL) { if (!zfs_prop_inheritable(prop)) continue; } else if (!zfs_prop_user(name)) continue; fnvlist_add_boolean(xprops, name); } else { /* -o property=value */ fnvlist_add_nvpair(oprops, nvp); } } local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); nvlist_free(oprops); nvlist_free(xprops); } error = dmu_recv_stream(&drc, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; zvol_state_handle_t *zv = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( begin_record->drr_u.drr_begin. drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); /* * If the dataset was not redacted, but we received a * redacted stream onto it, we need to unmount the * dataset. Otherwise, resume the filesystem. */ if (error == 0 && !drc.drc_newfs && stream_is_redacted && !tofs_was_redacted) { error = zfs_end_fs(zfsvfs, ds); } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(tofs)) != NULL) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (recv_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recv_delayprops, *errors); } if (local_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } } /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ if (recv_delayprops != NULL) { ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); nvlist_free(recv_delayprops); } if (local_delayprops != NULL) { ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } *read_bytes = off - noff; #ifdef ZFS_DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error != 0 && recvprops != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, recvprops, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ *errflags |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origrecvd == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explicitly if we're restoring local properties cleared in the * first new-style receive. */ if (origrecvd != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origrecvd, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ *errflags |= ZPROP_ERR_NORESTORE; } } if (error != 0 && localprops != NULL && !drc.drc_newfs && !first_recvd_props) { nvlist_t *setprops; nvlist_t *inheritprops; nvpair_t *nvp; if (origprops == NULL) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; goto out; } /* Restore original props */ setprops = fnvlist_alloc(); inheritprops = fnvlist_alloc(); nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); const char *source; nvlist_t *attrs; if (!nvlist_exists(origprops, name)) { /* * Property was not present or was explicitly * inherited before the receive, restore this. */ fnvlist_add_boolean(inheritprops, name); continue; } attrs = fnvlist_lookup_nvlist(origprops, name); source = fnvlist_lookup_string(attrs, ZPROP_SOURCE); /* Skip received properties */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) continue; if (strcmp(source, tofs) == 0) { /* Property was locally set */ fnvlist_add_nvlist(setprops, name, attrs); } else { /* Property was implicitly inherited */ fnvlist_add_boolean(inheritprops, name); } } if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; nvlist_free(setprops); nvlist_free(inheritprops); } out: zfs_file_put(input_fd); nvlist_free(origrecvd); nvlist_free(origprops); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of containing filesystem (unused) * zc_nvlist_src{_size} nvlist of properties to apply * zc_nvlist_conf{_size} nvlist of properties to exclude * (DATA_TYPE_BOOLEAN) and override (everything else) * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * * outputs: * zc_cookie number of bytes read * zc_obj zprop_errflags_t * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_recv(zfs_cmd_t *zc) { dmu_replay_record_t begin_record; nvlist_t *errors = NULL; nvlist_t *recvdprops = NULL; nvlist_t *localprops = NULL; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; int error = 0; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &recvdprops)) != 0) return (error); if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &localprops)) != 0) return (error); if (zc->zc_string[0]) origin = zc->zc_string; begin_record.drr_type = DRR_BEGIN; begin_record.drr_payloadlen = 0; begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); nvlist_free(recvdprops); nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && errors != NULL && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ error = SET_ERROR(EINVAL); } nvlist_free(errors); return (error); } /* * innvl: { * "snapname" -> full name of the snapshot to create * (optional) "props" -> received properties to set (nvlist) * (optional) "localprops" -> override and exclude properties (nvlist) * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused * (optional) "hidden_args" -> { "wkeydata" -> value } * } * * outnvl: { * "read_bytes" -> number of bytes read * "error_flags" -> zprop_errflags_t * "errors" -> error for each unapplied received property (nvlist) * } */ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"snapname", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"origin", DATA_TYPE_STRING, ZK_OPTIONAL}, {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { dmu_replay_record_t *begin_record; uint_t begin_record_size; nvlist_t *errors = NULL; nvlist_t *recvprops = NULL; nvlist_t *localprops = NULL; nvlist_t *hidden_args = NULL; char *snapname; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; int input_fd = -1; int error; snapname = fnvlist_lookup_string(innvl, "snapname"); if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || strchr(snapname, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; error = nvlist_lookup_string(innvl, "origin", &origin); if (error && error != ENOENT) return (error); error = nvlist_lookup_byte_array(innvl, "begin_record", (uchar_t **)&begin_record, &begin_record_size); if (error != 0 || begin_record_size != sizeof (*begin_record)) return (SET_ERROR(EINVAL)); input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); return (error); } typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; int dbi_len; int dbi_err; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; zfs_file_t *fp; caddr_t buf; fp = dbi->dbi_fp; buf = dbi->dbi_buf; dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { dump_bytes_io_t dbi; dbi.dbi_fp = arg; dbi.dbi_buf = buf; dbi.dbi_len = len; #if defined(HAVE_LARGE_STACKS) dump_bytes_cb(&dbi); #else /* * The vn_rdwr() call is performed in a taskq to ensure that there is * always enough stack space to write safely to the target filesystem. * The ZIO_TYPE_FREE threads are used because there can be a lot of * them and they are used in vdev_file.c for a similar purpose. */ spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); #endif /* HAVE_LARGE_STACKS */ return (dbi.dbi_err); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set * * NOTE: This is no longer the preferred interface, any new functionality * should be added to zfs_ioc_send_new() instead. */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); boolean_t rawok = (zc->zc_flags & 0x8); boolean_t savedok = (zc->zc_flags & 0x10); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, compressok || rawok, savedok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { zfs_file_t *fp; dmu_send_outparams_t out = {0}; if ((error = zfs_file_get(zc->zc_cookie, &fp))) return (error); off = zfs_file_off(fp); out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); zfs_file_put(zc->zc_cookie); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dss_outfd == zc->zc_cookie && zfs_proc_is_caller(dsp->dss_proc)) break; } if (dsp != NULL) { zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, 0, 0); /* This is the closest thing we have to atomic_read_64. */ zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); } else { error = SET_ERROR(ENOENT); } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); /* * If multihost is enabled, resuming I/O is unsafe as another * host may have imported the pool. */ if (spa_multihost(spa) && spa_suspended(spa)) return (SET_ERROR(EINVAL)); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { error = SET_ERROR(ENODEV); (void) spa_vdev_state_exit(spa, NULL, error); spa_close(spa, FTAG); return (error); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, spa_suspended(spa) ? NULL : spa->spa_root_vdev, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } /* * Reopen all the vdevs associated with the pool. * * innvl: { * "scrub_restart" -> when true and scrub is running, allow to restart * scrub as the side effect of the reopen (boolean). * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; boolean_t rc, scrub_restart = B_TRUE; if (innvl) { error = nvlist_lookup_boolean_value(innvl, "scrub_restart", &rc); if (error == 0) scrub_restart = rc; } error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If the scrub_restart flag is B_FALSE and a scrub is already * in progress then set spa_scrub_reopen flag to B_TRUE so that * we don't restart the scrub as a side effect of the reopen. * Otherwise, let vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = (!scrub_restart && dsl_scan_scrubbing(spa->spa_dsl_pool)); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group|project}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = vmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } vmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, B_TRUE, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) { mutex_enter(&zfsvfs->z_os->os_upgrade_lock); if (zfsvfs->z_os->os_upgrade_id == 0) { /* clear potential error code and retry */ zfsvfs->z_os->os_upgrade_status = 0; mutex_exit(&zfsvfs->z_os->os_upgrade_lock); dsl_pool_config_enter( dmu_objset_pool(zfsvfs->z_os), FTAG); dmu_objset_userspace_upgrade(zfsvfs->z_os); dsl_pool_config_exit( dmu_objset_pool(zfsvfs->z_os), FTAG); } else { mutex_exit(&zfsvfs->z_os->os_upgrade_lock); } taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq, zfsvfs->z_os->os_upgrade_id); error = zfsvfs->z_os->os_upgrade_status; } zfs_vfs_rele(zfsvfs); } else { objset_t *os; /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_userspace_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); if (dmu_objset_userobjspace_upgradable(os) || dmu_objset_projectquota_upgradable(os)) { mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_id_quota_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; } else { dsl_pool_rele(dmu_objset_pool(os), FTAG); } dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } static int zfs_ioc_share(zfs_cmd_t *zc) { return (SET_ERROR(ENOSYS)); } ace_t full_access[] = { {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} }; /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; int error; minor_t minor; error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (error != 0) return (error); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strlcpy(zc->zc_value, snap_name, sizeof (zc->zc_value)); kmem_strfree(snap_name); kmem_strfree(hold_name); zfs_onexit_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { zfs_file_t *fp; offset_t off; int error; if ((error = zfs_file_get(zc->zc_cookie, &fp))) return (error); off = zfs_file_off(fp); error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); zfs_file_put(zc->zc_cookie); return (error); } static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { return (SET_ERROR(ENOTSUP)); } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_hold[] = { {"holds", DATA_TYPE_NVLIST, 0}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; holds = fnvlist_lookup_nvlist(args, "holds"); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { error = zfs_onexit_fd_hold(cleanup_fd, &minor); if (error != 0) return (SET_ERROR(error)); } error = dsl_dataset_user_hold(holds, minor, errlist); if (minor != 0) zfs_onexit_fd_rele(cleanup_fd); return (SET_ERROR(error)); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ static const zfs_ioc_key_t zfs_keys_get_holds[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_release[] = { {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, }; /* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_guid flags (ZEVENT_NONBLOCK) * zc_cleanup_fd zevent file descriptor * * outputs: * zc_nvlist_dst next nvlist event * zc_cookie dropped events since last get */ static int zfs_ioc_events_next(zfs_cmd_t *zc) { zfs_zevent_t *ze; nvlist_t *event = NULL; minor_t minor; uint64_t dropped = 0; int error; error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (error != 0) return (error); do { error = zfs_zevent_next(ze, &event, &zc->zc_nvlist_dst_size, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); nvlist_free(event); } if (zc->zc_guid & ZEVENT_NONBLOCK) break; if ((error == 0) || (error != ENOENT)) break; error = zfs_zevent_wait(ze); if (error != 0) break; } while (1); zfs_zevent_fd_rele(zc->zc_cleanup_fd); return (error); } /* * outputs: * zc_cookie cleared events count */ static int zfs_ioc_events_clear(zfs_cmd_t *zc) { int count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; return (0); } /* * inputs: * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END * zc_cleanup zevent file descriptor */ static int zfs_ioc_events_seek(zfs_cmd_t *zc) { zfs_zevent_t *ze; minor_t minor; int error; error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (error != 0) return (error); error = zfs_zevent_seek(ze, zc->zc_guid); zfs_zevent_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of later filesystem or snapshot * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (strchr(zc->zc_value, '#') != NULL) { zfs_bookmark_phys_t bmp; error = dsl_bookmark_lookup(dp, zc->zc_value, new, &bmp); if (error == 0) { error = dsl_dataset_space_written_bookmark(&bmp, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); } } else { dsl_dataset_t *old; error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error == 0) { error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); } } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static const zfs_ioc_key_t zfs_keys_space_snaps[] = { {"firstsnap", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "savedok" -> (value ignored) * presence indicates we should send a partially received snapshot * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "redactbook" -> (string) * if present, use this bookmark's redaction list to generate a redacted * send stream * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"fd", DATA_TYPE_INT32, 0}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { int error; offset_t off; char *fromname = NULL; int fd; zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); if ((error = zfs_file_get(fd, &fp))) return (error); off = zfs_file_off(fp); dmu_send_outparams_t out = {0}; out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); zfs_file_put(fd); return (error); } /* ARGSUSED */ static int send_space_sum(objset_t *os, void *buf, int len, void *arg) { uint64_t *size = arg; *size += len; return (0); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "fd" -> file descriptor to use as a cookie for progress * tracking (int32) * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; int error; char *fromname = NULL; char *redactlist_book = NULL; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t space = 0; boolean_t full_estimate = B_FALSE; uint64_t resumeobj = 0; uint64_t resumeoff = 0; uint64_t resume_bytes = 0; int32_t fd = -1; zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } (void) nvlist_lookup_int32(innvl, "fd", &fd); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", &redactlist_book) == 0); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); if (altbook) { full_estimate = B_TRUE; } else if (from) { if (strchr(fromname, '#')) { error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); /* * dsl_bookmark_lookup() will fail with EXDEV if * the from-bookmark and tosnap are at the same txg. * However, it's valid to do a send (and therefore, * a send estimate) from and to the same time point, * if the bookmark is redacted (the incremental send * can change what's redacted on the target). In * this case, dsl_bookmark_lookup() fills in zbm * but returns EXDEV. Ignore this error. */ if (error == EXDEV && zbm.zbm_redaction_obj != 0 && zbm.zbm_guid == dsl_dataset_phys(tosnap)->ds_guid) error = 0; if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & ZBM_FLAG_HAS_FBN)) { full_estimate = B_TRUE; } } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { full_estimate = B_TRUE; dsl_dataset_rele(fromsnap, FTAG); } } else { /* * from is not properly formatted as a snapshot or * bookmark */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } } if (full_estimate) { dmu_send_outparams_t out = {0}; offset_t off = 0; out.dso_outfunc = send_space_sum; out.dso_arg = &space; out.dso_dryrun = B_TRUE; /* * We have to release these holds so dmu_send can take them. It * will do all the error checking we need. */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactlist_book, fd, &off, &out); } else { error = dmu_send_estimate_fast(tosnap, fromsnap, (from && strchr(fromname, '#') != NULL ? &zbm : NULL), compressok || rawok, savedok, &space); space -= resume_bytes; if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); return (error); } /* * Sync the currently open TXG to disk for the specified pool. * This is somewhat similar to 'zfs_sync()'. * For cases that do not result in error this ioctl will wait for * the currently open TXG to commit before returning back to the caller. * * innvl: { * "force" -> when true, force uberblock update even if there is no dirty data. * In addition this will cause the vdev configuration to be written * out including updating the zpool cache file. (boolean_t) * } * * onvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_sync[] = { {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, }; /* ARGSUSED */ static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { int err; boolean_t rc, force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); if (innvl) { err = nvlist_lookup_boolean_value(innvl, "force", &rc); if (err == 0) force = rc; } if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } txg_wait_synced(spa_get_dsl(spa), 0); spa_close(spa, FTAG); return (0); } /* * Load a user's wrapping key into the kernel. * innvl: { * "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * (optional) "noop" -> (value ignored) * presence indicated key should only be verified, not loaded * } */ static const zfs_ioc_key_t zfs_keys_load_key[] = { {"hidden_args", DATA_TYPE_NVLIST, 0}, {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret; dsl_crypto_params_t *dcp = NULL; nvlist_t *hidden_args; boolean_t noop = nvlist_exists(innvl, "noop"); if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = SET_ERROR(EINVAL); goto error; } hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_load_wkey(dsname, dcp, noop); if (ret != 0) goto error; dsl_crypto_params_free(dcp, noop); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } /* * Unload a user's wrapping key from the kernel. * Both innvl and outnvl are unused. */ static const zfs_ioc_key_t zfs_keys_unload_key[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret = 0; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto out; } ret = spa_keystore_unload_wkey(dsname); if (ret != 0) goto out; out: return (ret); } /* * Changes a user's wrapping key used to decrypt a dataset. The keyformat, * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified * here to change how the key is derived in userspace. * * innvl: { * "hidden_args" (optional) -> { "wkeydata" -> value } * raw uint8_t array of new encryption wrapping key data (32 bytes) * "props" (optional) -> { prop -> value } * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_change_key[] = { {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret; uint64_t cmd = DCP_CMD_NONE; dsl_crypto_params_t *dcp = NULL; nvlist_t *args = NULL, *hidden_args = NULL; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto error; } (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); (void) nvlist_lookup_nvlist(innvl, "props", &args); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_change_key(dsname, dcp); if (ret != 0) goto error; dsl_crypto_params_free(dcp, B_FALSE); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; vec->zvec_nvl_keys = nvl_keys; vec->zvec_nvl_key_count = num_keys; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, ARRAY_SIZE(zfs_keys_get_bookmark_props)); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_bookmarks, ARRAY_SIZE(zfs_keys_destroy_bookmarks)); zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, zfs_ioc_load_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, zfs_ioc_unload_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, zfs_ioc_change_key, zfs_secpolicy_change_key, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_change_key, ARRAY_SIZE(zfs_keys_change_key)); zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); zfs_ioctl_register("redact", ZFS_IOC_REDACT, zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); zfs_ioctl_register("wait", ZFS_IOC_WAIT, zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_init_os(); } /* * Verify that for non-legacy ioctls the input nvlist * pairs match against the expected input. * * Possible errors are: * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair */ static int zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) { const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; boolean_t required_keys_found = B_FALSE; /* * examine each input pair */ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); data_type_t type = nvpair_type(pair); boolean_t identified = B_FALSE; /* * check pair against the documented names and type */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { /* if not a wild card name, check for an exact match */ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && strcmp(nvl_keys[k].zkey_name, name) != 0) continue; identified = B_TRUE; if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && nvl_keys[k].zkey_type != type) { return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); } if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; required_keys_found = B_TRUE; break; } /* allow an 'optional' key, everything else is invalid */ if (!identified && (strcmp(name, "optional") != 0 || type != DATA_TYPE_NVLIST)) { return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); } } /* verify that all required keys were found */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { /* at least one non-optional key is expected here */ if (!required_keys_found) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); continue; } if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); } return (0); } static int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME || type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } int zfsdev_getminor(int fd, minor_t *minorp) { zfsdev_state_t *zs, *fpd; zfs_file_t *fp; int rc; ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); if ((rc = zfs_file_get(fd, &fp))) return (rc); fpd = zfs_file_private(fp); if (fpd == NULL) return (SET_ERROR(EBADF)); mutex_enter(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) continue; if (fpd == zs) { *minorp = fpd->zs_minor; mutex_exit(&zfsdev_state_lock); return (0); } } mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EBADF)); } static void * zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { smp_rmb(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); case ZST_ZEVENT: return (zs->zs_zevent); case ZST_ALL: return (zs); } } } return (NULL); } void * zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { void *ptr; ptr = zfsdev_get_state_impl(minor, which); return (ptr); } /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (zfsdev_get_state_impl(m, ZST_ALL) == NULL) { last_minor = m; return (m); } } return (0); } long zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) { int error, cmd; const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; uint64_t max_nvlist_src_size; size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; hrtime_t start_time = gethrtime(); cmd = vecnum; error = 0; if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); vec = &zfs_ioc_vec[vecnum]; /* * The registered ioctl list may be sparse, verify that either * a normal or legacy handler are registered. */ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); zc->zc_iflags = flag & FKIOCTL; max_nvlist_src_size = zfs_max_nvlist_src_size_os(); if (zc->zc_nvlist_src_size > max_nvlist_src_size) { /* * Make sure the user doesn't pass in an insane value for * zc_nvlist_src_size. We have to check, since we will end * up allocating that much memory inside of get_nvlist(). This * prevents a nefarious user from allocating tons of kernel * memory. * * Also, we return EINVAL instead of ENOMEM here. The reason * being that returning ENOMEM from an ioctl() has a special * connotation; that the user's size value is too small and * needs to be expanded to hold the nvlist. See * zcmd_expand_dst_nvlist() for details. */ error = SET_ERROR(EINVAL); /* User's size too big */ } else if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case ENTITY_NAME: if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { error = SET_ERROR(EINVAL); } else { error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); } break; case NO_NAME: break; } /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. * * The vectored functions can use fnvlist_lookup_{type} for any * required pairs since zfs_check_input_nvpairs() confirmed that * they exist and are of the correct type. */ if (error == 0 && vec->zvec_func != NULL) { error = zfs_check_input_nvpairs(innvl, vec); if (error != 0) goto out; } if (error == 0) { cookie = spl_fstrans_mark(); error = vec->zvec_secpolicy(zc, innvl, CRED()); spl_fstrans_unmark(cookie); } if (error != 0) goto out; /* legacy ioctls can modify zc_name */ /* * Can't use kmem_strdup() as we might truncate the string and * kmem_strfree() would then free with incorrect size. */ saved_poolname_len = strlen(zc->zc_name) + 1; saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); cookie = spl_fstrans_mark(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); spl_fstrans_unmark(cookie); /* * Some commands can partially execute, modify state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { size_t out_size = fnvlist_size(outnvl); if (out_size > zfs_history_output_max) { fnvlist_add_int64(lognv, ZPOOL_HIST_OUTPUT_SIZE, out_size); } else { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS, gethrtime() - start_time); (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { cookie = spl_fstrans_mark(); error = vec->zvec_legacy_func(zc); spl_fstrans_unmark(cookie); } out: nvlist_free(innvl); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) kmem_strfree(s); (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); } if (saved_poolname != NULL) kmem_free(saved_poolname, saved_poolname_len); return (error); } int zfs_kmod_init(void) { int error; if ((error = zvol_init()) != 0) return (error); spa_init(SPA_MODE_READ | SPA_MODE_WRITE); zfs_init(); zfs_ioctl_init(); mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); zfsdev_state_list->zs_minor = -1; if ((error = zfsdev_attach()) != 0) goto out; tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); return (0); out: zfs_fini(); spa_fini(); zvol_fini(); return (error); } void zfs_kmod_fini(void) { zfsdev_state_t *zs, *zsnext = NULL; zfsdev_detach(); mutex_destroy(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) { zsnext = zs->zs_next; if (zs->zs_onexit) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW, "Maximum size in bytes of ZFS ioctl output that will be logged"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 61d5f06c6455..a35c17f86f93 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -1,897 +1,897 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static ulong_t zfs_fsync_sync_cnt = 4; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } tsd_set(zfs_fsyncer_key, NULL); return (0); } #if defined(SEEK_HOLE) && defined(SEEK_DATA) /* * Lseek support for finding holes (cmd == SEEK_HOLE) and * data (cmd == SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) { uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == F_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* file was dirty, so fall back to using generic logic */ if (error == EBUSY) { if (hole) *off = file_sz; return (0); } /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } int zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_holey_common(zp, cmd, off); ZFS_EXIT(zfsvfs); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ /*ARGSUSED*/ int zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: zp - inode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - O_SYNC flags; used to provide FRSYNC semantics. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * inode - atime updated if byte count > 0 */ /* ARGSUSED */ int zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; boolean_t frsync = B_FALSE; zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* We don't copy out anything useful for directories. */ if (Z_ISDIR(ZTOTYPE(zp))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } /* * Validate file offset */ if (zfs_uio_offset(uio) < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (zfs_uio_resid(uio) == 0) { ZFS_EXIT(zfsvfs); return (0); } #ifdef FRSYNC /* * If we're in FRSYNC mode, sync out this znode before reading it. * Only do this for non-snapshots. * * Some platforms do not support FRSYNC and instead map it * to O_SYNC, which results in unnecessary calls to zil_commit. We * only honor FRSYNC requests on platforms which support it. */ frsync = !!(ioflag & FRSYNC); #endif if (zfsvfs->z_log && (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (zfs_uio_offset(uio) >= zp->z_size) { error = 0; goto out; } ASSERT(zfs_uio_offset(uio) < zp->z_size); ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; while (n > 0) { ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) error = mappedread_sf(zp, nbytes, uio); else #endif if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { error = mappedread(zp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } int64_t nread = start_resid - n; dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); task_io_account_read(nread); out: zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: zp - znode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - O_APPEND flag set if in append mode. * O_DIRECT flag; used to bypass page cache. * cr - credentials of caller. * * OUT: uio - updated offset and range. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ int zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { int error = 0; ssize_t start_resid = zfs_uio_resid(uio); /* * Fasttrack empty write */ ssize_t n = start_resid; if (n == 0) return (0); zfsvfs_t *zfsvfs = ZTOZSB(zp); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); sa_bulk_attr_t bulk[4]; int count = 0; uint64_t mtime[2], ctime[2]; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM */ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && (zfs_uio_offset(uio) < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Validate file offset */ offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } const uint64_t max_blksz = zfsvfs->z_max_blksz; /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EFAULT)); } /* * If in append mode, set the io offset pointer to eof. */ zfs_locked_range_t *lr; if (ioflag & O_APPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); woff = lr->lr_offset; if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } zfs_uio_setoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (zn_rlimit_fsize(zp, uio)) { zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } const rlim64_t limit = MAXOFFSET_T; if (woff >= limit) { zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if (n > limit - woff) n = limit - woff; uint64_t end_size = MAX(zp->z_size, woff + n); zilog_t *zilog = zfsvfs->z_log; const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); const uint64_t projid = zp->z_projid; /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { woff = zfs_uio_offset(uio); if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || (projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid))) { error = SET_ERROR(EDQUOT); break; } arc_buf_t *abuf = NULL; if (n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if ((error = zfs_uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; } ASSERT3S(cbytes, ==, max_blksz); } /* * Start a transaction. */ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, MIN(n, max_blksz)); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since rangelock_reduce() will * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_rangelock_reduce(lr, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ const ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); zfs_uio_fault_disable(uio, B_TRUE); error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); zfs_uio_fault_disable(uio, B_FALSE); #ifdef __linux__ if (error == EFAULT) { dmu_tx_commit(tx); /* * Account for partial writes before * continuing the loop. * Update needs to occur before the next * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ if (tx_bytes != zfs_uio_resid(uio)) n -= tx_bytes - zfs_uio_resid(uio); if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { break; } continue; } #endif if (error != 0) { dmu_tx_commit(tx); break; } tx_bytes -= zfs_uio_resid(uio); } else { /* Implied by abuf != NULL: */ ASSERT3S(n, >=, max_blksz); ASSERT0(P2PHASE(woff, max_blksz)); /* * We can simplify nbytes to MIN(n, max_blksz) since * P2PHASE(woff, max_blksz) is 0, and knowing * n >= max_blksz lets us simplify further: */ ASSERT3S(nbytes, ==, max_blksz); /* * Thus, we're writing a full block at a block-aligned * offset and extending the file past EOF. * * dmu_assign_arcbuf_by_dbuf() will directly assign the * arc buffer to a dbuf. */ error = dmu_assign_arcbuf_by_dbuf( sa_get_db(zp->z_sa_hdl), woff, abuf, tx); if (error != 0) { dmu_return_arcbuf(abuf); dmu_tx_commit(tx); break; } ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } if (tx_bytes && zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the execute bits is set. * * It would be nice to do this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(zp, cr, ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { (void) atomic_cas_64(&zp->z_size, end_size, zfs_uio_offset(uio)); ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, NULL, NULL); dmu_tx_commit(tx); if (error != 0) break; ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; if (n > 0) { if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { error = SET_ERROR(EFAULT); break; } } } zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, or the * uio data is inaccessible return an error. Otherwise, it's * at least a partial write, so it's successful. */ if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (O_SYNC | O_DSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); const int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); task_io_account_write(nwritten); ZFS_EXIT(zfsvfs); return (0); } /*ARGSUSED*/ int zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } #ifdef ZFS_DEBUG static int zil_fault_io = 0; #endif static void zfs_get_done(zgd_t *zgd, int error); /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef ZFS_DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP. */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } /* ARGSUSED */ static void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ zfs_zrele_async(zp); kmem_free(zgd, sizeof (zgd_t)); } EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_holey); EXPORT_SYMBOL(zfs_read); EXPORT_SYMBOL(zfs_write); EXPORT_SYMBOL(zfs_getsecattr); EXPORT_SYMBOL(zfs_setsecattr); ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW, "Bytes to read per chunk"); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 7f3cb19d46db..262ca24b1443 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -1,5039 +1,5036 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; int zio_deadman_log_all = B_FALSE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ kmem_cache_t *zio_cache; kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif /* Mark IOs as "slow" if they take longer than 30 seconds */ int zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. * * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable * compression (including of metadata). In practice, we don't have this * many sync passes, so this has no effect. * * The original intent was that disabling compression would help the sync * passes to converge. However, in practice disabling compression increases * the average number of sync passes, because when we turn compression off, a * lot of block's size will change and thus we have to re-allocate (not * overwrite) them. It also increases the number of 128KB allocations (e.g. * for indirect blocks and spacemaps) because these will not be compressed. * The 128K allocations are especially detrimental to performance on highly * fragmented systems, which may have very few free segments of this size, * and may need to load new metaslabs to satisfy 128K allocations. */ int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */ int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) /* * Enable smaller cores by excluding metadata * allocations as well. */ int zio_exclude_metadata = 0; int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG int zio_buf_debug_limit = 16384; #else int zio_buf_debug_limit = 0; #endif static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; size_t data_cflags, cflags; data_cflags = KMC_NODEBUG; cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; #if defined(_ILP32) && defined(_KERNEL) /* * Cache size limited to 1M on 32-bit platforms until ARC * buffers no longer require virtual address space. */ if (size > zfs_max_recordsize) break; #endif while (!ISP2(p2)) p2 &= p2 - 1; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; /* * Here's the problem - on 4K native devices in userland on * Linux using O_DIRECT, buffers must be 4K aligned or I/O * will fail with EINVAL, causing zdb (and others) to coredump. * Since userland probably doesn't need optimized buffer caches, * we just force 4K alignment on everything. */ align = 8 * SPA_MINBLOCKSIZE; #else if (size < PAGESIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = PAGESIZE; } #endif if (align != 0) { char name[36]; if (cflags == data_cflags) { /* * Resulting kmem caches would be identical. * Save memory by creating only one. */ (void) snprintf(name, sizeof (name), "zio_buf_comb_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); zio_data_buf_cache[c] = zio_buf_cache[c]; continue; } (void) snprintf(name, sizeof (name), "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, data_cflags); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t i, j, n; kmem_cache_t *cache; n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; #if defined(ZFS_DEBUG) && !defined(_KERNEL) for (i = 0; i < n; i++) { if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((i + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[i], (long long unsigned)zio_buf_cache_frees[i]); } #endif /* * The same kmem cache can show up multiple times in both zio_buf_cache * and zio_data_buf_cache. Do a wasteful but trivially correct scan to * sort it out. */ for (i = 0; i < n; i++) { cache = zio_buf_cache[i]; if (cache == NULL) continue; for (j = i; j < n; j++) { if (cache == zio_buf_cache[j]) zio_buf_cache[j] = NULL; if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (i = 0; i < n; i++) { cache = zio_data_buf_cache[i]; if (cache == NULL) continue; for (j = i; j < n; j++) { if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (i = 0; i < n; i++) { if (zio_buf_cache[i] != NULL) panic("zio_fini: zio_buf_cache[%d] != NULL", (int)i); if (zio_data_buf_cache[i] != NULL) panic("zio_fini: zio_data_buf_cache[%d] != NULL", (int)i); } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { abd_free((abd_t *)abd); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, tmp, zio->io_size, size, &zio->io_prop.zp_complevel); abd_return_buf_copy(data, tmp, size); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } static void zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) { int ret; void *tmp; blkptr_t *bp = zio->io_bp; spa_t *spa = zio->io_spa; uint64_t dsobj = zio->io_bookmark.zb_objset; uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t ot = BP_GET_TYPE(bp); uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(size, !=, 0); if (zio->io_error != 0) return; /* * Verify the cksum of MACs stored in an indirect bp. It will always * be possible to verify this since it does not require an encryption * key. */ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { zio_crypt_decode_mac_bp(bp, mac); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* * We haven't decompressed the data yet, but * zio_crypt_do_indirect_mac_checksum() requires * decompressed data to be able to parse out the MACs * from the indirect block. We decompress it now and * throw away the result after we are finished. */ tmp = zio_buf_alloc(lsize); ret = zio_decompress_data(BP_GET_COMPRESS(bp), zio->io_abd, tmp, zio->io_size, lsize, &zio->io_prop.zp_complevel); if (ret != 0) { ret = SET_ERROR(EIO); goto error; } ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); zio_buf_free(tmp, lsize); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); } abd_copy(data, zio->io_abd, size); if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } if (ret != 0) goto error; return; } /* * If this is an authenticated block, just check the MAC. It would be * nice to separate this out into its own flag, but for the moment * enum zio_flag is out of bits. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); } else { zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, mac); if (zio_injection_enabled && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } } abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; } zio_crypt_decode_params_bp(bp, salt, iv); if (ot == DMU_OT_INTENT_LOG) { tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmp, mac); abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, mac); } ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, zio->io_abd, &no_crypt); if (no_crypt) abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; error: /* assert that the key was found unless this was speculative */ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); /* * If there was a decryption / authentication error return EIO as * the io_error. If this was not a speculative zio, create an ereport. */ if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; ASSERT(MUTEX_HELD(&pio->io_lock)); *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); pio->io_child_count++; cio->io_parent_count++; mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); pio->io_child_count--; cio->io_parent_count--; mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * If we can tell the caller to execute this parent next, do * so. Otherwise dispatch the parent zio as its own task. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch * overhead, and has no recursion penalty. Note that one * read from disk typically causes at least 3 zio's: a * zio_null(), the logical zio_read(), and then a physical * zio. When the physical ZIO completes, we are able to call * zio_done() on all 3 of these zio's from one invocation of * zio_execute() by returning the parent back to * zio_execute(). Since the parent isn't executed until this * thread returns back to zio_execute(), the caller should do * so promptly. * * In other cases, dispatching the parent prevents * overflowing the stack when we have deeply nested * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); } } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { if (zio->io_metaslab_class == NULL) zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } static void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } static int zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, enum blk_verify_flag blk_verify, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); switch (blk_verify) { case BLK_VERIFY_HALT: dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: zfs_dbgmsg("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_ONLY: break; } return (1); } /* * Verify the block pointer fields contain reasonable values. This means * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Modes for zfs_blkptr_verify: * 1) BLK_VERIFY_ONLY (evaluate the block) * 2) BLK_VERIFY_LOG (evaluate the block and log problems) * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error) */ boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, enum blk_verify_flag blk_verify) { int errors = 0; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Do not verify individual DVAs if the config is not trusted. This * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (!spa->spa_trust_config) return (B_TRUE); if (!config_held) spa_config_enter(spa, SCL_VDEV, bp, RW_READER); else ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); if (vdevid >= spa->spa_root_vdev->vdev_children) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %p DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } if (errors > 0) dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); if (!config_held) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); } boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) return (B_FALSE); vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) return (B_FALSE); if (vd->vdev_ops == &vdev_hole_ops) return (B_FALSE); if (vd->vdev_ops == &vdev_missing_ops) { return (B_FALSE); } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (BP_IS_GANG(bp)) asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); if (offset + asize > vd->vdev_asize) return (B_FALSE); return (B_TRUE); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, BLK_VERIFY_HALT); zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_physdone = physdone; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). Encrypted * dedup blocks need data as well so we also disable dedup in this * case. */ if (data == NULL && (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; metaslab_check_free(spa, bp); /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. * * Note that we only defer frees after zfs_sync_pass_deferred_free * when the log space map feature is disabled. [see relevant comment * in spa_sync_iterate_to_convergence()] */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) { bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); } } /* * To improve performance, this function may return NULL if we were able * to do the free immediately. This avoids the cost of creating a zio * (and linking it to the parent, etc). */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); if (BP_IS_EMBEDDED(bp)) return (NULL); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) { /* * GANG and DEDUP blocks can induce a read (for the gang block * header, or the DDT), so issue them asynchronously so that * this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); } else { metaslab_free(spa, bp, txg, B_FALSE); return (NULL); } } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, done, private, flags)); } return (zio); } zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, enum trim_flag trim_flags) { zio_t *zio; ASSERT0(vd->vdev_children); ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); ASSERT3U(size, !=, 0); zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); zio->io_trim_flags = trim_flags; return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; /* * vdev child I/Os do not propagate their error to the parent. * Therefore, for correct operation the caller *must* check for * and handle the error in the child i/o's done callback. * The only exceptions are i/os that we don't care about * (OPTIONAL or REPAIR). */ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; } flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); zio->io_physdone = pio->io_physdone; if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, zio_type_t type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || BP_HAS_INDIRECT_MAC_CKSUM(bp)) && zio->io_child_type == ZIO_CHILD_LOGICAL) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decrypt); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) zio->io_flags |= ZIO_FLAG_DONT_CACHE; if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (zio); } static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(bp->blk_birth != zio->io_txg); ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (BP_IS_EMBEDDED(bp)) return (zio); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (zio); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (zio); } static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; int pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (NULL); } if (!IO_IS_ALLOCATING(zio)) return (zio); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize, zp->zp_complevel); if (psize == 0 || psize >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (zio); } else { /* * Round compressed size up to the minimum allocation * size of the smallest-ashift device, and zero the * tail. This ensures that the compressed size of the * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); size_t rounded = (size_t)roundup(psize, spa->spa_min_alloc); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cdata, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && zp->zp_type == DMU_OT_DNODE) { /* * The DMU actually relies on the zio layer's compression * to free metadnode blocks that have had all contained * dnodes freed. As a result, even when doing a raw * receive, we must check whether the block can be compressed * to a hole. */ psize = zio_compress_data(ZIO_COMPRESS_EMPTY, zio->io_abd, NULL, lsize, zp->zp_complevel); if (psize == 0 || psize >= lsize) compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!zp->zp_encrypt || DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (zio); } static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); return (zio); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if ((zio->io_priority == ZIO_PRIORITY_NOW || zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { spa_t *spa = zio->io_spa; taskq_t *tq = taskq_of_curthread(); for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (tqs->stqs_taskq[i] == tq) return (B_TRUE); } } return (B_FALSE); } static zio_t * zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } void zio_interrupt(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; clock_t expire_at_tick = ddi_get_lbolt() + NSEC_TO_TICK(diff); DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); if (NSEC_TO_TICK(diff) == 0) { /* Our delay is less than a jiffy - just spin */ zfs_sleep_until(zio->io_target_timestamp); zio_interrupt(zio); } else { /* * Use taskq_dispatch_delay() in the place of * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, (task_func_t *)zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just * finish the zio without a delay. */ zio_interrupt(zio); } } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } static void zio_deadman_impl(zio_t *pio, int ziodepth) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; vdev_t *vd = pio->io_vd; if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) { vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL; zbookmark_phys_t *zb = &pio->io_bookmark; uint64_t delta = gethrtime() - pio->io_timestamp; uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " "delta=%llu queued=%llu io=%llu " "path=%s last=%llu " "type=%d priority=%d flags=0x%x " "stage=0x%x pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu level=%llu blkid=%llu " "offset=%llu size=%llu error=%d", ziodepth, pio, pio->io_timestamp, delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, pio->io_priority, pio->io_flags, pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, pio->io_offset, pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { zio_interrupt(pio); } } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_deadman_impl(cio, ziodepth + 1); } mutex_exit(&pio->io_lock); } /* * Log the critical information describing this zio and all of its children * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void zio_deadman(zio_t *pio, char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); if (!zfs_deadman_enabled || spa_suspended(spa)) return; zio_deadman_impl(pio, 0); switch (spa_get_deadman_failmode(spa)) { case ZIO_FAILURE_MODE_WAIT: zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_CONTINUE: zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_PANIC: fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); break; } } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(zio_t *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ static boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; /* * The zio pipeline stage returns the next zio to execute * (typically the same as this one), or NULL if we should * stop. */ zio = zio_pipeline[highbit64(stage) - 1](zio); if (zio == NULL) return; } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { /* * Some routines, like zio_free_sync(), may return a NULL zio * to avoid the performance overhead of creating and then destroying * an unneeded zio. For the callers' simplicity, we accept a NULL * zio and ignore it. */ if (zio == NULL) return (0); long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) { error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { mutex_exit(&zio->io_lock); timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); zio_deadman(zio, FTAG); mutex_enter(&zio->io_lock); } } mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { /* * See comment in zio_wait(). */ if (zio == NULL) return; ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && zio_unique_parent(zio) == NULL) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE]; zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(zio_t *pio) { zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); } mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " "failure and has been suspended.\n", spa_name(spa)); (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = reason; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_free(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_free(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } /* ARGSUSED */ static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio)); if (zio == NULL) { zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); } return (zio); } /* ARGSUSED */ static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(zio->io_child_count == 0); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_free(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (zio); } static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; zio_t *gio __maybe_unused = zio->io_gang_leader; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { /* * The io_abd field will be NULL for a zio with no data. The io_flags * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) abd_free(zio->io_abd); } static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; int gbh_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* * encrypted blocks need DVA[2] free so encrypted gang headers can't * have a third copy. */ gbh_copies = MIN(copies + 1, spa_max_replication(spa)); if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP) gbh_copies = SPA_DVAS_PER_BP - 1; int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator]. mca_alloc_slots, pio)); /* * The logical zio has already placed a reservation for * 'copies' allocation slots but gang blocks may require * additional copies. These additional copies * (i.e. gbh_copies - copies) are guaranteed to succeed * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * If we failed to allocate the gang block header then * we remove any additional allocation reservations that * we placed here. The original reservation will * be removed when the logical I/O goes to the ready * stage. */ metaslab_class_throttle_unreserve(mc, gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; return (pio); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; bzero(zp.zp_salt, ZIO_DATA_SALT_LEN); bzero(zp.zp_iv, ZIO_DATA_IV_LEN); bzero(zp.zp_mac, ZIO_DATA_MAC_LEN); zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, zio_write_gang_member_ready, NULL, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * Gang children won't throttle but we should * account for their work, so reserve an allocation * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; /* * We didn't allocate this bp, so make sure it doesn't get unmarked. */ pio->io_flags &= ~ZIO_FLAG_FASTWRITE; zio_nowait(zio); return (pio); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (zio); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, sizeof (uint64_t)) == 0); /* * If we're overwriting a block that is currently on an * indirect vdev, then ignore the nopwrite request and * allow a new block to be allocated on a concrete vdev. */ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); vdev_t *tvd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (tvd->vdev_ops == &vdev_indirect_ops) { spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); return (zio); } spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (zio); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_abd == NULL) dde->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (zio); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (zio); } static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (zio); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL && do_raw) { return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { zio_link_t *zl = NULL; while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; zio_t *cio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); } ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); zio_nowait(cio); return (zio); } ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddp = ddt_phys_select(dde, bp); if (ddp) ddt_phys_decref(ddp); } ddt_exit(ddt); return (zio); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); zio = avl_first(&spa->spa_alloc_trees[allocator]); if (zio == NULL) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { return (NULL); } avl_remove(&spa->spa_alloc_trees[allocator], zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; /* locate an appropriate allocation class */ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); } ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); zbookmark_phys_t *bm = &zio->io_bookmark; /* * We want to try to use as many allocators as possible to help improve * performance, but we also want logically adjacent IOs to be physically * adjacent to improve sequential read performance. We chunk each object * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio->io_metaslab_class = mc; avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); nio = zio_io_to_allocate(spa, zio->io_allocator); mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); return (nio); } static void zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; mutex_enter(&spa->spa_alloc_locks[allocator]); zio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_alloc_locks[allocator]); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* * if not already chosen, locate an appropriate allocation class */ mc = zio->io_metaslab_class; if (mc == NULL) { mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); zio->io_metaslab_class = mc; } /* * Try allocating the block in the usual metaslab class. * If that's full, allocate it in the normal class. * If that's full, allocate as a gang block, * and if all are full, the allocation fails (which shouldn't happen). * * Note that we do not fall back on embedded slog (ZIL) space, to * preserve unfragmented slog space, which is critical for decent * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); /* * Fallback to normal class when an alloc class is full */ if (error == ENOSPC && mc != spa_normal_class(spa)) { /* * If throttling, transfer reservation over to normal class. * The io_allocator slot can remain the same even though we * are switching classes. */ if (mc->mc_alloc_throttle_enabled && (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { metaslab_class_throttle_unreserve(mc, zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; VERIFY(metaslab_class_throttle_reserve( spa_normal_class(spa), zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); } zio->io_metaslab_class = mc = spa_normal_class(spa); if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying normal class: zio %px, size %llu, error %d", spa_name(spa), zio, zio->io_size, error); } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying ganging: zio %px, size %llu, error %d", spa_name(spa), zio, zio->io_size, error); } return (zio_write_gang_block(zio, mc)); } if (error != 0) { if (error != ENOSPC || (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " "size %llu, error %d", spa_name(spa), zio, zio->io_size, error); } zio->io_error = error; } return (zio); } static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (zio); } static zio_t * zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (zio); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); /* * Block pointer fields are useful to metaslabs for stats and debugging. * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_PSIZE(new_bp, size); BP_SET_LEVEL(new_bp, 0); /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ int flags = METASLAB_FASTWRITE | METASLAB_ZIL; int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0); if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } if (error != 0) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); /* * encrypted blocks will require an IV and salt. We generate * these now since we will not be rewriting the bp at * rewrite time. */ if (os->os_encrypted) { uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t salt[ZIO_DATA_SALT_LEN]; BP_SET_CRYPT(new_bp, B_TRUE); VERIFY0(spa_crypt_get_salt(spa, dmu_objset_id(os), salt)); VERIFY0(zio_crypt_generate_iv(iv)); zio_crypt_encode_params_bp(new_bp, salt, iv); } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), size, error); } return (error); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (NULL); } ASSERT3P(zio->io_logical, !=, zio); if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ if (zio->io_vd->vdev_removing) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering. * * There are a few ways that we can end up creating these spurious * resilver i/os: * * 1. A resilver i/o will be issued if any DVA in the BP has a * dirty DTL. The mirror code will issue resilver writes to * each DVA, including the one(s) that are not on vdevs with dirty * DTLs. * * 2. With nested replication, which happens when we have a * "replacing" or "spare" vdev that's a child of a mirror or raidz. * For example, given mirror(replacing(A+B), C), it's likely that * only A is out of date (it's the new device). In this case, we'll * read from C, then use the data to resilver A+B -- but we don't * actually want to resilver B, just A. The top-level mirror has no * way to know this, so instead we just discard unnecessary repairs * as we work our way down the vdev tree. * * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. * The same logic applies to any form of nested replication: ditto * + mirror, RAID-Z + replacing, etc. * * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. * * Leaf DTL_PARTIAL can be empty when a legitimate write comes from * a dRAID spare vdev. For example, when a dRAID spare is first * used, its spare blocks need to be written to but the leaf vdev's * of such blocks can have empty DTL_PARTIAL. * * There seemed no clean way to allow such writes while bypassing * spurious ones. At this point, just avoid all bypassing for dRAID * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } /* * Select the next best leaf I/O to process. Distributed spares are * excluded since they dispatch the I/O directly to a leaf vdev after * applying the dRAID mapping. */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (zio); if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (NULL); } zio->io_delay = gethrtime(); } vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); } /* * This function is used to change the priority of an existing zio that is * currently in-flight. This is used by the arc to upgrade priority in the * event that a demand read is made for a block that is currently queued * as a scrub or async read IO. Otherwise, the high priority read request * would end up having to wait for the lower priority IO. */ void zio_change_priority(zio_t *pio, zio_priority_t priority) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { vdev_queue_change_io_priority(pio, priority); } else { pio->io_priority = priority; } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_change_priority(cio, priority); } mutex_exit(&pio->io_lock); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } /*ARGSUSED*/ void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_abd_free; } static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (NULL); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && zio->io_physdone != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); zio->io_physdone(zio->io_logical); } return (zio); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Encrypt and store encryption parameters * ========================================================================== */ /* * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for * managing the storage of encryption parameters and passing them to the * lower-level encryption functions. */ static zio_t * zio_encrypt(zio_t *zio) { zio_prop_t *zp = &zio->io_prop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_GET_PSIZE(bp); uint64_t dsobj = zio->io_bookmark.zb_objset; dmu_object_type_t ot = BP_GET_TYPE(bp); void *enc_buf = NULL; abd_t *eabd = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* the root zio already encrypted the data */ if (zio->io_child_type == ZIO_CHILD_GANG) return (zio); /* only ZIL blocks are re-encrypted on rewrite */ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) return (zio); if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { BP_SET_CRYPT(bp, B_FALSE); return (zio); } /* if we are doing raw encryption set the provided encryption params */ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { ASSERT0(BP_GET_LEVEL(bp)); BP_SET_CRYPT(bp, B_TRUE); BP_SET_BYTEORDER(bp, zp->zp_byteorder); if (ot != DMU_OT_OBJSET) zio_crypt_encode_mac_bp(bp, zp->zp_mac); /* dnode blocks must be written out in the provided byteorder */ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && ot == DMU_OT_DNODE) { void *bswap_buf = zio_buf_alloc(psize); abd_t *babd = abd_get_from_buf(bswap_buf, psize); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); abd_copy_to_buf(bswap_buf, zio->io_abd, psize); dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, psize); abd_take_ownership_of_buf(babd, B_TRUE); zio_push_transform(zio, babd, psize, psize, NULL); } if (DMU_OT_IS_ENCRYPTED(ot)) zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); return (zio); } /* indirect blocks only maintain a cksum of the lower level MACs */ if (BP_GET_LEVEL(bp) > 0) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Objset blocks are a special case since they have 2 256-bit MACs * embedded within them. */ if (ot == DMU_OT_OBJSET) { ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); return (zio); } /* unencrypted object types are only authenticated with a MAC */ if (!DMU_OT_IS_ENCRYPTED(ot)) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Later passes of sync-to-convergence may decide to rewrite data * in place to avoid more disk reallocations. This presents a problem * for encryption because this constitutes rewriting the new data with * the same encryption key and IV. However, this only applies to blocks * in the MOS (particularly the spacemaps) and we do not encrypt the * MOS. We assert that the zio is allocating or an intent log write * to enforce this. */ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); ASSERT3U(psize, !=, 0); enc_buf = zio_buf_alloc(psize); eabd = abd_get_from_buf(enc_buf, psize); abd_take_ownership_of_buf(eabd, B_TRUE); /* * For an explanation of what encryption parameters are stored * where, see the block comment in zio_crypt.c. */ if (ot == DMU_OT_INTENT_LOG) { zio_crypt_decode_params_bp(bp, salt, iv); } else { BP_SET_CRYPT(bp, B_TRUE); } /* Perform the encryption. This should not fail */ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); /* encode encryption metadata into the bp */ if (ot == DMU_OT_INTENT_LOG) { /* * ZIL blocks store the MAC in the embedded checksum, so the * transform must always be applied. */ zio_crypt_encode_mac_zil(enc_buf, mac); zio_push_transform(zio, eabd, psize, psize, NULL); } else { BP_SET_CRYPT(bp, B_TRUE); zio_crypt_encode_params_bp(bp, salt, iv); zio_crypt_encode_mac_bp(bp, mac); if (no_crypt) { ASSERT3U(ot, ==, DMU_OT_DNODE); abd_free(eabd); } else { zio_push_transform(zio, eabd, psize, psize, NULL); } } return (zio); } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (zio); } static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - int ret = zfs_ereport_start_checksum(zio->io_spa, + (void) zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, NULL, &info); - - if (ret != EALREADY) { - mutex_enter(&zio->io_vd->vdev_stat_lock); - zio->io_vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&zio->io_vd->vdev_stat_lock); - } + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); } } return (zio); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio); zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (zio); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *lio __maybe_unused = zio->io_logical; zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* * Parents of gang children can have two flavors -- ones that * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) * and ones that allocated the constituent blocks. The allocation * throttle needs to know the allocating parent zio so we must find * it here. */ if (pio->io_child_type == ZIO_CHILD_GANG) { /* * If our parent is a rewrite gang child then our grandparent * would have been the one that performed the allocation. */ if (pio->io_flags & ZIO_FLAG_IO_REWRITE) pio = zio_unique_parent(pio); flags |= METASLAB_GANG_CHILD; } ASSERT(IO_IS_ALLOCATING(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static zio_t * zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recursively up to 19 levels deep. */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (NULL); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { ASSERT(zio->io_metaslab_class != NULL); ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } /* * If the allocation throttle is enabled, verify that * we have decremented the refcounts for every I/O that was throttled. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class-> mc_allocator[zio->io_allocator].mca_alloc_slots, zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; if (asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); if (asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { /* * We want to only increment our slow IO counters if * the IO is valid (i.e. not if the drive is removed). * * zfs_ereport_post() will also do these checks, but * it can also ratelimit and have other failures, so we * need to increment the slow_io counters independent * of it. */ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } } } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { mutex_enter(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_READ) zio->io_vd->vdev_stat.vs_read_errors++; else if (zio->io_type == ZIO_TYPE_WRITE) zio->io_vd->vdev_stat.vs_write_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); } } if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't * bother with "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; /* * This is a rare code path, so we don't bother with * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); } return (NULL); } ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * We are done executing this zio. We may want to execute a parent * next. See the comment in zio_notify_parent(). */ zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (next_to_execute); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_encrypt, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT(last_block->zb_level == 0); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, "Prioritize requeued I/O"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, INT, ZMOD_RW, "Defer frees starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW, "Don't compress starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW, "Rewrite new bps starting in this pass"); ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, "Throttle block allocations in the ZIO pipeline"); ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/scripts/zfs.sh b/sys/contrib/openzfs/scripts/zfs.sh index c131429c919c..2f5f3f8fdb54 100755 --- a/sys/contrib/openzfs/scripts/zfs.sh +++ b/sys/contrib/openzfs/scripts/zfs.sh @@ -1,282 +1,282 @@ #!/bin/sh # # A simple script to load/unload the ZFS module stack. # BASE_DIR=$(dirname "$0") SCRIPT_COMMON=common.sh if [ -f "${BASE_DIR}/${SCRIPT_COMMON}" ]; then . "${BASE_DIR}/${SCRIPT_COMMON}" else echo "Missing helper script ${SCRIPT_COMMON}" && exit 1 fi PROG=zfs.sh VERBOSE="no" UNLOAD="no" STACK_TRACER="no" ZED_PIDFILE=${ZED_PIDFILE:-/var/run/zed.pid} LDMOD=${LDMOD:-/sbin/modprobe} KMOD_ZLIB_DEFLATE=${KMOD_ZLIB_DEFLATE:-zlib_deflate} KMOD_ZLIB_INFLATE=${KMOD_ZLIB_INFLATE:-zlib_inflate} KMOD_SPL=${KMOD_SPL:-spl} KMOD_ZAVL=${KMOD_ZAVL:-zavl} KMOD_ZNVPAIR=${KMOD_ZNVPAIR:-znvpair} KMOD_ZUNICODE=${KMOD_ZUNICODE:-zunicode} KMOD_ZCOMMON=${KMOD_ZCOMMON:-zcommon} KMOD_ZLUA=${KMOD_ZLUA:-zlua} KMOD_ICP=${KMOD_ICP:-icp} KMOD_ZFS=${KMOD_ZFS:-zfs} KMOD_FREEBSD=${KMOD_FREEBSD:-openzfs} KMOD_ZZSTD=${KMOD_ZZSTD:-zzstd} usage() { cat << EOF USAGE: $0 [hvudS] [module-options] DESCRIPTION: Load/unload the ZFS module stack. OPTIONS: -h Show this message -v Verbose -u Unload modules -S Enable kernel stack tracer EOF } while getopts 'hvuS' OPTION; do case $OPTION in h) usage exit 1 ;; v) VERBOSE="yes" ;; u) UNLOAD="yes" ;; S) STACK_TRACER="yes" ;; ?) usage exit ;; esac done kill_zed() { if [ -f "$ZED_PIDFILE" ]; then PID=$(cat "$ZED_PIDFILE") kill "$PID" fi } check_modules_linux() { LOADED_MODULES="" MISSING_MODULES="" for KMOD in $KMOD_SPL $KMOD_ZAVL $KMOD_ZNVPAIR $KMOD_ZUNICODE $KMOD_ZCOMMON \ $KMOD_ZLUA $KMOD_ZZSTD $KMOD_ICP $KMOD_ZFS; do NAME=$(basename "$KMOD" .ko) if lsmod | grep -E -q "^${NAME}"; then LOADED_MODULES="$LOADED_MODULES\t$NAME\n" fi if ! modinfo "$KMOD" >/dev/null 2>&1; then MISSING_MODULES="$MISSING_MODULES\t${KMOD}\n" fi done if [ -n "$LOADED_MODULES" ]; then printf "Unload the kernel modules by running '%s -u':\n" "$PROG" printf "%b" "$LOADED_MODULES" exit 1 fi if [ -n "$MISSING_MODULES" ]; then printf "The following kernel modules can not be found:\n" printf "%b" "$MISSING_MODULES" exit 1 fi return 0 } load_module_linux() { KMOD=$1 FILE=$(modinfo "$KMOD" | awk '/^filename:/ {print $2}') VERSION=$(modinfo "$KMOD" | awk '/^version:/ {print $2}') if [ "$VERBOSE" = "yes" ]; then echo "Loading: $FILE ($VERSION)" fi $LDMOD "$KMOD" >/dev/null 2>&1 # shellcheck disable=SC2181 if [ $? -ne 0 ]; then echo "Failed to load $KMOD" return 1 fi return 0 } load_modules_freebsd() { kldload "$KMOD_FREEBSD" || return 1 if [ "$VERBOSE" = "yes" ]; then echo "Successfully loaded ZFS module stack" fi return 0 } load_modules_linux() { mkdir -p /etc/zfs if modinfo "$KMOD_ZLIB_DEFLATE" >/dev/null 2>&1; then modprobe "$KMOD_ZLIB_DEFLATE" >/dev/null 2>&1 fi if modinfo "$KMOD_ZLIB_INFLATE">/dev/null 2>&1; then modprobe "$KMOD_ZLIB_INFLATE" >/dev/null 2>&1 fi for KMOD in $KMOD_SPL $KMOD_ZAVL $KMOD_ZNVPAIR \ $KMOD_ZUNICODE $KMOD_ZCOMMON $KMOD_ZLUA $KMOD_ZZSTD \ $KMOD_ICP $KMOD_ZFS; do load_module_linux "$KMOD" || return 1 done if [ "$VERBOSE" = "yes" ]; then echo "Successfully loaded ZFS module stack" fi return 0 } unload_module_linux() { KMOD=$1 NAME=$(basename "$KMOD" .ko) FILE=$(modinfo "$KMOD" | awk '/^filename:/ {print $2}') VERSION=$(modinfo "$KMOD" | awk '/^version:/ {print $2}') if [ "$VERBOSE" = "yes" ]; then echo "Unloading: $KMOD ($VERSION)" fi rmmod "$NAME" || echo "Failed to unload $NAME" return 0 } unload_modules_freebsd() { kldunload "$KMOD_FREEBSD" || echo "Failed to unload $KMOD_FREEBSD" if [ "$VERBOSE" = "yes" ]; then echo "Successfully unloaded ZFS module stack" fi return 0 } unload_modules_linux() { for KMOD in $KMOD_ZFS $KMOD_ICP $KMOD_ZZSTD $KMOD_ZLUA $KMOD_ZCOMMON \ $KMOD_ZUNICODE $KMOD_ZNVPAIR $KMOD_ZAVL $KMOD_SPL; do NAME=$(basename "$KMOD" .ko) USE_COUNT=$(lsmod | grep -E "^${NAME} " | awk '{print $3}') if [ "$USE_COUNT" = "0" ] ; then unload_module_linux "$KMOD" || return 1 - else + elif [ "$USE_COUNT" != "" ] ; then echo "Module ${NAME} is still in use!" return 1 fi done if modinfo "$KMOD_ZLIB_DEFLATE" >/dev/null 2>&1; then modprobe -r "$KMOD_ZLIB_DEFLATE" >/dev/null 2>&1 fi if modinfo "$KMOD_ZLIB_INFLATE">/dev/null 2>&1; then modprobe -r "$KMOD_ZLIB_INFLATE" >/dev/null 2>&1 fi if [ "$VERBOSE" = "yes" ]; then echo "Successfully unloaded ZFS module stack" fi return 0 } stack_clear_linux() { STACK_MAX_SIZE=/sys/kernel/debug/tracing/stack_max_size STACK_TRACER_ENABLED=/proc/sys/kernel/stack_tracer_enabled if [ "$STACK_TRACER" = "yes" ] && [ -e "$STACK_MAX_SIZE" ]; then echo 1 >"$STACK_TRACER_ENABLED" echo 0 >"$STACK_MAX_SIZE" fi } stack_check_linux() { STACK_MAX_SIZE=/sys/kernel/debug/tracing/stack_max_size STACK_TRACE=/sys/kernel/debug/tracing/stack_trace STACK_LIMIT=15362 if [ -e "$STACK_MAX_SIZE" ]; then STACK_SIZE=$(cat "$STACK_MAX_SIZE") if [ "$STACK_SIZE" -ge "$STACK_LIMIT" ]; then echo echo "Warning: max stack size $STACK_SIZE bytes" cat "$STACK_TRACE" fi fi } if [ "$(id -u)" != 0 ]; then echo "Must run as root" exit 1 fi UNAME=$(uname -s) if [ "$UNLOAD" = "yes" ]; then kill_zed umount -t zfs -a case $UNAME in FreeBSD) unload_modules_freebsd ;; Linux) stack_check_linux unload_modules_linux ;; esac else case $UNAME in FreeBSD) load_modules_freebsd ;; Linux) stack_clear_linux check_modules_linux load_modules_linux "$@" udevadm trigger udevadm settle ;; esac fi exit 0 diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index 1351b76fae16..f86d0283bf37 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -1,930 +1,931 @@ # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. # You may only use this file in accordance with the terms of version # 1.0 of the CDDL. # # A full copy of the text of the CDDL should have accompanied this # source. A copy of the CDDL is also available via the Internet at # http://www.illumos.org/license/CDDL. # # This run file contains all of the common functional tests. When # adding a new test consider also adding it to the sanity.run file # if the new test runs to completion in only a few seconds. # # Approximate run time: 4-5 hours # [DEFAULT] pre = setup quiet = False pre_user = root user = root timeout = 600 post_user = root post = cleanup failsafe_user = root failsafe = callbacks/zfs_failsafe outputdir = /var/tmp/test_results tags = ['functional'] [tests/functional/alloc_class] tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', 'alloc_class_013_pos'] tags = ['functional', 'alloc_class'] [tests/functional/arc] tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'dbufstats_003_pos', 'arcstats_runtime_tuning'] tags = ['functional', 'arc'] [tests/functional/atime] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tags = ['functional', 'atime'] [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', 'bootfs_008_pos'] tags = ['functional', 'bootfs'] [tests/functional/btree] tests = ['btree_positive', 'btree_negative'] tags = ['functional', 'btree'] pre = post = [tests/functional/cache] tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', 'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos'] tags = ['functional', 'cache'] [tests/functional/cachefile] tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', 'cachefile_004_pos'] tags = ['functional', 'cachefile'] [tests/functional/casenorm] tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure', 'sensitive_none_lookup', 'sensitive_none_delete', 'sensitive_formd_lookup', 'sensitive_formd_delete', 'insensitive_none_lookup', 'insensitive_none_delete', 'insensitive_formd_lookup', 'insensitive_formd_delete', 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] tags = ['functional', 'casenorm'] [tests/functional/channel_program/lua_core] tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] tags = ['functional', 'channel_program', 'lua_core'] [tests/functional/channel_program/synctask_core] tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', 'tst.list_children', 'tst.list_clones', 'tst.list_holds', 'tst.list_snapshots', 'tst.list_system_props', 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', 'tst.snapshot_recursive', 'tst.snapshot_simple', 'tst.bookmark.create', 'tst.bookmark.copy', 'tst.terminate_by_signal' ] tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/checksum] tests = ['run_sha2_test', 'run_skein_test', 'filetest_001_pos', 'filetest_002_pos'] tags = ['functional', 'checksum'] [tests/functional/clean_mirror] tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', 'clean_mirror_003_pos', 'clean_mirror_004_pos'] tags = ['functional', 'clean_mirror'] [tests/functional/cli_root/zdb] tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2'] pre = post = tags = ['functional', 'cli_root', 'zdb'] [tests/functional/cli_root/zfs] tests = ['zfs_001_neg', 'zfs_002_pos'] tags = ['functional', 'cli_root', 'zfs'] [tests/functional/cli_root/zfs_bookmark] tests = ['zfs_bookmark_cliargs'] tags = ['functional', 'cli_root', 'zfs_bookmark'] [tests/functional/cli_root/zfs_change-key] tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] tags = ['functional', 'cli_root', 'zfs_change-key'] [tests/functional/cli_root/zfs_clone] tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested'] tags = ['functional', 'cli_root', 'zfs_clone'] [tests/functional/cli_root/zfs_copies] tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos', 'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos'] tags = ['functional', 'cli_root', 'zfs_copies'] [tests/functional/cli_root/zfs_create] tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', 'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_nomount', 'zfs_create_verbose'] tags = ['functional', 'cli_root', 'zfs_create'] [tests/functional/cli_root/zfs_destroy] tests = ['zfs_clone_livelist_condense_and_disable', 'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', 'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist', 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] tags = ['functional', 'cli_root', 'zfs_destroy'] [tests/functional/cli_root/zfs_diff] tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp', 'zfs_diff_types', 'zfs_diff_encrypted'] tags = ['functional', 'cli_root', 'zfs_diff'] [tests/functional/cli_root/zfs_get] tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos', 'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg', 'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg'] tags = ['functional', 'cli_root', 'zfs_get'] [tests/functional/cli_root/zfs_ids_to_path] tests = ['zfs_ids_to_path_001_pos'] tags = ['functional', 'cli_root', 'zfs_ids_to_path'] [tests/functional/cli_root/zfs_inherit] tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos', 'zfs_inherit_mountpoint'] tags = ['functional', 'cli_root', 'zfs_inherit'] [tests/functional/cli_root/zfs_load-key] tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] tags = ['functional', 'cli_root', 'zfs_load-key'] [tests/functional/cli_root/zfs_mount] tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', 'zfs_mount_test_race'] tags = ['functional', 'cli_root', 'zfs_mount'] [tests/functional/cli_root/zfs_program] tests = ['zfs_program_json'] tags = ['functional', 'cli_root', 'zfs_program'] [tests/functional/cli_root/zfs_promote] tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] tags = ['functional', 'cli_root', 'zfs_promote'] [tests/functional/cli_root/zfs_property] tests = ['zfs_written_property_001_pos'] tags = ['functional', 'cli_root', 'zfs_property'] [tests/functional/cli_root/zfs_receive] tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', 'zfs_receive_016_pos', 'receive-o-x_props_override', 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', 'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props'] tags = ['functional', 'cli_root', 'zfs_receive'] [tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', 'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child', 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount'] tags = ['functional', 'cli_root', 'zfs_rename'] [tests/functional/cli_root/zfs_reservation] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] tags = ['functional', 'cli_root', 'zfs_rollback'] [tests/functional/cli_root/zfs_send] tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b'] tags = ['functional', 'cli_root', 'zfs_send'] [tests/functional/cli_root/zfs_set] tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation', 'zfs_set_feature_activation'] tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_share] tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', 'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg', 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_snapshot] tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', 'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg', 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg', 'zfs_snapshot_009_pos'] tags = ['functional', 'cli_root', 'zfs_snapshot'] [tests/functional/cli_root/zfs_unload-key] tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] tags = ['functional', 'cli_root', 'zfs_unload-key'] [tests/functional/cli_root/zfs_unmount] tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', 'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos', 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', 'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys'] tags = ['functional', 'cli_root', 'zfs_unmount'] [tests/functional/cli_root/zfs_unshare] tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', 'zfs_unshare_007_pos'] tags = ['functional', 'cli_root', 'zfs_unshare'] [tests/functional/cli_root/zfs_upgrade] tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', 'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg', 'zfs_upgrade_007_neg'] tags = ['functional', 'cli_root', 'zfs_upgrade'] [tests/functional/cli_root/zfs_wait] tests = ['zfs_wait_deleteq'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zpool] tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors'] tags = ['functional', 'cli_root', 'zpool'] [tests/functional/cli_root/zpool_add] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] tests = ['zpool_attach_001_neg', 'attach-o_ashift'] tags = ['functional', 'cli_root', 'zpool_attach'] [tests/functional/cli_root/zpool_clear] tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg', 'zpool_clear_readonly'] tags = ['functional', 'cli_root', 'zpool_clear'] [tests/functional/cli_root/zpool_create] tests = ['zpool_create_001_pos', 'zpool_create_002_pos', 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos', 'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', 'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg', 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', 'zpool_create_023_neg', 'zpool_create_024_pos', 'zpool_create_encrypted', 'zpool_create_crypt_combos', 'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos', 'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos', 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', 'zpool_create_features_005_pos', 'zpool_create_features_006_pos', 'zpool_create_features_007_pos', 'zpool_create_features_008_pos', 'create-o_ashift', 'zpool_create_tempname', 'zpool_create_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_create'] [tests/functional/cli_root/zpool_destroy] tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', 'zpool_destroy_003_neg'] pre = post = tags = ['functional', 'cli_root', 'zpool_destroy'] [tests/functional/cli_root/zpool_detach] tests = ['zpool_detach_001_neg'] tags = ['functional', 'cli_root', 'zpool_detach'] [tests/functional/cli_root/zpool_events] tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', - 'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates'] + 'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates', + 'zpool_events_clear_retained'] tags = ['functional', 'cli_root', 'zpool_events'] [tests/functional/cli_root/zpool_export] tests = ['zpool_export_001_pos', 'zpool_export_002_pos', 'zpool_export_003_neg', 'zpool_export_004_pos'] tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', 'zpool_get_004_neg', 'zpool_get_005_pos'] tags = ['functional', 'cli_root', 'zpool_get'] [tests/functional/cli_root/zpool_history] tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] tags = ['functional', 'cli_root', 'zpool_history'] [tests/functional/cli_root/zpool_import] tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', 'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos', 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', 'zpool_import_encrypted', 'zpool_import_encrypted_load', 'zpool_import_errata3', 'zpool_import_errata4', 'import_cachefile_device_added', 'import_cachefile_device_removed', 'import_cachefile_device_replaced', 'import_cachefile_mirror_attached', 'import_cachefile_mirror_detached', 'import_cachefile_shared_device', 'import_devices_missing', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 [tests/functional/cli_root/zpool_labelclear] tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', 'zpool_labelclear_removed', 'zpool_labelclear_valid'] pre = post = tags = ['functional', 'cli_root', 'zpool_labelclear'] [tests/functional/cli_root/zpool_initialize] tests = ['zpool_initialize_attach_detach_add_remove', 'zpool_initialize_fault_export_import_online', 'zpool_initialize_import_export', 'zpool_initialize_offline_export_import_online', 'zpool_initialize_online_offline', 'zpool_initialize_split', 'zpool_initialize_start_and_cancel_neg', 'zpool_initialize_start_and_cancel_pos', 'zpool_initialize_suspend_resume', 'zpool_initialize_unsupported_vdevs', 'zpool_initialize_verify_checksums', 'zpool_initialize_verify_initialized'] pre = tags = ['functional', 'cli_root', 'zpool_initialize'] [tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', 'zpool_offline_003_pos'] tags = ['functional', 'cli_root', 'zpool_offline'] [tests/functional/cli_root/zpool_online] tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] tags = ['functional', 'cli_root', 'zpool_online'] [tests/functional/cli_root/zpool_remove] tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', 'zpool_remove_003_pos'] tags = ['functional', 'cli_root', 'zpool_remove'] [tests/functional/cli_root/zpool_replace] tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', 'zpool_split_resilver', 'zpool_split_indirect', 'zpool_split_dryrun_output'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] tags = ['functional', 'cli_root', 'zpool_sync'] [tests/functional/cli_root/zpool_trim] tests = ['zpool_trim_attach_detach_add_remove', 'zpool_trim_fault_export_import_online', 'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg', 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', 'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg', 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', 'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume', 'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums', 'zpool_trim_verify_trimmed'] tags = ['functional', 'zpool_trim'] [tests/functional/cli_root/zpool_upgrade] tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', 'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos', 'zpool_upgrade_009_neg', 'zpool_upgrade_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_upgrade'] [tests/functional/cli_root/zpool_wait] tests = ['zpool_wait_discard', 'zpool_wait_freeing', 'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel', 'zpool_wait_initialize_flag', 'zpool_wait_multiple', 'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel', 'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag', 'zpool_wait_usage'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_root/zpool_wait/scan] tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', 'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag'] tags = ['functional', 'cli_root', 'zpool_wait'] [tests/functional/cli_user/misc] tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', 'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', 'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg', 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege'] user = tags = ['functional', 'cli_user', 'misc'] [tests/functional/cli_user/zfs_list] tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', 'zfs_list_004_neg', 'zfs_list_007_pos', 'zfs_list_008_neg'] user = tags = ['functional', 'cli_user', 'zfs_list'] [tests/functional/cli_user/zpool_iostat] tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', 'zpool_iostat_005_pos', 'zpool_iostat_-c_disable', 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_iostat'] [tests/functional/cli_user/zpool_list] tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] user = tags = ['functional', 'cli_user', 'zpool_list'] [tests/functional/cli_user/zpool_status] tests = ['zpool_status_003_pos', 'zpool_status_-c_disable', 'zpool_status_-c_homedir', 'zpool_status_-c_searchpath'] user = tags = ['functional', 'cli_user', 'zpool_status'] [tests/functional/compression] tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', 'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled', 'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc'] tags = ['functional', 'compression'] [tests/functional/cp_files] tests = ['cp_files_001_pos'] tags = ['functional', 'cp_files'] [tests/functional/ctime] tests = ['ctime_001_pos' ] tags = ['functional', 'ctime'] [tests/functional/delegate] tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', 'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg', 'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg', 'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos', 'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos', 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] tags = ['functional', 'delegate'] [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] [tests/functional/features/async_destroy] tests = ['async_destroy_001_pos'] tags = ['functional', 'features', 'async_destroy'] [tests/functional/features/large_dnode] tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos'] tags = ['functional', 'features', 'large_dnode'] [tests/functional/grow] pre = post = tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] tags = ['functional', 'grow'] [tests/functional/history] tests = ['history_001_pos', 'history_002_pos', 'history_003_pos', 'history_004_pos', 'history_005_neg', 'history_006_neg', 'history_007_pos', 'history_008_pos', 'history_009_pos', 'history_010_pos'] tags = ['functional', 'history'] [tests/functional/hkdf] tests = ['run_hkdf_test'] tags = ['functional', 'hkdf'] [tests/functional/inheritance] tests = ['inherit_001_pos'] pre = tags = ['functional', 'inheritance'] [tests/functional/io] tests = ['sync', 'psync', 'posixaio', 'mmap'] tags = ['functional', 'io'] [tests/functional/inuse] tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos'] post = tags = ['functional', 'inuse'] [tests/functional/large_files] tests = ['large_files_001_pos', 'large_files_002_pos'] tags = ['functional', 'large_files'] [tests/functional/largest_pool] tests = ['largest_pool_001_pos'] pre = post = tags = ['functional', 'largest_pool'] [tests/functional/limits] tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count', 'snapshot_limit'] tags = ['functional', 'limits'] [tests/functional/link_count] tests = ['link_count_001', 'link_count_root_inode'] tags = ['functional', 'link_count'] [tests/functional/migration] tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_write_001_pos', 'mmap_read_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mount] tests = ['umount_001', 'umountall_001'] tags = ['functional', 'mount'] [tests/functional/mv_files] tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation'] tags = ['functional', 'mv_files'] [tests/functional/nestedfs] tests = ['nestedfs_001_pos'] tags = ['functional', 'nestedfs'] [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', 'enospc_df'] tags = ['functional', 'no_space'] [tests/functional/nopwrite] tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', 'nopwrite_varying_compression', 'nopwrite_volume'] tags = ['functional', 'nopwrite'] [tests/functional/online_offline] tests = ['online_offline_001_pos', 'online_offline_002_neg', 'online_offline_003_neg'] tags = ['functional', 'online_offline'] [tests/functional/pool_checkpoint] tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', 'checkpoint_discard_busy', 'checkpoint_discard_many', 'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz', 'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind', 'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice', 'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat'] tags = ['functional', 'pool_checkpoint'] timeout = 1800 [tests/functional/pool_names] tests = ['pool_names_001_pos', 'pool_names_002_neg'] pre = post = tags = ['functional', 'pool_names'] [tests/functional/poolversion] tests = ['poolversion_001_pos', 'poolversion_002_pos'] tags = ['functional', 'poolversion'] [tests/functional/pyzfs] tests = ['pyzfs_unittest'] pre = post = tags = ['functional', 'pyzfs'] [tests/functional/quota] tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] tags = ['functional', 'quota'] [tests/functional/redacted_send] tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', 'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes', 'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones', 'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative', 'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size', 'redacted_volume'] tags = ['functional', 'redacted_send'] [tests/functional/raidz] tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos'] tags = ['functional', 'raidz'] [tests/functional/redundancy] tests = ['redundancy_draid1', 'redundancy_draid2', 'redundancy_draid3', 'redundancy_draid_spare1', 'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror', 'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2', 'redundancy_raidz3', 'redundancy_stripe'] tags = ['functional', 'redundancy'] [tests/functional/refquota] tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', 'refquota_007_neg', 'refquota_008_neg'] tags = ['functional', 'refquota'] [tests/functional/refreserv] tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz', 'refreserv_raidz'] tags = ['functional', 'refreserv'] [tests/functional/removal] pre = tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space', 'removal_condense_export', 'removal_multiple_indirection', 'removal_nopwrite', 'removal_remap_deadlists', 'removal_resume_export', 'removal_sanity', 'removal_with_add', 'removal_with_create_fs', 'removal_with_dedup', 'removal_with_errors', 'removal_with_export', 'removal_with_ganging', 'removal_with_faulted', 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', 'removal_with_send_recv', 'removal_with_snapshot', 'removal_with_write', 'removal_with_zdb', 'remove_expanded', 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz', 'remove_indirect', 'remove_attach_mirror'] tags = ['functional', 'removal'] [tests/functional/rename_dirs] tests = ['rename_dirs_001_pos'] tags = ['functional', 'rename_dirs'] [tests/functional/replacement] tests = ['attach_import', 'attach_multiple', 'attach_rebuild', 'attach_resilver', 'detach', 'rebuild_disabled_feature', 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', 'scrub_cancel'] tags = ['functional', 'replacement'] [tests/functional/reservation] tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', 'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos', 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', 'reservation_022_pos'] tags = ['functional', 'reservation'] [tests/functional/rootpool] tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] tags = ['functional', 'rootpool'] [tests/functional/rsend] tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos', 'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos', 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props', 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump', 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw', 'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid'] tags = ['functional', 'rsend'] [tests/functional/scrub_mirror] tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] tags = ['functional', 'scrub_mirror'] [tests/functional/slog] tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001', 'slog_replay_fs_002', 'slog_replay_volume'] tags = ['functional', 'slog'] [tests/functional/snapshot] tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', 'snapshot_017_pos'] tags = ['functional', 'snapshot'] [tests/functional/snapused] tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos', 'snapused_004_pos', 'snapused_005_pos'] tags = ['functional', 'snapused'] [tests/functional/sparse] tests = ['sparse_001_pos'] tags = ['functional', 'sparse'] [tests/functional/suid] tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', 'suid_write_to_none'] tags = ['functional', 'suid'] [tests/functional/threadsappend] tests = ['threadsappend_001_pos'] tags = ['functional', 'threadsappend'] [tests/functional/trim] tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity', 'trim_integrity', 'trim_config', 'trim_l2arc'] tags = ['functional', 'trim'] [tests/functional/truncate] tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] tags = ['functional', 'truncate'] [tests/functional/upgrade] tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] tags = ['functional', 'upgrade'] [tests/functional/userquota] tests = [ 'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos', 'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos', 'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos', 'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg', 'userspace_001_pos', 'userspace_002_pos', 'userspace_encrypted', 'userspace_send_encrypted'] tags = ['functional', 'userquota'] [tests/functional/vdev_zaps] tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', 'vdev_zaps_007_pos'] tags = ['functional', 'vdev_zaps'] [tests/functional/write_dirs] tests = ['write_dirs_001_pos', 'write_dirs_002_pos'] tags = ['functional', 'write_dirs'] [tests/functional/xattr] tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos'] tags = ['functional', 'xattr'] [tests/functional/zvol/zvol_ENOSPC] tests = ['zvol_ENOSPC_001_pos'] tags = ['functional', 'zvol', 'zvol_ENOSPC'] [tests/functional/zvol/zvol_cli] tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] tags = ['functional', 'zvol', 'zvol_cli'] [tests/functional/zvol/zvol_misc] tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] tags = ['functional', 'zvol', 'zvol_swap'] [tests/functional/libzfs] tests = ['many_fds', 'libzfs_input'] tags = ['functional', 'libzfs'] [tests/functional/log_spacemap] tests = ['log_spacemap_import_logs'] pre = post = tags = ['functional', 'log_spacemap'] [tests/functional/l2arc] tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos', 'persist_l2arc_001_pos', 'persist_l2arc_002_pos', 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos', 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos'] tags = ['functional', 'l2arc'] [tests/functional/zpool_influxdb] tests = ['zpool_influxdb'] tags = ['functional', 'zpool_influxdb'] diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am index 99c46f0143c2..765df102229d 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am @@ -1,25 +1,26 @@ include $(top_srcdir)/config/Rules.am pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_events pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_events dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ zpool_events_clear.ksh \ zpool_events_cliargs.ksh \ zpool_events_follow.ksh \ zpool_events_poolname.ksh \ zpool_events_errors.ksh \ - zpool_events_duplicates.ksh + zpool_events_duplicates.ksh \ + zpool_events_clear_retained.ksh dist_pkgdata_DATA = \ zpool_events.cfg \ zpool_events.kshlib ereports_LDADD = \ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ $(abs_top_builddir)/lib/libzfs/libzfs.la pkgexec_PROGRAMS = ereports ereports_SOURCES = ereports.c diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear_retained.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear_retained.ksh new file mode 100755 index 000000000000..fdf56b2cf9a6 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear_retained.ksh @@ -0,0 +1,135 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2021 by Delphix. All rights reserved. +# + +# DESCRIPTION: +# Verify that new errors after a pool scrub are considered a duplicate +# +# STRATEGY: +# 1. Create a raidz pool with a file +# 2. Inject garbage into one of the vdevs +# 3. Scrub the pool +# 4. Observe the checksum error counts +# 5. Repeat inject and pool scrub +# 6. Verify that second pass also produces similar errors (i.e. not +# treated as a duplicate) +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +MOUNTDIR=$TEST_BASE_DIR/mount +FILEPATH=$MOUNTDIR/target +VDEV1=$TEST_BASE_DIR/vfile1 +VDEV2=$TEST_BASE_DIR/vfile2 +VDEV3=$TEST_BASE_DIR/vfile3 +SUPPLY=$TEST_BASE_DIR/supply +POOL=test_pool +FILESIZE="15M" +DAMAGEBLKS=10 + +OLD_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX) +RETAIN_MAX=$(get_tunable ZEVENT_RETAIN_MAX) +OLD_CHECKSUMS=$(get_tunable CHECKSUM_EVENTS_PER_SECOND) + +EREPORTS="$STF_SUITE/tests/functional/cli_root/zpool_events/ereports" + +function cleanup +{ + log_must set_tunable64 CHECKSUM_EVENTS_PER_SECOND $OLD_CHECKSUMS + log_must set_tunable64 ZEVENT_LEN_MAX $OLD_LEN_MAX + + zpool events -c + if poolexists $POOL ; then + zpool export $POOL + fi + log_must rm -f $VDEV1 $VDEV2 $VDEV3 +} + +function damage_and_repair +{ + log_must zpool clear $POOL $VDEV1 + log_must zpool events -c + + log_note injecting damage to $VDEV1 + log_must dd conv=notrunc if=$SUPPLY of=$VDEV1 bs=1M seek=4 count=$DAMAGEBLKS + log_must zpool scrub $POOL + log_must zpool wait -t scrub $POOL + log_note "pass $1 observed $($EREPORTS | grep -c checksum) checksum ereports" + + repaired=$(zpool status $POOL | grep "scan: scrub repaired" | awk '{print $4}') + if [ "$repaired" == "0B" ]; then + log_fail "INVALID TEST -- expected scrub to repair some blocks" + else + log_note "$repaired repaired during scrub" + fi +} + +function checksum_error_count +{ + zpool status -p $POOL | grep $VDEV1 | awk '{print $5}' +} + +assertion="Damage to recently repaired blocks should be reported/counted" +log_assert "$assertion" +log_note "zevent retain max setting: $RETAIN_MAX" + +log_onexit cleanup + +# Set our threshold high to avoid dropping events. +set_tunable64 ZEVENT_LEN_MAX 20000 +set_tunable64 CHECKSUM_EVENTS_PER_SECOND 20000 + +# Initialize resources for the test +log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 $VDEV3 +log_must dd if=/dev/urandom of=$SUPPLY bs=1M count=$DAMAGEBLKS +log_must mkdir -p $MOUNTDIR +log_must zpool create -f -m $MOUNTDIR -o failmode=continue $POOL raidz $VDEV1 $VDEV2 $VDEV3 +log_must zfs set compression=off recordsize=16k $POOL +# create a file full of zeros +log_must mkfile -v $FILESIZE $FILEPATH +log_must zpool sync $POOL + +# run once and observe the checksum errors +damage_and_repair 1 +errcnt=$(checksum_error_count) +log_note "$errcnt errors observed" +# set expectaton of at least 75% of what we observed in first pass +(( expected = (errcnt * 75) / 100 )) + +# run again and we should observe new checksum errors +damage_and_repair 2 +errcnt=$(checksum_error_count) + +log_must zpool destroy $POOL + +if (( errcnt < expected )); then + log_fail "FAILED -- expecting at least $expected checksum errors but only observed $errcnt" +else + log_note observed $errcnt new checksum errors after a scrub + log_pass "$assertion" +fi + diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh index 1ba7b1b34496..d4194a5b8f5b 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_duplicates.ksh @@ -1,155 +1,144 @@ #!/bin/ksh -p # CDDL HEADER START # # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # # When distributing Covered Code, include this CDDL HEADER in each # file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] # # CDDL HEADER END # # # Copyright (c) 2018 by Lawrence Livermore National Security, LLC. # Copyright (c) 2020 by Delphix. All rights reserved. # # DESCRIPTION: # Verify that duplicate I/O ereport errors are not posted # # STRATEGY: # 1. Create a mirror pool # 2. Inject duplicate read/write IO errors and checksum errors # 3. Verify there are no duplicate events being posted # . $STF_SUITE/include/libtest.shlib verify_runnable "both" MOUNTDIR=$TEST_BASE_DIR/mount FILEPATH=$MOUNTDIR/badfile VDEV1=$TEST_BASE_DIR/vfile1 VDEV2=$TEST_BASE_DIR/vfile2 POOL=error_pool FILESIZE="10M" OLD_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX) RETAIN_MAX=$(get_tunable ZEVENT_RETAIN_MAX) EREPORTS="$STF_SUITE/tests/functional/cli_root/zpool_events/ereports" duplicates=false function cleanup { log_must set_tunable64 ZEVENT_LEN_MAX $OLD_LEN_MAX log_must zinject -c all if poolexists $POOL ; then destroy_pool $POOL fi log_must rm -f $VDEV1 $VDEV2 } log_assert "Duplicate I/O ereport errors are not posted" log_note "zevent retain max setting: $RETAIN_MAX" log_onexit cleanup # Set our threshold high to avoid dropping events. set_tunable64 ZEVENT_LEN_MAX 20000 log_must truncate -s $MINVDEVSIZE $VDEV1 $VDEV2 log_must mkdir -p $MOUNTDIR # # $1: test type - corrupt (checksum error), io # $2: read, write function do_dup_test { ERR=$1 RW=$2 log_note "Testing $ERR $RW ereports" log_must zpool create -f -m $MOUNTDIR -o failmode=continue $POOL mirror $VDEV1 $VDEV2 log_must zpool events -c log_must zfs set compression=off $POOL if [ "$RW" == "read" ] ; then log_must mkfile $FILESIZE $FILEPATH # unmount and mount filesystems to purge file from ARC # to force reads to go through error inject handler log_must zfs unmount $POOL log_must zfs mount $POOL # all reads from this file get an error if [ "$ERR" == "corrupt" ] ; then log_must zinject -a -t data -e checksum -T read $FILEPATH else log_must zinject -a -t data -e io -T read $FILEPATH fi # Read the file a few times to generate some # duplicate errors of the same blocks # shellcheck disable=SC2034 for i in {1..15}; do dd if=$FILEPATH of=/dev/null bs=128K > /dev/null 2>&1 done log_must zinject -c all fi log_must zinject -d $VDEV1 -e $ERR -T $RW -f 100 $POOL if [ "$RW" == "write" ] ; then log_must mkfile $FILESIZE $FILEPATH log_must zpool sync $POOL - else - # scrub twice to generate some duplicates - log_must zpool scrub $POOL - log_must zpool wait -t scrub $POOL - log_must zpool scrub $POOL - log_must zpool wait -t scrub $POOL fi log_must zinject -c all - # Wait for the pool to settle down and finish resilvering (if - # necessary). We want the errors to stop incrementing before we - # check for duplicates. - zpool wait -t resilver $POOL - ereports="$($EREPORTS | sort)" actual=$(echo "$ereports" | wc -l) unique=$(echo "$ereports" | uniq | wc -l) log_note "$actual total $ERR $RW ereports where $unique were unique" if [ $actual -gt $unique ] ; then log_note "UNEXPECTED -- $((actual-unique)) duplicate $ERR $RW ereports" echo "$ereports" duplicates=true fi log_must zpool destroy $POOL } do_dup_test "corrupt" "read" do_dup_test "io" "read" do_dup_test "io" "write" if $duplicates; then log_fail "FAILED -- Duplicate I/O ereport errors encountered" else log_pass "Duplicate I/O ereport errors are not posted" fi diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index 18ca44d1b3c1..0ef2200d1940 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -1,774 +1,774 @@ /* * $FreeBSD$ */ /* zfs_config.h. Generated from zfs_config.h.in by configure. */ /* zfs_config.h.in. Generated from configure.ac by autoheader. */ /* Define to 1 if translation of program messages to the user's native language is requested. */ /* #undef ENABLE_NLS */ /* bio_end_io_t wants 1 arg */ /* #undef HAVE_1ARG_BIO_END_IO_T */ /* lookup_bdev() wants 1 arg */ /* #undef HAVE_1ARG_LOOKUP_BDEV */ /* submit_bio() wants 1 arg */ /* #undef HAVE_1ARG_SUBMIT_BIO */ /* bdi_setup_and_register() wants 2 args */ /* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 2 args */ /* #undef HAVE_2ARGS_VFS_GETATTR */ /* zlib_deflate_workspacesize() wants 2 args */ /* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ /* bdi_setup_and_register() wants 3 args */ /* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */ /* vfs_getattr wants 3 args */ /* #undef HAVE_3ARGS_VFS_GETATTR */ /* vfs_getattr wants 4 args */ /* #undef HAVE_4ARGS_VFS_GETATTR */ /* kernel has access_ok with 'type' parameter */ /* #undef HAVE_ACCESS_OK_TYPE */ /* posix_acl has refcount_t */ /* #undef HAVE_ACL_REFCOUNT */ /* Define if host toolchain supports AES */ #define HAVE_AES 1 #ifdef __amd64__ #ifndef RESCUE /* Define if host toolchain supports AVX */ #define HAVE_AVX 1 #endif /* Define if host toolchain supports AVX2 */ #define HAVE_AVX2 1 /* Define if host toolchain supports AVX512BW */ #define HAVE_AVX512BW 1 /* Define if host toolchain supports AVX512CD */ #define HAVE_AVX512CD 1 /* Define if host toolchain supports AVX512DQ */ #define HAVE_AVX512DQ 1 /* Define if host toolchain supports AVX512ER */ #define HAVE_AVX512ER 1 /* Define if host toolchain supports AVX512F */ #define HAVE_AVX512F 1 /* Define if host toolchain supports AVX512IFMA */ #define HAVE_AVX512IFMA 1 /* Define if host toolchain supports AVX512PF */ #define HAVE_AVX512PF 1 /* Define if host toolchain supports AVX512VBMI */ #define HAVE_AVX512VBMI 1 /* Define if host toolchain supports AVX512VL */ #define HAVE_AVX512VL 1 #endif /* bdev_check_media_change() exists */ /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */ /* bdev_whole() is available */ /* #undef HAVE_BDEV_WHOLE */ /* bio->bi_opf is defined */ /* #undef HAVE_BIO_BI_OPF */ /* bio->bi_status exists */ /* #undef HAVE_BIO_BI_STATUS */ /* bio has bi_iter */ /* #undef HAVE_BIO_BVEC_ITER */ /* bio_*_io_acct() available */ /* #undef HAVE_BIO_IO_ACCT */ /* bio_set_dev() is available */ /* #undef HAVE_BIO_SET_DEV */ /* bio_set_dev() GPL-only */ /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */ /* bio_set_op_attrs is available */ /* #undef HAVE_BIO_SET_OP_ATTRS */ /* blkdev_reread_part() exists */ /* #undef HAVE_BLKDEV_REREAD_PART */ /* blkg_tryget() is available */ /* #undef HAVE_BLKG_TRYGET */ /* blkg_tryget() GPL-only */ /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */ /* blk_alloc_queue() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */ /* blk_alloc_queue_rh() expects request function */ /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */ /* blk queue backing_dev_info is dynamic */ /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */ /* blk_queue_flag_clear() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */ /* blk_queue_flag_set() exists */ /* #undef HAVE_BLK_QUEUE_FLAG_SET */ /* blk_queue_flush() is available */ /* #undef HAVE_BLK_QUEUE_FLUSH */ /* blk_queue_flush() is GPL-only */ /* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */ /* blk_queue_secdiscard() is available */ /* #undef HAVE_BLK_QUEUE_SECDISCARD */ /* blk_queue_secure_erase() is available */ /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */ /* blk_queue_write_cache() exists */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE */ /* blk_queue_write_cache() is GPL-only */ /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYCURRENT */ /* Define to 1 if you have the Mac OS X function CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */ /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */ /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in the CoreFoundation framework. */ /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */ /* check_disk_change() exists */ /* #undef HAVE_CHECK_DISK_CHANGE */ /* clear_inode() is available */ /* #undef HAVE_CLEAR_INODE */ /* dentry uses const struct dentry_operations */ /* #undef HAVE_CONST_DENTRY_OPERATIONS */ /* copy_from_iter() is available */ /* #undef HAVE_COPY_FROM_ITER */ /* copy_to_iter() is available */ /* #undef HAVE_COPY_TO_ITER */ /* yes */ /* #undef HAVE_CPU_HOTPLUG */ /* current_time() exists */ /* #undef HAVE_CURRENT_TIME */ /* Define if the GNU dcgettext() function is already present or preinstalled. */ /* #undef HAVE_DCGETTEXT */ /* DECLARE_EVENT_CLASS() is available */ /* #undef HAVE_DECLARE_EVENT_CLASS */ /* lookup_bdev() wants dev_t arg */ /* #undef HAVE_DEVT_LOOKUP_BDEV */ /* sops->dirty_inode() wants flags */ /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */ /* Define to 1 if you have the header file. */ #define HAVE_DLFCN_H 1 /* d_make_root() is available */ /* #undef HAVE_D_MAKE_ROOT */ /* d_prune_aliases() is available */ /* #undef HAVE_D_PRUNE_ALIASES */ /* dops->d_revalidate() operation takes nameidata */ /* #undef HAVE_D_REVALIDATE_NAMEIDATA */ /* eops->encode_fh() wants child and parent inodes */ /* #undef HAVE_ENCODE_FH_WITH_INODE */ /* sops->evict_inode() exists */ /* #undef HAVE_EVICT_INODE */ /* fops->aio_fsync() exists */ /* #undef HAVE_FILE_AIO_FSYNC */ /* file_dentry() is available */ /* #undef HAVE_FILE_DENTRY */ /* file_inode() is available */ /* #undef HAVE_FILE_INODE */ /* iops->follow_link() cookie */ /* #undef HAVE_FOLLOW_LINK_COOKIE */ /* iops->follow_link() nameidata */ /* #undef HAVE_FOLLOW_LINK_NAMEIDATA */ /* fops->fsync() with range */ /* #undef HAVE_FSYNC_RANGE */ /* fops->fsync() without dentry */ /* #undef HAVE_FSYNC_WITHOUT_DENTRY */ /* generic_*_io_acct() 3 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_3ARG */ /* generic_*_io_acct() 4 arg available */ /* #undef HAVE_GENERIC_IO_ACCT_4ARG */ /* generic_readlink is global */ /* #undef HAVE_GENERIC_READLINK */ /* generic_setxattr() exists */ /* #undef HAVE_GENERIC_SETXATTR */ /* generic_write_checks() takes kiocb */ /* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */ /* Define if the GNU gettext() function is already present or preinstalled. */ /* #undef HAVE_GETTEXT */ /* iops->get_link() cookie */ /* #undef HAVE_GET_LINK_COOKIE */ /* iops->get_link() delayed */ /* #undef HAVE_GET_LINK_DELAYED */ /* group_info->gid exists */ /* #undef HAVE_GROUP_INFO_GID */ /* has_capability() is available */ /* #undef HAVE_HAS_CAPABILITY */ /* Define if you have the iconv() function and it works. */ #define HAVE_ICONV 1 /* yes */ /* #undef HAVE_INODE_LOCK_SHARED */ /* inode_set_flags() exists */ /* #undef HAVE_INODE_SET_FLAGS */ /* inode_set_iversion() exists */ /* #undef HAVE_INODE_SET_IVERSION */ /* inode->i_*time's are timespec64 */ /* #undef HAVE_INODE_TIMESPEC64_TIMES */ /* timestamp_truncate() exists */ /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */ /* Define to 1 if you have the header file. */ #define HAVE_INTTYPES_H 1 /* in_compat_syscall() is available */ /* #undef HAVE_IN_COMPAT_SYSCALL */ /* iov_iter_advance() is available */ /* #undef HAVE_IOV_ITER_ADVANCE */ /* iov_iter_count() is available */ /* #undef HAVE_IOV_ITER_COUNT */ /* iov_iter_fault_in_readable() is available */ /* #undef HAVE_IOV_ITER_FAULT_IN_READABLE */ /* iov_iter_init() is available */ /* #undef HAVE_IOV_ITER_INIT */ /* iov_iter_init() is available */ /* #undef HAVE_IOV_ITER_INIT_LEGACY */ /* iov_iter_revert() is available */ /* #undef HAVE_IOV_ITER_REVERT */ /* iov_iter types are available */ /* #undef HAVE_IOV_ITER_TYPES */ /* yes */ /* #undef HAVE_IO_SCHEDULE_TIMEOUT */ /* Define to 1 if you have the `issetugid' function. */ #define HAVE_ISSETUGID 1 /* kernel has kernel_fpu_* functions */ /* #undef HAVE_KERNEL_FPU */ /* kernel has asm/fpu/api.h */ /* #undef HAVE_KERNEL_FPU_API_HEADER */ /* kernel fpu internal */ /* #undef HAVE_KERNEL_FPU_INTERNAL */ /* uncached_acl_sentinel() exists */ /* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */ /* kernel does stack verification */ /* #undef HAVE_KERNEL_OBJTOOL */ /* kernel has linux/objtool.h */ /* #undef HAVE_KERNEL_OBJTOOL_HEADER */ /* kernel_read() take loff_t pointer */ /* #undef HAVE_KERNEL_READ_PPOS */ /* timer_list.function gets a timer_list */ /* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */ /* struct timer_list has a flags member */ /* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */ /* timer_setup() is available */ /* #undef HAVE_KERNEL_TIMER_SETUP */ /* kernel_write() take loff_t pointer */ /* #undef HAVE_KERNEL_WRITE_PPOS */ /* kmem_cache_create_usercopy() exists */ /* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */ /* kstrtoul() exists */ /* #undef HAVE_KSTRTOUL */ /* ktime_get_coarse_real_ts64() exists */ /* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */ /* ktime_get_raw_ts64() exists */ /* #undef HAVE_KTIME_GET_RAW_TS64 */ /* kvmalloc exists */ /* #undef HAVE_KVMALLOC */ /* kernel has large stacks */ /* #undef HAVE_LARGE_STACKS */ /* Define if you have [aio] */ /* #undef HAVE_LIBAIO */ /* Define if you have [blkid] */ /* #undef HAVE_LIBBLKID */ /* Define if you have [crypto] */ #define HAVE_LIBCRYPTO 1 /* Define if you have [tirpc] */ /* #undef HAVE_LIBTIRPC */ /* Define if you have [udev] */ /* #undef HAVE_LIBUDEV */ /* Define if you have [uuid] */ /* #undef HAVE_LIBUUID */ /* lseek_execute() is available */ /* #undef HAVE_LSEEK_EXECUTE */ /* makedev() is declared in sys/mkdev.h */ /* #undef HAVE_MAKEDEV_IN_MKDEV */ /* makedev() is declared in sys/sysmacros.h */ /* #undef HAVE_MAKEDEV_IN_SYSMACROS */ /* Noting that make_request_fn() returns blk_qc_t */ /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */ /* Noting that make_request_fn() returns void */ /* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */ /* Define to 1 if you have the header file. */ #define HAVE_MEMORY_H 1 /* iops->create()/mkdir()/mknod() take umode_t */ /* #undef HAVE_MKDIR_UMODE_T */ /* Define to 1 if you have the `mlockall' function. */ #define HAVE_MLOCKALL 1 /* lookup_bdev() wants mode arg */ /* #undef HAVE_MODE_LOOKUP_BDEV */ /* Define if host toolchain supports MOVBE */ #define HAVE_MOVBE 1 /* new_sync_read()/new_sync_write() are available */ /* #undef HAVE_NEW_SYNC_READ */ /* iops->getattr() takes a path */ /* #undef HAVE_PATH_IOPS_GETATTR */ /* Define if host toolchain supports PCLMULQDQ */ #define HAVE_PCLMULQDQ 1 /* percpu_counter_init() wants gfp_t */ /* #undef HAVE_PERCPU_COUNTER_INIT_WITH_GFP */ /* posix_acl_chmod() exists */ /* #undef HAVE_POSIX_ACL_CHMOD */ /* posix_acl_from_xattr() needs user_ns */ /* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */ /* posix_acl_release() is available */ /* #undef HAVE_POSIX_ACL_RELEASE */ /* posix_acl_release() is GPL-only */ /* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */ /* posix_acl_valid() wants user namespace */ /* #undef HAVE_POSIX_ACL_VALID_WITH_NS */ /* proc_ops structure exists */ /* #undef HAVE_PROC_OPS_STRUCT */ /* iops->put_link() cookie */ /* #undef HAVE_PUT_LINK_COOKIE */ /* iops->put_link() delayed */ /* #undef HAVE_PUT_LINK_DELAYED */ /* iops->put_link() nameidata */ /* #undef HAVE_PUT_LINK_NAMEIDATA */ /* If available, contains the Python version number currently in use. */ #define HAVE_PYTHON "3.7" /* qat is enabled and existed */ /* #undef HAVE_QAT */ /* iops->rename() wants flags */ /* #undef HAVE_RENAME_WANTS_FLAGS */ /* REQ_DISCARD is defined */ /* #undef HAVE_REQ_DISCARD */ /* REQ_FLUSH is defined */ /* #undef HAVE_REQ_FLUSH */ /* REQ_OP_DISCARD is defined */ /* #undef HAVE_REQ_OP_DISCARD */ /* REQ_OP_FLUSH is defined */ /* #undef HAVE_REQ_OP_FLUSH */ /* REQ_OP_SECURE_ERASE is defined */ /* #undef HAVE_REQ_OP_SECURE_ERASE */ /* REQ_PREFLUSH is defined */ /* #undef HAVE_REQ_PREFLUSH */ /* revalidate_disk() is available */ /* #undef HAVE_REVALIDATE_DISK */ /* revalidate_disk_size() is available */ /* #undef HAVE_REVALIDATE_DISK_SIZE */ /* struct rw_semaphore has member activity */ /* #undef HAVE_RWSEM_ACTIVITY */ /* struct rw_semaphore has atomic_long_t member count */ /* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */ /* linux/sched/signal.h exists */ /* #undef HAVE_SCHED_SIGNAL_HEADER */ /* Define to 1 if you have the header file. */ #define HAVE_SECURITY_PAM_MODULES_H 1 /* setattr_prepare() is available */ /* #undef HAVE_SETATTR_PREPARE */ /* iops->set_acl() exists */ /* #undef HAVE_SET_ACL */ /* set_cached_acl() is usable */ /* #undef HAVE_SET_CACHED_ACL_USABLE */ /* struct shrink_control exists */ /* #undef HAVE_SHRINK_CONTROL_STRUCT */ /* new shrinker callback wants 2 args */ /* #undef HAVE_SINGLE_SHRINKER_CALLBACK */ /* ->count_objects exists */ /* #undef HAVE_SPLIT_SHRINKER_CALLBACK */ #if defined(__amd64__) || defined(__i386__) /* Define if host toolchain supports SSE */ #define HAVE_SSE 1 /* Define if host toolchain supports SSE2 */ #define HAVE_SSE2 1 /* Define if host toolchain supports SSE3 */ #define HAVE_SSE3 1 /* Define if host toolchain supports SSE4.1 */ #define HAVE_SSE4_1 1 /* Define if host toolchain supports SSE4.2 */ #define HAVE_SSE4_2 1 /* Define if host toolchain supports SSSE3 */ #define HAVE_SSSE3 1 #endif /* STACK_FRAME_NON_STANDARD is defined */ /* #undef HAVE_STACK_FRAME_NON_STANDARD */ /* Define to 1 if you have the header file. */ #define HAVE_STDINT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRINGS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the `strlcat' function. */ #define HAVE_STRLCAT 1 /* Define to 1 if you have the `strlcpy' function. */ #define HAVE_STRLCPY 1 /* submit_bio is member of struct block_device_operations */ /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ /* super_setup_bdi_name() exits */ /* #undef HAVE_SUPER_SETUP_BDI_NAME */ /* super_block->s_user_ns exists */ /* #undef HAVE_SUPER_USER_NS */ /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* i_op->tmpfile() exists */ /* #undef HAVE_TMPFILE */ /* totalhigh_pages() exists */ /* #undef HAVE_TOTALHIGH_PAGES */ /* kernel has totalram_pages() */ /* #undef HAVE_TOTALRAM_PAGES_FUNC */ /* Define to 1 if you have the `udev_device_get_is_initialized' function. */ /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */ /* kernel has __kernel_fpu_* functions */ /* #undef HAVE_UNDERSCORE_KERNEL_FPU */ /* Define to 1 if you have the header file. */ #define HAVE_UNISTD_H 1 /* iops->getattr() takes a vfsmount */ /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */ /* aops->direct_IO() uses iovec */ /* #undef HAVE_VFS_DIRECT_IO_IOVEC */ /* aops->direct_IO() uses iov_iter without rw */ /* #undef HAVE_VFS_DIRECT_IO_ITER */ /* aops->direct_IO() uses iov_iter with offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */ /* aops->direct_IO() uses iov_iter with rw and offset */ /* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */ /* All required iov_iter interfaces are available */ /* #undef HAVE_VFS_IOV_ITER */ /* fops->iterate() is available */ /* #undef HAVE_VFS_ITERATE */ /* fops->iterate_shared() is available */ /* #undef HAVE_VFS_ITERATE_SHARED */ /* fops->readdir() is available */ /* #undef HAVE_VFS_READDIR */ /* fops->read/write_iter() are available */ /* #undef HAVE_VFS_RW_ITERATE */ /* __vmalloc page flags exists */ /* #undef HAVE_VMALLOC_PAGE_KERNEL */ /* yes */ /* #undef HAVE_WAIT_ON_BIT_ACTION */ /* wait_queue_entry_t exists */ /* #undef HAVE_WAIT_QUEUE_ENTRY_T */ /* wq_head->head and wq_entry->entry exist */ /* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */ /* xattr_handler->get() wants dentry */ /* #undef HAVE_XATTR_GET_DENTRY */ /* xattr_handler->get() wants both dentry and inode */ /* #undef HAVE_XATTR_GET_DENTRY_INODE */ /* xattr_handler->get() wants xattr_handler */ /* #undef HAVE_XATTR_GET_HANDLER */ /* xattr_handler has name */ /* #undef HAVE_XATTR_HANDLER_NAME */ /* xattr_handler->list() wants dentry */ /* #undef HAVE_XATTR_LIST_DENTRY */ /* xattr_handler->list() wants xattr_handler */ /* #undef HAVE_XATTR_LIST_HANDLER */ /* xattr_handler->list() wants simple */ /* #undef HAVE_XATTR_LIST_SIMPLE */ /* xattr_handler->set() wants dentry */ /* #undef HAVE_XATTR_SET_DENTRY */ /* xattr_handler->set() wants both dentry and inode */ /* #undef HAVE_XATTR_SET_DENTRY_INODE */ /* xattr_handler->set() wants xattr_handler */ /* #undef HAVE_XATTR_SET_HANDLER */ /* Define if you have [z] */ #define HAVE_ZLIB 1 /* __posix_acl_chmod() exists */ /* #undef HAVE___POSIX_ACL_CHMOD */ /* kernel exports FPU functions */ /* #undef KERNEL_EXPORTS_X86_FPU */ /* Define to the sub-directory where libtool stores uninstalled libraries. */ #define LT_OBJDIR ".libs/" /* make_request_fn() return type */ /* #undef MAKE_REQUEST_FN_RET */ /* hardened module_param_call */ /* #undef MODULE_PARAM_CALL_CONST */ /* struct shrink_control has nid */ /* #undef SHRINK_CONTROL_HAS_NID */ /* Defined for legacy compatibility. */ #define SPL_META_ALIAS ZFS_META_ALIAS /* Defined for legacy compatibility. */ #define SPL_META_RELEASE ZFS_META_RELEASE /* Defined for legacy compatibility. */ #define SPL_META_VERSION ZFS_META_VERSION /* True if ZFS is to be compiled for a FreeBSD system */ #define SYSTEM_FREEBSD 1 /* True if ZFS is to be compiled for a Linux system */ /* #undef SYSTEM_LINUX */ /* zfs debugging enabled */ /* #undef ZFS_DEBUG */ /* /dev/zfs minor */ /* #undef ZFS_DEVICE_MINOR */ /* enum node_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */ /* enum node_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum node_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */ /* enum zone_stat_item contains NR_FILE_PAGES */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */ /* enum zone_stat_item contains NR_INACTIVE_ANON */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */ /* enum zone_stat_item contains NR_INACTIVE_FILE */ /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */ /* global_node_page_state() exists */ /* #undef ZFS_GLOBAL_NODE_PAGE_STATE */ /* global_zone_page_state() exists */ /* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */ /* Define to 1 if GPL-only symbols can be used */ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ #define ZFS_META_ALIAS "zfs-2.0.0-FreeBSD_gbf156c966" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" /* Define the project release date. */ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ #define ZFS_META_KVER_MAX "5.11" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" /* Define the project license. */ #define ZFS_META_LICENSE "CDDL" /* Define the libtool library 'age' version information. */ /* #undef ZFS_META_LT_AGE */ /* Define the libtool library 'current' version information. */ /* #undef ZFS_META_LT_CURRENT */ /* Define the libtool library 'revision' version information. */ /* #undef ZFS_META_LT_REVISION */ /* Define the project name. */ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_gbf156c966" +#define ZFS_META_RELEASE "FreeBSD_g9312e0fd1" /* Define the project version. */ #define ZFS_META_VERSION "2.0.0" /* count is located in percpu_ref.data */ /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */