diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 44afa763283a..588c72f6e4fa 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -1,474 +1,475 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019 Datto Inc. */ #ifndef _SYS_SPA_IMPL_H #define _SYS_SPA_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct spa_alloc { kmutex_t spaa_lock; avl_tree_t spaa_tree; } ____cacheline_aligned spa_alloc_t; typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; avl_node_t se_avl; zbookmark_err_phys_t se_zep; /* not accounted in avl_find */ } spa_error_entry_t; typedef struct spa_history_phys { uint64_t sh_pool_create_len; /* ending offset of zpool create */ uint64_t sh_phys_max_off; /* physical EOF */ uint64_t sh_bof; /* logical BOF */ uint64_t sh_eof; /* logical EOF */ uint64_t sh_records_lost; /* num of records overwritten */ } spa_history_phys_t; /* * All members must be uint64_t, for byteswap purposes. */ typedef struct spa_removing_phys { uint64_t sr_state; /* dsl_scan_state_t */ /* * The vdev ID that we most recently attempted to remove, * or -1 if no removal has been attempted. */ uint64_t sr_removing_vdev; /* * The vdev ID that we most recently successfully removed, * or -1 if no devices have been removed. */ uint64_t sr_prev_indirect_vdev; uint64_t sr_start_time; uint64_t sr_end_time; /* * Note that we can not use the space map's or indirect mapping's * accounting as a substitute for these values, because we need to * count frees of not-yet-copied data as though it did the copy. * Otherwise, we could get into a situation where copied > to_copy, * or we complete before copied == to_copy. */ uint64_t sr_to_copy; /* bytes that need to be copied */ uint64_t sr_copied; /* bytes that have been copied or freed */ } spa_removing_phys_t; /* * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense * of an indirect vdev's mapping object is in progress. */ typedef struct spa_condensing_indirect_phys { /* * The vdev ID of the indirect vdev whose indirect mapping is * being condensed. */ uint64_t scip_vdev; /* * The vdev's old obsolete spacemap. This spacemap's contents are * being integrated into the new mapping. */ uint64_t scip_prev_obsolete_sm_object; /* * The new mapping object that is being created. */ uint64_t scip_next_mapping_object; } spa_condensing_indirect_phys_t; struct spa_aux_vdev { uint64_t sav_object; /* MOS object for device list */ nvlist_t *sav_config; /* cached device config */ vdev_t **sav_vdevs; /* devices */ int sav_count; /* number devices */ boolean_t sav_sync; /* sync the device list */ nvlist_t **sav_pending; /* pending device additions */ uint_t sav_npending; /* # pending devices */ }; typedef struct spa_config_lock { kmutex_t scl_lock; kthread_t *scl_writer; int scl_write_wanted; int scl_count; kcondvar_t scl_cv; } ____cacheline_aligned spa_config_lock_t; typedef struct spa_config_dirent { list_node_t scd_link; char *scd_path; } spa_config_dirent_t; typedef enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES } zio_taskq_type_t; /* * State machine for the zpool-poolname process. The states transitions * are done as follows: * * From To Routine * PROC_NONE -> PROC_CREATED spa_activate() * PROC_CREATED -> PROC_ACTIVE spa_thread() * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() * PROC_DEACTIVATE -> PROC_GONE spa_thread() * PROC_GONE -> PROC_NONE spa_deactivate() */ typedef enum spa_proc_state { SPA_PROC_NONE, /* spa_proc = &p0, no process created */ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ } spa_proc_state_t; typedef struct spa_taskqs { uint_t stqs_count; taskq_t **stqs_taskq; } spa_taskqs_t; typedef enum spa_all_vdev_zap_action { AVZ_ACTION_NONE = 0, AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */ AVZ_ACTION_INITIALIZE } spa_avz_action_t; typedef enum spa_config_source { SPA_CONFIG_SRC_NONE = 0, SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */ SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */ SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */ SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */ SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */ } spa_config_source_t; struct spa { /* * Fields protected by spa_namespace_lock. */ char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */ char *spa_comment; /* comment */ avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ nvlist_t *spa_config_splitting; /* config for splitting */ nvlist_t *spa_load_info; /* info and errors from load */ uint64_t spa_config_txg; /* txg of last config change */ uint32_t spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ boolean_t spa_trust_config; /* do we trust vdev tree? */ boolean_t spa_is_splitting; /* in the middle of a split? */ spa_config_source_t spa_config_source; /* where config comes from? */ uint64_t spa_import_flags; /* import specific flags */ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ boolean_t spa_is_exporting; /* true while exporting pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ metaslab_class_t *spa_special_class; /* special allocation class */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ uint64_t spa_load_max_txg; /* best initial ub_txg */ uint64_t spa_claim_max_txg; /* highest claimed birth txg */ inode_timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ list_t spa_evicting_os_list; /* Objsets being evicted. */ kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_min_ashift; /* of vdevs in normal class */ uint64_t spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_min_alloc; /* of vdevs in normal class */ + uint64_t spa_gcd_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ /* * spa_allocs is an array, whose lengths is stored in spa_alloc_count. * There is one tree and one lock for each allocator, to help improve * allocation performance in write-heavy workloads. */ spa_alloc_t *spa_allocs; int spa_alloc_count; spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ /* checksum context templates */ kmutex_t spa_cksum_tmpls_lock; void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ /* in-flight verification bytes */ uint64_t spa_load_verify_bytes; kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ uint8_t spa_scrub_started; /* started since last boot */ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ uint64_t spa_scan_pass_start; /* start time per pass/reboot */ uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */ uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ uint64_t spa_scan_pass_exam; /* examined bytes per pass */ uint64_t spa_scan_pass_issued; /* issued bytes per pass */ /* error scrub pause time in milliseconds */ uint64_t spa_scan_pass_errorscrub_pause; /* total error scrub paused time in milliseconds */ uint64_t spa_scan_pass_errorscrub_spent_paused; /* * We are in the middle of a resilver, and another resilver * is needed once this one completes. This is set iff any * vdev_resilver_deferred is set. */ boolean_t spa_resilver_deferred; kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ uint64_t spa_missing_tvds; /* unopenable tvds on load */ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ uint64_t spa_nonallocating_dspace; spa_removing_phys_t spa_removing_phys; spa_vdev_removal_t *spa_vdev_removal; spa_condensing_indirect_phys_t spa_condensing_indirect_phys; spa_condensing_indirect_t *spa_condensing_indirect; zthr_t *spa_condense_zthr; /* zthr doing condense. */ uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ zthr_t *spa_checkpoint_discard_zthr; space_map_t *spa_syncing_log_sm; /* current log space map */ avl_tree_t spa_sm_logs_by_txg; kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ avl_tree_t spa_metaslabs_by_flushed; spa_unflushed_stats_t spa_unflushed_stats; list_t spa_log_summary; uint64_t spa_log_flushall_txg; zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ zthr_t *spa_livelist_condense_zthr; /* condensing livelists */ uint64_t spa_livelists_to_delete; /* set of livelists to free */ livelist_condense_entry_t spa_to_condense; /* next to condense */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ uint64_t spa_load_txg; /* ub txg that loaded */ uint64_t spa_load_txg_ts; /* timestamp from that ub */ uint64_t spa_load_meta_errors; /* verify metadata err count */ uint64_t spa_load_data_errors; /* verify data err count */ uint64_t spa_verify_min_txg; /* start txg of verify scrub */ kmutex_t spa_errlog_lock; /* error log lock */ uint64_t spa_errlog_last; /* last error log object */ uint64_t spa_errlog_scrub; /* scrub error log object */ kmutex_t spa_errlist_lock; /* error list/ereport lock */ avl_tree_t spa_errlist_last; /* last error list */ avl_tree_t spa_errlist_scrub; /* scrub error list */ avl_tree_t spa_errlist_healed; /* list of healed blocks */ uint64_t spa_deflate; /* should we deflate? */ uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ vdev_t *spa_pending_vdev; /* pending vdev additions */ kmutex_t spa_props_lock; /* property lock */ uint64_t spa_pool_props_object; /* object for properties */ uint64_t spa_bootfs; /* default boot filesystem */ uint64_t spa_failmode; /* failure mode for the pool */ uint64_t spa_deadman_failmode; /* failure mode for deadman */ uint64_t spa_delegation; /* delegation on/off */ list_t spa_config_list; /* previous cache file(s) */ /* per-CPU array of root of async I/O: */ zio_t **spa_async_zio_root; zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ zio_suspend_reason_t spa_suspended; /* pool is suspended */ uint8_t spa_claiming; /* pool is doing zil_claim() */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ spa_mode_t spa_mode; /* SPA_MODE_{READ|WRITE} */ boolean_t spa_read_spacemaps; /* spacemaps available if ro */ spa_log_state_t spa_log_state; /* log state */ uint64_t spa_autoexpand; /* lun expansion on/off */ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ uint64_t spa_ddt_stat_object; /* DDT statistics */ uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ spa_proc_state_t spa_proc_state; /* see definition */ proc_t *spa_proc; /* "zpool-poolname" process */ uintptr_t spa_did; /* if procp != p0, did of t1 */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ uint64_t spa_prev_software_version; /* See ub_software_version */ uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */ nvlist_t *spa_feat_stats; /* Cache of enabled features */ /* cache feature refcounts */ uint64_t spa_feat_refcount_cache[SPA_FEATURES]; taskqid_t spa_deadman_tqid; /* Task id */ uint64_t spa_deadman_calls; /* number of deadman calls */ hrtime_t spa_sync_starttime; /* starting time of spa_sync */ uint64_t spa_deadman_synctime; /* deadman sync expiration */ uint64_t spa_deadman_ziotime; /* deadman zio expiration */ uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ uint64_t spa_autotrim; /* automatic background trim? */ uint64_t spa_errata; /* errata issues detected */ spa_stats_t spa_stats; /* assorted spa statistics */ spa_keystore_t spa_keystore; /* loaded crypto keys */ /* arc_memory_throttle() parameters during low memory condition */ uint64_t spa_lowmem_page_load; /* memory load during txg */ uint64_t spa_lowmem_last_txg; /* txg window start */ hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ taskq_t *spa_zvol_taskq; /* Taskq for minor management */ taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */ uint64_t spa_multihost; /* multihost aware (mmp) */ mmp_thread_t spa_mmp; /* multihost mmp thread */ list_t spa_leaf_list; /* list of leaf vdevs */ uint64_t spa_leaf_list_gen; /* track leaf_list changes */ uint32_t spa_hostid; /* cached system hostid */ /* synchronization for threads in spa_wait */ kmutex_t spa_activities_lock; kcondvar_t spa_activities_cv; kcondvar_t spa_waiters_cv; int spa_waiters; /* number of waiting threads */ boolean_t spa_waiters_cancel; /* waiters should return */ char *spa_compatibility; /* compatibility file(s) */ /* * spa_refcount & spa_config_lock must be the last elements * because zfs_refcount_t changes size based on compilation options. * In order for the MDB module to function correctly, the other * fields must remain in the same location. */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */ }; extern char *spa_config_path; extern const char *zfs_deadman_failmode; extern uint_t spa_slop_shift; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags); extern void spa_load_spares(spa_t *spa); extern void spa_load_l2cache(spa_t *spa); extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name); extern void spa_event_post(sysevent_t *ev); extern int param_set_deadman_failmode_common(const char *val); extern void spa_set_deadman_synctime(hrtime_t ns); extern void spa_set_deadman_ziotime(hrtime_t ns); extern const char *spa_history_zone(void); #ifdef __cplusplus } #endif #endif /* _SYS_SPA_IMPL_H */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 06f640769043..3b355e0debcc 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1,3004 +1,3005 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include #include #include #include /* * SPA locking * * There are three basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy/import/export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa zfs_refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. */ static avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; static kcondvar_t spa_namespace_cv; static const int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; spa_mode_t spa_mode_global = SPA_MODE_UNINIT; #ifdef ZFS_DEBUG /* * Everything except dprintf, set_error, spa, and indirect_remap is on * by default in debug builds. */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | ZFS_DEBUG_INDIRECT_REMAP); #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ int zfs_recover = B_FALSE; /* * If destroy encounters an EIO while reading metadata (e.g. indirect * blocks), space referenced by the missing metadata can not be freed. * Normally this causes the background destroy to become "stalled", as * it is unable to make forward progress. While in this stalled state, * all remaining space to free from the error-encountering filesystem is * "temporarily leaked". Set this flag to cause it to ignore the EIO, * permanently leak the space from indirect blocks that can not be read, * and continue to free everything else that it can. * * The default, "stalling" behavior is useful if the storage partially * fails (i.e. some but not all i/os fail), and then later recovers. In * this case, we will be able to continue pool operations while it is * partially failed, and when it recovers, we can continue to free the * space, with no leaks. However, note that this case is actually * fairly rare. * * Typically pools either (a) fail completely (but perhaps temporarily, * e.g. a top-level vdev going offline), or (b) have localized, * permanent errors (e.g. disk returns the wrong data due to bit flip or * firmware bug). In case (a), this setting does not matter because the * pool will be suspended and the sync thread will not be able to make * forward progress regardless. In case (b), because the error is * permanent, the best we can do is leak the minimum amount of space, * which is what setting this flag will do. Therefore, it is reasonable * for this flag to normally be set, but we chose the more conservative * approach of not setting it, so that there is no possibility of * leaking space in the "partial temporary" failure case. */ int zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the * spa_deadman() will fire if spa_sync() has not completed in 600 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in one of three behaviors controlled by zfs_deadman_failmode. */ uint64_t zfs_deadman_synctime_ms = 600000UL; /* 10 min. */ /* * This value controls the maximum amount of time zio_wait() will block for an * outstanding IO. By default this is 300 seconds at which point the "hung" * behavior will be applied as described for zfs_deadman_synctime_ms. */ uint64_t zfs_deadman_ziotime_ms = 300000UL; /* 5 min. */ /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ uint64_t zfs_deadman_checktime_ms = 60000UL; /* 1 min. */ /* * By default the deadman is enabled. */ int zfs_deadman_enabled = B_TRUE; /* * Controls the behavior of the deadman when it detects a "hung" I/O. * Valid values are zfs_deadman_failmode=. * * wait - Wait for the "hung" I/O (default) * continue - Attempt to recover from a "hung" I/O * panic - Panic the system */ const char *zfs_deadman_failmode = "wait"; /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ uint_t spa_asize_inflation = 24; /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed (bounded by spa_max_slop). This ensures that we * don't run the pool completely out of space, due to unaccounted changes (e.g. * to the MOS). It also limits the worst-case time to allocate space. If we * have less than this amount of free space, most ZPL operations (e.g. write, * create) will return ENOSPC. The ZIL metaslabs (spa_embedded_log_class) are * also part of this 3.2% of space which can't be consumed by normal writes; * the slop space "proper" (spa_get_slop_space()) is decreased by the embedded * log space. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half * the slop space is free. Typically, once the pool has less than the slop * space free, the user will use these operations to free up space in the pool. * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * * Operations that are almost guaranteed to free up space in the absence of * a pool checkpoint can use up to three quarters of the slop space * (e.g zfs destroy). * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net * increase in the amount of space used, it is possible to run the pool * completely out of space, causing it to be permanently read-only. * * Note that on very small pools, the slop space will be larger than * 3.2%, in an effort to have it be at least spa_min_slop (128MB), * but we never allow it to be more than half the pool size. * * Further, on very large pools, the slop space will be smaller than * 3.2%, to avoid reserving much more space than we actually need; bounded * by spa_max_slop (128GB). * * See also the comments in zfs_space_check_t. */ uint_t spa_slop_shift = 5; static const uint64_t spa_min_slop = 128ULL * 1024 * 1024; static const uint64_t spa_max_slop = 128ULL * 1024 * 1024 * 1024; static const int spa_allocators = 4; void spa_load_failed(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } void spa_load_note(spa_t *spa, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); } /* * By default dedup and user data indirects land in the special class */ static int zfs_ddt_data_is_special = B_TRUE; static int zfs_user_indirect_is_special = B_TRUE; /* * The percentage of special class final space reserved for metadata only. * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only * let metadata into the class. */ static uint_t zfs_special_class_metadata_reserve_pct = 25; /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); scl->scl_writer = NULL; scl->scl_write_wanted = 0; scl->scl_count = 0; } } static void spa_config_lock_destroy(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); ASSERT(scl->scl_count == 0); } } int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (scl->scl_count != 0) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks & ((1 << i) - 1), tag); return (0); } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } return (1); } static void spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, int mmp_flag) { (void) tag; int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || (!mmp_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (scl->scl_count != 0) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } scl->scl_count++; mutex_exit(&scl->scl_lock); } ASSERT3U(wlocks_held, <=, locks); } void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 0); } /* * The spa_config_enter_mmp() allows the mmp thread to cut in front of * outstanding write lock requests. This is needed since the mmp updates are * time sensitive and failure to service them promptly will result in a * suspended pool. This pool suspension has been seen in practice when there is * a single disk in a pool that is responding slowly and presumably about to * fail. */ void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) { spa_config_enter_impl(spa, locks, tag, rw, 1); } void spa_config_exit(spa_t *spa, int locks, const void *tag) { (void) tag; for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(scl->scl_count > 0); if (--scl->scl_count == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int locks_held = 0; for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && scl->scl_count != 0) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(search.spa_name, "/@#"); if (cp != NULL) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); return (spa); } /* * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. * If the zfs_deadman_enabled flag is set then it inspects all vdev queues * looking for potentially hung I/Os. */ void spa_deadman(void *arg) { spa_t *spa = arg; /* Disable the deadman if the pool is suspended. */ if (spa_suspended(spa)) return; zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, (u_longlong_t)++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev, FTAG); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + MSEC_TO_TICK(zfs_deadman_checktime_ms)); } static int spa_log_sm_sort_by_txg(const void *va, const void *vb) { const spa_log_sm_t *a = va; const spa_log_sm_t *b = vb; return (TREE_CMP(a->sls_txg, b->sls_txg)); } /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; spa->spa_trust_config = B_TRUE; spa->spa_hostid = zone_get_hostid(NULL); spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); spa_set_deadman_failmode(spa, zfs_deadman_failmode); zfs_refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); spa_stats_init(spa); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) spa->spa_root = spa_strdup(altroot); spa->spa_alloc_count = spa_allocators; spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count * sizeof (spa_alloc_t), KM_SLEEP); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node)); list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t), offsetof(log_summary_entry_t, lse_node)); /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); } if (spa->spa_label_features == NULL) { VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, KM_SLEEP) == 0); } spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; spa->spa_min_alloc = INT_MAX; + spa->spa_gcd_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature * refcount cache. */ for (int i = 0; i < SPA_FEATURES; i++) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } list_create(&spa->spa_leaf_list, sizeof (vdev_t), offsetof(vdev_t, vdev_leaf_node)); return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED); ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); ASSERT0(spa->spa_waiters); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); if (spa->spa_root) spa_strfree(spa->spa_root); while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } for (int i = 0; i < spa->spa_alloc_count; i++) { avl_destroy(&spa->spa_allocs[i].spaa_tree); mutex_destroy(&spa->spa_allocs[i].spaa_lock); } kmem_free(spa->spa_allocs, spa->spa_alloc_count * sizeof (spa_alloc_t)); avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_sm_logs_by_txg); list_destroy(&spa->spa_log_summary); list_destroy(&spa->spa_config_list); list_destroy(&spa->spa_leaf_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); nvlist_free(spa->spa_feat_stats); spa_config_set(spa, NULL); zfs_refcount_destroy(&spa->spa_refcount); spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); zio_checksum_templates_free(spa); cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); cv_destroy(&spa->spa_activities_cv); cv_destroy(&spa->spa_waiters_cv); mutex_destroy(&spa->spa_flushed_ms_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); mutex_destroy(&spa->spa_feat_stats_lock); mutex_destroy(&spa->spa_activities_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) zfs_refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_close(spa_t *spa, const void *tag) { ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t held by a dsl dir that is * being asynchronously released. Async releases occur from a taskq * performing eviction of dsl datasets and dirs. The namespace lock * isn't held and the hold by the object being evicted may contribute to * spa_minref (e.g. dataset or directory released during pool export), * so the asserts in spa_close() do not apply. */ void spa_async_close(spa_t *spa, const void *tag) { (void) zfs_refcount_remove(&spa->spa_refcount, tag); } /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the * number of references acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static inline int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = (const spa_aux_t *)a; const spa_aux_t *sb = (const spa_aux_t *)b; return (TREE_CMP(sa->aux_guid, sb->aux_guid)); } static void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } static void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } static boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } static void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. * - A spare may be added to a pool even if it's actively in use within * another pool. * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); vdev_autotrim_stop_all(spa); return (spa_vdev_config_enter(spa)); } /* * The same as spa_vdev_enter() above but additionally takes the guid of * the vdev being detached. When there is a rebuild in process it will be * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). * The rebuild is canceled if only a single child remains after the detach. */ uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); vdev_autotrim_stop_all(spa); if (guid != 0) { vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd) { vdev_rebuild_stop_wait(vd->vdev_top); } } return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, const char *tag) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, NULL); mutex_exit(&vd->vdev_initialize_lock); mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); mutex_exit(&vd->vdev_trim_lock); } /* * The vdev may be both a leaf and top-level device. */ vdev_autotrim_stop_wait(vd); spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_STATE_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { vdev_autotrim_restart(spa); vdev_rebuild_restart(spa); spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; vdev_t *vdev_top; if (vd == NULL || vd == spa->spa_root_vdev) { vdev_top = spa->spa_root_vdev; } else { vdev_top = vd->vdev_top; } if (vd != NULL || error == 0) vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); if (vd != NULL) { if (vd != spa->spa_root_vdev) vdev_state_dirty(vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); /* * When we are creating the pool (tx_txg==TXG_INITIAL), we can't * dirty the vdev config because lock SCL_CONFIG is not held. * Thankfully, in this case we don't need to dirty the config * because it will be written out anyway when we finish * creating the pool. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); } } void spa_deactivate_mos_feature(spa_t *spa, const char *feature) { if (nvlist_remove_all(spa->spa_label_features, feature) == 0) vdev_config_dirty(spa->spa_root_vdev); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); memcpy(new, s, len + 1); return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid; if (spa != NULL) { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)); } else { do { (void) random_get_pseudo_bytes((void *)&guid, sizeof (guid)); } while (guid == 0 || spa_guid_exists(guid, 0)); } return (guid); } void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; const char *checksum = NULL; const char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); (void) snprintf(type, sizeof (type), "bswap %s %s", DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? "metadata" : "data", dmu_ot_byteswap[bswap].ob_name); } else { (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } if (!BP_IS_EMBEDDED(bp)) { checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SNPRINTF_BLKPTR(kmem_scnprintf, ' ', buf, buflen, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. */ uint64_t zfs_strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } void spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) { /* * We bump the feature refcount for each special vdev added to the pool */ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } boolean_t spa_is_initializing(spa_t *spa) { return (spa->spa_is_initializing); } boolean_t spa_indirect_vdevs_loaded(spa_t *spa) { return (spa->spa_indirect_vdevs_loaded); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else (void) strlcpy(buf, spa->spa_root, buflen); } uint32_t spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t guid; /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev == NULL) return (spa->spa_config_guid); guid = spa->spa_last_synced_guid != 0 ? spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; /* * Return the most recently synced out guid unless we're * in syncing context. */ if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else return (guid); } uint64_t spa_load_guid(spa_t *spa) { /* * This is a GUID that exists solely as a reference for the * purposes of the arc. It is generated at load time, and * is never written to persistent storage. */ return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } /* * Return the last txg where data can be dirtied. The final txgs * will be used to just clear out any deferred frees that remain. */ uint64_t spa_final_dirty_txg(spa_t *spa) { return (spa->spa_final_txg - TXG_DEFER_SIZE); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* * Return the inflated asize for a logical write in bytes. This is used by the * DMU to calculate the space a logical write will require on disk. * If lsize is smaller than the largest physical block size allocatable on this * pool we use its value instead, since the write will end up using the whole * block anyway. */ uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) { if (lsize == 0) return (0); /* No inflation needed */ return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); } /* * Return the amount of slop space in bytes. It is typically 1/32 of the pool * (3.2%), minus the embedded log space. On very small pools, it may be * slightly larger than this. On very large pools, it will be capped to * the value of spa_max_slop. The embedded log space is not included in * spa_dspace. By subtracting it, the usable space (per "zfs list") is a * constant 97% of the total space, regardless of metaslab size (assuming the * default spa_slop_shift=5 and a non-tiny pool). * * See the comment above spa_slop_shift for more details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = 0; uint64_t slop = 0; /* * Make sure spa_dedup_dspace has been set. */ if (spa->spa_dedup_dspace == ~0ULL) spa_update_dspace(spa); /* * spa_get_dspace() includes the space only logically "used" by * deduplicated data, so since it's not useful to reserve more * space with more deduplicated data, we subtract that out here. */ space = spa_get_dspace(spa) - spa->spa_dedup_dspace; slop = MIN(space >> spa_slop_shift, spa_max_slop); /* * Subtract the embedded log space, but no more than half the (3.2%) * unusable space. Note, the "no more than half" is only relevant if * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by * default. */ uint64_t embedded_log = metaslab_class_get_dspace(spa_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); /* * Slop space should be at least spa_min_slop, but no more than half * the entire pool. */ slop = MAX(slop, MIN(space >> 1, spa_min_slop)); return (slop); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } uint64_t spa_get_checkpoint_space(spa_t *spa) { return (spa->spa_checkpoint_info.sci_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ddt_get_dedup_dspace(spa) + brt_get_dspace(spa); if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that * contribute to dspace. If a file is overwritten, its old * blocks are freed and new blocks are allocated. If there are * no snapshots of the file, the available space should remain * the same. The old blocks could be freed from the * non-allocating vdev, but the new blocks must be allocated on * other (allocating) vdevs. By reserving the entire size of * the non-allocating vdevs (including allocated space), we * ensure that there will be enough space on the allocating * vdevs for this file overwrite to succeed. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking * of how much space has been logically used). So it * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new device). */ ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace); spa->spa_dspace -= spa->spa_nonallocating_dspace; } } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint64_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended != ZIO_SUSPEND_NONE); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); } metaslab_class_t * spa_embedded_log_class(spa_t *spa) { return (spa->spa_embedded_log_class); } metaslab_class_t * spa_special_class(spa_t *spa) { return (spa->spa_special_class); } metaslab_class_t * spa_dedup_class(spa_t *spa) { return (spa->spa_dedup_class); } /* * Locate an appropriate allocation class */ metaslab_class_t * spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk) { /* * ZIL allocations determine their class in zio_alloc_zil(). */ ASSERT(objtype != DMU_OT_INTENT_LOG); boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; if (DMU_OT_IS_DDT(objtype)) { if (spa->spa_dedup_class->mc_groups != 0) return (spa_dedup_class(spa)); else if (has_special_class && zfs_ddt_data_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* Indirect blocks for user data can land in special if allowed */ if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { if (has_special_class && zfs_user_indirect_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } if (DMU_OT_IS_METADATA(objtype) || level > 0) { if (has_special_class) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } /* * Allow small file blocks in special class in some cases (like * for the dRAID vdev feature). But always leave a reserve of * zfs_special_class_metadata_reserve_pct exclusively for metadata. */ if (DMU_OT_IS_FILE(objtype) && has_special_class && size <= special_smallblk) { metaslab_class_t *special = spa_special_class(spa); uint64_t alloc = metaslab_class_get_alloc(special); uint64_t space = metaslab_class_get_space(special); uint64_t limit = (space * (100 - zfs_special_class_metadata_reserve_pct)) / 100; if (alloc < limit) return (special); } return (spa_normal_class(spa)); } void spa_evicting_os_register(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_insert_head(&spa->spa_evicting_os_list, os); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_deregister(spa_t *spa, objset_t *os) { mutex_enter(&spa->spa_evicting_os_lock); list_remove(&spa->spa_evicting_os_list, os); cv_broadcast(&spa->spa_evicting_os_cv); mutex_exit(&spa->spa_evicting_os_lock); } void spa_evicting_os_wait(spa_t *spa) { mutex_enter(&spa->spa_evicting_os_lock); while (!list_is_empty(&spa->spa_evicting_os_list)) cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); mutex_exit(&spa->spa_evicting_os_lock); dmu_buf_user_evict_wait(); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t spa_deadman_synctime(spa_t *spa) { return (spa->spa_deadman_synctime); } spa_autotrim_t spa_get_autotrim(spa_t *spa) { return (spa->spa_autotrim); } uint64_t spa_deadman_ziotime(spa_t *spa) { return (spa->spa_deadman_ziotime); } uint64_t spa_get_deadman_failmode(spa_t *spa) { return (spa->spa_deadman_failmode); } void spa_set_deadman_failmode(spa_t *spa, const char *failmode) { if (strcmp(failmode, "wait") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; else if (strcmp(failmode, "continue") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE; else if (strcmp(failmode, "panic") == 0) spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; else spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; } void spa_set_deadman_ziotime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_ziotime = ns; mutex_exit(&spa_namespace_lock); } } void spa_set_deadman_synctime(hrtime_t ns) { spa_t *spa = NULL; if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa->spa_deadman_synctime = ns; mutex_exit(&spa_namespace_lock); } } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); if (vd != NULL) dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } uint64_t spa_dirty_data(spa_t *spa) { return (spa->spa_dsl_pool->dp_dirty_total); } /* * ========================================================================== * SPA Import Progress Routines * ========================================================================== */ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; } spa_import_progress_t; spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", "pool_name"); return (0); } static int spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, (sip->pool_name ? sip->pool_name : "-")); return (0); } /* Remove oldest elements from list until there are no more than 'size' left */ static void spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) { spa_import_progress_t *sip; while (shl->size > size) { sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list)); } static void spa_import_progress_init(void) { spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t), KM_SLEEP); spa_import_progress_list->size = 0; spa_import_progress_list->procfs_list.pl_private = spa_import_progress_list; procfs_list_install("zfs", NULL, "import_progress", 0644, &spa_import_progress_list->procfs_list, spa_import_progress_show, spa_import_progress_show_header, NULL, offsetof(spa_import_progress_t, smh_node)); } static void spa_import_progress_destroy(void) { spa_history_list_t *shl = spa_import_progress_list; procfs_list_uninstall(&shl->procfs_list); spa_import_progress_truncate(shl, 0); procfs_list_destroy(&shl->procfs_list); kmem_free(shl, sizeof (spa_history_list_t)); } int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t load_state) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_max_txg = load_max_txg; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } int spa_import_progress_set_mmp_check(uint64_t pool_guid, uint64_t mmp_sec_remaining) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; int error = ENOENT; if (shl->size == 0) return (0); mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->mmp_sec_remaining = mmp_sec_remaining; error = 0; break; } } mutex_exit(&shl->procfs_list.pl_lock); return (error); } /* * A new import is in progress, add an entry. */ void spa_import_progress_add(spa_t *spa) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; const char *poolname = NULL; sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP); sip->pool_guid = spa_guid(spa); (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, &poolname); if (poolname == NULL) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); shl->size++; mutex_exit(&shl->procfs_list.pl_lock); } void spa_import_progress_remove(uint64_t pool_guid) { spa_history_list_t *shl = spa_import_progress_list; spa_import_progress_t *sip; mutex_enter(&shl->procfs_list.pl_lock); for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); break; } } mutex_exit(&shl->procfs_list.pl_lock); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); return (TREE_ISIGN(s)); } void spa_boot_init(void) { spa_config_load(); } void spa_init(spa_mode_t mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; #ifndef _KERNEL if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) { struct sigaction sa; sa.sa_flags = SA_SIGINFO; sigemptyset(&sa.sa_mask); sa.sa_sigaction = arc_buf_sigsegv; if (sigaction(SIGSEGV, &sa, NULL) == -1) { perror("could not enable watchpoints: " "sigaction(SIGSEGV, ...) = "); } else { arc_watch = B_TRUE; } } #endif fm_init(); zfs_refcount_init(); unique_init(); zfs_btree_init(); metaslab_stat_init(); brt_init(); ddt_init(); zio_init(); dmu_init(); zil_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); chksum_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); vdev_prop_init(); l2arc_start(); scan_init(); qat_init(); spa_import_progress_init(); } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_file_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); zil_fini(); dmu_fini(); zio_fini(); ddt_fini(); brt_fini(); metaslab_stat_fini(); zfs_btree_fini(); unique_fini(); zfs_refcount_fini(); fm_fini(); scan_fini(); qat_fini(); spa_import_progress_destroy(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } /* * Return whether this pool has a dedicated slog device. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness. */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_groups != 0); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config); } /* * Returns true if there is a pending sync task in any of the current * syncing txg, the current quiescing txg, or the current open txg. */ boolean_t spa_has_pending_synctask(spa_t *spa) { return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); } spa_mode_t spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_errorscrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; // error scrub stats spa->spa_scan_pass_errorscrub_spent_paused = 0; } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE && scn->errorscrub_phys.dep_func == POOL_SCAN_NONE)) return (SET_ERROR(ENOENT)); memset(ps, 0, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_skipped = scn->scn_phys.scn_skipped; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; /* data not stored on disk */ ps->pss_pass_exam = spa->spa_scan_pass_exam; ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; /* error scrub data stored on disk */ ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func; ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state; ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time; ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time; ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined; ps->pss_error_scrub_to_be_examined = scn->errorscrub_phys.dep_to_examine; /* error scrub data not stored on disk */ ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause; return (0); } int spa_maxblocksize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SPA_MAXBLOCKSIZE); else return (SPA_OLD_MAXBLOCKSIZE); } /* * Returns the txg that the last device removal completed. No indirect mappings * have been added since this txg. */ uint64_t spa_get_last_removal_txg(spa_t *spa) { uint64_t vdevid; uint64_t ret = -1ULL; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* * sr_prev_indirect_vdev is only modified while holding all the * config locks, so it is sufficient to hold SCL_VDEV as reader when * examining it. */ vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; while (vdevid != -1ULL) { vdev_t *vd = vdev_lookup_top(spa, vdevid); vdev_indirect_births_t *vib = vd->vdev_indirect_births; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); /* * If the removal did not remap any data, we don't care. */ if (vdev_indirect_births_count(vib) != 0) { ret = vdev_indirect_births_last_entry_txg(vib); break; } vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_VDEV, FTAG); IMPLY(ret != -1ULL, spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); return (ret); } int spa_maxdnodesize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) return (DNODE_MAX_SIZE); else return (DNODE_MIN_SIZE); } boolean_t spa_multihost(spa_t *spa) { return (spa->spa_multihost ? B_TRUE : B_FALSE); } uint32_t spa_get_hostid(spa_t *spa) { return (spa->spa_hostid); } boolean_t spa_trust_config(spa_t *spa) { return (spa->spa_trust_config); } uint64_t spa_missing_tvds_allowed(spa_t *spa) { return (spa->spa_missing_tvds_allowed); } space_map_t * spa_syncing_log_sm(spa_t *spa) { return (spa->spa_syncing_log_sm); } void spa_set_missing_tvds(spa_t *spa, uint64_t missing) { spa->spa_missing_tvds = missing; } /* * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc). */ const char * spa_state_to_name(spa_t *spa) { ASSERT3P(spa, !=, NULL); /* * it is possible for the spa to exist, without root vdev * as the spa transitions during import/export */ vdev_t *rvd = spa->spa_root_vdev; if (rvd == NULL) { return ("TRANSITIONING"); } vdev_state_t state = rvd->vdev_state; vdev_aux_t aux = rvd->vdev_stat.vs_aux; if (spa_suspended(spa) && (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)) return ("SUSPENDED"); switch (state) { case VDEV_STATE_CLOSED: case VDEV_STATE_OFFLINE: return ("OFFLINE"); case VDEV_STATE_REMOVED: return ("REMOVED"); case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return ("FAULTED"); else if (aux == VDEV_AUX_SPLIT_POOL) return ("SPLIT"); else return ("UNAVAIL"); case VDEV_STATE_FAULTED: return ("FAULTED"); case VDEV_STATE_DEGRADED: return ("DEGRADED"); case VDEV_STATE_HEALTHY: return ("ONLINE"); default: break; } return ("UNKNOWN"); } boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t spa_has_checkpoint(spa_t *spa) { return (spa->spa_checkpoint_txg != 0); } boolean_t spa_importing_readonly_checkpoint(spa_t *spa) { return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && spa->spa_mode == SPA_MODE_READ); } uint64_t spa_min_claim_txg(spa_t *spa) { uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; if (checkpoint_txg != 0) return (checkpoint_txg + 1); return (spa->spa_first_txg); } /* * If there is a checkpoint, async destroys may consume more space from * the pool instead of freeing it. In an attempt to save the pool from * getting suspended when it is about to run out of space, we stop * processing async destroys. */ boolean_t spa_suspend_async_destroy(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t unreserved = dsl_pool_unreserved_space(dp, ZFS_SPACE_CHECK_EXTRA_RESERVED); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; if (spa_has_checkpoint(spa) && avail == 0) return (B_TRUE); return (B_FALSE); } #if defined(_KERNEL) int param_set_deadman_failmode_common(const char *val) { spa_t *spa = NULL; char *p; if (val == NULL) return (SET_ERROR(EINVAL)); if ((p = strchr(val, '\n')) != NULL) *p = '\0'; if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 && strcmp(val, "panic")) return (SET_ERROR(EINVAL)); if (spa_mode_global != SPA_MODE_UNINIT) { mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) spa_set_deadman_failmode(spa, val); mutex_exit(&spa_namespace_lock); } return (0); } #endif /* Namespace manipulation */ EXPORT_SYMBOL(spa_lookup); EXPORT_SYMBOL(spa_add); EXPORT_SYMBOL(spa_remove); EXPORT_SYMBOL(spa_next); /* Refcount functions */ EXPORT_SYMBOL(spa_open_ref); EXPORT_SYMBOL(spa_close); EXPORT_SYMBOL(spa_refcount_zero); /* Pool configuration lock */ EXPORT_SYMBOL(spa_config_tryenter); EXPORT_SYMBOL(spa_config_enter); EXPORT_SYMBOL(spa_config_exit); EXPORT_SYMBOL(spa_config_held); /* Pool vdev add/remove lock */ EXPORT_SYMBOL(spa_vdev_enter); EXPORT_SYMBOL(spa_vdev_exit); /* Pool vdev state change lock */ EXPORT_SYMBOL(spa_vdev_state_enter); EXPORT_SYMBOL(spa_vdev_state_exit); /* Accessor functions */ EXPORT_SYMBOL(spa_shutting_down); EXPORT_SYMBOL(spa_get_dsl); EXPORT_SYMBOL(spa_get_rootblkptr); EXPORT_SYMBOL(spa_set_rootblkptr); EXPORT_SYMBOL(spa_altroot); EXPORT_SYMBOL(spa_sync_pass); EXPORT_SYMBOL(spa_name); EXPORT_SYMBOL(spa_guid); EXPORT_SYMBOL(spa_last_synced_txg); EXPORT_SYMBOL(spa_first_txg); EXPORT_SYMBOL(spa_syncing_txg); EXPORT_SYMBOL(spa_version); EXPORT_SYMBOL(spa_state); EXPORT_SYMBOL(spa_load_state); EXPORT_SYMBOL(spa_freeze_txg); EXPORT_SYMBOL(spa_get_dspace); EXPORT_SYMBOL(spa_update_dspace); EXPORT_SYMBOL(spa_deflate); EXPORT_SYMBOL(spa_normal_class); EXPORT_SYMBOL(spa_log_class); EXPORT_SYMBOL(spa_special_class); EXPORT_SYMBOL(spa_preferred_class); EXPORT_SYMBOL(spa_max_replication); EXPORT_SYMBOL(spa_prev_software_version); EXPORT_SYMBOL(spa_get_failmode); EXPORT_SYMBOL(spa_suspended); EXPORT_SYMBOL(spa_bootfs); EXPORT_SYMBOL(spa_delegation); EXPORT_SYMBOL(spa_meta_objset); EXPORT_SYMBOL(spa_maxblocksize); EXPORT_SYMBOL(spa_maxdnodesize); /* Miscellaneous support routines */ EXPORT_SYMBOL(spa_guid_exists); EXPORT_SYMBOL(spa_strdup); EXPORT_SYMBOL(spa_strfree); EXPORT_SYMBOL(spa_generate_guid); EXPORT_SYMBOL(snprintf_blkptr); EXPORT_SYMBOL(spa_freeze); EXPORT_SYMBOL(spa_upgrade); EXPORT_SYMBOL(spa_evict_all); EXPORT_SYMBOL(spa_lookup_by_guid); EXPORT_SYMBOL(spa_has_spare); EXPORT_SYMBOL(dva_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize_sync); EXPORT_SYMBOL(bp_get_dsize); EXPORT_SYMBOL(spa_has_slogs); EXPORT_SYMBOL(spa_is_root); EXPORT_SYMBOL(spa_writeable); EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); EXPORT_SYMBOL(spa_trust_config); EXPORT_SYMBOL(spa_missing_tvds_allowed); EXPORT_SYMBOL(spa_set_missing_tvds); EXPORT_SYMBOL(spa_state_to_name); EXPORT_SYMBOL(spa_importing_readonly_checkpoint); EXPORT_SYMBOL(spa_min_claim_txg); EXPORT_SYMBOL(spa_suspend_async_destroy); EXPORT_SYMBOL(spa_has_checkpoint); EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW, "Set additional debugging flags"); ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, "Set to attempt to recover from fatal errors"); ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, "Set to ignore IO errors during free and permanently leak the space"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, checktime_ms, U64, ZMOD_RW, "Dead I/O check interval in milliseconds"); ZFS_MODULE_PARAM(zfs_deadman, zfs_deadman_, enabled, INT, ZMOD_RW, "Enable deadman timer"); ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, UINT, ZMOD_RW, "SPA size estimate multiplication factor"); ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, "Place DDT data into the special class"); ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, "Place user data indirect blocks into the special class"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, param_set_deadman_failmode, param_get_charp, ZMOD_RW, "Failmode for deadman timer"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, param_set_deadman_synctime, spl_param_get_u64, ZMOD_RW, "Pool sync expiration time in milliseconds"); ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, param_set_deadman_ziotime, spl_param_get_u64, ZMOD_RW, "IO expiration time in milliseconds"); ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, UINT, ZMOD_RW, "Small file blocks in special vdevs depends on this much " "free space available"); /* END CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, param_get_uint, ZMOD_RW, "Reserved free space in pool"); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index b6f8c0ab302e..f3812b843e95 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1,6386 +1,6414 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2021 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" /* * One metaslab from each (normal-class) vdev is used by the ZIL. These are * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are * part of the spa_embedded_log_class. The metaslab with the most free space * in each vdev is selected for this purpose when the pool is opened (or a * vdev is added). See vdev_metaslab_init(). * * Log blocks can be allocated from the following locations. Each one is tried * in order until the allocation succeeds: * 1. dedicated log vdevs, aka "slog" (spa_log_class) * 2. embedded slog metaslabs (spa_embedded_log_class) * 3. other metaslabs in normal vdevs (spa_normal_class) * * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer * than this number of metaslabs in the vdev. This ensures that we don't set * aside an unreasonable amount of space for the ZIL. If set to less than * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced * (by more than 1<vdev_path != NULL) { zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, vd->vdev_path, buf); } else { zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid, buf); } } void vdev_dbgmsg_print_tree(vdev_t *vd, int indent) { char state[20]; if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { zfs_dbgmsg("%*svdev %llu: %s", indent, "", (u_longlong_t)vd->vdev_id, vd->vdev_ops->vdev_op_type); return; } switch (vd->vdev_state) { case VDEV_STATE_UNKNOWN: (void) snprintf(state, sizeof (state), "unknown"); break; case VDEV_STATE_CLOSED: (void) snprintf(state, sizeof (state), "closed"); break; case VDEV_STATE_OFFLINE: (void) snprintf(state, sizeof (state), "offline"); break; case VDEV_STATE_REMOVED: (void) snprintf(state, sizeof (state), "removed"); break; case VDEV_STATE_CANT_OPEN: (void) snprintf(state, sizeof (state), "can't open"); break; case VDEV_STATE_FAULTED: (void) snprintf(state, sizeof (state), "faulted"); break; case VDEV_STATE_DEGRADED: (void) snprintf(state, sizeof (state), "degraded"); break; case VDEV_STATE_HEALTHY: (void) snprintf(state, sizeof (state), "healthy"); break; default: (void) snprintf(state, sizeof (state), "", (uint_t)vd->vdev_state); } zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, vd->vdev_islog ? " (log)" : "", (u_longlong_t)vd->vdev_guid, vd->vdev_path ? vd->vdev_path : "N/A", state); for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } /* * Virtual device management. */ static vdev_ops_t *const vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_draid_ops, &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, &vdev_indirect_ops, NULL }; /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, *const *opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Given a vdev and a metaslab class, find which metaslab group we're * interested in. All vdevs may belong to two different metaslab classes. * Dedicated slog devices use only the primary metaslab group, rather than a * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL. */ metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { if (mc == spa_embedded_log_class(vd->vdev_spa) && vd->vdev_log_mg != NULL) return (vd->vdev_log_mg); else return (vd->vdev_mg); } void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { (void) vd, (void) remain_rs; physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) { vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) alloc_bias = VDEV_BIAS_LOG; else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) alloc_bias = VDEV_BIAS_SPECIAL; else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) alloc_bias = VDEV_BIAS_DEDUP; return (alloc_bias); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } uint64_t vdev_default_min_asize(vdev_t *vd) { return (vd->vdev_min_asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } /* * Get the minimal allocation size for the top-level vdev. */ uint64_t vdev_get_min_alloc(vdev_t *vd) { uint64_t min_alloc = 1ULL << vd->vdev_ashift; if (vd->vdev_ops->vdev_op_min_alloc != NULL) min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); return (min_alloc); } /* * Get the parity level for a top-level vdev. */ uint64_t vdev_get_nparity(vdev_t *vd) { uint64_t nparity = 0; if (vd->vdev_ops->vdev_op_nparity != NULL) nparity = vd->vdev_ops->vdev_op_nparity(vd); return (nparity); } static int vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t objid; int err; if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (EINVAL); } err = zap_lookup(mos, objid, vdev_prop_to_name(prop), sizeof (uint64_t), 1, value); if (err == ENOENT) *value = vdev_prop_default_numeric(prop); return (err); } /* * Get the number of data disks for a top-level vdev. */ uint64_t vdev_get_ndisks(vdev_t *vd) { uint64_t ndisks = 1; if (vd->vdev_ops->vdev_op_ndisks != NULL) ndisks = vd->vdev_ops->vdev_op_ndisks(vd); return (ndisks); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { int rc; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); rc = vdev_count_leaves_impl(spa->spa_root_vdev); spa_config_exit(spa, SCL_VDEV, FTAG); return (rc); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_alloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { memcpy(newchild, pvd->vdev_child, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; if (cvd->vdev_ops->vdev_op_leaf) { list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); cvd->vdev_spa->spa_leaf_list_gen++; } } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } if (cvd->vdev_ops->vdev_op_leaf) { spa_t *spa = cvd->vdev_spa; list_remove(&spa->spa_leaf_list, cvd); spa->spa_leaf_list_gen++; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (oldc == 0) return; for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; if (newc > 0) { newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } } else { newchild = NULL; } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vdev_indirect_config_t *vic; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); vic = &vd->vdev_indirect_config; if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); vic->vic_prev_indirect_vdev = UINT64_MAX; rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); /* * Initialize rate limit structs for events. We rate limit ZIO delay * and checksum events so that we don't overwhelm ZED with thousands * of events when a disk is acting up. */ zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); /* * Default Thresholds for tuning ZED */ vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N); vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; const char *type; uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; const char *tmp = NULL; int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { const char *bias; /* * If creating a top-level vdev, check for allocation * classes input. */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); /* spa_vdev_add() expects feature to be enabled */ if (spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { return (SET_ERROR(ENOTSUP)); } } /* spa_vdev_add() expects feature to be enabled */ if (ops == &vdev_draid_ops && spa->spa_load_state != SPA_LOAD_CREATE && !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { return (SET_ERROR(ENOTSUP)); } } /* * Initialize the vdev specific data. This is done before calling * vdev_alloc_common() since it may fail and this simplifies the * error reporting and cleanup code paths. */ void *tsd = NULL; if (ops->vdev_op_init != NULL) { rc = ops->vdev_op_init(spa, nv, &tsd); if (rc != 0) { return (rc); } } vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_tsd = tsd; vd->vdev_islog = islog; if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0) vd->vdev_path = spa_strdup(tmp); /* * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a * fault on a vdev and want it to persist across imports (like with * zpool offline -f). */ rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_faulted = 1; vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0) vd->vdev_devid = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0) vd->vdev_physpath = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &tmp) == 0) vd->vdev_enc_sysfs_path = spa_strdup(tmp); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0) vd->vdev_fru = spa_strdup(tmp); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; vic = &vd->vdev_indirect_config; ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); ASSERT0(vic->vic_births_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, &vic->vic_births_object); ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, &vic->vic_prev_indirect_vdev); /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. Ignore pool ashift for vdev * attach case. */ if (alloctype != VDEV_ALLOC_ATTACH) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); } else { vd->vdev_attaching = B_TRUE; } /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); if (vd->vdev_ops == &vdev_root_ops && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, &vd->vdev_root_zap); } /* * If we're a top-level vdev, try to load the allocation parameters. */ if (top_level && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, &vd->vdev_noalloc); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (top_level && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); /* Note: metaslab_group_create() is now deferred */ } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, &vd->vdev_rebuild_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) vdev_defer_resilver(vd); /* * In general, when importing a pool we want to ignore the * persistent fault state, as the diagnosis made on another * system may not be valid in the current context. The only * exception is if we forced a vdev to a persistently faulted * state with 'zpool offline -f'. The persistent fault will * remain across imports until cleared. * * Local vdevs will remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN || spa_load_state(spa) == SPA_LOAD_IMPORT) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { const char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; else vd->vdev_faulted = 0ULL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); /* * Scan queues are normally destroyed at the end of a scan. If the * queue exists here, that implies the vdev is being removed while * the scan is still running. */ if (vd->vdev_scan_io_queue != NULL) { mutex_enter(&vd->vdev_scan_io_queue_lock); dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); vd->vdev_scan_io_queue = NULL; mutex_exit(&vd->vdev_scan_io_queue_lock); } /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); if (vd->vdev_ops->vdev_op_fini != NULL) vd->vdev_ops->vdev_op_fini(vd); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); metaslab_group_destroy(vd->vdev_log_mg); vd->vdev_log_mg = NULL; } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); ASSERT(!list_link_active(&vd->vdev_leaf_node)); /* * Clean up vdev structure. */ vdev_queue_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_enc_sysfs_path) spa_strfree(vd->vdev_enc_sysfs_path); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); EQUIV(vd->vdev_indirect_births != NULL, vd->vdev_indirect_mapping != NULL); if (vd->vdev_indirect_births != NULL) { vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vdev_indirect_births_close(vd->vdev_indirect_births); } if (vd->vdev_obsolete_sm != NULL) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; } range_tree_destroy(vd->vdev_obsolete_segments); rw_destroy(&vd->vdev_indirect_rwlock); mutex_destroy(&vd->vdev_obsolete_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); cv_destroy(&vd->vdev_initialize_cv); mutex_destroy(&vd->vdev_trim_lock); mutex_destroy(&vd->vdev_autotrim_lock); mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); cv_destroy(&vd->vdev_rebuild_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); if (tvd->vdev_log_mg) ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_log_mg = svd->vdev_log_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_log_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; if (tvd->vdev_log_mg != NULL) tvd->vdev_log_mg->mg_vd = tvd; tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; svd->vdev_checkpoint_sm = NULL; tvd->vdev_alloc_bias = svd->vdev_alloc_bias; svd->vdev_alloc_bias = VDEV_BIAS_NONE; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; /* * State which may be set on a top-level vdev that's in the * process of being removed. */ ASSERT0(tvd->vdev_indirect_config.vic_births_object); ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); ASSERT3P(tvd->vdev_indirect_births, ==, NULL); ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); ASSERT0(tvd->vdev_noalloc); ASSERT0(tvd->vdev_removing); ASSERT0(tvd->vdev_rebuilding); tvd->vdev_noalloc = svd->vdev_noalloc; tvd->vdev_removing = svd->vdev_removing; tvd->vdev_rebuilding = svd->vdev_rebuilding; tvd->vdev_rebuild_config = svd->vdev_rebuild_config; tvd->vdev_indirect_config = svd->vdev_indirect_config; tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; tvd->vdev_indirect_births = svd->vdev_indirect_births; range_tree_swap(&svd->vdev_obsolete_segments, &tvd->vdev_obsolete_segments); tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; svd->vdev_indirect_config.vic_mapping_object = 0; svd->vdev_indirect_config.vic_births_object = 0; svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; svd->vdev_indirect_mapping = NULL; svd->vdev_indirect_births = NULL; svd->vdev_obsolete_sm = NULL; svd->vdev_noalloc = 0; svd->vdev_removing = 0; svd->vdev_rebuilding = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. There is no need to * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_psize = cvd->vdev_psize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; /* * If pool not set for autoexpand, we need to also preserve * mvd's asize to prevent automatic expansion of cvd. * Otherwise if we are adjusting the mirror by attaching and * detaching children of non-uniform sizes, the mirror could * autoexpand, unexpectedly requiring larger devices to * re-establish the mirror. */ if (!cvd->vdev_spa->spa_autoexpand) cvd->vdev_asize = mvd->vdev_asize; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } +/* + * Choose GCD for spa_gcd_alloc. + */ +static uint64_t +vdev_gcd(uint64_t a, uint64_t b) +{ + while (b != 0) { + uint64_t t = b; + b = a % b; + a = t; + } + return (a); +} + +/* + * Set spa_min_alloc and spa_gcd_alloc. + */ +static void +vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) +{ + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; + if (spa->spa_gcd_alloc == INT_MAX) { + spa->spa_gcd_alloc = min_alloc; + } else { + spa->spa_gcd_alloc = vdev_gcd(min_alloc, + spa->spa_gcd_alloc); + } +} + void vdev_metaslab_group_create(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * metaslab_group_create was delayed until allocation bias was available */ if (vd->vdev_mg == NULL) { metaslab_class_t *mc; if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) vd->vdev_alloc_bias = VDEV_BIAS_LOG; ASSERT3U(vd->vdev_islog, ==, (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); switch (vd->vdev_alloc_bias) { case VDEV_BIAS_LOG: mc = spa_log_class(spa); break; case VDEV_BIAS_SPECIAL: mc = spa_special_class(spa); break; case VDEV_BIAS_DEDUP: mc = spa_dedup_class(spa); break; default: mc = spa_normal_class(spa); } vd->vdev_mg = metaslab_group_create(mc, vd, spa->spa_alloc_count); if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd, 1); } /* * The spa ashift min/max only apply for the normal metaslab * class. Class destination is late binding so ashift boundary * setting had to wait until now. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; uint64_t min_alloc = vdev_get_min_alloc(vd); - if (min_alloc < spa->spa_min_alloc) - spa->spa_min_alloc = min_alloc; + vdev_spa_set_alloc(spa, min_alloc); } } } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; boolean_t expanding = (oldc != 0); ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); ASSERT(oldc <= newc); mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (expanding) { memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* * vdev_ms_array may be 0 if we are creating the "fake" * metaslabs for an indirect vdev for zdb's leak detection. * See zdb_leak_init(). */ if (txg == 0 && vd->vdev_ms_array != 0) { error = dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error != 0) { vdev_dbgmsg(vd, "unable to read the metaslab " "array [error=%d]", error); return (error); } } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error != 0) { vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", error); return (error); } } /* * Find the emptiest metaslab on the vdev and mark it for use for * embedded slog by moving it from the regular to the log metaslab * group. */ if (vd->vdev_mg->mg_class == spa_normal_class(spa) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; uint64_t smallest = UINT64_MAX; /* * Note, we only search the new metaslabs, because the old * (pre-existing) ones may be active (e.g. have non-empty * range_tree's), and we don't move them to the new * metaslab_t. */ for (uint64_t m = oldc; m < newc; m++) { uint64_t alloc = space_map_allocated(vd->vdev_ms[m]->ms_sm); if (alloc < smallest) { slog_msid = m; smallest = alloc; } } metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; /* * The metaslab was marked as dirty at the end of * metaslab_init(). Remove it from the dirty list so that we * can uninitialize and reinitialize it to the new class. */ if (txg != 0) { (void) txg_list_remove_this(&vd->vdev_ms_list, slog_ms, txg); } uint64_t sm_obj = space_map_object(slog_ms->ms_sm); metaslab_fini(slog_ms); VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, &vd->vdev_ms[slog_msid])); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is marked as non-allocating then don't * activate the metaslabs since we want to ensure that * no allocations are performed on this device. */ if (vd->vdev_noalloc) { /* track non-allocating vdev space */ spa->spa_nonallocating_dspace += spa_deflate(spa) ? vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; } else if (!expanding) { metaslab_group_activate(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_group_activate(vd->vdev_log_mg); } if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { if (vd->vdev_checkpoint_sm != NULL) { ASSERT(spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_POOL_CHECKPOINT)); space_map_close(vd->vdev_checkpoint_sm); /* * Even though we close the space map, we need to set its * pointer to NULL. The reason is that vdev_metaslab_fini() * may be called multiple times for certain operations * (i.e. when destroying a pool) so we need to ensure that * this clause never executes twice. This logic is similar * to the one used for the vdev_ms clause below. */ vd->vdev_checkpoint_sm = NULL; } if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); if (vd->vdev_log_mg != NULL) { ASSERT(!vd->vdev_islog); metaslab_group_passivate(vd->vdev_log_mg); } uint64_t count = vd->vdev_ms_count; for (uint64_t m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; vd->vdev_ms_count = 0; for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { ASSERT0(mg->mg_histogram[i]); if (vd->vdev_log_mg != NULL) ASSERT0(vd->vdev_log_mg->mg_histogram[i]); } } ASSERT0(vd->vdev_ms_count); ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; zio_link_t *zl; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_load_child(void *arg) { vdev_t *vd = arg; vd->vdev_load_error = vdev_load(vd); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } static boolean_t vdev_uses_zvols(vdev_t *vd) { #ifdef _KERNEL if (zvol_is_zvol(vd->vdev_path)) return (B_TRUE); #endif for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Returns B_TRUE if the passed child should be opened. */ static boolean_t vdev_default_open_children_func(vdev_t *vd) { (void) vd; return (B_TRUE); } /* * Open the requested child vdevs. If any of the leaf vdevs are using * a ZFS volume then do the opens in a single thread. This avoids a * deadlock when the current thread is holding the spa_namespace_lock. */ static void vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) { int children = vd->vdev_children; taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); vd->vdev_nonrot = B_TRUE; for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (open_func(cvd) == B_FALSE) continue; if (tq == NULL || vdev_uses_zvols(vd)) { cvd->vdev_open_error = vdev_open(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_open_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } vd->vdev_nonrot &= cvd->vdev_nonrot; } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } } /* * Open all child vdevs. */ void vdev_open_children(vdev_t *vd) { vdev_open_children_impl(vd, vdev_default_open_children_func); } /* * Conditionally open a subset of child vdevs. */ void vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) { vdev_open_children_impl(vd, open_func); } /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ static void vdev_set_deflate_ratio(vdev_t *vd) { if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); } } /* * Choose the best of two ashifts, preferring one between logical ashift * (absolute minimum) and administrator defined maximum, otherwise take * the biggest of the two. */ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) { if (a > logical && a <= zfs_vdev_max_auto_ashift) { if (b <= logical || b > zfs_vdev_max_auto_ashift) return (a); else return (MAX(a, b)); } else if (b <= logical || b > zfs_vdev_max_auto_ashift) return (MAX(a, b)); return (b); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ static void vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); if (vd->vdev_ashift < vd->vdev_physical_ashift && vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * If the logical and physical ashifts are the same, then * we ensure that the top-level vdev's ashift is not smaller * than our minimum ashift value. For the unusual case * where logical ashift > physical ashift, we can't cap * the calculated ashift based on max ashift as that * would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, vd->vdev_ashift); } } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* Keep the device in removed state if unplugged */ if (error == ENOENT && vd->vdev_removed) { vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); return (error); } /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy * or damaged: either way it's not safe for use, bail out of the open. */ if (osize > max_osize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_OPEN_FAILED); return (SET_ERROR(ENXIO)); } /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, vd->vdev_stat.vs_aux); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); } return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } /* * If the vdev was expanded, record this so that we can re-create the * uberblock rings in labels {2,3}, during the next sync. */ if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) vd->vdev_copy_uberblocks = B_TRUE; vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } /* * We can always set the logical/physical ashift members since * their values are only used to calculate the vdev_ashift when * the device is first added to the config. These values should * not be used for anything else since they may change whenever * the device is reopened and we don't store them in the label. */ vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For compatibility, a different ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; /* * If the vdev_ashift was not overridden at creation time, * then set it the logical ashift and optimize the ashift. */ if (vd->vdev_ashift == 0) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (SET_ERROR(EDOM)); } if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) vdev_ashift_optimize(vd); vd->vdev_attaching = B_FALSE; } if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || vd->vdev_ashift > ASHIFT_MAX)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_ASHIFT); return (SET_ERROR(EDOM)); } } else { /* * Make sure the alignment required hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { (void) zfs_ereport_post( FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EDOM)); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the minimum allocation size. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && vd->vdev_islog == 0 && vd->vdev_aux == NULL) { uint64_t min_alloc = vdev_get_min_alloc(vd); - if (min_alloc < spa->spa_min_alloc) - spa->spa_min_alloc = min_alloc; + vdev_spa_set_alloc(spa, min_alloc); } /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since * this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } static void vdev_validate_child(void *arg) { vdev_t *vd = arg; vd->vdev_validate_thread = curthread; vd->vdev_validate_error = vdev_validate(vd); vd->vdev_validate_thread = NULL; } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; taskq_t *tq = NULL; nvlist_t *label; uint64_t guid = 0, aux_guid = 0, top_guid; uint64_t state; nvlist_t *nvl; uint64_t txg; int children = vd->vdev_children; if (vdev_validate_skip) return (0); if (children > 0) { tq = taskq_create("vdev_validate", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } for (uint64_t c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { vdev_validate_child(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < children; c++) { int error = vd->vdev_child[c]->vdev_validate_error; if (error != 0) return (SET_ERROR(EBADF)); } /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) return (0); /* * If we are performing an extreme rewind, we allow for a label that * was modified at a point after the current txg. * If config lock is not held do not check for the txg. spa_sync could * be updating the vdev's label before updating spa_last_synced_txg. */ if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) txg = UINT64_MAX; else txg = spa_last_synced_txg(spa); if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); vdev_dbgmsg(vd, "vdev_validate: failed reading config for " "txg %llu", (u_longlong_t)txg); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_GUID); return (0); } /* * If config is not trusted then ignore the spa guid check. This is * necessary because if the machine crashed during a re-guid the new * guid might have been written to all of the vdev labels, but not the * cached config. The check will be performed again once we have the * trusted config from the MOS. */ if (spa->spa_trust_config && guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " "match config (%llu != %llu)", (u_longlong_t)guid, (u_longlong_t)spa_guid(spa)); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_GUID); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_TOP_GUID); return (0); } /* * If this vdev just became a top-level vdev because its sibling was * detached, it will have adopted the parent's vdev guid -- but the * label may or may not be on disk yet. Fortunately, either version * of the label will have the same top guid, so if we're a top-level * vdev, we can safely compare to that instead. * However, if the config comes from a cachefile that failed to update * after the detach, a top-level vdev will appear as a non top-level * vdev in the config. Also relax the constraints if we perform an * extreme rewind. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { boolean_t mismatch = B_FALSE; if (spa->spa_trust_config && !spa->spa_extreme_rewind) { if (vd != vd->vdev_top || vd->vdev_guid != top_guid) mismatch = B_TRUE; } else { if (vd->vdev_guid != top_guid && vd->vdev_top->vdev_guid != guid) mismatch = B_TRUE; } if (mismatch) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: config guid " "doesn't match label guid"); vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)vd->vdev_top->vdev_guid); vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " "aux_guid %llu", (u_longlong_t)guid, (u_longlong_t)top_guid, (u_longlong_t)aux_guid); return (0); } } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", ZPOOL_CONFIG_POOL_STATE); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) { vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " "for spa %s", (u_longlong_t)state, spa->spa_name); return (SET_ERROR(EBADF)); } /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; return (0); } static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { char *old, *new; if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path, svd->vdev_path); spa_strfree(dvd->vdev_path); dvd->vdev_path = spa_strdup(svd->vdev_path); } } else if (svd->vdev_path != NULL) { dvd->vdev_path = spa_strdup(svd->vdev_path); zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); } /* * Our enclosure sysfs path may have changed between imports */ old = dvd->vdev_enc_sysfs_path; new = svd->vdev_enc_sysfs_path; if ((old != NULL && new == NULL) || (old == NULL && new != NULL) || ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, old, new); if (dvd->vdev_enc_sysfs_path) spa_strfree(dvd->vdev_enc_sysfs_path); if (svd->vdev_enc_sysfs_path) { dvd->vdev_enc_sysfs_path = spa_strdup( svd->vdev_enc_sysfs_path); } else { dvd->vdev_enc_sysfs_path = NULL; } } } /* * Recursively copy vdev paths from one vdev to another. Source and destination * vdev trees must have same geometry otherwise return error. Intended to copy * paths from userland config into MOS config. */ int vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) { if ((svd->vdev_ops == &vdev_missing_ops) || (svd->vdev_ishole && dvd->vdev_ishole) || (dvd->vdev_ops == &vdev_indirect_ops)) return (0); if (svd->vdev_ops != dvd->vdev_ops) { vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); return (SET_ERROR(EINVAL)); } if (svd->vdev_guid != dvd->vdev_guid) { vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " "%llu)", (u_longlong_t)svd->vdev_guid, (u_longlong_t)dvd->vdev_guid); return (SET_ERROR(EINVAL)); } if (svd->vdev_children != dvd->vdev_children) { vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " "%llu != %llu", (u_longlong_t)svd->vdev_children, (u_longlong_t)dvd->vdev_children); return (SET_ERROR(EINVAL)); } for (uint64_t i = 0; i < svd->vdev_children; i++) { int error = vdev_copy_path_strict(svd->vdev_child[i], dvd->vdev_child[i]); if (error != 0) return (error); } if (svd->vdev_ops->vdev_op_leaf) vdev_copy_path_impl(svd, dvd); return (0); } static void vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) { ASSERT(stvd->vdev_top == stvd); ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); for (uint64_t i = 0; i < dvd->vdev_children; i++) { vdev_copy_path_search(stvd, dvd->vdev_child[i]); } if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) return; /* * The idea here is that while a vdev can shift positions within * a top vdev (when replacing, attaching mirror, etc.) it cannot * step outside of it. */ vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) return; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_copy_path_impl(vd, dvd); } /* * Recursively copy vdev paths from one root vdev to another. Source and * destination vdev trees may differ in geometry. For each destination leaf * vdev, search a vdev with the same guid and top vdev id in the source. * Intended to copy paths from userland config into MOS config. */ void vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) { uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); ASSERT(srvd->vdev_ops == &vdev_root_ops); ASSERT(drvd->vdev_ops == &vdev_root_ops); for (uint64_t i = 0; i < children; i++) { vdev_copy_path_search(srvd->vdev_child[i], drvd->vdev_child[i]); } } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; ASSERT(vd != NULL); ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { ASSERT(spa_is_root(vd->vdev_spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. */ if (l2arc_vdev_present(vd)) { l2arc_rebuild_vdev(vd, B_TRUE); } else { l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); } /* * Recheck if resilver is still needed and cancel any * scheduled resilver if resilver is unneeded. */ if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && spa->spa_async_tasks & SPA_ASYNC_RESILVER) { mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; mutex_exit(&spa->spa_async_lock); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : SET_ERROR(ENXIO)); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { uint64_t asize = vd->vdev_asize; uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. * * The default values used below are a good balance between memory * usage (larger metaslab size means more memory needed for loaded * metaslabs; more metaslabs means more memory needed for the * metaslab_t structs), metaslab load time (larger metaslabs take * longer to load), and metaslab sync time (more metaslabs means * more time spent syncing all of them). * * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. * The range of the dimensions are as follows: * * 2^29 <= ms_size <= 2^34 * 16 <= ms_count <= 131,072 * * On the lower end of vdev sizes, we aim for metaslabs sizes of * at least 512MB (2^29) to minimize fragmentation effects when * testing with smaller devices. However, the count constraint * of at least 16 metaslabs will override this minimum size goal. * * On the upper end of vdev sizes, we aim for a maximum metaslab * size of 16GB. However, we will cap the total count to 2^17 * metaslabs to keep our memory footprint in check and let the * metaslab size grow from there if that limit is hit. * * The net effect of applying above constrains is summarized below. * * vdev size metaslab count * --------------|----------------- * < 8GB ~16 * 8GB - 100GB one per 512MB * 100GB - 3TB ~200 * 3TB - 2PB one per 16GB * > 2PB ~131,072 * -------------------------------- * * Finally, note that all of the above calculate the initial * number of metaslabs. Expanding a top-level vdev will result * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); else if (ms_count > zfs_vdev_default_ms_count) ms_shift = highbit64(asize / zfs_vdev_default_ms_count); else ms_shift = zfs_vdev_default_ms_shift; if (ms_shift < SPA_MAXBLOCKSHIFT) { ms_shift = SPA_MAXBLOCKSHIFT; } else if (ms_shift > zfs_vdev_max_ms_shift) { ms_shift = zfs_vdev_max_ms_shift; /* cap the total count to constrain memory footprint */ if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); } vd->vdev_ms_shift = ms_shift; ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); /* indirect vdevs don't have metaslabs or dtls */ ASSERT(vdev_is_concrete(vd) || flags == 0); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); /* * While we are loading the pool, the DTLs have not been loaded yet. * This isn't a problem but it can result in devices being tried * which are known to not have the data. In which case, the import * is relying on the checksum to ensure that we get the right data. * Note that while importing we are only reading the MOS, which is * always checksummed. */ mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); mutex_exit(&vd->vdev_dtl_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(&vd->vdev_dtl_lock); empty = range_tree_is_empty(rt); mutex_exit(&vd->vdev_dtl_lock); return (empty); } /* * Check if the txg falls within the range which must be * resilvered. DVAs outside this range can always be skipped. */ boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { (void) dva, (void) psize; /* Set by sequential resilver. */ if (phys_birth == TXG_UNKNOWN) return (B_TRUE); return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); } /* * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. */ boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); if (vd->vdev_ops->vdev_op_need_resilver == NULL || vd->vdev_ops->vdev_op_leaf) return (B_TRUE); return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, phys_birth)); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) { ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_deferred) return (B_FALSE); if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) return (B_TRUE); if (rebuild_done) { vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; /* Rebuild not initiated by attach */ if (vd->vdev_rebuild_txg == 0) return (B_TRUE); /* * When a rebuild completes without error then all missing data * up to the rebuild max txg has been reconstructed and the DTL * is eligible for excision. */ if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && vdev_dtl_max(vd) <= vrp->vrp_max_txg) { ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); return (B_TRUE); } } else { dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; /* Resilver not initiated by attach */ if (vd->vdev_resilver_txg == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the * scn_max_txg value to the highest txg value that exists * in all DTLs. If this device's max DTL is not part of this * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] * then it is not eligible for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); return (B_TRUE); } } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. If txg == 0 no * write operations will be issued to the pool. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done, rebuild_done); if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; boolean_t check_excise = B_FALSE; boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* * If requested, pretend the scan or rebuild completed cleanly. */ if (zfs_scan_ignore_errors) { if (scn != NULL) scn->scn_phys.scn_errors = 0; if (vr != NULL) vr->vr_rebuild_phys.vrp_errors = 0; } if (scrub_txg != 0 && !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { wasempty = B_FALSE; zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " "dtl:%llu/%llu errors:%llu", (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, (u_longlong_t)scrub_txg, spa->spa_scrub_started, (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd), (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); } /* * If we've completed a scrub/resilver or a rebuild cleanly * then determine if this vdev should remove any DTLs. We * only want to excise regions on vdevs that were available * during the entire duration of this scan. */ if (rebuild_done && vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { check_excise = B_TRUE; } else { if (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) { check_excise = B_TRUE; } } if (scrub_txg && check_excise && vdev_dtl_should_excise(vd, rebuild_done)) { /* * We completed a scrub, resilver or rebuild up to * scrub_txg. If we did it without rebooting, then * the scrub dtl will be valid, so excise the old * region and fold in the scrub dtl. Otherwise, * leave the dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { zfs_dbgmsg("update DTL_MISSING:%llu/%llu", (u_longlong_t)vdev_dtl_min(vd), (u_longlong_t)vdev_dtl_max(vd)); } else if (!wasempty) { zfs_dbgmsg("DTL_MISSING is now empty"); } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering or rebuilding and no longer * has any DTLs then reset the appropriate flag and dirty * the top level so that we persist the change. */ if (txg != 0 && range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { if (vd->vdev_rebuild_txg != 0) { vd->vdev_rebuild_txg = 0; vdev_config_dirty(vd->vdev_top); } else if (vd->vdev_resilver_txg != 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vdev_get_nparity(vd) != 0) minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } /* * Iterate over all the vdevs except spare, and post kobj events */ void vdev_post_kobj_evt(vdev_t *vd) { if (vd->vdev_ops->vdev_op_kobj_evt_post && vd->vdev_kobj_flag == B_FALSE) { vd->vdev_kobj_flag = B_TRUE; vd->vdev_ops->vdev_op_kobj_evt_post(vd); } for (int c = 0; c < vd->vdev_children; c++) vdev_post_kobj_evt(vd->vdev_child[c]); } /* * Iterate over all the vdevs except spare, and clear kobj events */ void vdev_clear_kobj_evt(vdev_t *vd) { vd->vdev_kobj_flag = B_FALSE; for (int c = 0; c < vd->vdev_children; c++) vdev_clear_kobj_evt(vd->vdev_child[c]); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(vdev_is_concrete(vd)); /* * If the dtl cannot be sync'd there is no need to open it. */ if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) return (0); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, vd->vdev_dtl[DTL_MISSING]); mutex_exit(&vd->vdev_dtl_lock); } range_tree_vacate(rt, NULL, NULL); range_tree_destroy(rt); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } static void vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; const char *string; ASSERT(alloc_bias != VDEV_BIAS_NONE); string = (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; ASSERT(string != NULL); VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, strlen(string) + 1, string, tx)); if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { spa_activate_allocation_classes(spa, tx); } } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) vdev_zap_allocation_data(vd, tx); } } if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); vd->vdev_root_zap = vdev_create_link_zap(vd, tx); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } static void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0)); ASSERT(vd->vdev_dtl_sm != NULL); } rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " "new object %llu", (u_longlong_t)txg, spa_name(spa), (u_longlong_t)object, (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); if (!required && zio_injection_enabled) { required = !!zio_handle_device_injection(vd, NULL, SET_ERROR(ECHILD)); } return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } /* * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj * will contain either the checkpoint spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } int vdev_load(vdev_t *vd) { int children = vd->vdev_children; int error = 0; taskq_t *tq = NULL; /* * It's only worthwhile to use the taskq for the root vdev, because the * slow part is metaslab_init, and that only happens for top-level * vdevs. */ if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { tq = taskq_create("vdev_load", children, minclsyspri, children, children, TASKQ_PREPOPULATE); } /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; if (tq == NULL || vdev_uses_zvols(cvd)) { cvd->vdev_load_error = vdev_load(cvd); } else { VERIFY(taskq_dispatch(tq, vdev_load_child, cvd, TQ_SLEEP) != TASKQID_INVALID); } } if (tq != NULL) { taskq_wait(tq); taskq_destroy(tq); } for (int c = 0; c < vd->vdev_children; c++) { int error = vd->vdev_child[c]->vdev_load_error; if (error != 0) return (error); } vdev_set_deflate_ratio(vd); /* * On spa_load path, grab the allocation bias from our zap */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; char bias_str[64]; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); return (error); } } if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { spa_t *spa = vd->vdev_spa; uint64_t failfast; error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), 1, &failfast); if (error == 0) { vd->vdev_failfast = failfast & 1; } else if (error == ENOENT) { vd->vdev_failfast = vdev_prop_default_numeric( VDEV_PROP_FAILFAST); } else { vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " "failed [error=%d]", (u_longlong_t)vd->vdev_top_zap, error); } } /* * Load any rebuild state from the top-level vdev zap. */ if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { error = vdev_rebuild_load(vd); if (error && error != ENOTSUP) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " "failed [error=%d]", error); return (error); } } if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) { uint64_t zapobj; if (vd->vdev_top_zap != 0) zapobj = vd->vdev_top_zap; else zapobj = vd->vdev_leaf_zap; error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N, &vd->vdev_checksum_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T, &vd->vdev_checksum_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_N, &vd->vdev_io_n); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); error = vdev_prop_get_int(vd, VDEV_PROP_IO_T, &vd->vdev_io_t); if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); } /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " "asize=%llu", (u_longlong_t)vd->vdev_ashift, (u_longlong_t)vd->vdev_asize); return (SET_ERROR(ENXIO)); } error = vdev_metaslab_init(vd, 0); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " "[error=%d]", error); vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (error); } uint64_t checkpoint_sm_obj; error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); if (error == 0 && checkpoint_sm_obj != 0) { objset_t *mos = spa_meta_objset(vd->vdev_spa); ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); error = space_map_open(&vd->vdev_checkpoint_sm, mos, checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift); if (error != 0) { vdev_dbgmsg(vd, "vdev_load: space_map_open " "failed for checkpoint spacemap (obj %llu) " "[error=%d]", (u_longlong_t)checkpoint_sm_obj, error); return (error); } ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); /* * Since the checkpoint_sm contains free entries * exclusively we can use space_map_allocated() to * indicate the cumulative checkpointed space that * has been freed. */ vd->vdev_stat.vs_checkpoint_space = -space_map_allocated(vd->vdev_checkpoint_sm); vd->vdev_spa->spa_checkpoint_info.sci_dspace += vd->vdev_stat.vs_checkpoint_space; } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve " "checkpoint space map object from vdev ZAP " "[error=%d]", error); return (error); } } /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " "[error=%d]", error); return (error); } uint64_t obsolete_sm_object; error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); if (error == 0 && obsolete_sm_object != 0) { objset_t *mos = vd->vdev_spa->spa_meta_objset; ASSERT(vd->vdev_asize != 0); ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, obsolete_sm_object, 0, vd->vdev_asize, 0))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " "obsolete spacemap (obj %llu) [error=%d]", (u_longlong_t)obsolete_sm_object, error); return (error); } } else if (error != 0) { vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " "space map object from vdev ZAP [error=%d]", error); return (error); } return (0); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } static void vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) { objset_t *mos = spa_meta_objset(vd->vdev_spa); if (vd->vdev_top_zap == 0) return; uint64_t object = 0; int err = zap_lookup(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); if (err == ENOENT) return; VERIFY0(err); VERIFY0(dmu_object_free(mos, object, tx)); VERIFY0(zap_remove(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); } /* * Free the objects used to store this vdev's spacemaps, and the array * that points to them. */ void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ms_array == 0) return; objset_t *mos = vd->vdev_spa->spa_meta_objset; uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; size_t array_bytes = array_count * sizeof (uint64_t); uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, array_bytes, smobj_array, 0)); for (uint64_t i = 0; i < array_count; i++) { uint64_t smobj = smobj_array[i]; if (smobj == 0) continue; space_map_free_obj(mos, smobj, tx); } kmem_free(smobj_array, array_bytes); VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); vdev_destroy_ms_flush_data(vd, tx); vd->vdev_ms_array = 0; } static void vdev_remove_empty_log(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); vdev_destroy_spacemaps(vd, tx); if (vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(vdev_is_concrete(vd)); while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) != NULL) metaslab_sync_done(msp, txg); if (reassess) { metaslab_sync_reassess(vd->vdev_mg); if (vd->vdev_log_mg != NULL) metaslab_sync_reassess(vd->vdev_log_mg); } } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; ASSERT3U(txg, ==, spa->spa_syncing_txg); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (range_tree_space(vd->vdev_obsolete_segments) > 0) { ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); vdev_indirect_sync_obsolete(vd, tx); /* * If the vdev is indirect, it can't have dirty * metaslabs or DTLs. */ if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); dmu_tx_commit(tx); return; } } ASSERT(vdev_is_concrete(vd)); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && !vd->vdev_removing) { ASSERT(vd == vd->vdev_top); ASSERT0(vd->vdev_indirect_config.vic_mapping_object); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); } while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); /* * If this is an empty log device being removed, destroy the * metadata associated with it. */ if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove_empty_log(vd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); dmu_tx_commit(tx); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); tvd = vd->vdev_top; /* * If user did a 'zpool offline -f' then make the fault persist across * reboots. */ if (aux == VDEV_AUX_EXTERNAL_PERSIST) { /* * There are two kinds of forced faults: temporary and * persistent. Temporary faults go away at pool import, while * persistent faults stay set. Both types of faults can be * cleared with a zpool clear. * * We tell if a vdev is persistently faulted by looking at the * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at * import then it's a persistent fault. Otherwise, it's * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This * tells vdev_config_generate() (which gets run later) to set * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. */ vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; vd->vdev_tmpoffline = B_FALSE; aux = VDEV_AUX_EXTERNAL; } else { vd->vdev_tmpoffline = B_TRUE; } /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_remove_wanted(spa_t *spa, uint64_t guid) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* * If the vdev is already removed, or expanding which can trigger * repartition add/remove events, then don't do anything. */ if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); /* * Confirm the vdev has been removed, otherwise don't do anything. */ if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand); vd->vdev_expansion_time = gethrestime_sec(); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } /* Restart initializing if necessary */ mutex_enter(&vd->vdev_initialize_lock); if (vdev_writeable(vd) && vd->vdev_initialize_thread == NULL && vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { (void) vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); /* * Restart trimming if necessary. We do not restart trimming for cache * devices here. This is triggered by l2arc_rebuild_vdev() * asynchronously for the whole device or in l2arc_evict() as it evicts * space for upcoming writes. */ mutex_enter(&vd->vdev_trim_lock); if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); /* * Asynchronously detach spare vdev if resilver or * rebuild is not required */ if (vd->vdev_unspare && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && !vdev_rebuild_active(tvd)) spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); } return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); if (vd->vdev_ops == &vdev_draid_spare_ops) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ ASSERT3P(tvd->vdev_log_mg, ==, NULL); metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_reset_logs(spa); /* * If the log device was successfully reset but has * checkpointed data, do not offline it. */ if (error == 0 && tvd->vdev_checkpoint_sm != NULL) { ASSERT3U(space_map_allocated( tvd->vdev_checkpoint_sm), !=, 0); error = ZFS_ERR_CHECKPOINT_EXISTS; } spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); /* * It makes no sense to "clear" an indirect or removed vdev. */ if (!vdev_is_concrete(vd) || vd->vdev_removed) return; /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in response to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_stat.vs_aux = 0; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); /* If a resilver isn't required, check if vdevs can be culled */ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && !dsl_scan_resilvering(spa->spa_dsl_pool) && !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; /* Clear recent error events cache (i.e. duplicate events tracking) */ zfs_ereport_clear(spa, vd); } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ops == &vdev_hole_ops || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write && vdev_is_concrete(vd)); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && vdev_is_concrete(vd) && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { /* * Exclude the dRAID spare when aggregating to avoid double counting * the ops and bytes. These IOs are counted by the physical leaves. */ if (cvd->vdev_ops == &vdev_draid_spare_ops) return; for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } /* * Get extended stats */ static void vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) { (void) cvd; int t, b; for (t = 0; t < ZIO_TYPES; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { vsx->vsx_total_histo[t][b] += cvsx->vsx_total_histo[t][b]; } } for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { vsx->vsx_queue_histo[t][b] += cvsx->vsx_queue_histo[t][b]; } vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; } } boolean_t vdev_is_spacemap_addressable(vdev_t *vd) { if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) return (B_TRUE); /* * If double-word space map entries are not enabled we assume * 47 bits of the space map entry are dedicated to the entry's * offset (see SM_OFFSET_BITS in space_map.h). We then use that * to calculate the maximum address that can be described by a * space map entry for the given device. */ uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; if (shift >= 63) /* detect potential overflow */ return (B_TRUE); return (vd->vdev_asize < (1ULL << shift)); } /* * Get statistics for the given vdev. */ static void vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { int t; /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (!vd->vdev_ops->vdev_op_leaf) { if (vs) { memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); } if (vsx) memset(vsx, 0, sizeof (*vsx)); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; vdev_get_stats_ex_impl(cvd, cvs, cvsx); if (vs) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); } } else { /* * We're a leaf. Just copy our ZIO active queue stats in. The * other leaf stats are updated in vdev_stat_update(). */ if (!vsx) return; memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; mutex_enter(&vd->vdev_stat_lock); if (vs) { memcpy(vs, &vd->vdev_stat, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report initializing progress. Since we don't * have the initializing locks held, this is only * an estimate (although a fairly accurate one). */ vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; vs->vs_initialize_state = vd->vdev_initialize_state; vs->vs_initialize_action_time = vd->vdev_initialize_action_time; /* * Report manual TRIM progress. Since we don't have * the manual TRIM locks held, this is only an * estimate (although fairly accurate one). */ vs->vs_trim_notsup = !vd->vdev_has_trim; vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; vs->vs_trim_state = vd->vdev_trim_state; vs->vs_trim_action_time = vd->vdev_trim_action_time; /* Set when there is a deferred resilver. */ vs->vs_resilver_deferred = vd->vdev_resilver_deferred; } /* * Report expandable space on top-level, non-auxiliary devices * only. The expandable space is reported in terms of metaslab * sized units since that determines how much space the pool * can expand. */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN( vd->vdev_max_asize - vd->vdev_asize, 1ULL << tvd->vdev_ms_shift); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; if (vd->vdev_physical_ashift <= ASHIFT_MAX) vs->vs_physical_ashift = vd->vdev_physical_ashift; else vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, * non-auxiliary, concrete devices. */ if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { /* * The vdev fragmentation rating doesn't take into * account the embedded slog metaslab (vdev_log_mg). * Since it's only one metaslab, it would have a tiny * impact on the overall fragmentation. */ vs->vs_fragmentation = (vd->vdev_mg != NULL) ? vd->vdev_mg->mg_fragmentation : 0; } vs->vs_noalloc = MAX(vd->vdev_noalloc, tvd ? tvd->vdev_noalloc : 0); } vdev_get_stats_ex_impl(vd, vs, vsx); mutex_exit(&vd->vdev_stat_lock); } void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { return (vdev_get_stats_ex(vd, vs, NULL)); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; /* Suppress ASAN false positive */ #ifdef __SANITIZE_ADDRESS__ vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; #else vdev_stat_t *vs = &vd->vdev_stat; vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; #endif zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { /* * Repair is the result of a resilver issued by the * scan thread (spa_sync). */ if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dsl_scan_phys_t *scn_phys = &scn->scn_phys; uint64_t *processed = &scn_phys->scn_processed; if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } /* * Repair is the result of a rebuild issued by the * rebuild thread (vdev_rebuild_thread). To avoid * double counting repaired bytes the virtual dRAID * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); } vs->vs_rebuild_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } /* * The bytes/ops/histograms are recorded at the leaf level and * aggregated into the higher level vdevs in vdev_get_stats(). */ if (vd->vdev_ops->vdev_op_leaf && (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { zio_type_t vs_type = type; zio_priority_t priority = zio->io_priority; /* * TRIM ops and bytes are reported to user space as * ZIO_TYPE_IOCTL. This is done to preserve the * vdev_stat_t structure layout for user space. */ if (type == ZIO_TYPE_TRIM) vs_type = ZIO_TYPE_IOCTL; /* * Solely for the purposes of 'zpool iostat -lqrw' * reporting use the priority to categorize the IO. * Only the following are reported to user space: * * ZIO_PRIORITY_SYNC_READ, * ZIO_PRIORITY_SYNC_WRITE, * ZIO_PRIORITY_ASYNC_READ, * ZIO_PRIORITY_ASYNC_WRITE, * ZIO_PRIORITY_SCRUB, * ZIO_PRIORITY_TRIM, * ZIO_PRIORITY_REBUILD. */ if (priority == ZIO_PRIORITY_INITIALIZING) { ASSERT3U(type, ==, ZIO_TYPE_WRITE); priority = ZIO_PRIORITY_ASYNC_WRITE; } else if (priority == ZIO_PRIORITY_REMOVAL) { priority = ((type == ZIO_TYPE_WRITE) ? ZIO_PRIORITY_ASYNC_WRITE : ZIO_PRIORITY_ASYNC_READ); } vs->vs_ops[vs_type]++; vs->vs_bytes[vs_type] += psize; if (flags & ZIO_FLAG_DELEGATED) { vsx->vsx_agg_histo[priority] [RQ_HISTO(zio->io_size)]++; } else { vsx->vsx_ind_histo[priority] [RQ_HISTO(zio->io_size)]++; } if (zio->io_delta && zio->io_delay) { vsx->vsx_queue_histo[priority] [L_HISTO(zio->io_delta - zio->io_delay)]++; vsx->vsx_disk_histo[type] [L_HISTO(zio->io_delay)]++; vsx->vsx_total_histo[type] [L_HISTO(zio->io_delta)]++; } } mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } int64_t vdev_deflated_space(vdev_t *vd, int64_t space) { ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { (void) defer_delta; int64_t dspace_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * children's, thus not accurate enough for us. */ dspace_delta = vdev_deflated_space(vd, space_delta); mutex_enter(&vd->vdev_stat_lock); /* ensure we won't underflow */ if (alloc_delta < 0) { ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); } vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); /* every class but log contributes to root space stats */ if (vd->vdev_mg != NULL && !vd->vdev_islog) { ASSERT(!vd->vdev_isl2cache); mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } /* Note: metaslab_class_space_update moved to metaslab_space_update */ } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && vdev_is_concrete(vd)) { list_insert_head(&spa->spa_config_dirty_list, vd); } } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && vdev_is_concrete(vd)) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes or indirect vdevs into the * decision. */ if (!vdev_is_concrete(child)) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { /* * Since vdev_offline() code path is already in an offline * state we can miss a statechange event to OFFLINE. Check * the previous state to catch this condition. */ if (vd->vdev_ops->vdev_op_leaf && (state == VDEV_STATE_OFFLINE) && (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { /* post an offline state change */ zfs_post_state_change(spa, vd, vd->vdev_prevstate); } vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; case VDEV_AUX_BAD_ASHIFT: class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify ZED of any significant state-change on a leaf vdev. * */ if (vd->vdev_ops->vdev_op_leaf) { /* preserve original state from a vdev_reopen() */ if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && (vd->vdev_prevstate != vd->vdev_state) && (save_state <= VDEV_STATE_CLOSED)) save_state = vd->vdev_prevstate; /* filter out state change due to initial vdev_open */ if (save_state > VDEV_STATE_CLOSED) zfs_post_state_change(spa, vd, save_state); } if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } boolean_t vdev_children_are_offline(vdev_t *vd) { ASSERT(!vd->vdev_ops->vdev_op_leaf); for (uint64_t i = 0; i < vd->vdev_children; i++) { if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) return (B_FALSE); } return (B_TRUE); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. */ boolean_t vdev_is_bootable(vdev_t *vd) { if (!vd->vdev_ops->vdev_op_leaf) { const char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } boolean_t vdev_is_concrete(vdev_t *vd) { vdev_ops_t *ops = vd->vdev_ops; if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || ops == &vdev_missing_ops || ops == &vdev_root_ops) { return (B_FALSE); } else { return (B_TRUE); } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vdev_is_concrete(vd)); vdev_set_deflate_ratio(vd); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && vdev_is_concrete(vd)) { vdev_metaslab_group_create(vd); VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; VERIFY3U(pvd->vdev_children, >, 1); vdev_remove_child(pvd, vd); vdev_compact_children(pvd); ASSERT3P(pvd->vdev_child, !=, NULL); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd, const char *tag) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd, tag); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; zfs_dbgmsg("slow vdev: %s has %u active IOs", vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); } mutex_exit(&vq->vq_lock); } } void vdev_defer_resilver(vdev_t *vd) { ASSERT(vd->vdev_ops->vdev_op_leaf); vd->vdev_resilver_deferred = B_TRUE; vd->vdev_spa->spa_resilver_deferred = B_TRUE; } /* * Clears the resilver deferred flag on all leaf devs under vd. Returns * B_TRUE if we have devices that need to be resilvered and are available to * accept resilver I/Os. */ boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) { boolean_t resilver_needed = B_FALSE; spa_t *spa = vd->vdev_spa; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } if (vd == spa->spa_root_vdev && spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); vdev_config_dirty(vd); spa->spa_resilver_deferred = B_FALSE; return (resilver_needed); } if (!vdev_is_concrete(vd) || vd->vdev_aux || !vd->vdev_ops->vdev_op_leaf) return (resilver_needed); vd->vdev_resilver_deferred = B_FALSE; return (!vdev_is_dead(vd) && !vd->vdev_offline && vdev_resilver_needed(vd, NULL, NULL)); } boolean_t vdev_xlate_is_empty(range_seg64_t *rs) { return (rs->rs_start == rs->rs_end); } /* * Translate a logical range to the first contiguous physical range for the * specified vdev_t. This function is initially called with a leaf vdev and * will walk each parent vdev until it reaches a top-level vdev. Once the * top-level is reached the physical range is initialized and the recursive * function begins to unwind. As it unwinds it calls the parent's vdev * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, remain_rs); } else { /* * We've reached the top-level vdev, initialize the physical * range to the logical range and set an empty remaining * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; remain_rs->rs_start = logical_rs->rs_start; remain_rs->rs_end = logical_rs->rs_start; return; } vdev_t *pvd = vd->vdev_parent; ASSERT3P(pvd, !=, NULL); ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); /* * As this recursive function unwinds, translate the logical * range into its physical and any remaining components by calling * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, vdev_xlate_func_t *func, void *arg) { range_seg64_t iter_rs = *logical_rs; range_seg64_t physical_rs; range_seg64_t remain_rs; while (!vdev_xlate_is_empty(&iter_rs)) { vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); /* * With raidz and dRAID, it's possible that the logical range * does not live on this leaf vdev. Only when there is a non- * zero physical size call the provided function. */ if (!vdev_xlate_is_empty(&physical_rs)) func(arg, &physical_rs); iter_rs = remain_rs; } } static char * vdev_name(vdev_t *vd, char *buf, int buflen) { if (vd->vdev_path == NULL) { if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { strlcpy(buf, vd->vdev_spa->spa_name, buflen); } else if (!vd->vdev_ops->vdev_op_leaf) { snprintf(buf, buflen, "%s-%llu", vd->vdev_ops->vdev_op_type, (u_longlong_t)vd->vdev_id); } } else { strlcpy(buf, vd->vdev_path, buflen); } return (buf); } /* * Look at the vdev tree and determine whether any devices are currently being * replaced. */ boolean_t vdev_replace_in_progress(vdev_t *vdev) { ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); if (vdev->vdev_ops == &vdev_replacing_ops) return (B_TRUE); /* * A 'spare' vdev indicates that we have a replace in progress, unless * it has exactly two children, and the second, the hot spare, has * finished being resilvered. */ if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) return (B_TRUE); for (int i = 0; i < vdev->vdev_children; i++) { if (vdev_replace_in_progress(vdev->vdev_child[i])) return (B_TRUE); } return (B_FALSE); } /* * Add a (source=src, propname=propval) list to an nvlist. */ static void vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval, uint64_t intval, zprop_source_t src) { nvlist_t *propval; propval = fnvlist_alloc(); fnvlist_add_uint64(propval, ZPROP_SOURCE, src); if (strval != NULL) fnvlist_add_string(propval, ZPROP_VALUE, strval); else fnvlist_add_uint64(propval, ZPROP_VALUE, intval); fnvlist_add_nvlist(nvl, propname, propval); nvlist_free(propval); } static void vdev_props_set_sync(void *arg, dmu_tx_t *tx) { vdev_t *vd; nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; uint64_t vdev_guid; uint64_t objid; nvlist_t *nvprops; vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); /* this vdev could get removed while waiting for this sync task */ if (vd == NULL) return; /* * Set vdev property values in the vdev props mos object. */ if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { panic("unexpected vdev type"); } mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { uint64_t intval; const char *strval; vdev_prop_t prop; const char *propname = nvpair_name(elem); zprop_type_t proptype; switch (prop = vdev_name_to_prop(propname)) { case VDEV_PROP_USERPROP: if (vdev_prop_user(propname)) { strval = fnvpair_value_string(elem); if (strlen(strval) == 0) { /* remove the property if value == "" */ (void) zap_remove(mos, objid, propname, tx); } else { VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); } spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } break; default: /* normalize the property name */ propname = vdev_prop_to_name(prop); proptype = vdev_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, objid, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%s", (u_longlong_t)vdev_guid, nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(vdev_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, objid, propname, sizeof (uint64_t), 1, &intval, tx)); spa_history_log_internal(spa, "vdev set", tx, "vdev_guid=%llu: %s=%lld", (u_longlong_t)vdev_guid, nvpair_name(elem), (longlong_t)intval); } else { panic("invalid vdev property type %u", nvpair_type(elem)); } } } mutex_exit(&spa->spa_props_lock); } int vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; nvpair_t *elem = NULL; uint64_t vdev_guid; nvlist_t *nvprops; int error = 0; ASSERT(vd != NULL); /* Check that vdev has a zap we can use */ if (vd->vdev_root_zap == 0 && vd->vdev_top_zap == 0 && vd->vdev_leaf_zap == 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, &nvprops) != 0) return (SET_ERROR(EINVAL)); if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) return (SET_ERROR(EINVAL)); while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { const char *propname = nvpair_name(elem); vdev_prop_t prop = vdev_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { error = EINVAL; goto end; } if (vdev_prop_readonly(prop)) { error = EROFS; goto end; } /* Special Processing */ switch (prop) { case VDEV_PROP_PATH: if (vd->vdev_path == NULL) { error = EROFS; break; } if (nvpair_value_string(elem, &strval) != 0) { error = EINVAL; break; } /* New path must start with /dev/ */ if (strncmp(strval, "/dev/", 5)) { error = EINVAL; break; } error = spa_vdev_setpath(spa, vdev_guid, strval); break; case VDEV_PROP_ALLOCATING: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } if (intval != vd->vdev_noalloc) break; if (intval == 0) error = spa_vdev_noalloc(spa, vdev_guid); else error = spa_vdev_alloc(spa, vdev_guid); break; case VDEV_PROP_FAILFAST: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_failfast = intval & 1; break; case VDEV_PROP_CHECKSUM_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_n = intval; break; case VDEV_PROP_CHECKSUM_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_checksum_t = intval; break; case VDEV_PROP_IO_N: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_n = intval; break; case VDEV_PROP_IO_T: if (nvpair_value_uint64(elem, &intval) != 0) { error = EINVAL; break; } vd->vdev_io_t = intval; break; default: /* Most processing is done in vdev_props_set_sync */ break; } end: if (error != 0) { intval = error; vdev_prop_add_list(outnvl, propname, strval, intval, 0); return (error); } } return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } int vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int err = 0; uint64_t objid; uint64_t vdev_guid; nvpair_t *elem = NULL; nvlist_t *nvprops = NULL; uint64_t intval = 0; char *strval = NULL; const char *propname = NULL; vdev_prop_t prop; ASSERT(vd != NULL); ASSERT(mos != NULL); if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, &vdev_guid) != 0) return (SET_ERROR(EINVAL)); nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); if (vd->vdev_root_zap != 0) { objid = vd->vdev_root_zap; } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { return (SET_ERROR(EINVAL)); } ASSERT(objid != 0); mutex_enter(&spa->spa_props_lock); if (nvprops != NULL) { char namebuf[64] = { 0 }; while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { intval = 0; strval = NULL; propname = nvpair_name(elem); prop = vdev_name_to_prop(propname); zprop_source_t src = ZPROP_SRC_DEFAULT; uint64_t integer_size, num_integers; switch (prop) { /* Special Read-only Properties */ case VDEV_PROP_NAME: strval = vdev_name(vd, namebuf, sizeof (namebuf)); if (strval == NULL) continue; vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_CAPACITY: /* percent used */ intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : (vd->vdev_stat.vs_alloc * 100 / vd->vdev_stat.vs_dspace); vdev_prop_add_list(outnvl, propname, NULL, intval, ZPROP_SRC_NONE); continue; case VDEV_PROP_STATE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_state, ZPROP_SRC_NONE); continue; case VDEV_PROP_GUID: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_guid, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_asize, ZPROP_SRC_NONE); continue; case VDEV_PROP_PSIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_psize, ZPROP_SRC_NONE); continue; case VDEV_PROP_ASHIFT: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_ashift, ZPROP_SRC_NONE); continue; case VDEV_PROP_SIZE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); continue; case VDEV_PROP_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_dspace - vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_ALLOCATED: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); continue; case VDEV_PROP_EXPANDSZ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRAGMENTATION: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_fragmentation, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARITY: vdev_prop_add_list(outnvl, propname, NULL, vdev_get_nparity(vd), ZPROP_SRC_NONE); continue; case VDEV_PROP_PATH: if (vd->vdev_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_DEVID: if (vd->vdev_devid == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_devid, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PHYS_PATH: if (vd->vdev_physpath == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_physpath, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_ENC_PATH: if (vd->vdev_enc_sysfs_path == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_FRU: if (vd->vdev_fru == NULL) continue; vdev_prop_add_list(outnvl, propname, vd->vdev_fru, 0, ZPROP_SRC_NONE); continue; case VDEV_PROP_PARENT: if (vd->vdev_parent != NULL) { strval = vdev_name(vd->vdev_parent, namebuf, sizeof (namebuf)); vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); } continue; case VDEV_PROP_CHILDREN: if (vd->vdev_children > 0) strval = kmem_zalloc(ZAP_MAXVALUELEN, KM_SLEEP); for (uint64_t i = 0; i < vd->vdev_children; i++) { const char *vname; vname = vdev_name(vd->vdev_child[i], namebuf, sizeof (namebuf)); if (vname == NULL) vname = "(unknown)"; if (strlen(strval) > 0) strlcat(strval, ",", ZAP_MAXVALUELEN); strlcat(strval, vname, ZAP_MAXVALUELEN); } if (strval != NULL) { vdev_prop_add_list(outnvl, propname, strval, 0, ZPROP_SRC_NONE); kmem_free(strval, ZAP_MAXVALUELEN); } continue; case VDEV_PROP_NUMCHILDREN: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_children, ZPROP_SRC_NONE); continue; case VDEV_PROP_READ_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_read_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_WRITE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_write_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_CHECKSUM_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_checksum_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_INITIALIZE_ERRORS: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_initialize_errors, ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_OPS_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_IOCTL. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_ops[ZIO_TYPE_IOCTL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_NULL: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_READ: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_WRITE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_FREE: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_CLAIM: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], ZPROP_SRC_NONE); continue; case VDEV_PROP_BYTES_TRIM: /* * TRIM ops and bytes are reported to user * space as ZIO_TYPE_IOCTL. This is done to * preserve the vdev_stat_t structure layout * for user space. */ vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_stat.vs_bytes[ZIO_TYPE_IOCTL], ZPROP_SRC_NONE); continue; case VDEV_PROP_REMOVING: vdev_prop_add_list(outnvl, propname, NULL, vd->vdev_removing, ZPROP_SRC_NONE); continue; /* Numeric Properites */ case VDEV_PROP_ALLOCATING: /* Leaf vdevs cannot have this property */ if (vd->vdev_mg == NULL && vd->vdev_top != NULL) { src = ZPROP_SRC_NONE; intval = ZPROP_BOOLEAN_NA; } else { err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; } vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; case VDEV_PROP_FAILFAST: src = ZPROP_SRC_LOCAL; strval = NULL; err = zap_lookup(mos, objid, nvpair_name(elem), sizeof (uint64_t), 1, &intval); if (err == ENOENT) { intval = vdev_prop_default_numeric( prop); err = 0; } else if (err) { break; } if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; vdev_prop_add_list(outnvl, propname, strval, intval, src); break; case VDEV_PROP_CHECKSUM_N: case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; if (intval == vdev_prop_default_numeric(prop)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; /* Text Properties */ case VDEV_PROP_COMMENT: /* Exists in the ZAP below */ /* FALLTHRU */ case VDEV_PROP_USERPROP: /* User Properites */ src = ZPROP_SRC_LOCAL; err = zap_length(mos, objid, nvpair_name(elem), &integer_size, &num_integers); if (err) break; switch (integer_size) { case 8: /* User properties cannot be integers */ err = EINVAL; break; case 1: /* string property */ strval = kmem_alloc(num_integers, KM_SLEEP); err = zap_lookup(mos, objid, nvpair_name(elem), 1, num_integers, strval); if (err) { kmem_free(strval, num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, num_integers); break; } break; default: err = ENOENT; break; } if (err) break; } } else { /* * Get all properties from the MOS vdev property object. */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, mos, objid); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { intval = 0; strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; propname = za.za_name; switch (za.za_integer_length) { case 8: /* We do not allow integer user properties */ /* This is likely an internal value */ break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, objid, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } vdev_prop_add_list(outnvl, propname, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); } mutex_exit(&spa->spa_props_lock); if (err && err != ENOENT) { return (err); } return (0); } EXPORT_SYMBOL(vdev_fault); EXPORT_SYMBOL(vdev_degrade); EXPORT_SYMBOL(vdev_online); EXPORT_SYMBOL(vdev_offline); EXPORT_SYMBOL(vdev_clear); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, "Default lower limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, "Default upper limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, "Practical upper limit of total metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " "(do not set below ZED threshold)."); /* END CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, "Ignore errors during resilver/scrub"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, "Bypass vdev_validate()"); ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, "Disable cache flushes"); ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, "Minimum number of metaslabs required to dedicate one for log blocks"); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, param_set_min_auto_ashift, param_get_uint, ZMOD_RW, "Minimum ashift used when creating new top-level vdevs"); ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); /* END CSTYLED */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 10279fde89df..3f5e6a08d89c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1,5168 +1,5180 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2022 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2021, Datto, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ========================================================================== * I/O type descriptions * ========================================================================== */ const char *const zio_type_name[ZIO_TYPES] = { /* * Note: Linux kernel thread name length is limited * so these names will differ from upstream open zfs. */ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim" }; int zio_dva_throttle_enabled = B_TRUE; static int zio_deadman_log_all = B_FALSE; /* * ========================================================================== * I/O kmem caches * ========================================================================== */ static kmem_cache_t *zio_cache; static kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #if defined(ZFS_DEBUG) && !defined(_KERNEL) static uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; static uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; #endif /* Mark IOs as "slow" if they take longer than 30 seconds */ static uint_t zio_slow_io_ms = (30 * MILLISEC); #define BP_SPANB(indblkshift, level) \ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) #define COMPARE_META_LEVEL 0x80000000ul /* * The following actions directly effect the spa's sync-to-convergence logic. * The values below define the sync pass when we start performing the action. * Care should be taken when changing these values as they directly impact * spa_sync() performance. Tuning these values may introduce subtle performance * pathologies and should only be done in the context of performance analysis. * These tunables will eventually be removed and replaced with #defines once * enough analysis has been done to determine optimal values. * * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that * regular blocks are not deferred. * * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable * compression (including of metadata). In practice, we don't have this * many sync passes, so this has no effect. * * The original intent was that disabling compression would help the sync * passes to converge. However, in practice disabling compression increases * the average number of sync passes, because when we turn compression off, a * lot of block's size will change and thus we have to re-allocate (not * overwrite) them. It also increases the number of 128KB allocations (e.g. * for indirect blocks and spacemaps) because these will not be compressed. * The 128K allocations are especially detrimental to performance on highly * fragmented systems, which may have very few free segments of this size, * and may need to load new metaslabs to satisfy 128K allocations. */ /* defer frees starting in this pass */ uint_t zfs_sync_pass_deferred_free = 2; /* don't compress starting in this pass */ static uint_t zfs_sync_pass_dont_compress = 8; /* rewrite new bps starting in this pass */ static uint_t zfs_sync_pass_rewrite = 2; /* * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) /* * Enable smaller cores by excluding metadata * allocations as well. */ int zio_exclude_metadata = 0; static int zio_requeue_io_start_cut_in_line = 1; #ifdef ZFS_DEBUG static const int zio_buf_debug_limit = 16384; #else static const int zio_buf_debug_limit = 0; #endif static inline void __zio_execute(zio_t *zio); static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); void zio_init(void) { size_t c; zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * For small buffers, we want a cache for each multiple of * SPA_MINBLOCKSIZE. For larger buffers, we want a cache * for each quarter-power of 2. */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; size_t p2 = size; size_t align = 0; size_t data_cflags, cflags; data_cflags = KMC_NODEBUG; cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; while (!ISP2(p2)) p2 &= p2 - 1; #ifndef _KERNEL /* * If we are using watchpoints, put each buffer on its own page, * to eliminate the performance overhead of trapping to the * kernel when modifying a non-watched buffer that shares the * page with a watched buffer. */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; /* * Here's the problem - on 4K native devices in userland on * Linux using O_DIRECT, buffers must be 4K aligned or I/O * will fail with EINVAL, causing zdb (and others) to coredump. * Since userland probably doesn't need optimized buffer caches, * we just force 4K alignment on everything. */ align = 8 * SPA_MINBLOCKSIZE; #else if (size < PAGESIZE) { align = SPA_MINBLOCKSIZE; } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = PAGESIZE; } #endif if (align != 0) { char name[36]; if (cflags == data_cflags) { /* * Resulting kmem caches would be identical. * Save memory by creating only one. */ (void) snprintf(name, sizeof (name), "zio_buf_comb_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); zio_data_buf_cache[c] = zio_buf_cache[c]; continue; } (void) snprintf(name, sizeof (name), "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, cflags); (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, NULL, data_cflags); } } while (--c != 0) { ASSERT(zio_buf_cache[c] != NULL); if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; ASSERT(zio_data_buf_cache[c] != NULL); if (zio_data_buf_cache[c - 1] == NULL) zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; } zio_inject_init(); lz4_init(); } void zio_fini(void) { size_t n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; #if defined(ZFS_DEBUG) && !defined(_KERNEL) for (size_t i = 0; i < n; i++) { if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i]) (void) printf("zio_fini: [%d] %llu != %llu\n", (int)((i + 1) << SPA_MINBLOCKSHIFT), (long long unsigned)zio_buf_cache_allocs[i], (long long unsigned)zio_buf_cache_frees[i]); } #endif /* * The same kmem cache can show up multiple times in both zio_buf_cache * and zio_data_buf_cache. Do a wasteful but trivially correct scan to * sort it out. */ for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_buf_cache[j]) zio_buf_cache[j] = NULL; if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { kmem_cache_t *cache = zio_data_buf_cache[i]; if (cache == NULL) continue; for (size_t j = i; j < n; j++) { if (cache == zio_data_buf_cache[j]) zio_data_buf_cache[j] = NULL; } kmem_cache_destroy(cache); } for (size_t i = 0; i < n; i++) { VERIFY3P(zio_buf_cache[i], ==, NULL); VERIFY3P(zio_data_buf_cache[i], ==, NULL); } kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); lz4_fini(); } /* * ========================================================================== * Allocate and free I/O buffers * ========================================================================== */ /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's * useful to inspect ZFS metadata, but if possible, we should avoid keeping * excess / transient data in-core during a crashdump. */ void * zio_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); } /* * Use zio_data_buf_alloc to allocate data. The data will not appear in a * crashdump if the kernel panics. This exists so that we will limit the amount * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount * of kernel heap dumped to disk when the kernel panics) */ void * zio_data_buf_alloc(size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); } void zio_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); #if defined(ZFS_DEBUG) && !defined(_KERNEL) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif kmem_cache_free(zio_buf_cache[c], buf); } void zio_data_buf_free(void *buf, size_t size) { size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); kmem_cache_free(zio_data_buf_cache[c], buf); } static void zio_abd_free(void *abd, size_t size) { (void) size; abd_free((abd_t *)abd); } /* * ========================================================================== * Push and pop I/O transform buffers * ========================================================================== */ void zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; zio->io_abd = data; zio->io_size = size; } void zio_pop_transforms(zio_t *zio) { zio_transform_t *zt; while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) abd_free(zio->io_abd); zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; kmem_free(zt, sizeof (zio_transform_t)); } } /* * ========================================================================== * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) abd_copy(data, zio->io_abd, size); } static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { void *tmp = abd_borrow_buf(data, size); int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), zio->io_abd, tmp, zio->io_size, size, &zio->io_prop.zp_complevel); abd_return_buf_copy(data, tmp, size); if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); if (ret != 0) zio->io_error = SET_ERROR(EIO); } } static void zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) { int ret; void *tmp; blkptr_t *bp = zio->io_bp; spa_t *spa = zio->io_spa; uint64_t dsobj = zio->io_bookmark.zb_objset; uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t ot = BP_GET_TYPE(bp); uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; ASSERT(BP_USES_CRYPT(bp)); ASSERT3U(size, !=, 0); if (zio->io_error != 0) return; /* * Verify the cksum of MACs stored in an indirect bp. It will always * be possible to verify this since it does not require an encryption * key. */ if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { zio_crypt_decode_mac_bp(bp, mac); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { /* * We haven't decompressed the data yet, but * zio_crypt_do_indirect_mac_checksum() requires * decompressed data to be able to parse out the MACs * from the indirect block. We decompress it now and * throw away the result after we are finished. */ tmp = zio_buf_alloc(lsize); ret = zio_decompress_data(BP_GET_COMPRESS(bp), zio->io_abd, tmp, zio->io_size, lsize, &zio->io_prop.zp_complevel); if (ret != 0) { ret = SET_ERROR(EIO); goto error; } ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); zio_buf_free(tmp, lsize); } else { ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); } abd_copy(data, zio->io_abd, size); if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } if (ret != 0) goto error; return; } /* * If this is an authenticated block, just check the MAC. It would be * nice to separate this out into its own flag, but when this was done, * we had run out of bits in what is now zio_flag_t. Future cleanup * could make this a flag bit. */ if (BP_IS_AUTHENTICATED(bp)) { if (ot == DMU_OT_OBJSET) { ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); } else { zio_crypt_decode_mac_bp(bp, mac); ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, zio->io_abd, size, mac); if (zio_injection_enabled && ret == 0) { ret = zio_handle_decrypt_injection(spa, &zio->io_bookmark, ot, ECKSUM); } } abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; } zio_crypt_decode_params_bp(bp, salt, iv); if (ot == DMU_OT_INTENT_LOG) { tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); zio_crypt_decode_mac_zil(tmp, mac); abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); } else { zio_crypt_decode_mac_bp(bp, mac); } ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, zio->io_abd, &no_crypt); if (no_crypt) abd_copy(data, zio->io_abd, size); if (ret != 0) goto error; return; error: /* assert that the key was found unless this was speculative */ ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); /* * If there was a decryption / authentication error return EIO as * the io_error. If this was not a speculative zio, create an ereport. */ if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark, &zio->io_bp->blk_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } } else { zio->io_error = ret; } } /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ zio_t * zio_walk_parents(zio_t *cio, zio_link_t **zl) { list_t *pl = &cio->io_parent_list; *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_child == cio); return ((*zl)->zl_parent); } zio_t * zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; ASSERT(MUTEX_HELD(&pio->io_lock)); *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); ASSERT((*zl)->zl_parent == pio); return ((*zl)->zl_child); } zio_t * zio_unique_parent(zio_t *cio) { zio_link_t *zl = NULL; zio_t *pio = zio_walk_parents(cio, &zl); VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); return (pio); } void zio_add_child(zio_t *pio, zio_t *cio) { /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } void zio_add_child_first(zio_t *pio, zio_t *cio) { /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. * Vdev I/Os can only have vdev children. * The following ASSERT captures all of these constraints. */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; ASSERT(list_is_empty(&cio->io_parent_list)); list_insert_head(&cio->io_parent_list, zl); mutex_enter(&pio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); mutex_exit(&pio->io_lock); } static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); mutex_enter(&pio->io_lock); mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } static boolean_t zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) { boolean_t waiting = B_FALSE; mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); for (int c = 0; c < ZIO_CHILD_TYPES; c++) { if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) continue; uint64_t *countp = &zio->io_children[c][wait]; if (*countp != 0) { zio->io_stage >>= 1; ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); zio->io_stall = countp; waiting = B_TRUE; break; } } mutex_exit(&zio->io_lock); return (waiting); } __attribute__((always_inline)) static inline void zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, zio_t **next_to_executep) { uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; int *errorp = &pio->io_child_error[zio->io_child_type]; mutex_enter(&pio->io_lock); if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); (*countp)--; if (*countp == 0 && pio->io_stall == countp) { zio_taskq_type_t type = pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : ZIO_TASKQ_INTERRUPT; pio->io_stall = NULL; mutex_exit(&pio->io_lock); /* * If we can tell the caller to execute this parent next, do * so. We only do this if the parent's zio type matches the * child's type. Otherwise dispatch the parent zio in its * own taskq. * * Having the caller execute the parent when possible reduces * locking on the zio taskq's, reduces context switch * overhead, and has no recursion penalty. Note that one * read from disk typically causes at least 3 zio's: a * zio_null(), the logical zio_read(), and then a physical * zio. When the physical ZIO completes, we are able to call * zio_done() on all 3 of these zio's from one invocation of * zio_execute() by returning the parent back to * zio_execute(). Since the parent isn't executed until this * thread returns back to zio_execute(), the caller should do * so promptly. * * In other cases, dispatching the parent prevents * overflowing the stack when we have deeply nested * parent-child relationships, as we do with the "mega zio" * of writes for spa_sync(), and the chain of ZIL blocks. */ if (next_to_executep != NULL && *next_to_executep == NULL && pio->io_type == zio->io_type) { *next_to_executep = pio; } else { zio_taskq_dispatch(pio, type, B_FALSE); } } else { mutex_exit(&pio->io_lock); } } static void zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; } int zio_bookmark_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) return (-1); if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) return (1); if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) return (-1); if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) return (1); if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) return (-1); if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) return (1); if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) return (-1); if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) return (1); if (z1 < z2) return (-1); if (z1 > z2) return (1); return (0); } /* * ========================================================================== * Create the various types of I/O (read, write, free, etc) * ========================================================================== */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, zio_flag_t flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); memset(zio, 0, sizeof (zio_t)); mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); list_create(&zio->io_parent_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_parent_node)); list_create(&zio->io_child_list, sizeof (zio_link_t), offsetof(zio_link_t, zl_child_node)); metaslab_trace_init(&zio->io_alloc_list); if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; else if (flags & ZIO_FLAG_DDT_CHILD) zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { if (type != ZIO_TYPE_WRITE || zio->io_child_type == ZIO_CHILD_DDT) { zio->io_bp_copy = *bp; zio->io_bp = &zio->io_bp_copy; /* so caller can free */ } else { zio->io_bp = (blkptr_t *)bp; } zio->io_bp_orig = *bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; zio->io_pipeline_trace = ZIO_STAGE_OPEN; zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child_first(pio, zio); } taskq_init_ent(&zio->io_tqent); return (zio); } void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); list_destroy(&zio->io_parent_list); list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); } zio_t * zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * zio_root(spa_t *spa, zio_done_func_t *done, void *private, zio_flag_t flags) { return (zio_null(NULL, spa, NULL, done, private, flags)); } static int zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, enum blk_verify_flag blk_verify, const char *fmt, ...) { va_list adx; char buf[256]; va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); zfs_dbgmsg("bad blkptr at %px: " "DVA[0]=%#llx/%#llx " "DVA[1]=%#llx/%#llx " "DVA[2]=%#llx/%#llx " "prop=%#llx " "pad=%#llx,%#llx " "phys_birth=%#llx " "birth=%#llx " "fill=%#llx " "cksum=%#llx/%#llx/%#llx/%#llx", bp, (long long)bp->blk_dva[0].dva_word[0], (long long)bp->blk_dva[0].dva_word[1], (long long)bp->blk_dva[1].dva_word[0], (long long)bp->blk_dva[1].dva_word[1], (long long)bp->blk_dva[2].dva_word[0], (long long)bp->blk_dva[2].dva_word[1], (long long)bp->blk_prop, (long long)bp->blk_pad[0], (long long)bp->blk_pad[1], (long long)bp->blk_phys_birth, (long long)bp->blk_birth, (long long)bp->blk_fill, (long long)bp->blk_cksum.zc_word[0], (long long)bp->blk_cksum.zc_word[1], (long long)bp->blk_cksum.zc_word[2], (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: zfs_dbgmsg("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_ONLY: break; } return (1); } /* * Verify the block pointer fields contain reasonable values. This means * it only contains known object types, checksum/compression identifiers, * block sizes within the maximum allowed limits, valid DVAs, etc. * * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * * Values for blk_verify_flag: * BLK_VERIFY_ONLY: evaluate the block * BLK_VERIFY_LOG: evaluate the block and log problems * BLK_VERIFY_HALT: call zfs_panic_recover on error * * Values for blk_config_flag: * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be * obtained for reader * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better * performance */ boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } /* * Do not verify individual DVAs if the config is not trusted. This * will be done once the zio is executed in vdev_mirror_map_alloc. */ if (!spa->spa_trust_config) return (errors == 0); switch (blk_config) { case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); break; case BLK_CONFIG_NEEDED: spa_config_enter(spa, SCL_VDEV, bp, RW_READER); break; case BLK_CONFIG_SKIP: return (errors == 0); default: panic("invalid blk_config %u", blk_config); } /* * Pool-specific checks. * * Note: it would be nice to verify that the blk_birth and * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() * allows the birth time of log blocks (and dmu_sync()-ed blocks * that are in the log) to be arbitrarily large. */ for (int i = 0; i < BP_GET_NDVAS(bp); i++) { const dva_t *dva = &bp->blk_dva[i]; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_missing_ops) { /* * "missing" vdevs are valid during import, but we * don't have their detailed info (e.g. asize), so * we can't perform any more checks on them. */ continue; } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } if (blk_config == BLK_CONFIG_NEEDED) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); } boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) { (void) bp; uint64_t vdevid = DVA_GET_VDEV(dva); if (vdevid >= spa->spa_root_vdev->vdev_children) return (B_FALSE); vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) return (B_FALSE); if (vd->vdev_ops == &vdev_hole_ops) return (B_FALSE); if (vd->vdev_ops == &vdev_missing_ops) { return (B_FALSE); } uint64_t offset = DVA_GET_OFFSET(dva); uint64_t asize = DVA_GET_ASIZE(dva); if (DVA_GET_GANG(dva)) asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) return (B_FALSE); return (B_TRUE); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && zp->zp_compress >= ZIO_COMPRESS_OFF && zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && DMU_OT_IS_VALID(zp->zp_type) && zp->zp_level < 32 && zp->zp_copies > 0 && zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_children_ready = children_ready; zio->io_prop = *zp; /* * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to * dedup (just take the already-allocated BP verbatim). Encrypted * dedup blocks need data as well so we also disable dedup in this * case. */ if (data == NULL && (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } return (zio); } zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb) { zio_t *zio; zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); return (zio); } void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite, boolean_t brtwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio->io_stage == ZIO_STAGE_OPEN); ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); ASSERT(!brtwrite || !nopwrite); /* * We must reset the io_prop to match the values that existed * when the bp was first written by dmu_sync() keeping in mind * that nopwrite and dedup are mutually exclusive. */ zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_brtwrite = brtwrite; zio->io_prop.zp_copies = copies; zio->io_bp_override = bp; } void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We * process the free here (by ignoring it) rather than * putting it on the list and then processing it in zio_free_sync(). */ if (BP_IS_EMBEDDED(bp)) return; /* * Frees that are for the currently-syncing txg, are not going to be * deferred, and which will not need to do a read (i.e. not GANG or * DEDUP), can be processed immediately. Otherwise, put them on the * in-memory list for later processing. * * Note that we only defer frees after zfs_sync_pass_deferred_free * when the log space map feature is disabled. [see relevant comment * in spa_sync_iterate_to_convergence()] */ if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || txg != spa->spa_syncing_txg || (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) || brt_maybe_exists(spa, bp)) { metaslab_check_free(spa, bp); bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); } else { VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); } } /* * To improve performance, this function may return NULL if we were able * to do the free immediately. This avoids the cost of creating a zio * (and linking it to the parent, etc). */ zio_t * zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_flag_t flags) { ASSERT(!BP_IS_HOLE(bp)); ASSERT(spa_syncing_txg(spa) == txg); if (BP_IS_EMBEDDED(bp)) return (NULL); metaslab_check_free(spa, bp); arc_freed(spa, bp); dsl_scan_freed(spa, bp); if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || brt_maybe_exists(spa, bp)) { /* * GANG, DEDUP and BRT blocks can induce a read (for the gang * block header, the DDT or the BRT), so issue them * asynchronously so that this thread is not tied up. */ enum zio_stage stage = ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); } else { metaslab_free(spa, bp, txg, B_FALSE); return (NULL); } } zio_t * zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); /* * A claim is an allocation of a specific block. Claims are needed * to support immediate writes in the intent log. The issue is that * immediate writes contain committed data, but in a txg that was * *not* committed. Upon opening the pool after an unclean shutdown, * the intent log claims all blocks that contain immediate write data * so that the SPA knows they're in use. * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); ASSERT0(zio->io_queued_timestamp); return (zio); } zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, zio_flag_t flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; } else { zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, done, private, flags)); } return (zio); } zio_t * zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, enum trim_flag trim_flags) { zio_t *zio; ASSERT0(vd->vdev_children); ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); ASSERT3U(size, !=, 0); zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); zio->io_trim_flags = trim_flags; return (zio); } zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; return (zio); } zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, zio_flag_t flags, boolean_t labels) { zio_t *zio; ASSERT(vd->vdev_children == 0); ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { /* * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ abd_t *wbuf = abd_alloc_sametype(data, size); abd_copy(wbuf, data, size); zio_push_transform(zio, wbuf, size, size, NULL); } return (zio); } /* * Create a child I/O to do some work for us. */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; /* * vdev child I/Os do not propagate their error to the parent. * Therefore, for correct operation the caller *must* check for * and handle the error in the child i/o's done callback. * The only exceptions are i/os that we don't care about * (OPTIONAL or REPAIR). */ ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || done != NULL); if (type == ZIO_TYPE_READ && bp != NULL) { /* * If we have the bp, then the child should perform the * checksum and the parent need not. This pushes error * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { ASSERT0(vd->vdev_children); offset += VDEV_LABEL_START_SIZE; } flags |= ZIO_VDEV_CHILD_FLAGS(pio); /* * If we've decided to do a repair, the write is not speculative -- * even if the original read was. */ if (flags & ZIO_FLAG_IO_REPAIR) flags &= ~ZIO_FLAG_SPECULATIVE; /* * If we're creating a child I/O that is not associated with a * top-level vdev, then the child zio is not an allocating I/O. * If this is a retried I/O then we ignore it since we will * have already processed the original allocating I/O. */ if (flags & ZIO_FLAG_IO_ALLOCATING && (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { ASSERT(pio->io_metaslab_class != NULL); ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); ASSERT(type == ZIO_TYPE_WRITE); ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || pio->io_child_type == ZIO_CHILD_GANG); flags &= ~ZIO_FLAG_IO_ALLOCATING; } zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, zio_type_t type, zio_priority_t priority, zio_flag_t flags, zio_done_func_t *done, void *private) { zio_t *zio; ASSERT(vd->vdev_ops->vdev_op_leaf); zio = zio_create(NULL, vd->vdev_spa, 0, NULL, data, size, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, vd, offset, NULL, ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } void zio_shrink(zio_t *zio, uint64_t size) { ASSERT3P(zio->io_executor, ==, NULL); ASSERT3U(zio->io_orig_size, ==, zio->io_size); ASSERT3U(size, <=, zio->io_size); /* * We don't shrink for raidz because of problems with the * reconstruction when reading back less than the block size. * Note, BP_IS_RAIDZ() assumes no compression. */ ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); if (!BP_IS_RAIDZ(zio->io_bp)) { /* we are not doing a raw write */ ASSERT3U(zio->io_size, ==, zio->io_lsize); zio->io_orig_size = zio->io_size = zio->io_lsize = size; } } +/* + * Round provided allocation size up to a value that can be allocated + * by at least some vdev(s) in the pool with minimum or no additional + * padding and without extra space usage on others + */ +static uint64_t +zio_roundup_alloc_size(spa_t *spa, uint64_t size) +{ + if (size > spa->spa_min_alloc) + return (roundup(size, spa->spa_gcd_alloc)); + return (spa->spa_min_alloc); +} + /* * ========================================================================== * Prepare to read and write logical blocks * ========================================================================== */ static zio_t * zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || BP_HAS_INDIRECT_MAC_CKSUM(bp)) && zio->io_child_type == ZIO_CHILD_LOGICAL) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decrypt); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; return (zio); } static zio_t * zio_write_bp_init(zio_t *zio) { if (!IO_IS_ALLOCATING(zio)) return (zio); ASSERT(zio->io_child_type != ZIO_CHILD_DDT); if (zio->io_bp_override) { blkptr_t *bp = zio->io_bp; zio_prop_t *zp = &zio->io_prop; ASSERT(bp->blk_birth != zio->io_txg); *bp = *zio->io_bp_override; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zp->zp_brtwrite) return (zio); ASSERT(!BP_GET_DEDUP(zio->io_bp_override)); if (BP_IS_EMBEDDED(bp)) return (zio); /* * If we've been overridden and nopwrite is set then * set the flag accordingly to indicate that a nopwrite * has already occurred. */ if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { ASSERT(!zp->zp_dedup); ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); zio->io_flags |= ZIO_FLAG_NOPWRITE; return (zio); } ASSERT(!zp->zp_nopwrite); if (BP_IS_HOLE(bp) || !zp->zp_dedup) return (zio); ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (zio); } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } return (zio); } static zio_t * zio_write_compress(zio_t *zio) { spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; uint64_t lsize = zio->io_lsize; uint64_t psize = zio->io_size; uint32_t pass = 1; /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { return (NULL); } if (!IO_IS_ALLOCATING(zio)) return (zio); if (zio->io_children_ready != NULL) { /* * Now that all our children are ready, run the callback * associated with this zio in case it wants to modify the * data to be written. */ ASSERT3U(zp->zp_level, >, 0); zio->io_children_ready(zio); } ASSERT(zio->io_child_type != ZIO_CHILD_DDT); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { /* * We're rewriting an existing block, which means we're * working on behalf of spa_sync(). For spa_sync() to * converge, it must eventually be the case that we don't * have to allocate new blocks. But compression changes * the blocksize, which forces a reallocate, and makes * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ pass = spa_sync_pass(spa); ASSERT(zio->io_txg == spa_syncing_txg(spa)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!BP_GET_DEDUP(bp)); if (pass >= zfs_sync_pass_dont_compress) compress = ZIO_COMPRESS_OFF; /* Make sure someone doesn't change their mind on overwrites */ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = NULL; psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize, zp->zp_complevel); if (psize == 0) { compress = ZIO_COMPRESS_OFF; } else if (psize >= lsize) { compress = ZIO_COMPRESS_OFF; if (cbuf != NULL) zio_buf_free(cbuf, lsize); } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); zio_buf_free(cbuf, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_EMBEDDED_DATA)); return (zio); } else { /* * Round compressed size up to the minimum allocation * size of the smallest-ashift device, and zero the * tail. This ensures that the compressed size of the * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ - ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); - size_t rounded = (size_t)roundup(psize, - spa->spa_min_alloc); + size_t rounded = (size_t)zio_roundup_alloc_size(spa, + psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); psize = lsize; } else { abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cdata, psize, lsize, NULL); } } /* * We were unable to handle this as an override bp, treat * it as a regular write I/O. */ zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && zp->zp_type == DMU_OT_DNODE) { /* * The DMU actually relies on the zio layer's compression * to free metadnode blocks that have had all contained * dnodes freed. As a result, even when doing a raw * receive, we must check whether the block can be compressed * to a hole. */ psize = zio_compress_data(ZIO_COMPRESS_EMPTY, zio->io_abd, NULL, lsize, zp->zp_complevel); if (psize == 0 || psize >= lsize) compress = ZIO_COMPRESS_OFF; } else if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) { /* * If we are raw receiving an encrypted dataset we should not * take this codepath because it will change the on-disk block * and decryption will fail. */ - size_t rounded = MIN((size_t)roundup(psize, - spa->spa_min_alloc), lsize); + size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize), + lsize); if (rounded != psize) { abd_t *cdata = abd_alloc_linear(rounded, B_TRUE); abd_zero_off(cdata, psize, rounded - psize); abd_copy_off(cdata, zio->io_abd, 0, 0, psize); psize = rounded; zio_push_transform(zio, cdata, psize, rounded, NULL); } } else { ASSERT3U(psize, !=, 0); } /* * The final pass of spa_sync() must be all rewrites, but the first * few passes offer a trade-off: allocating blocks defers convergence, * but newly allocated blocks are sequential, so they can be written * to disk faster. Therefore, we allow the first few passes of * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { BP_ZERO(bp); zio->io_pipeline = ZIO_WRITE_PIPELINE; } if (psize == 0) { if (zio->io_bp_orig.blk_birth != 0 && spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_BIRTH(bp, zio->io_txg, 0); } zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!zp->zp_encrypt || DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; } } return (zio); } static zio_t * zio_free_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { if (BP_GET_DEDUP(bp)) zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; } ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); return (zio); } /* * ========================================================================== * Execute the I/O pipeline * ========================================================================== */ static void zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) { spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* * A similar issue exists for the L2ARC write thread until L2ARC 2.0. */ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; /* * If this is a high priority I/O, then use the high priority taskq if * available. */ if ((zio->io_priority == ZIO_PRIORITY_NOW || zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && spa->spa_zio_taskq[t][q + 1].stqs_count != 0) q++; ASSERT3U(q, <, ZIO_TASKQ_TYPES); /* * NB: We are assuming that the zio can only be dispatched * to a single taskq at a time. It would be a grievous error * to dispatch the zio to another taskq at the same time. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(spa, t, q, zio_execute, zio, flags, &zio->io_tqent); } static boolean_t zio_taskq_member(zio_t *zio, zio_taskq_type_t q) { spa_t *spa = zio->io_spa; taskq_t *tq = taskq_of_curthread(); for (zio_type_t t = 0; t < ZIO_TYPES; t++) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t i; for (i = 0; i < tqs->stqs_count; i++) { if (tqs->stqs_taskq[i] == tq) return (B_TRUE); } } return (B_FALSE); } static zio_t * zio_issue_async(zio_t *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } void zio_interrupt(void *zio) { zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } void zio_delay_interrupt(zio_t *zio) { /* * The timeout_generic() function isn't defined in userspace, so * rather than trying to implement the function, the zio delay * functionality has been disabled for userspace builds. */ #ifdef _KERNEL /* * If io_target_timestamp is zero, then no delay has been registered * for this IO, thus jump to the end of this function and "skip" the * delay; issuing it directly to the zio layer. */ if (zio->io_target_timestamp != 0) { hrtime_t now = gethrtime(); if (now >= zio->io_target_timestamp) { /* * This IO has already taken longer than the target * delay to complete, so we don't want to delay it * any longer; we "miss" the delay and issue it * directly to the zio layer. This is likely due to * the target latency being set to a value less than * the underlying hardware can satisfy (e.g. delay * set to 1ms, but the disks take 10ms to complete an * IO request). */ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, hrtime_t, now); zio_interrupt(zio); } else { taskqid_t tid; hrtime_t diff = zio->io_target_timestamp - now; clock_t expire_at_tick = ddi_get_lbolt() + NSEC_TO_TICK(diff); DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, hrtime_t, now, hrtime_t, diff); if (NSEC_TO_TICK(diff) == 0) { /* Our delay is less than a jiffy - just spin */ zfs_sleep_until(zio->io_target_timestamp); zio_interrupt(zio); } else { /* * Use taskq_dispatch_delay() in the place of * OpenZFS's timeout_generic(). */ tid = taskq_dispatch_delay(system_taskq, zio_interrupt, zio, TQ_NOSLEEP, expire_at_tick); if (tid == TASKQID_INVALID) { /* * Couldn't allocate a task. Just * finish the zio without a delay. */ zio_interrupt(zio); } } } return; } #endif DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); zio_interrupt(zio); } static void zio_deadman_impl(zio_t *pio, int ziodepth) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; vdev_t *vd = pio->io_vd; if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) { vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL; zbookmark_phys_t *zb = &pio->io_bookmark; uint64_t delta = gethrtime() - pio->io_timestamp; uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " "delta=%llu queued=%llu io=%llu " "path=%s " "last=%llu type=%d " "priority=%d flags=0x%llx stage=0x%x " "pipeline=0x%x pipeline-trace=0x%x " "objset=%llu object=%llu " "level=%llu blkid=%llu " "offset=%llu size=%llu " "error=%d", ziodepth, pio, pio->io_timestamp, (u_longlong_t)delta, pio->io_delta, pio->io_delay, vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, pio->io_type, pio->io_priority, (u_longlong_t)pio->io_flags, pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid, (u_longlong_t)pio->io_offset, (u_longlong_t)pio->io_size, pio->io_error); (void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, pio->io_spa, vd, zb, pio, 0); if (failmode == ZIO_FAILURE_MODE_CONTINUE && taskq_empty_ent(&pio->io_tqent)) { zio_interrupt(pio); } } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_deadman_impl(cio, ziodepth + 1); } mutex_exit(&pio->io_lock); } /* * Log the critical information describing this zio and all of its children * using the zfs_dbgmsg() interface then post deadman event for the ZED. */ void zio_deadman(zio_t *pio, const char *tag) { spa_t *spa = pio->io_spa; char *name = spa_name(spa); if (!zfs_deadman_enabled || spa_suspended(spa)) return; zio_deadman_impl(pio, 0); switch (spa_get_deadman_failmode(spa)) { case ZIO_FAILURE_MODE_WAIT: zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_CONTINUE: zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); break; case ZIO_FAILURE_MODE_PANIC: fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); break; } } /* * Execute the I/O pipeline until one of the following occurs: * (1) the I/O completes; (2) the pipeline stalls waiting for * dependent child I/Os; (3) the I/O issues, so we're waiting * for an I/O completion interrupt; (4) the I/O is delegated by * vdev-level caching or aggregation; (5) the I/O is deferred * due to vdev-level queueing; (6) the I/O is handed off to * another thread. In all cases, the pipeline stops whenever * there's no CPU work; it never burns a thread in cv_wait_io(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ static zio_pipe_stage_t *zio_pipeline[]; /* * zio_execute() is a wrapper around the static function * __zio_execute() so that we can force __zio_execute() to be * inlined. This reduces stack overhead which is important * because __zio_execute() is called recursively in several zio * code paths. zio_execute() itself cannot be inlined because * it is externally visible. */ void zio_execute(void *zio) { fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); __zio_execute(zio); spl_fstrans_unmark(cookie); } /* * Used to determine if in the current context the stack is sized large * enough to allow zio_execute() to be called recursively. A minimum * stack size of 16K is required to avoid needing to re-dispatch the zio. */ static boolean_t zio_execute_stack_check(zio_t *zio) { #if !defined(HAVE_LARGE_STACKS) dsl_pool_t *dp = spa_get_dsl(zio->io_spa); /* Executing in txg_sync_thread() context. */ if (dp && curthread == dp->dp_tx.tx_sync_thread) return (B_TRUE); /* Pool initialization outside of zio_taskq context. */ if (dp && spa_is_initializing(dp->dp_spa) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) return (B_TRUE); #else (void) zio; #endif /* HAVE_LARGE_STACKS */ return (B_FALSE); } __attribute__((always_inline)) static inline void __zio_execute(zio_t *zio) { ASSERT3U(zio->io_queued_timestamp, >, 0); while (zio->io_stage < ZIO_STAGE_DONE) { enum zio_stage pipeline = zio->io_pipeline; enum zio_stage stage = zio->io_stage; zio->io_executor = curthread; ASSERT(!MUTEX_HELD(&zio->io_lock)); ASSERT(ISP2(stage)); ASSERT(zio->io_stall == NULL); do { stage <<= 1; } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, * or may wait for an I/O that needs an interrupt thread * to complete, issue async to avoid deadlock. * * For VDEV_IO_START, we cut in line so that the io will * be sent to disk promptly. */ if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } /* * If the current context doesn't have large enough stacks * the zio must be issued asynchronously to prevent overflow. */ if (zio_execute_stack_check(zio)) { boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; /* * The zio pipeline stage returns the next zio to execute * (typically the same as this one), or NULL if we should * stop. */ zio = zio_pipeline[highbit64(stage) - 1](zio); if (zio == NULL) return; } } /* * ========================================================================== * Initiate I/O, either sync or async * ========================================================================== */ int zio_wait(zio_t *zio) { /* * Some routines, like zio_free_sync(), may return a NULL zio * to avoid the performance overhead of creating and then destroying * an unneeded zio. For the callers' simplicity, we accept a NULL * zio and ignore it. */ if (zio == NULL) return (0); long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); int error; ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); ASSERT3P(zio->io_executor, ==, NULL); zio->io_waiter = curthread; ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); mutex_enter(&zio->io_lock); while (zio->io_executor != NULL) { error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, ddi_get_lbolt() + timeout); if (zfs_deadman_enabled && error == -1 && gethrtime() - zio->io_queued_timestamp > spa_deadman_ziotime(zio->io_spa)) { mutex_exit(&zio->io_lock); timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); zio_deadman(zio, FTAG); mutex_enter(&zio->io_lock); } } mutex_exit(&zio->io_lock); error = zio->io_error; zio_destroy(zio); return (error); } void zio_nowait(zio_t *zio) { /* * See comment in zio_wait(). */ if (zio == NULL) return; ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE]; zio_add_child(pio, zio); } ASSERT0(zio->io_queued_timestamp); zio->io_queued_timestamp = gethrtime(); __zio_execute(zio); } /* * ========================================================================== * Reexecute, cancel, or suspend/resume failed I/O * ========================================================================== */ static void zio_reexecute(void *arg) { zio_t *pio = arg; zio_t *cio, *cio_next; ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); ASSERT(pio->io_gang_leader == NULL); ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_flags |= ZIO_FLAG_REEXECUTED; pio->io_pipeline_trace = 0; pio->io_error = 0; for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; if (IO_IS_ALLOCATING(pio)) BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that * the remainder of pio's io_child_list, from 'cio_next' onward, * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); mutex_enter(&pio->io_lock); } mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. * We don't reexecute "The Godfather" I/O here as it's the * responsibility of the caller to wait on it. */ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { pio->io_queued_timestamp = gethrtime(); __zio_execute(pio); } } void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) { if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) fm_panic("Pool '%s' has encountered an uncorrectable I/O " "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " "failure and has been suspended.\n", spa_name(spa)); (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); spa->spa_suspended = reason; if (zio != NULL) { ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } mutex_exit(&spa->spa_suspend_lock); } int zio_resume(spa_t *spa) { zio_t *pio; /* * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; spa->spa_suspend_zio_root = NULL; mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) return (0); zio_reexecute(pio); return (zio_wait(pio)); } void zio_resume_wait(spa_t *spa) { mutex_enter(&spa->spa_suspend_lock); while (spa_suspended(spa)) cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); mutex_exit(&spa->spa_suspend_lock); } /* * ========================================================================== * Gang blocks. * * A gang block is a collection of small blocks that looks to the DMU * like one large block. When zio_dva_allocate() cannot find a block * of the requested size, due to either severe fragmentation or the pool * being nearly full, it calls zio_write_gang_block() to construct the * block from smaller fragments. * * A gang block consists of a gang header (zio_gbh_phys_t) and up to * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like * an indirect block: it's an array of block pointers. It consumes * only one sector and hence is allocatable regardless of fragmentation. * The gang header's bps point to its gang members, which hold the data. * * Gang blocks are self-checksumming, using the bp's * as the verifier to ensure uniqueness of the SHA256 checksum. * Critically, the gang block bp's blk_cksum is the checksum of the data, * not the gang header. This ensures that data block signatures (needed for * deduplication) are independent of how the block is physically stored. * * Gang blocks can be nested: a gang member may itself be a gang block. * Thus every gang block is a tree in which root and all interior nodes are * gang headers, and the leaves are normal blocks that contain user data. * The root of the gang tree is called the gang leader. * * To perform any operation (read, rewrite, free, claim) on a gang block, * zio_gang_assemble() first assembles the gang tree (minus data leaves) * in the io_gang_tree field of the original logical i/o by recursively * reading the gang leader and all gang headers below it. This yields * an in-core tree containing the contents of every gang header and the * bps for every constituent of the gang block. * * With the gang tree now assembled, zio_gang_issue() just walks the gang tree * and invokes a callback on each bp. To free a gang block, zio_gang_issue() * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). * zio_read_gang() is a wrapper around zio_read() that omits reading gang * headers, since we already have those in io_gang_tree. zio_rewrite_gang() * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() * of the gang header plus zio_checksum_compute() of the data to update the * gang header's blk_cksum as described above. * * The two-phase assemble/issue model solves the problem of partial failure -- * what if you'd freed part of a gang block but then couldn't read the * gang header for another part? Assembling the entire gang tree first * ensures that all the necessary gang header I/O has succeeded before * starting the actual work of free, claim, or write. Once the gang tree * is assembled, free and claim are in-memory operations that cannot fail. * * In the event that a gang write fails, zio_dva_unallocate() walks the * gang tree to immediately free (i.e. insert back into the space map) * everything we've allocated. This ensures that we don't get ENOSPC * errors during repeated suspend/resume cycles due to a flaky device. * * Gang rewrites only happen during sync-to-convergence. If we can't assemble * the gang tree, we won't modify the block, so we can safely defer the free * (knowing that the block is still intact). If we *can* assemble the gang * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free * each constituent bp and we can allocate a new block on the next sync pass. * * In all cases, the gang tree allows complete recovery from partial failure. * ========================================================================== */ static void zio_gang_issue_func_done(zio_t *zio) { abd_free(zio->io_abd); } static zio_t * zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { if (gn != NULL) return (pio); return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } static zio_t * zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { zio_t *zio; if (gn != NULL) { abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will * compute a new data checksum, so we do that here. The one * exception is the gang leader: the pipeline already computed * its data checksum because that stage precedes gang assembly. * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { abd_t *buf = abd_get_offset(data, offset); zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), buf, BP_GET_PSIZE(bp)); abd_free(buf); } /* * If we are here to damage data for testing purposes, * leave the GBH alone so that we can detect the damage. */ if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, abd_get_offset(data, offset), BP_GET_PSIZE(bp), zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } return (zio); } static zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio)); if (zio == NULL) { zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); } return (zio); } static zio_t * zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, uint64_t offset) { (void) gn, (void) data, (void) offset; return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); } static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { NULL, zio_read_gang, zio_rewrite_gang, zio_free_gang, zio_claim_gang, NULL }; static void zio_gang_tree_assemble_done(zio_t *zio); static zio_gang_node_t * zio_gang_node_alloc(zio_gang_node_t **gnpp) { zio_gang_node_t *gn; ASSERT(*gnpp == NULL); gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); *gnpp = gn; return (gn); } static void zio_gang_node_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) ASSERT(gn->gn_child[g] == NULL); zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); kmem_free(gn, sizeof (*gn)); *gnpp = NULL; } static void zio_gang_tree_free(zio_gang_node_t **gnpp) { zio_gang_node_t *gn = *gnpp; if (gn == NULL) return; for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) zio_gang_tree_free(&gn->gn_child[g]); zio_gang_node_free(gnpp); } static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); ASSERT(list_is_empty(&zio->io_child_list)); if (zio->io_error) return; /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); abd_free(zio->io_abd); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, offset); offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); } static zio_t * zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); return (zio); } static zio_t * zio_gang_issue(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, 0); else zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } static void zio_write_gang_member_ready(zio_t *zio) { zio_t *pio = zio_unique_parent(zio); dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; zio_t *gio __maybe_unused = zio->io_gang_leader; if (BP_IS_HOLE(zio->io_bp)) return; ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { ASSERT(DVA_GET_GANG(&pdva[d])); asize = DVA_GET_ASIZE(&pdva[d]); asize += DVA_GET_ASIZE(&cdva[d]); DVA_SET_ASIZE(&pdva[d], asize); } mutex_exit(&pio->io_lock); } static void zio_write_gang_done(zio_t *zio) { /* * The io_abd field will be NULL for a zio with no data. The io_flags * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't * check for it here as it is cleared in zio_ready. */ if (zio->io_abd != NULL) abd_free(zio->io_abd); } static zio_t * zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* * If one copy was requested, store 2 copies of the GBH, so that we * can still traverse all the data (e.g. to free or scrub) even if a * block is damaged. Note that we can't store 3 copies of the GBH in * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt. */ int gbh_copies = copies; if (gbh_copies == 1) { gbh_copies = MIN(2, spa_max_replication(spa)); } int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator]. mca_alloc_slots, pio)); /* * The logical zio has already placed a reservation for * 'copies' allocation slots but gang blocks may require * additional copies. These additional copies * (i.e. gbh_copies - copies) are guaranteed to succeed * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio->io_allocator, pio, flags)); } error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, &pio->io_alloc_list, pio, pio->io_allocator); if (error) { if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * If we failed to allocate the gang block header then * we remove any additional allocation reservations that * we placed here. The original reservation will * be removed when the logical I/O goes to the ready * stage. */ metaslab_class_throttle_unreserve(mc, gbh_copies - copies, pio->io_allocator, pio); } pio->io_error = error; return (pio); } if (pio == gio) { gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); } gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; memset(gbh, 0, SPA_GANGBLOCKSIZE); gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, zio_write_gang_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. */ for (int g = 0; resid != 0; resid -= lsize, g++) { lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(has_data); /* * Gang children won't throttle but we should * account for their work, so reserve an allocation * slot for them here. */ VERIFY(metaslab_class_throttle_reserve(mc, zp.zp_copies, cio->io_allocator, cio, flags)); } zio_nowait(cio); } /* * Set pio's pipeline to just wait for zio to finish. */ pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; /* * We didn't allocate this bp, so make sure it doesn't get unmarked. */ pio->io_flags &= ~ZIO_FLAG_FASTWRITE; zio_nowait(zio); return (pio); } /* * The zio_nop_write stage in the pipeline determines if allocating a * new bp is necessary. The nopwrite feature can handle writes in * either syncing or open context (i.e. zil writes) and as a result is * mutually exclusive with dedup. * * By leveraging a cryptographically secure checksum, such as SHA256, we * can compare the checksums of the new data and the old to determine if * allocating a new block is required. Note that our requirements for * cryptographic strength are fairly weak: there can't be any accidental * hash collisions, but we don't need to be secure against intentional * (malicious) collisions. To trigger a nopwrite, you have to be able * to write the file to begin with, and triggering an incorrect (hash * collision) nopwrite is no worse than simply writing to the file. * That said, there are no known attacks against the checksum algorithms * used for nopwrite, assuming that the salt and the checksums * themselves remain secret. */ static zio_t * zio_nop_write(zio_t *zio) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; ASSERT(BP_IS_HOLE(bp)); ASSERT(BP_GET_LEVEL(bp) == 0); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(zp->zp_nopwrite); ASSERT(!zp->zp_dedup); ASSERT(zio->io_bp_override == NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Check to see if the original bp and the new bp have matching * characteristics (i.e. same checksum, compression algorithms, etc). * If they don't then just continue with the pipeline which will * allocate a new bp. */ if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || zp->zp_copies != BP_GET_NDVAS(bp_orig)) return (zio); /* * If the checksums match then reset the pipeline so that we * avoid allocating a new bp and issuing any I/O. */ if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); ASSERT3U(bp->blk_prop, ==, bp_orig->blk_prop); /* * If we're overwriting a block that is currently on an * indirect vdev, then ignore the nopwrite request and * allow a new block to be allocated on a concrete vdev. */ spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp_orig); d++) { vdev_t *tvd = vdev_lookup_top(zio->io_spa, DVA_GET_VDEV(&bp_orig->blk_dva[d])); if (tvd->vdev_ops == &vdev_indirect_ops) { spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); return (zio); } } spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); *bp = *bp_orig; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; zio->io_flags |= ZIO_FLAG_NOPWRITE; } return (zio); } /* * ========================================================================== * Block Reference Table * ========================================================================== */ static zio_t * zio_brt_free(zio_t *zio) { blkptr_t *bp; bp = zio->io_bp; if (BP_GET_LEVEL(bp) > 0 || BP_IS_METADATA(bp) || !brt_maybe_exists(zio->io_spa, bp)) { return (zio); } if (!brt_entry_decref(zio->io_spa, bp)) { /* * This isn't the last reference, so we cannot free * the data yet. */ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } return (zio); } /* * ========================================================================== * Dedup * ========================================================================== */ static void zio_ddt_child_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ if (zio->io_error == 0 && dde->dde_repair_abd == NULL) dde->dde_repair_abd = zio->io_abd; else abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } static zio_t * zio_ddt_read_start(zio_t *zio) { blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); ddt_phys_t *ddp = dde->dde_phys; ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; if (ddp_self == NULL) return (zio); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) continue; ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (zio); } zio_nowait(zio_read(zio, zio->io_spa, bp, zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (zio); } static zio_t * zio_ddt_read_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_PSIZE(bp) == zio->io_size); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = zio->io_vsd; if (ddt == NULL) { ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); return (zio); } if (dde == NULL) { zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (NULL); } if (dde->dde_repair_abd != NULL) { abd_copy(zio->io_abd, dde->dde_repair_abd, zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); zio->io_vsd = NULL; } ASSERT(zio->io_vsd == NULL); return (zio); } static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); ASSERT(!(zio->io_bp_override && do_raw)); /* * Note: we compare the original data, not the transformed data, * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. * However, we should never get a raw, override zio so in these * cases we can compare the io_abd directly. This is useful because * it allows us to do dedup verification even if we don't have access * to the original data (for instance, if the encryption keys aren't * loaded). */ for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; if (lio != NULL && do_raw) { return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } } for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; if (ddp->ddp_phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) return (B_TRUE); ddt_exit(ddt); tmpabd = abd_alloc_for_io(psize, B_TRUE); error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_RAW, &zio->io_bookmark)); if (error == 0) { if (abd_cmp(tmpabd, zio->io_abd) != 0) error = SET_ERROR(ENOENT); } abd_free(tmpabd); ddt_enter(ddt); return (error != 0); } else if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); ddt_exit(ddt); error = arc_read(NULL, spa, &blk, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zio->io_bookmark); if (error == 0) { if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } ddt_enter(ddt); return (error != 0); } } return (B_FALSE); } static void zio_ddt_child_write_ready(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; zio_t *pio; if (zio->io_error) return; ddt_enter(ddt); ASSERT(dde->dde_lead_zio[p] == zio); ddt_phys_fill(ddp, zio->io_bp); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); ddt_exit(ddt); } static void zio_ddt_child_write_done(zio_t *zio) { int p = zio->io_prop.zp_copies; ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; ddt_phys_t *ddp = &dde->dde_phys[p]; ddt_enter(ddt); ASSERT(ddp->ddp_refcnt == 0); ASSERT(dde->dde_lead_zio[p] == zio); dde->dde_lead_zio[p] = NULL; if (zio->io_error == 0) { zio_link_t *zl = NULL; while (zio_walk_parents(zio, &zl) != NULL) ddt_phys_addref(ddp); } else { ddt_phys_clear(ddp); } ddt_exit(ddt); } static zio_t * zio_ddt_write(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; int p = zp->zp_copies; zio_t *cio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum * and try again. If we're already using a strong checksum, * we can't resolve it, so just convert to an ordinary write. * (And automatically e-mail a paper to Nature?) */ if (!(zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) { zp->zp_checksum = spa_dedup_checksum(spa); zio_pop_transforms(zio); zio->io_stage = ZIO_STAGE_OPEN; BP_ZERO(bp); } else { zp->zp_dedup = B_FALSE; BP_SET_DEDUP(bp, B_FALSE); } ASSERT(!BP_GET_DEDUP(bp)); zio->io_pipeline = ZIO_WRITE_PIPELINE; ddt_exit(ddt); return (zio); } if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { if (ddp->ddp_phys_birth != 0) ddt_bp_fill(ddp, bp, txg); if (dde->dde_lead_zio[p] != NULL) zio_add_child(zio, dde->dde_lead_zio[p]); else ddt_phys_addref(ddp); } else if (zio->io_bp_override) { ASSERT(bp->blk_birth == txg); ASSERT(BP_EQUAL(bp, zio->io_bp_override)); ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } ddt_exit(ddt); zio_nowait(cio); return (zio); } static ddt_entry_t *freedde; /* for debugging */ static zio_t * zio_ddt_free(zio_t *zio) { spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { ddp = ddt_phys_select(dde, bp); if (ddp) ddt_phys_decref(ddp); } ddt_exit(ddt); return (zio); } /* * ========================================================================== * Allocate and free blocks * ========================================================================== */ static zio_t * zio_io_to_allocate(spa_t *spa, int allocator) { zio_t *zio; ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock)); zio = avl_first(&spa->spa_allocs[allocator].spaa_tree); if (zio == NULL) return (NULL); ASSERT(IO_IS_ALLOCATING(zio)); /* * Try to place a reservation for this zio. If we're unable to * reserve then we throttle. */ ASSERT3U(zio->io_allocator, ==, allocator); if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, zio->io_prop.zp_copies, allocator, zio, 0)) { return (NULL); } avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); return (zio); } static zio_t * zio_dva_throttle(zio_t *zio) { spa_t *spa = zio->io_spa; zio_t *nio; metaslab_class_t *mc; /* locate an appropriate allocation class */ mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || zio->io_flags & ZIO_FLAG_NODATA) { return (zio); } ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); zbookmark_phys_t *bm = &zio->io_bookmark; /* * We want to try to use as many allocators as possible to help improve * performance, but we also want logically adjacent IOs to be physically * adjacent to improve sequential read performance. We chunk each object * into 2^20 block regions, and then hash based on the objset, object, * level, and region to accomplish both of these goals. */ int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; zio->io_allocator = allocator; zio->io_metaslab_class = mc; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); avl_add(&spa->spa_allocs[allocator].spaa_tree, zio); nio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_allocs[allocator].spaa_lock); return (nio); } static void zio_allocate_dispatch(spa_t *spa, int allocator) { zio_t *zio; mutex_enter(&spa->spa_allocs[allocator].spaa_lock); zio = zio_io_to_allocate(spa, allocator); mutex_exit(&spa->spa_allocs[allocator].spaa_lock); if (zio == NULL) return; ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); ASSERT0(zio->io_error); zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); } static zio_t * zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; metaslab_class_t *mc; blkptr_t *bp = zio->io_bp; int error; int flags = 0; if (zio->io_gang_leader == NULL) { ASSERT(zio->io_child_type > ZIO_CHILD_GANG); zio->io_gang_leader = zio; } ASSERT(BP_IS_HOLE(bp)); ASSERT0(BP_GET_NDVAS(bp)); ASSERT3U(zio->io_prop.zp_copies, >, 0); ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; if (zio->io_flags & ZIO_FLAG_NODATA) flags |= METASLAB_DONT_THROTTLE; if (zio->io_flags & ZIO_FLAG_GANG_CHILD) flags |= METASLAB_GANG_CHILD; if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) flags |= METASLAB_ASYNC_ALLOC; /* * if not already chosen, locate an appropriate allocation class */ mc = zio->io_metaslab_class; if (mc == NULL) { mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); zio->io_metaslab_class = mc; } /* * Try allocating the block in the usual metaslab class. * If that's full, allocate it in the normal class. * If that's full, allocate as a gang block, * and if all are full, the allocation fails (which shouldn't happen). * * Note that we do not fall back on embedded slog (ZIL) space, to * preserve unfragmented slog space, which is critical for decent * sync write performance. If a log allocation fails, we will fall * back to spa_sync() which is abysmal for performance. */ error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); /* * Fallback to normal class when an alloc class is full */ if (error == ENOSPC && mc != spa_normal_class(spa)) { /* * If throttling, transfer reservation over to normal class. * The io_allocator slot can remain the same even though we * are switching classes. */ if (mc->mc_alloc_throttle_enabled && (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { metaslab_class_throttle_unreserve(mc, zio->io_prop.zp_copies, zio->io_allocator, zio); zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; VERIFY(metaslab_class_throttle_reserve( spa_normal_class(spa), zio->io_prop.zp_copies, zio->io_allocator, zio, flags | METASLAB_MUST_RESERVE)); } zio->io_metaslab_class = mc = spa_normal_class(spa); if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying normal class: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_prop.zp_copies, zio->io_txg, NULL, flags, &zio->io_alloc_list, zio, zio->io_allocator); } if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) { zfs_dbgmsg("%s: metaslab allocation failure, " "trying ganging: zio %px, size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } return (zio_write_gang_block(zio, mc)); } if (error != 0) { if (error != ENOSPC || (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) { zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " "size %llu, error %d", spa_name(spa), zio, (u_longlong_t)zio->io_size, error); } zio->io_error = error; } return (zio); } static zio_t * zio_dva_free(zio_t *zio) { metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (zio); } static zio_t * zio_dva_claim(zio_t *zio) { int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); if (error) zio->io_error = error; return (zio); } /* * Undo an allocation. This is used by zio_done() when an I/O fails * and we want to give back the block we just allocated. * This handles both normal blocks and gang blocks. */ static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { zio_dva_unallocate(zio, gn->gn_child[g], &gn->gn_gbh->zg_blkptr[g]); } } } /* * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, uint64_t size, boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); /* * Block pointer fields are useful to metaslabs for stats and debugging. * Fill in the obvious ones before calling into metaslab_alloc(). */ BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_PSIZE(new_bp, size); BP_SET_LEVEL(new_bp, 0); /* * When allocating a zil block, we don't have information about * the final destination of the block except the objset it's part * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ int flags = METASLAB_FASTWRITE | METASLAB_ZIL; int allocator = (uint_t)cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0); if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } if (error != 0) { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); } metaslab_trace_fini(&io_alloc_list); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(new_bp, spa_version(spa) >= SPA_VERSION_SLIM_ZIL ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); /* * encrypted blocks will require an IV and salt. We generate * these now since we will not be rewriting the bp at * rewrite time. */ if (os->os_encrypted) { uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t salt[ZIO_DATA_SALT_LEN]; BP_SET_CRYPT(new_bp, B_TRUE); VERIFY0(spa_crypt_get_salt(spa, dmu_objset_id(os), salt)); VERIFY0(zio_crypt_generate_iv(iv)); zio_crypt_encode_params_bp(new_bp, salt, iv); } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), (u_longlong_t)size, error); } return (error); } /* * ========================================================================== * Read and write to physical devices * ========================================================================== */ /* * Issue an I/O to the underlying vdev. Typically the issue pipeline * stops after this stage and will resume upon I/O completion. * However, there are instances where the vdev layer may need to * continue the pipeline when an I/O was not issued. Since the I/O * that was sent to the vdev layer might be different than the one * currently active in the pipeline (see vdev_queue_io()), we explicitly * force the underlying vdev layers to call either zio_execute() or * zio_interrupt() to ensure that the pipeline continues with the correct I/O. */ static zio_t * zio_vdev_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; zio->io_delay = 0; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); if (vd == NULL) { if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_enter(spa, SCL_ZIO, zio, RW_READER); /* * The mirror_ops handle multiple DVAs in a single BP. */ vdev_mirror_ops.vdev_op_io_start(zio); return (NULL); } ASSERT3P(zio->io_logical, !=, zio); if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); /* * Note: the code can handle other kinds of writes, * but we don't expect them. */ if (zio->io_vd->vdev_noalloc) { ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } /* * If this is not a physical io, make sure that it is properly aligned * before proceeding. */ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { ASSERT0(P2PHASE(zio->io_offset, align)); ASSERT0(P2PHASE(zio->io_size, align)); } else { /* * For physical writes, we allow 512b aligned writes and assume * the device will perform a read-modify-write as necessary. */ ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); } VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); /* * If this is a repair I/O, and there's no self-healing involved -- * that is, we're just resilvering what we expect to resilver -- * then don't do the I/O unless zio's txg is actually in vd's DTL. * This prevents spurious resilvering. * * There are a few ways that we can end up creating these spurious * resilver i/os: * * 1. A resilver i/o will be issued if any DVA in the BP has a * dirty DTL. The mirror code will issue resilver writes to * each DVA, including the one(s) that are not on vdevs with dirty * DTLs. * * 2. With nested replication, which happens when we have a * "replacing" or "spare" vdev that's a child of a mirror or raidz. * For example, given mirror(replacing(A+B), C), it's likely that * only A is out of date (it's the new device). In this case, we'll * read from C, then use the data to resilver A+B -- but we don't * actually want to resilver B, just A. The top-level mirror has no * way to know this, so instead we just discard unnecessary repairs * as we work our way down the vdev tree. * * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. * The same logic applies to any form of nested replication: ditto * + mirror, RAID-Z + replacing, etc. * * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. * * Leaf DTL_PARTIAL can be empty when a legitimate write comes from * a dRAID spare vdev. For example, when a dRAID spare is first * used, its spare blocks need to be written to but the leaf vdev's * of such blocks can have empty DTL_PARTIAL. * * There seemed no clean way to allow such writes while bypassing * spurious ones. At this point, just avoid all bypassing for dRAID * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } /* * Select the next best leaf I/O to process. Distributed spares are * excluded since they dispatch the I/O directly to a leaf vdev after * applying the dRAID mapping. */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return (NULL); } zio->io_delay = gethrtime(); } vd->vdev_ops->vdev_op_io_start(zio); return (NULL); } static zio_t * zio_vdev_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; if (vd != NULL && vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { if (!vdev_accessible(vd, zio)) { zio->io_error = SET_ERROR(ENXIO); } else { unexpected_error = B_TRUE; } } } ops->vdev_op_io_done(zio); if (unexpected_error && vd->vdev_remove_wanted == B_FALSE) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); } /* * This function is used to change the priority of an existing zio that is * currently in-flight. This is used by the arc to upgrade priority in the * event that a demand read is made for a block that is currently queued * as a scrub or async read IO. Otherwise, the high priority read request * would end up having to wait for the lower priority IO. */ void zio_change_priority(zio_t *pio, zio_priority_t priority) { zio_t *cio, *cio_next; zio_link_t *zl = NULL; ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { vdev_queue_change_io_priority(pio, priority); } else { pio->io_priority = priority; } mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); zio_change_priority(cio, priority); } mutex_exit(&pio->io_lock); } /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); } void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; zcr->zcr_free = zio_abd_free; } static zio_t * zio_vdev_io_assess(zio_t *zio) { vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { return (NULL); } if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); /* * If the I/O failed, determine whether we should attempt to retry it. * * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); return (NULL); } /* * If we got an error on a leaf device, convert it to ENXIO * if the device is not accessible at all. */ if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && !vdev_accessible(vd, zio)) zio->io_error = SET_ERROR(ENXIO); /* * If we can't write to an interior vdev (mirror or RAID-Z), * set vdev_cant_write so that we stop trying to allocate from it. */ if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && vd != NULL && !vd->vdev_ops->vdev_op_leaf) { vdev_dbgmsg(vd, "zio_vdev_io_assess(zio=%px) setting " "cant_write=TRUE due to write failure with ENXIO", zio); vd->vdev_cant_write = B_TRUE; } /* * If a cache flush returns ENOTSUP or ENOTTY, we know that no future * attempts will ever succeed. In this case we set a persistent * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; return (zio); } void zio_vdev_io_reissue(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_stage >>= 1; } void zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); zio->io_stage >>= 1; } void zio_vdev_io_bypass(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* * ========================================================================== * Encrypt and store encryption parameters * ========================================================================== */ /* * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for * managing the storage of encryption parameters and passing them to the * lower-level encryption functions. */ static zio_t * zio_encrypt(zio_t *zio) { zio_prop_t *zp = &zio->io_prop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t psize = BP_GET_PSIZE(bp); uint64_t dsobj = zio->io_bookmark.zb_objset; dmu_object_type_t ot = BP_GET_TYPE(bp); void *enc_buf = NULL; abd_t *eabd = NULL; uint8_t salt[ZIO_DATA_SALT_LEN]; uint8_t iv[ZIO_DATA_IV_LEN]; uint8_t mac[ZIO_DATA_MAC_LEN]; boolean_t no_crypt = B_FALSE; /* the root zio already encrypted the data */ if (zio->io_child_type == ZIO_CHILD_GANG) return (zio); /* only ZIL blocks are re-encrypted on rewrite */ if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) return (zio); if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { BP_SET_CRYPT(bp, B_FALSE); return (zio); } /* if we are doing raw encryption set the provided encryption params */ if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { ASSERT0(BP_GET_LEVEL(bp)); BP_SET_CRYPT(bp, B_TRUE); BP_SET_BYTEORDER(bp, zp->zp_byteorder); if (ot != DMU_OT_OBJSET) zio_crypt_encode_mac_bp(bp, zp->zp_mac); /* dnode blocks must be written out in the provided byteorder */ if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && ot == DMU_OT_DNODE) { void *bswap_buf = zio_buf_alloc(psize); abd_t *babd = abd_get_from_buf(bswap_buf, psize); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); abd_copy_to_buf(bswap_buf, zio->io_abd, psize); dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, psize); abd_take_ownership_of_buf(babd, B_TRUE); zio_push_transform(zio, babd, psize, psize, NULL); } if (DMU_OT_IS_ENCRYPTED(ot)) zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); return (zio); } /* indirect blocks only maintain a cksum of the lower level MACs */ if (BP_GET_LEVEL(bp) > 0) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Objset blocks are a special case since they have 2 256-bit MACs * embedded within them. */ if (ot == DMU_OT_OBJSET) { ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); return (zio); } /* unencrypted object types are only authenticated with a MAC */ if (!DMU_OT_IS_ENCRYPTED(ot)) { BP_SET_CRYPT(bp, B_TRUE); VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, zio->io_abd, psize, mac)); zio_crypt_encode_mac_bp(bp, mac); return (zio); } /* * Later passes of sync-to-convergence may decide to rewrite data * in place to avoid more disk reallocations. This presents a problem * for encryption because this constitutes rewriting the new data with * the same encryption key and IV. However, this only applies to blocks * in the MOS (particularly the spacemaps) and we do not encrypt the * MOS. We assert that the zio is allocating or an intent log write * to enforce this. */ ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); ASSERT3U(psize, !=, 0); enc_buf = zio_buf_alloc(psize); eabd = abd_get_from_buf(enc_buf, psize); abd_take_ownership_of_buf(eabd, B_TRUE); /* * For an explanation of what encryption parameters are stored * where, see the block comment in zio_crypt.c. */ if (ot == DMU_OT_INTENT_LOG) { zio_crypt_decode_params_bp(bp, salt, iv); } else { BP_SET_CRYPT(bp, B_TRUE); } /* Perform the encryption. This should not fail */ VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); /* encode encryption metadata into the bp */ if (ot == DMU_OT_INTENT_LOG) { /* * ZIL blocks store the MAC in the embedded checksum, so the * transform must always be applied. */ zio_crypt_encode_mac_zil(enc_buf, mac); zio_push_transform(zio, eabd, psize, psize, NULL); } else { BP_SET_CRYPT(bp, B_TRUE); zio_crypt_encode_params_bp(bp, salt, iv); zio_crypt_encode_mac_bp(bp, mac); if (no_crypt) { ASSERT3U(ot, ==, DMU_OT_DNODE); abd_free(eabd); } else { zio_push_transform(zio, eabd, psize, psize, NULL); } } return (zio); } /* * ========================================================================== * Generate and verify checksums * ========================================================================== */ static zio_t * zio_checksum_generate(zio_t *zio) { blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; if (bp == NULL) { /* * This is zio_write_phys(). * We're either generating a label checksum, or none at all. */ checksum = zio->io_prop.zp_checksum; if (checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT(checksum == ZIO_CHECKSUM_LABEL); } else { if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { ASSERT(!IO_IS_ALLOCATING(zio)); checksum = ZIO_CHECKSUM_GANG_HEADER; } else { checksum = BP_GET_CHECKSUM(bp); } } zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (zio); } static zio_t * zio_checksum_verify(zio_t *zio) { zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; ASSERT(zio->io_vd != NULL); if (bp == NULL) { /* * This is zio_read_phys(). * We're either verifying a label checksum, or nothing at all. */ if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_checksum_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_start_checksum(zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, zio->io_offset, zio->io_size, &info); } } return (zio); } /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ void zio_checksum_verified(zio_t *zio) { zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* * ========================================================================== * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. * An error of 0 indicates success. ENXIO indicates whole-device failure, * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO * indicate errors that are specific to one I/O, and most likely permanent. * Any other error is presumed to be worse because we weren't expecting it. * ========================================================================== */ int zio_worst_error(int e1, int e2) { static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; int r1, r2; for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) if (e1 == zio_error_rank[r1]) break; for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) if (e2 == zio_error_rank[r2]) break; return (r1 > r2 ? e1 : e2); } /* * ========================================================================== * I/O completion * ========================================================================== */ static zio_t * zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; zio_link_t *zl = NULL; if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) { return (NULL); } if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); zio->io_ready(zio); } #ifdef ZFS_DEBUG if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; #endif if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_metaslab_class != NULL); /* * We were unable to allocate anything, unreserve and * issue the next I/O to allocate. */ metaslab_class_throttle_unreserve( zio->io_metaslab_class, zio->io_prop.zp_copies, zio->io_allocator, zio); zio_allocate_dispatch(zio->io_spa, zio->io_allocator); } } mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_READY] = 1; pio = zio_walk_parents(zio, &zl); mutex_exit(&zio->io_lock); /* * As we notify zio's parents, new parents could be added. * New parents go to the head of zio's io_parent_list, however, * so we will (correctly) not notify them. The remainder of zio's * io_parent_list, from 'pio_next' onward, cannot change because * all parents must wait for us to be done before they can be done. */ for (; pio != NULL; pio = pio_next) { pio_next = zio_walk_parents(zio, &zl); zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); } if (zio->io_flags & ZIO_FLAG_NODATA) { if (bp != NULL && BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } if (zio_injection_enabled && zio->io_spa->spa_syncing_txg == zio->io_txg) zio_handle_ignored_writes(zio); return (zio); } /* * Update the allocation throttle accounting. */ static void zio_dva_throttle_done(zio_t *zio) { zio_t *lio __maybe_unused = zio->io_logical; zio_t *pio = zio_unique_parent(zio); vdev_t *vd = zio->io_vd; int flags = METASLAB_ASYNC_ALLOC; ASSERT3P(zio->io_bp, !=, NULL); ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); ASSERT(vd != NULL); ASSERT3P(vd, ==, vd->vdev_top); ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); /* * Parents of gang children can have two flavors -- ones that * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) * and ones that allocated the constituent blocks. The allocation * throttle needs to know the allocating parent zio so we must find * it here. */ if (pio->io_child_type == ZIO_CHILD_GANG) { /* * If our parent is a rewrite gang child then our grandparent * would have been the one that performed the allocation. */ if (pio->io_flags & ZIO_FLAG_IO_REWRITE) pio = zio_unique_parent(pio); flags |= METASLAB_GANG_CHILD; } ASSERT(IO_IS_ALLOCATING(pio)); ASSERT3P(zio, !=, zio->io_logical); ASSERT(zio->io_logical != NULL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); ASSERT(zio->io_metaslab_class != NULL); mutex_enter(&pio->io_lock); metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, pio->io_allocator, B_TRUE); mutex_exit(&pio->io_lock); metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, pio->io_allocator, pio); /* * Call into the pipeline to see if there is more work that * needs to be done. If there is work to be done it will be * dispatched to another taskq thread. */ zio_allocate_dispatch(zio->io_spa, pio->io_allocator); } static zio_t * zio_done(zio_t *zio) { /* * Always attempt to keep stack usage minimal here since * we can be called recursively up to 19 levels deep. */ const uint64_t psize = zio->io_size; zio_t *pio, *pio_next; zio_link_t *zl = NULL; /* * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { return (NULL); } /* * If the allocation throttle is enabled, then update the accounting. * We only track child I/Os that are part of an allocating async * write. We must do this since the allocation is performed * by the logical I/O but the actual write is done by child I/Os. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && zio->io_child_type == ZIO_CHILD_VDEV) { ASSERT(zio->io_metaslab_class != NULL); ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); zio_dva_throttle_done(zio); } /* * If the allocation throttle is enabled, verify that * we have decremented the refcounts for every I/O that was throttled. */ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); ASSERT(zio->io_bp != NULL); metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class-> mc_allocator[zio->io_allocator].mca_alloc_slots, zio)); } for (int c = 0; c < ZIO_CHILD_TYPES; c++) for (int w = 0; w < ZIO_WAIT_TYPES; w++) ASSERT(zio->io_children[c][w] == 0); if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { ASSERT(zio->io_bp->blk_pad[0] == 0); ASSERT(zio->io_bp->blk_pad[1] == 0); ASSERT(memcmp(zio->io_bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || (zio->io_bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || (BP_COUNT_GANG(zio->io_bp) == BP_GET_NDVAS(zio->io_bp))); } if (zio->io_flags & ZIO_FLAG_NOPWRITE) VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); } /* * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); zio_inherit_child_errors(zio, ZIO_CHILD_DDT); /* * If the I/O on the transformed data was successful, generate any * checksum reports now while we still have the transformed data. */ if (zio->io_error == 0) { while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); abd_t *adata = zio->io_abd; if (adata != NULL && asize != psize) { adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); if (adata != NULL && asize != psize) abd_free(adata); } } zio_pop_transforms(zio); /* note: may set zio->io_error */ vdev_stat_update(zio, psize); /* * If this I/O is attached to a particular vdev is slow, exceeding * 30 seconds to complete, post an error described the I/O delay. * We ignore these errors if the device is currently unavailable. */ if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { /* * We want to only increment our slow IO counters if * the IO is valid (i.e. not if the drive is removed). * * zfs_ereport_post() will also do these checks, but * it can also ratelimit and have other failures, so we * need to increment the slow_io counters independent * of it. */ if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, zio)) { mutex_enter(&zio->io_vd->vdev_stat_lock); zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); } } } if (zio->io_error) { /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure * at the block level. We ignore these errors if the * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { mutex_enter(&zio->io_vd->vdev_stat_lock); if (zio->io_type == ZIO_TYPE_READ) zio->io_vd->vdev_stat.vs_read_errors++; else if (zio->io_type == ZIO_TYPE_WRITE) zio->io_vd->vdev_stat.vs_write_errors++; mutex_exit(&zio->io_vd->vdev_stat_lock); } } if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ spa_log_error(zio->io_spa, &zio->io_bookmark, &zio->io_bp->blk_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } } if (zio->io_error && zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). */ ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_error == ENXIO && spa_load_state(zio->io_spa) == SPA_LOAD_NONE && spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; /* * Here is a possibly good place to attempt to do * either combinatorial reconstruction or error correction * based on checksums. It also might be a good place * to send out preliminary ereports before we suspend * processing. */ } /* * If there were logical child errors, they apply to us now. * We defer this until now to avoid conflating logical child * errors with errors that happened to the zio itself when * updating vdev stats and reporting FMA events above. */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); if ((zio->io_error || zio->io_reexecute) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); zio_gang_tree_free(&zio->io_gang_tree); /* * Godfather I/Os should never suspend. */ if ((zio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. * * Reexecute is top-down. When an i/o fails, if it's not * the root, it simply notifies its parent and sticks around. * The parent, seeing that it still has children in zio_done(), * does the same. This percolates all the way up to the root. * The root i/o will reexecute or suspend the entire tree. * * This approach ensures that zio_reexecute() honors * all the original i/o dependency relationships, e.g. * parents not executing until children are ready. */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); zio->io_gang_leader = NULL; mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * "The Godfather" I/O monitors its children but is * not a true parent to them. It will track them through * the pipeline but severs its ties whenever they get into * trouble (e.g. suspended). This allows "The Godfather" * I/O to return status without blocking. */ zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); if ((pio->io_flags & ZIO_FLAG_GODFATHER) && (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { zio_remove_child(pio, zio, remove_zl); /* * This is a rare code path, so we don't * bother with "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } } if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; /* * This is a rare code path, so we don't bother with * "next_to_execute". */ zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { /* * We'd fail again if we reexecuted now, so suspend * until conditions improve (e.g. device comes online). */ zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); } else { /* * Reexecution is potentially a huge amount of work. * Hand it off to the otherwise-unused claim taskq. */ ASSERT(taskq_empty_ent(&zio->io_tqent)); spa_taskq_dispatch_ent(zio->io_spa, ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, zio_reexecute, zio, 0, &zio->io_tqent); } return (NULL); } ASSERT(list_is_empty(&zio->io_child_list)); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* * Report any checksum errors, since the I/O is complete. */ while (zio->io_cksum_report != NULL) { zio_cksum_report_t *zcr = zio->io_cksum_report; zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, NULL); zfs_ereport_free_checksum(zcr); } if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); } /* * It is the responsibility of the done callback to ensure that this * particular zio is no longer discoverable for adoption, and as * such, cannot acquire any new parents. */ if (zio->io_done) zio->io_done(zio); mutex_enter(&zio->io_lock); zio->io_state[ZIO_WAIT_DONE] = 1; mutex_exit(&zio->io_lock); /* * We are done executing this zio. We may want to execute a parent * next. See the comment in zio_notify_parent(). */ zio_t *next_to_execute = NULL; zl = NULL; for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { zio_link_t *remove_zl = zl; pio_next = zio_walk_parents(zio, &zl); zio_remove_child(pio, zio, remove_zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); } if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); zio->io_executor = NULL; cv_broadcast(&zio->io_cv); mutex_exit(&zio->io_lock); } else { zio_destroy(zio); } return (next_to_execute); } /* * ========================================================================== * I/O pipeline definition * ========================================================================== */ static zio_pipe_stage_t *zio_pipeline[] = { NULL, zio_read_bp_init, zio_write_bp_init, zio_free_bp_init, zio_issue_async, zio_write_compress, zio_encrypt, zio_checksum_generate, zio_nop_write, zio_brt_free, zio_ddt_read_start, zio_ddt_read_done, zio_ddt_write, zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_throttle, zio_dva_allocate, zio_dva_free, zio_dva_claim, zio_ready, zio_vdev_io_start, zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, zio_done }; /* * Compare two zbookmark_phys_t's to see which we would reach first in a * pre-order traversal of the object tree. * * This is simple in every case aside from the meta-dnode object. For all other * objects, we traverse them in order (object 1 before object 2, and so on). * However, all of these objects are traversed while traversing object 0, since * the data it points to is the list of objects. Thus, we need to convert to a * canonical representation so we can compare meta-dnode bookmarks to * non-meta-dnode bookmarks. * * We do this by calculating "equivalents" for each field of the zbookmark. * zbookmarks outside of the meta-dnode use their own object and level, and * calculate the level 0 equivalent (the first L0 blkid that is contained in the * blocks this bookmark refers to) by multiplying their blkid by their span * (the number of L0 blocks contained within one block at their level). * zbookmarks inside the meta-dnode calculate their object equivalent * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use * level + 1<<31 (any value larger than a level could ever be) for their level. * This causes them to always compare before a bookmark in their object * equivalent, compare appropriately to bookmarks in other objects, and to * compare appropriately to other bookmarks in the meta-dnode. */ int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) { /* * These variables represent the "equivalent" values for the zbookmark, * after converting zbookmarks inside the meta dnode to their * normal-object equivalents. */ uint64_t zb1obj, zb2obj; uint64_t zb1L0, zb2L0; uint64_t zb1level, zb2level; if (zb1->zb_object == zb2->zb_object && zb1->zb_level == zb2->zb_level && zb1->zb_blkid == zb2->zb_blkid) return (0); IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); /* * BP_SPANB calculates the span in blocks. */ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); if (zb1->zb_object == DMU_META_DNODE_OBJECT) { zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb1L0 = 0; zb1level = zb1->zb_level + COMPARE_META_LEVEL; } else { zb1obj = zb1->zb_object; zb1level = zb1->zb_level; } if (zb2->zb_object == DMU_META_DNODE_OBJECT) { zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); zb2L0 = 0; zb2level = zb2->zb_level + COMPARE_META_LEVEL; } else { zb2obj = zb2->zb_object; zb2level = zb2->zb_level; } /* Now that we have a canonical representation, do the comparison. */ if (zb1obj != zb2obj) return (zb1obj < zb2obj ? -1 : 1); else if (zb1L0 != zb2L0) return (zb1L0 < zb2L0 ? -1 : 1); else if (zb1level != zb2level) return (zb1level > zb2level ? -1 : 1); /* * This can (theoretically) happen if the bookmarks have the same object * and level, but different blkids, if the block sizes are not the same. * There is presently no way to change the indirect block sizes */ return (0); } /* * This function checks the following: given that last_block is the place that * our traversal stopped last time, does that guarantee that we've visited * every node under subtree_root? Therefore, we can't just use the raw output * of zbookmark_compare. We have to pass in a modified version of * subtree_root; by incrementing the block id, and then checking whether * last_block is before or equal to that, we can tell whether or not having * visited last_block implies that all of subtree_root's children have been * visited. */ boolean_t zbookmark_subtree_completed(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { zbookmark_phys_t mod_zb = *subtree_root; mod_zb.zb_blkid++; ASSERT0(last_block->zb_level); /* The objset_phys_t isn't before anything. */ if (dnp == NULL) return (B_FALSE); /* * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the * data block size in sectors, because that variable is only used if * the bookmark refers to a block in the meta-dnode. Since we don't * know without examining it what object it refers to, and there's no * harm in passing in this value in other cases, we always pass it in. * * We pass in 0 for the indirect block size shift because zb2 must be * level 0. The indirect block size is only used to calculate the span * of the bookmark, but since the bookmark must be level 0, the span is * always 1, so the math works out. * * If you make changes to how the zbookmark_compare code works, be sure * to make sure that this code still works afterwards. */ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, last_block) <= 0); } /* * This function is similar to zbookmark_subtree_completed(), but returns true * if subtree_root is equal or ahead of last_block, i.e. still to be done. */ boolean_t zbookmark_subtree_tbd(const dnode_phys_t *dnp, const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) { ASSERT0(last_block->zb_level); if (dnp == NULL) return (B_FALSE); return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, subtree_root, last_block) >= 0); } EXPORT_SYMBOL(zio_type_name); EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, "Prioritize requeued I/O"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, UINT, ZMOD_RW, "Defer frees starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, UINT, ZMOD_RW, "Don't compress starting in this pass"); ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, UINT, ZMOD_RW, "Rewrite new bps starting in this pass"); ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, "Throttle block allocations in the ZIO pipeline"); ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, "Log all slow ZIOs, not just those with vdevs");