Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 329689) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 329690) @@ -1,2034 +1,2037 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Gary Mills * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; static void dsl_scan_cancel_sync(void *, dmu_tx_t *); static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *); static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *); unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ SYSCTL_DECL(_vfs_zfs); SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, &zfs_scan_idle, 0, "Idle scan window in clock ticks"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN, &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, &zfs_no_scrub_io, 0, "Disable scrub I/O"); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ uint64_t zfs_free_max_blocks = UINT64_MAX; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG"); #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) extern int zfs_txg_timeout; /* * Enable/disable the processing of the free_bpobj object. */ boolean_t zfs_free_bpobj_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { int err; dsl_scan_t *scn; spa_t *spa = dp->dp_spa; uint64_t f; scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; /* * It's possible that we're resuming a scan after a reboot so * make sure that the scan_async_destroying flag is initialized * appropriately. */ ASSERT(!scn->scn_async_destroying); scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { /* * There was an old-style scrub in progress. Restart a * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", scn->scn_restart_txg); /* * Load the queue obj from the old location so that it * can be freed by dsl_scan_done(). */ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); if (err == ENOENT) return (0); else if (err) return (err); if (scn->scn_phys.scn_state == DSS_SCANNING && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old * pool, and the pool was accessed by old * software. Restart from the beginning, since * the old software may have changed the pool in * the meantime. */ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", scn->scn_restart_txg); } } spa_scan_stat_init(spa); return (0); } void dsl_scan_fini(dsl_pool_t *dp) { if (dp->dp_scan) { kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } /* ARGSUSED */ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state == DSS_SCANNING) return (SET_ERROR(EBUSY)); return (0); } static void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; scn->scn_phys.scn_max_txg = tx->tx_txg; scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; spa_scan_stat_init(spa); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; /* rewrite all disk labels */ vdev_config_dirty(spa->spa_root_vdev); if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { - spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_RESILVER_START); } else { - spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); } spa->spa_scrub_started = B_TRUE; /* * If this is an incremental scrub, limit the DDT scrub phase * to just the auto-ditto class (for correctness); the rest * of the scrub should go faster using top-down pruning. */ if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; } /* back to the generic stuff */ if (dp->dp_blkstats == NULL) { dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); } bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); dsl_scan_sync_state(scn, tx); spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); } /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { static const char *old_names[] = { "scrub_bookmark", "scrub_ddt_bookmark", "scrub_ddt_class_max", "scrub_queue", "scrub_min_txg", "scrub_max_txg", "scrub_func", "scrub_errors", NULL }; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int i; /* Remove any remnants of an old-style scrub. */ for (i = 0; old_names[i]; i++) { (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); } if (scn->scn_phys.scn_queue_obj != 0) { VERIFY(0 == dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; /* * If we were "restarted" from a stopped state, don't bother * with anything else. */ if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (complete) scn->scn_phys.scn_state = DSS_FINISHED; else scn->scn_phys.scn_state = DSS_CANCELED; if (dsl_scan_restarting(scn, tx)) spa_history_log_internal(spa, "scan aborted, restarting", tx, "errors=%llu", spa_get_errlog_size(spa)); else if (!complete) spa_history_log_internal(spa, "scan cancelled", tx, "errors=%llu", spa_get_errlog_size(spa)); else spa_history_log_internal(spa, "scan done", tx, "errors=%llu", spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > 0) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } mutex_exit(&spa->spa_scrub_lock); spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* * If the scrub/resilver completed, update all DTLs to * reflect this. Whether it succeeded or not, vacate * all temporary scrub DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); if (complete) { - spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? + spa_event_notify(spa, NULL, NULL, + scn->scn_phys.scn_min_txg ? ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); } spa_errlog_rotate(spa); /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); } scn->scn_phys.scn_end_time = gethrestime_sec(); } /* ARGSUSED */ static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state != DSS_SCANNING) return (SET_ERROR(ENOENT)); return (0); } /* ARGSUSED */ static void dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx); - spa_event_notify(scn->scn_dp->dp_spa, NULL, ESC_ZFS_SCRUB_ABORT); + spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); } int dsl_scan_cancel(dsl_pool_t *dp) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn) { if (dsl_scan_scrubbing(scn->scn_dp) && scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED) return (B_TRUE); return (B_FALSE); } static int dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ if (!dsl_scan_scrubbing(dp)) return (SET_ERROR(ENOENT)); /* can't pause a paused scrub */ if (dsl_scan_is_paused_scrub(scn)) return (SET_ERROR(EBUSY)); } else if (*cmd != POOL_SCRUB_NORMAL) { return (SET_ERROR(ENOTSUP)); } return (0); } static void dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) { pool_scrub_cmd_t *cmd = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; if (*cmd == POOL_SCRUB_PAUSE) { /* can't pause a scrub when there is no in-progress scrub */ spa->spa_scan_pass_scrub_pause = gethrestime_sec(); scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx); - spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_PAUSED); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); if (dsl_scan_is_paused_scrub(scn)) { /* * We need to keep track of how much time we spend * paused per pass so that we can adjust the scrub rate * shown in the output of 'zpool status' */ spa->spa_scan_pass_scrub_spent_paused += gethrestime_sec() - spa->spa_scan_pass_scrub_pause; spa->spa_scan_pass_scrub_pause = 0; scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; dsl_scan_sync_state(scn, tx); } } } /* * Set scrub pause/resume state if it makes sense to do so */ int dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); } boolean_t dsl_scan_scrubbing(const dsl_pool_t *dp) { dsl_scan_t *scn = dp->dp_scan; if (scn->scn_phys.scn_state == DSS_SCANNING && scn->scn_phys.scn_func == POOL_SCAN_SCRUB) return (B_TRUE); return (B_FALSE); } static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { zio_free(dp->dp_spa, txg, bp); } void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), pio->io_flags)); } static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; if (ds->ds_is_snapshot) return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) { VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); } extern int zfs_vdev_async_write_active_min_dirty_percent; static boolean_t dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* we never skip user/group accounting objects */ if (zb && (int64_t)zb->zb_object < 0) return (B_FALSE); if (scn->scn_suspending) return (B_TRUE); /* we're already suspending */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ if (zb && zb->zb_level != 0) return (B_FALSE); /* * We suspend if: * - we have scanned for the maximum time: an entire txg * timeout (default 5 sec) * or * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly * (default 30%), or someone is explicitly waiting for this txg * to complete. * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. */ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scan_min_time_ms; uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > mintime && (txg_sync_waiting(scn->scn_dp) || dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) || spa_shutting_down(scn->scn_dp->dp_spa)) { if (zb) { dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; } dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n", (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); scn->scn_suspending = B_TRUE; return (B_TRUE); } return (B_FALSE); } typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; } zil_scan_arg_t; /* ARGSUSED */ static int dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * One block ("stubby") can be allocated a long time ago; we * want to visit that one because it has been allocated * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); return (0); } /* ARGSUSED */ static int dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * birth can be < claim_txg if this record's txg is * already txg sync'ed (but this log block contains * other records that are not synced) */ if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); } return (0); } static void dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ if (claim_txg == 0 && spa_writeable(dp->dp_spa)) return; zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, claim_txg); zil_free(zilog); } /* ARGSUSED */ static void dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, uint64_t object, uint64_t blkid) { zbookmark_phys_t czb; arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (zfs_no_scrub_prefetch) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) return; SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); } static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ if (zbookmark_subtree_completed(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it to a different object, zero it out to * indicate that it's OK to start checking for suspending * again. */ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { dprintf("resuming at %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); } } return (B_FALSE); } /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_object, zb->zb_blkid * epb + i); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { for (j = 0; j < cdnp->dn_nblkptr; j++) { blkptr_t *cbp = &cdnp->dn_blkptr[j]; dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_blkid * epb + i, j); } } for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } arc_buf_destroy(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } osp = buf->b_data; dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); if (OBJSET_BUF_HAS_USERUSED(buf)) { /* * We also always visit user/group accounting * objects, and never skip them, even if we are * suspending. This is necessary so that the space * deltas from this txg get integrated. */ dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_groupused_dnode, DMU_GROUPUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); } return (0); } static void dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx) { int j; for (j = 0; j < dnp->dn_nblkptr; j++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, dnp->dn_nlevels - 1, j); dsl_scan_visitbp(&dnp->dn_blkptr[j], &czb, dnp, ds, scn, ostype, tx); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); dsl_scan_visitbp(&dnp->dn_spill, &czb, dnp, ds, scn, ostype, tx); } } /* * The arguments are in this order because mdb can only print the * first 5; we want them to be useful. */ static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; arc_buf_t *buf = NULL; blkptr_t bp_toread = *bp; /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ if (dsl_scan_check_suspend(scn, zb)) return; if (dsl_scan_check_resume(scn, dnp, zb)) return; if (BP_IS_HOLE(bp)) return; scn->scn_visited_this_txg++; dprintf_bp(bp, "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", ds, ds ? ds->ds_object : 0, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, bp); if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return; if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0) return; /* * If dsl_scan_ddt() has already visited this block, it will have * already done any translations or scrubbing, so don't call the * callback again. */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { ASSERT(buf == NULL); return; } /* * If this block is from the future (after cur_max_txg), then we * are doing this on behalf of a deleted snapshot, and we will * revisit the future block on the next pass of this dataset. * Don't scan it now unless we need to because something * under it was modified. */ if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); } } static void dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } void dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { if (ds->ds_is_snapshot) { /* * Note: * - scn_cur_{min,max}_txg stays the same. * - Setting the flag is not really necessary if * scn_cur_max_txg == scn_max_txg, because there * is nothing after this snapshot that we care * about. However, we set it anyway and then * ignore it when we retraverse it in * dsl_scan_visitds(). */ scn->scn_phys.scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn->scn_phys.scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object); } } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); if (ds->ds_is_snapshot) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was * deleted too. */ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("destroying ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); } else { zfs_dbgmsg("destroying ds %llu; in queue; removing", (u_longlong_t)ds->ds_object); } } /* * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ dsl_scan_sync_state(scn, tx); } void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_prev_snap_obj; zfs_dbgmsg("snapshotting ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("snapshotting ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } dsl_scan_sync_state(scn, tx); } void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg) == 0) { int err; ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); err = zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); VERIFY(err == 0 || err == EEXIST); if (err == EEXIST) { /* Both were there to begin with */ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); } zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } dsl_scan_sync_state(scn, tx); } struct enqueue_clones_arg { dmu_tx_t *tx; uint64_t originobj; }; /* ARGSUSED */ static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { struct enqueue_clones_arg *eca = arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); dsl_dataset_rele(ds, FTAG); if (err) return (err); ds = prev; } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (scn->scn_phys.scn_cur_min_txg >= scn->scn_phys.scn_max_txg) { /* * This can happen if this snapshot was created after the * scan started, and we already completed a previous snapshot * that was created after the scan started. This snapshot * only references blocks with: * * birth < our ds_creation_txg * cur_min_txg is no less than ds_creation_txg. * We have already visited these blocks. * or * birth > scn_max_txg * The scan requested not to visit these blocks. * * Subsequent snapshots (and clones) can reference our * blocks, or blocks with even higher birth times. * Therefore we do not need to visit them either, * so we do not add them to the work queue. * * Note that checking for cur_min_txg >= cur_max_txg * is not sufficient, because in that case we may need to * visit subsequent snapshots. This happens when min_txg > 0, * which raises cur_min_txg. In this case we will visit * this dataset but skip all of its blocks, because the * rootbp's birth time is < cur_min_txg. Then we will * add the next snapshots/clones to the work queue. */ char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " "cur_min_txg (%llu) >= max_txg (%llu)", dsobj, dsname, scn->scn_phys.scn_cur_min_txg, scn->scn_phys.scn_max_txg); kmem_free(dsname, MAXNAMELEN); goto out; } if (dmu_objset_from_ds(ds, &os)) goto out; /* * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same * BP as in the head), they must be ignored. So we traverse the * ZIL here, rather than in scan_recurse(), because the regular * snapshot block-sharing rules don't apply to it. */ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) dsl_scan_zil(dp, &os->os_zil_header); /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "suspending=%u", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_cur_max_txg, (int)scn->scn_suspending); kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); if (scn->scn_suspending) goto out; /* * We've finished this pass over this dataset. */ /* * If we did not completely visit this dataset, do another pass. */ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass; visiting again"); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, scn->scn_phys.scn_cur_max_txg, tx) == 0); goto out; } /* * Add descendent datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { uint64_t count; /* * A bug in a previous version of the code could * cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a * missing entry. Therefore we can only use the * next_clones_obj when its count is correct. */ int err = zap_count(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, &count); if (err == 0 && count == dsl_dataset_phys(ds)->ds_num_children - 1) usenext = B_TRUE; } if (usenext) { VERIFY0(zap_join_key(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_creation_txg, tx)); } else { struct enqueue_clones_arg eca; eca.tx = tx; eca.originobj = ds->ds_object; VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); } } out: dsl_dataset_rele(ds, FTAG); } /* ARGSUSED */ static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } /* * If this is a clone, we don't need to worry about it for now. */ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(prev, FTAG); return (0); } dsl_dataset_rele(ds, FTAG); ds = prev; } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } /* * Scrub/dedup interaction. * * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * * We leverage the fact that the dde's replication class (enum ddt_class) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * * To prevent excess scrubbing, the scrub begins by walking the DDT * to find all blocks with refcnt > 1, and scrubs each of these once. * Since there are two replication classes which contain blocks with * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. * * There would be nothing more to say if a block's refcnt couldn't change * during a scrub, but of course it can so we must account for changes * in a block's replication class. * * Here's an example of what can occur: * * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 * when visited during the top-down scrub phase, it will be scrubbed twice. * This negates our scrub optimization, but is otherwise harmless. * * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 * on each visit during the top-down scrub phase, it will never be scrubbed. * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 * while a scrub is in progress, it scrubs the block right then. */ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ddt_entry_t dde = { 0 }; int error; uint64_t n = 0; while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) break; dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", (longlong_t)ddb->ddb_class, (longlong_t)ddb->ddb_type, (longlong_t)ddb->ddb_checksum, (longlong_t)ddb->ddb_cursor); /* There should be no pending changes to the dedup table */ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); n++; if (dsl_scan_check_suspend(scn, NULL)) break; } zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; " "suspending=%u", (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } /* ARGSUSED */ void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx) { const ddt_key_t *ddk = &dde->dde_key; ddt_phys_t *ddp = dde->dde_phys; blkptr_t bp; zbookmark_phys_t zb = { 0 }; if (scn->scn_phys.scn_state != DSS_SCANNING) return; for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) continue; ddt_bp_create(checksum, ddk, ddp, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); } } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; zap_cursor_t zc; zap_attribute_t za; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_ddt(scn, tx); if (scn->scn_suspending) return; } if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { /* First do the MOS & ORIGIN */ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_suspending) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_cb, tx, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); } ASSERT(!scn->scn_suspending); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { /* * If we were suspended, continue from here. Note if the * ds we were suspended on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); if (scn->scn_suspending) return; } /* * In case we were suspended right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); /* keep pulling things out of the zap-object-as-queue */ while (zap_cursor_init(&zc, dp->dp_meta_objset, scn->scn_phys.scn_queue_obj), zap_cursor_retrieve(&zc, &za) == 0) { dsl_dataset_t *ds; uint64_t dsobj; dsobj = zfs_strtonum(za.za_name, NULL); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsobj, tx)); /* Set up min/max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (za.za_first_integer != 0) { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, za.za_first_integer); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, dsl_dataset_phys(ds)->ds_prev_snap_txg); } scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); zap_cursor_fini(&zc); if (scn->scn_suspending) return; } zap_cursor_fini(&zc); } static boolean_t dsl_scan_free_should_suspend(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; if (zfs_recover) return (B_FALSE); if (scn->scn_visited_this_txg >= zfs_free_max_blocks) return (B_TRUE); elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } static int dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_scan_t *scn = arg; if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { if (dsl_scan_free_should_suspend(scn)) return (SET_ERROR(ERESTART)); } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; return (0); } boolean_t dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if ((scn->scn_phys.scn_state == DSS_SCANNING && !dsl_scan_is_paused_scrub(scn)) || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } return (used != 0); } /* Called whenever a txg syncs. */ void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; int err = 0; /* * Check for scn_restart_txg before checking spa_load_state, so * that we can restart an old-style scan while the pool is being * imported (see dsl_scan_init). */ if (dsl_scan_restarting(scn, tx)) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", func, tx->tx_txg); dsl_scan_setup_sync(&func, tx); } /* * Only process scans in sync pass 1. */ if (spa_sync_pass(dp->dp_spa) > 1) return; /* * If the spa is shutting down, then stop scanning. This will * ensure that the scan does not dirty any new data during the * shutdown phase. */ if (spa_shutting_down(spa)) return; /* * If the scan is inactive due to a stalled async destroy, try again. */ if (!scn->scn_async_stalled && !dsl_scan_active(scn)) return; scn->scn_visited_this_txg = 0; scn->scn_suspending = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; /* * First process the async destroys. If we suspend, don't do * any scrubbing or resilvering. This ensures that there are no * async destroys while we are scanning, so the scan code doesn't * have to worry about traversing it. It is also faster to free the * blocks than to scrub them. */ if (zfs_free_bpobj_enabled && spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); } if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); if (err == EIO || err == ECKSUM) { err = 0; } else if (err != 0 && err != ERESTART) { zfs_panic_recover("error %u from " "traverse_dataset_destroyed()", err); } if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { /* finished; deactivate async destroy feature */ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); ASSERT(!spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, tx)); VERIFY0(bptree_free(dp->dp_meta_objset, dp->dp_bptree_obj, tx)); dp->dp_bptree_obj = 0; scn->scn_async_destroying = B_FALSE; scn->scn_async_stalled = B_FALSE; } else { /* * If we didn't make progress, mark the async * destroy as stalled, so that we will not initiate * a spa_sync() on its behalf. Note that we only * check this if we are not finished, because if the * bptree had no blocks for us to visit, we can * finish without "making progress". */ scn->scn_async_stalled = (scn->scn_visited_this_txg == 0); } } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " "free_bpobj/bptree txg %llu; err=%d", (longlong_t)scn->scn_visited_this_txg, (longlong_t) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)tx->tx_txg, err); scn->scn_visited_this_txg = 0; /* * Write out changes to the DDT that may be required as a * result of the blocks freed. This ensures that the DDT * is clean when a scrub/resilver runs. */ ddt_sync(spa, tx->tx_txg); } if (err != 0) return; if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && zfs_free_leak_on_eio && (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { /* * We have finished background destroying, but there is still * some space left in the dp_free_dir. Transfer this leaked * space to the dp_leak_dir. */ if (dp->dp_leak_dir == NULL) { rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, LEAK_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir)); rrw_exit(&dp->dp_config_rwlock, FTAG); } dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_done_txg == tx->tx_txg) { ASSERT(!scn->scn_suspending); /* finished with scan. */ zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); dsl_scan_done(scn, B_TRUE, tx); ASSERT3U(spa->spa_scrub_inflight, ==, 0); dsl_scan_sync_state(scn, tx); return; } if (dsl_scan_is_paused_scrub(scn)) return; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { zfs_dbgmsg("doing scan sync txg %llu; " "ddt bm=%llu/%llu/%llu/%llx", (longlong_t)tx->tx_txg, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); } else { zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", (longlong_t)tx->tx_txg, (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, (longlong_t)scn->scn_phys.scn_bookmark.zb_object, (longlong_t)scn->scn_phys.scn_bookmark.zb_level, (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); } scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); dsl_pool_config_enter(dp, FTAG); dsl_scan_visit(scn, tx); dsl_pool_config_exit(dp, FTAG); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; zfs_dbgmsg("visited %llu blocks in %llums", (longlong_t)scn->scn_visited_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); if (!scn->scn_suspending) { scn->scn_done_txg = tx->tx_txg + 1; zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", tx->tx_txg, scn->scn_done_txg); } if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > 0) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } mutex_exit(&spa->spa_scrub_lock); } dsl_scan_sync_state(scn, tx); } /* * This will start a new scan, or restart an existing one. */ void dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; dmu_tx_commit(tx); } else { dp->dp_scan->scn_restart_txg = txg; } zfs_dbgmsg("restarting resilver txg=%llu", txg); } boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); } /* * scrub consumers */ static void count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. */ if (zab == NULL) return; for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; if (t & DMU_OT_NEWTYPE) t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_2_of_2_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal == 1) zb->zb_ditto_2_of_3_samevdev++; else if (equal == 3) zb->zb_ditto_3_of_3_samevdev++; break; } } } static void dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; } mutex_exit(&spa->spa_scrub_lock); } static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); boolean_t needs_io; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; unsigned int scan_delay = 0; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) return (0); count_block(dp->dp_blkstats, bp); if (BP_IS_EMBEDDED(bp)) return (0); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; scan_delay = zfs_scrub_delay; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; for (int d = 0; d < BP_GET_NDVAS(bp); d++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[d])); /* * Keep track of how much data we've examined so that * zpool(1M) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); /* if it's a resilver, this may not be in the target range */ if (!needs_io) { if (DVA_GET_GANG(&bp->blk_dva[d])) { /* * Gang members may be spread across multiple * vdevs, so the best estimate we have is the * scrub range, which has already been checked. * XXX -- it would be better to change our * allocation policy to ensure that all * gang members reside on the same vdev. */ needs_io = B_TRUE; } else { needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1); } } } if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * MAX(zfs_top_maxinflight, 1); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); /* * If we're seeing recent (zfs_scan_idle) "important" I/Os * then throttle our workload to limit the impact of a scan. */ if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) delay(MAX((int)scan_delay, 0)); zio_nowait(zio_read(NULL, spa, bp, abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* do not relocate this block */ return (0); } /* * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. * Can also be called to resume a paused scrub. */ int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) { spa_t *spa = dp->dp_spa; dsl_scan_t *scn = dp->dp_scan; /* * Purge all vdev caches and probe all devices. We do this here * rather than in sync context because this requires a writer lock * on the spa_config lock, which we can't do from sync context. The * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ spa_vdev_state_enter(spa, SCL_NONE); spa->spa_scrub_reopen = B_TRUE; vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { /* got scrub start cmd, resume paused scrub */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, POOL_SCRUB_NORMAL); if (err == 0) { - spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_RESUME); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); return (ECANCELED); } return (SET_ERROR(err)); } return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); } static boolean_t dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) { return (scn->scn_restart_txg != 0 && scn->scn_restart_txg <= tx->tx_txg); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 329689) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 329690) @@ -1,7400 +1,7407 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Martin Matuska . All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome + * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" /* Check hostid on import? */ static int check_hostid = 1; /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ static int zfs_ccw_retry_interval = 300; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, "Check hostid on import?"); TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, &zfs_ccw_retry_interval, 0, "Configuration cache file write, retry after failure, interval (seconds)"); typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "issue", "issue_high", "intr", "intr_high" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_BATCH * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; -static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, const char *name); +static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, + const char *name); static void spa_event_post(sysevent_t *ev); static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ #ifdef PSRSET_BIND id_t zio_taskq_psrset_bind = PS_NONE; #endif #ifdef SYSDC boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ #endif boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ extern int zfs_sync_pass_deferred_free; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); if (strval != NULL) VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); else VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(spa_normal_class(spa)); size = metaslab_class_get_space(spa_normal_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == FREAD), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; int err; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) { mutex_exit(&spa->spa_props_lock); return (0); } /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_pool_t *dp; dsl_dataset_t *ds = NULL; dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); if (err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds)) { dsl_pool_config_exit(dp, FTAG); break; } strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } spa_prop_add_list(*nvp, prop, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); mutex_exit(&spa->spa_props_lock); out: if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: if (!zpool_prop_feature(propname)) { error = SET_ERROR(EINVAL); break; } /* * Sanitize the input. */ if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; uint64_t propval; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } if (error = dmu_objset_hold(strval, FTAG, &os)) break; /* * Must be ZPL, and its property settings * must be supported by GRUB (compression * is not gzip, and large blocks are not used). */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else if ((error = dsl_prop_get_int_ds(dmu_objset_ds(os), zfs_prop_to_name(ZFS_PROP_COMPRESSION), &propval)) == 0 && !BOOTFS_COMPRESS_VALID(propval)) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < ZIO_FAILURE_MODE_WAIT || intval > ZIO_FAILURE_MODE_PANIC)) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { /* * The kernel doesn't have an easy isprint() * check. For this kernel check, we merely * check ASCII apart from DEL. Fix this if * there is an easy-to-use kernel isprint(). */ if (*check >= 0x7f) { error = SET_ERROR(EINVAL); break; } } if (strlen(strval) > ZPROP_MAX_COMMENT) error = E2BIG; break; case ZPOOL_PROP_DEDUPDITTO: if (spa_version(spa) < SPA_VERSION_DEDUP) error = SET_ERROR(ENOTSUP); else error = nvpair_value_uint64(elem, &intval); if (error == 0 && intval != 0 && intval < ZIO_DEDUPDITTO_MIN) error = SET_ERROR(EINVAL); break; } if (error) break; } if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } /*ARGSUSED*/ static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", oldguid, *newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. */ int spa_change_guid(spa_t *spa) { int error; uint64_t guid; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { spa_config_sync(spa, B_FALSE, B_TRUE); - spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { spa_error_entry_t *sa = (spa_error_entry_t *)a; spa_error_entry_t *sb = (spa_error_entry_t *)b; int ret; ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; char name[32]; uint_t flags = 0; boolean_t batch = B_FALSE; if (mode == ZTI_MODE_NULL) { tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >=, 1); value = MAX(value, 1); break; case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = zio_taskq_batch_pct; break; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } for (uint_t i = 0; i < count; i++) { taskq_t *tq; if (count > 1) { (void) snprintf(name, sizeof (name), "%s_%s_%u", zio_type_name[t], zio_taskq_types[q], i); } else { (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); } #ifdef SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { #endif pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly lower priority * than the other taskqs. * FreeBSD notes: * - numerically higher priorities are lower priorities; * - if priorities divided by four (RQ_PPQ) are equal * then a difference between them is insignificant. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) #ifdef illumos pri--; #else pri += 4; #endif tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); #ifdef SYSDC } #endif tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT0(tqs->stqs_count); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. In that case we choose which taskq at random by using * the low bits of gethrtime(). */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { #ifdef _KERNEL tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; #else tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; #endif } taskq_dispatch_ent(tq, func, arg, flags, ent); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } #ifdef _KERNEL #ifdef SPA_PROCESS static void spa_thread(void *arg) { callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); #ifdef PSRSET_BIND /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } #endif #ifdef SYSDC if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } #endif spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif /* SPA_PROCESS */ #endif /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, int mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; #ifdef SPA_PROCESS /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* SPA_PROCESS */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ ASSERT(spa->spa_proc == &p0); if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } /* * Start TRIM thread. */ trim_thread_create(spa); list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); /* * Stop TRIM thread in case spa_unload() wasn't called directly * before spa_deactivate(). */ trim_thread_destroy(spa); spa_evicting_os_wait(spa); txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); #ifdef SPA_PROCESS /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } #endif /* SPA_PROCESS */ } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ static int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { int i; ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Stop TRIM thread. */ trim_thread_destroy(spa); /* * Stop async tasks. */ spa_async_suspend(spa); /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * Even though vdev_free() also calls vdev_metaslab_fini, we need * to call it earlier, before we wait for async i/o to complete. * This ensures that there is no async metaslab prefetching, by * calling taskq_wait(mg_taskq). */ if (spa->spa_root_vdev != NULL) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); spa_config_exit(spa, SCL_ALL, FTAG); } /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); for (i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } spa_config_exit(spa, SCL_ALL, FTAG); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ static void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ static void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (sav->sav_config != NULL) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); } else { nl2cache = 0; newvdevs = NULL; } oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); } } /* * Purge vdevs that were dropped */ for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); if (sav->sav_config == NULL) goto out; sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error != 0) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = kmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); return (error); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && !vd->vdev_ishole) { zfs_post_autoreplace(vd->vdev_spa, vd); - spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); + spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } static void spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) { ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); vd->vdev_top_zap = mvd->vdev_top_zap; vd->vdev_leaf_zap = mvd->vdev_leaf_zap; for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); } } /* * Validate the current config against the MOS config */ static boolean_t spa_config_valid(spa_t *spa, nvlist_t *config) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); /* * If we're doing a normal import, then build up any additional * diagnostic information about missing devices in this config. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; if (tvd->vdev_ops == &vdev_missing_ops && mtvd->vdev_ops != &vdev_missing_ops && mtvd->vdev_islog) child[idx++] = vdev_config_generate(spa, mtvd, B_FALSE, 0); } if (idx) { VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, idx) == 0); VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); for (int i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); } /* * Compare the root vdev tree with the information we have * from the MOS config (mrvd). Check each top-level vdev * with the corresponding MOS config top-level (mtvd). */ for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; /* * Resolve any "missing" vdevs in the current configuration. * If we find that the MOS config has more accurate information * about the top-level vdev then use that vdev instead. */ if (tvd->vdev_ops == &vdev_missing_ops && mtvd->vdev_ops != &vdev_missing_ops) { if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) continue; /* * Device specific actions. */ if (mtvd->vdev_islog) { spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * XXX - once we have 'readonly' pool * support we should be able to handle * missing data devices by transitioning * the pool to readonly. */ continue; } /* * Swap the missing vdev with the data we were * able to obtain from the MOS config. */ vdev_remove_child(rvd, tvd); vdev_remove_child(mrvd, mtvd); vdev_add_child(rvd, mtvd); vdev_add_child(mrvd, tvd); spa_config_exit(spa, SCL_ALL, FTAG); vdev_load(mtvd); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_reopen(rvd); } else { if (mtvd->vdev_islog) { /* * Load the slog device's state from the MOS * config since it's possible that the label * does not contain the most up-to-date * information. */ vdev_load_log_state(tvd, mtvd); vdev_reopen(tvd); } /* * Per-vdev ZAP info is stored exclusively in the MOS. */ spa_config_valid_zaps(tvd, mtvd); } } vdev_free(mrvd); spa_config_exit(spa, SCL_ALL, FTAG); /* * Ensure we were able to validate the config. */ return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { metaslab_group_passivate(mg); slog_found = B_TRUE; } } return (slog_found); } static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) metaslab_group_activate(mg); } } int spa_offline_log(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_vdev_offline, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { int i; for (i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of concurrent scrub i/os to create while verifying * a pool while importing it. */ int spa_load_verify_maxinflight = 10000; boolean_t spa_load_verify_metadata = B_TRUE; boolean_t spa_load_verify_data = B_TRUE; SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, &spa_load_verify_maxinflight, 0, "Maximum number of concurrent scrub I/Os to create while verifying a " "pool while importing it"); SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, &spa_load_verify_metadata, 0, "Check metadata on import?"); SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, &spa_load_verify_data, 0, "Check user data on import?"); /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } /* ARGSUSED */ int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_rewind_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_rewind_policy(spa->spa_config, &policy); if (policy.zrp_request & ZPOOL_NEVER_REWIND) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); error = dmu_objset_find_dp(spa->spa_dsl_pool, spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, DS_FIND_CHILDREN); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); if (error != 0) return (error); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, spa_load_verify_cb, rio); } (void) zio_wait(rio); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && sle.sle_data_count <= policy.zrp_maxdata) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); VERIFY(nvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) { return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val)); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (err); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig) { nvlist_t *config = spa->spa_config; char *ereport = FM_EREPORT_ZFS_POOL; char *comment; int error; uint64_t pool_guid; nvlist_t *nvl; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) return (SET_ERROR(EINVAL)); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { error = SET_ERROR(EEXIST); } else { spa->spa_config_guid = pool_guid; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) { VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, KM_SLEEP) == 0); } nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); } /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; return (error); } /* * Count the number of per-vdev ZAPs associated with all of the vdevs in the * vdev tree rooted in the given vd, and ensure that each ZAP is present in the * spa's per-vdev ZAP list. */ static uint64_t vdev_count_verify_zaps(vdev_t *vd) { spa_t *spa = vd->vdev_spa; uint64_t total = 0; if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_top_zap)); } if (vd->vdev_leaf_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { total += vdev_count_verify_zaps(vd->vdev_child[i]); } return (total); } /* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ static int spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport) { int error = 0; nvlist_t *nvroot = NULL; nvlist_t *label; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; boolean_t missing_feat_write = B_FALSE; /* * If this is an untrusted config, access the pool in read-only mode. * This prevents things like resilvering recently removed devices. */ if (!mosconfig) spa->spa_mode = FREAD; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa->spa_load_state = state; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) return (SET_ERROR(EINVAL)); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } /* * Try to open all vdevs, loading each label in the process. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); /* * We need to validate the vdev labels against the configuration that * we have in hand, which is dependent on the setting of mosconfig. If * mosconfig is true then we're validating the vdev labels based on * that config. Otherwise, we're validating against the cached config * (zpool.cache) that was read when we loaded the zfs module, and then * later we will recursively call spa_load() and validate against * the vdev config. * * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd, mosconfig); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) return (SET_ERROR(ENXIO)); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL || nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { VERIFY(nvlist_add_string(unsup_feat, nvpair_name(nvp), "") == 0); } } if (!nvlist_empty(unsup_feat)) { VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); nvlist_free(unsup_feat); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } /* * If the vdev guid sum doesn't match the uberblock, we have an * incomplete configuration. We first check to see if the pool * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). * If it is, defer the vdev_guid_sum check till later so we * can handle missing vdevs. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { missing_feat_write = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (missing_feat_write && spa_writeable(spa))) { return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!mosconfig) { uint64_t hostid; nvlist_t *policy = NULL, *nvconfig; if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; VERIFY(nvlist_lookup_string(nvconfig, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); #ifdef _KERNEL myhostid = zone_get_hostid(NULL); #else /* _KERNEL */ /* * We're emulating the system's hostid in userland, so * we can't use zone_get_hostid(). */ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); #endif /* _KERNEL */ if (check_hostid && hostid != 0 && myhostid != 0 && hostid != myhostid) { nvlist_free(nvconfig); cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " "See: http://illumos.org/msg/ZFS-8000-EY", spa_name(spa), hostname, (unsigned long)hostid); return (SET_ERROR(EBADF)); } } if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY, &policy) == 0) VERIFY(nvlist_add_nvlist(nvconfig, ZPOOL_REWIND_POLICY, policy) == 0); spa_config_set(spa, nvconfig); spa_unload(spa); spa_deactivate(spa); spa_activate(spa, orig_mode); return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } /* Grab the secret checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); if (error == ENOENT) { /* Generate a new salt for subsequent use */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the per-vdev ZAP map. If we have an older pool, this will not * be present; in this case, defer its creation to a later time to * avoid dirtying the MOS this early / out of sync context. See * spa_sync_config_object. */ /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their * destruction to later; see spa_sync_config_object. */ spa->spa_avz_action = AVZ_ACTION_DESTROY; /* * We're assuming that no vdevs have had their ZAPs created * before this. Better be sure of it. */ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } nvlist_free(mos_config); /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_BOOTSIZE, &spa->spa_bootsize); spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, &spa->spa_dedup_ditto); spa->spa_autoreplace = (autoreplace != 0); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev state for all toplevel vdevs. */ vdev_load(rvd); /* * Propagate the leaf DTLs we just loaded all the way up the tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); /* * Load the DDTs (dedup tables). */ error = ddt_load(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_update_dspace(spa); /* * Validate the config, using the MOS config to fill in any * information which might be missing. If we fail to validate * the config then declare the pool unfit for use. If we're * assembling a pool from a split, the log is not transferred * over. */ if (type != SPA_IMPORT_ASSEMBLE) { nvlist_t *nvconfig; if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!spa_config_valid(spa, nvconfig)) { nvlist_free(nvconfig); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } nvlist_free(nvconfig); /* * Now that we've validated the config, check the state of the * root vdev. If it can't be opened, it indicates one or * more toplevel vdevs are faulted. */ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) return (SET_ERROR(ENXIO)); if (spa_writeable(spa) && spa_check_logs(spa)) { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } if (missing_feat_write) { ASSERT(state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (state != SPA_LOAD_TRYIMPORT) { if (error = spa_load_verify(spa)) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); ASSERT(state != SPA_LOAD_TRYIMPORT); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by either zil_check_log_chain() * (invoked from spa_check_logs()) or zil_claim() above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || state == SPA_LOAD_IMPORT || state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asychronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); /* * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(rvd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open"); /* * Delete any inconsistent datasets. */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); } return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) { int mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig); if (load_error == 0) return (0); if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state, mosconfig); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); else nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (mutex_owner(&spa_namespace_lock) != curthread) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_rewind_policy_t policy; firstopen = B_TRUE; zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zrp_request & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, policy.zrp_request); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); #ifdef __FreeBSD__ #ifdef _KERNEL if (firstopen) zvol_create_minors(spa->spa_name); #endif #endif } *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); if (nl2cache != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); } } } static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; zap_cursor_t zc; zap_attribute_t za; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); /* We may be unable to read features if pool is suspended. */ if (spa_suspended(spa)) goto out; if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } out: VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features) == 0); nvlist_free(features); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; VERIFY(nvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); if (spa_suspended(spa)) VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } /* * The L2ARC currently only supports disk devices in * kernel context. For user-level testing, we allow it. */ #ifdef _KERNEL if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { error = SET_ERROR(ENOTBLK); vdev_free(vd); goto out; } #endif vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatentating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs) == 0); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) VERIFY(nvlist_dup(olddevs[i], &newdevs[i], KM_SLEEP) == 0); for (i = 0; i < ndevs; i++) VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], KM_SLEEP) == 0); VERIFY(nvlist_remove(sav->sav_config, config, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, newdevs, ndevs + oldndevs) == 0); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs) == 0); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops) { spa_t *spa; char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; boolean_t has_features; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, NULL, altroot); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } has_features = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) has_features = B_TRUE; } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (int c = 0; c < rvd->vdev_children; c++) { vdev_ashift_optimize(rvd->vdev_child[c]); vdev_metaslab_set_size(rvd->vdev_child[c]); vdev_expand(rvd->vdev_child[c], txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (spa_version(spa) >= SPA_VERSION_FEATURES) spa_feature_create_zap_objects(spa, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY) spa_history_create_obj(spa, tx); /* * Generate some random noise for salted checksums to operate on. */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * We explicitly wait for the first transaction to complete so that our * bean counters are appropriately updated. */ txg_wait_synced(spa->spa_dsl_pool, txg); spa_config_sync(spa, B_FALSE, B_TRUE); - spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create"); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); return (0); } #ifdef _KERNEL #ifdef illumos /* * Get the root pool information from the root disk, then import the root pool * during the system boot up time. */ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); static nvlist_t * spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) { nvlist_t *config; nvlist_t *nvtop, *nvroot; uint64_t pgid; if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) return (NULL); /* * Add this top-level vdev to the child array. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &nvtop, 1) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); return (config); } /* * Walk the vdev tree and see if we can find a device with "better" * configuration. A configuration is "better" if the label on that * device has a more recent txg. */ static void spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) { for (int c = 0; c < vd->vdev_children; c++) spa_alt_rootvdev(vd->vdev_child[c], avd, txg); if (vd->vdev_ops->vdev_op_leaf) { nvlist_t *label; uint64_t label_txg; if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, &label) != 0) return; VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg) == 0); /* * Do we have a better boot device? */ if (label_txg > *txg) { *txg = label_txg; *avd = vd; } nvlist_free(label); } } /* * Import a root pool. * * For x86. devpath_list will consist of devid and/or physpath name of * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). * The GRUB "findroot" command will return the vdev we should boot. * * For Sparc, devpath_list consists the physpath name of the booting device * no matter the rootpool is a single device pool or a mirrored pool. * e.g. * "/pci@1f,0/ide@d/disk@0,0:a" */ int spa_import_rootpool(char *devpath, char *devid) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t guid, txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(devpath, devid, &guid); #if defined(_OBP) && defined(_KERNEL) if (config == NULL) { if (strstr(devpath, "/iscsi/ssd") != NULL) { /* iscsi boot */ get_iscsi_bootpath_phy(devpath); config = spa_generate_rootconf(devpath, devid, &guid); } } #endif if (config == NULL) { cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); return (SET_ERROR(EIO)); } VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pname)) != NULL) { /* * Remove the existing root pool from the namespace so that we * can replace it with the correct config we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } /* * Get the boot vdev. */ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", (u_longlong_t)guid); error = SET_ERROR(ENOENT); goto out; } /* * Determine if there is a better boot device. */ avd = bvd; spa_alt_rootvdev(rvd, &avd, &txg); if (avd != bvd) { cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " "try booting from '%s'", avd->vdev_path); error = SET_ERROR(EINVAL); goto out; } /* * If the boot device is part of a spare vdev then ensure that * we're booting off the active spare. */ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && !bvd->vdev_isspare) { cmn_err(CE_NOTE, "The boot device is currently spared. Please " "try booting from '%s'", bvd->vdev_parent-> vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); error = SET_ERROR(EINVAL); goto out; } error = 0; out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (error); } #else /* !illumos */ extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count); static nvlist_t * spa_generate_rootconf(const char *name) { nvlist_t **configs, **tops; nvlist_t *config; nvlist_t *best_cfg, *nvtop, *nvroot; uint64_t *holes; uint64_t best_txg; uint64_t nchildren; uint64_t pgid; uint64_t count; uint64_t i; uint_t nholes; if (vdev_geom_read_pool_label(name, &configs, &count) != 0) return (NULL); ASSERT3U(count, !=, 0); best_txg = 0; for (i = 0; i < count; i++) { uint64_t txg; VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if (txg > best_txg) { best_txg = txg; best_cfg = configs[i]; } } nchildren = 1; nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); holes = NULL; nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, &holes, &nholes); tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); for (i = 0; i < nchildren; i++) { if (i >= count) break; if (configs[i] == NULL) continue; VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); nvlist_dup(nvtop, &tops[i], KM_SLEEP); } for (i = 0; holes != NULL && i < nholes; i++) { if (i >= nchildren) continue; if (tops[holes[i]] != NULL) continue; nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, holes[i]) == 0); VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 0) == 0); } for (i = 0; i < nchildren; i++) { if (tops[i] != NULL) continue; nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, VDEV_TYPE_MISSING) == 0); VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, i) == 0); VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 0) == 0); } /* * Create pool config based on the best vdev config. */ nvlist_dup(best_cfg, &config, KM_SLEEP); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, tops, nchildren) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); /* * Drop vdev config elements that should not be present at pool level. */ nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); for (i = 0; i < count; i++) nvlist_free(configs[i]); kmem_free(configs, count * sizeof(void *)); for (i = 0; i < nchildren; i++) nvlist_free(tops[i]); kmem_free(tops, nchildren * sizeof(void *)); nvlist_free(nvroot); return (config); } int spa_import_rootpool(const char *name) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(name); mutex_enter(&spa_namespace_lock); if (config != NULL) { VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && strcmp(name, pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if ((spa = spa_lookup(pname)) != NULL) { /* * The pool could already be imported, * e.g., after reboot -r. */ if (spa->spa_state == POOL_STATE_ACTIVE) { mutex_exit(&spa_namespace_lock); nvlist_free(config); return (0); } /* * Remove the existing root pool from the namespace so * that we can replace it with the correct config * we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); /* * Set spa_ubsync.ub_version as it can be used in vdev_alloc() * via spa_version(). */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; } else if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", name); return (EIO); } else { VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); } spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (0); } #endif /* illumos */ #endif /* _KERNEL */ /* * Import a non-root pool into the system. */ int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_rewind_policy_t policy; uint64_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = FREAD; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_config_sync(spa, B_FALSE, B_TRUE); - spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_rewind_policy(config, &policy); if (policy.zrp_request & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; /* * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig * because the user-supplied config is actually the one to trust when * doing an import. */ if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, policy.zrp_request); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_L2CACHE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); spa_history_log_version(spa, "import"); - spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); #ifdef __FreeBSD__ #ifdef _KERNEL zvol_create_minors(pool); #endif #endif return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; char *poolname; spa_t *spa; uint64_t state; int error; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); /* * Pass off the heavy lifting to spa_load(). * Pass TRUE for mosconfig because the user-supplied config * is actually the one to trust when doing an import. */ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname) == 0); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & FWRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); /* * The pool will be in core if it's openable, * in which case we can modify its state. */ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { /* * Objsets may be open only because they're dirty, so we * have to force it to sync before checking spa_refcnt. */ txg_wait_synced(spa->spa_dsl_pool, 0); spa_evicting_os_wait(spa); /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EXDEV)); } /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } } - spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); return (0); } /* * Destroy a storage pool. */ int spa_destroy(char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg, id; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * Transfer each new top-level vdev from vd to rvd. */ for (int c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. */ for (id = 0; id < rvd->vdev_children; id++) { if (rvd->vdev_child[id]->vdev_ishole) { vdev_free(rvd->vdev_child[id]); break; } } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); - spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * Spares can't replace logs */ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) sprintf(oldvd->vdev_path, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* mark the device being resilvered */ newvd->vdev_resilver_txg = txg; /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); - spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver to restart in the future. We do this to * ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); if (spa->spa_bootfs) - spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); - spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a spare, then it implies * that the spare should become a real disk, and be removed from the * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0 && pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); - spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_offline_log(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || vd->vdev_ishole) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || vml[c]->vdev_ishole || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c])) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_TOP_ZAP, vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children) == 0); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl) == 0); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); #ifndef illumos /* mark that we are creating new spa by splitting */ newspa->spa_splitting_newspa = B_TRUE; #endif /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); #ifndef illumos newspa->spa_splitting_newspa = B_FALSE; #endif if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { VERIFY(nvlist_alloc(&newspa->spa_config_splitting, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL) { vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { for (int i = 0; i < count; i++) { uint64_t guid; VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (guid == target_guid) return (nvpp[i]); } return (NULL); } static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; if (count > 1) newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); for (int i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); } VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); for (int i = 0; i < count - 1; i++) nvlist_free(newdev[i]); if (count > 1) kmem_free(newdev, (count - 1) * sizeof (void *)); } /* * Evacuate the device. */ static int spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) { uint64_t txg; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); ASSERT(vd == vd->vdev_top); /* * Evacuate the device. We don't hold the config lock as writer * since we need to do I/O but we do keep the * spa_namespace_lock held. Once this completes the device * should no longer have any blocks allocated on it. */ if (vd->vdev_islog) { if (vd->vdev_stat.vs_alloc != 0) error = spa_offline_log(spa); } else { error = SET_ERROR(ENOTSUP); } if (error) return (error); /* * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ ASSERT0(vd->vdev_stat.vs_alloc); txg = spa_vdev_config_enter(spa); vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, txg); vdev_config_dirty(vd); spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); return (0); } /* * Complete the removal by cleaning up the namespace. */ static void spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; uint64_t id = vd->vdev_id; boolean_t last_vdev = (id == (rvd->vdev_children - 1)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vd == vd->vdev_top); /* * Only remove any devices which are empty. */ if (vd->vdev_stat.vs_alloc != 0) return; (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) vdev_state_clean(vd); if (list_link_active(&vd->vdev_config_dirty_node)) vdev_config_clean(vd); vdev_free(vd); if (last_vdev) { vdev_compact_children(rvd); } else { vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); } vdev_config_dirty(rvd); /* * Reassess the health of our root vdev. */ vdev_reopen(rvd); } /* * Remove a device from the pool - * * Removing a device from the vdev namespace requires several steps * and can take a significant amount of time. As a result we use * the spa_vdev_config_[enter/exit] functions which allow us to * grab and release the spa_config_lock while still holding the namespace * lock. During each step the configuration is synced out. * * Currently, this supports removing only hot spares, slogs, and level 2 ARC * devices. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; sysevent_t *ev = NULL; metaslab_group_t *mg; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; uint_t nspares, nl2cache; int error = 0; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); ASSERT(spa_writeable(spa)); if (!locked) txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_spares.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { /* * Only remove the hot spare if it's not currently in use * in this pool. */ if (vd == NULL || unspare) { if (vd == NULL) vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } else { error = SET_ERROR(EBUSY); } } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { /* * Cache devices can always be removed. */ vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); + ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); ASSERT(vd == vd->vdev_top); mg = vd->vdev_mg; /* * Stop allocating from this vdev. */ metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * Attempt to evacuate the vdev. */ error = spa_vdev_remove_evacuate(spa, vd); txg = spa_vdev_config_enter(spa); /* * If we couldn't evacuate the vdev, unwind. */ if (error) { metaslab_group_activate(mg); return (spa_vdev_exit(spa, NULL, txg, error)); } /* * Clean up the vdev namespace. */ - ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV); + ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); spa_vdev_remove_from_namespace(spa, vd); } else if (vd != NULL) { /* * Normal vdevs cannot be removed (yet). */ error = SET_ERROR(ENOTSUP); } else { /* * There is no vdev of any kind with the specified guid. */ error = SET_ERROR(ENOENT); } if (!locked) error = spa_vdev_exit(spa, NULL, txg, error); if (ev) spa_event_post(ev); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); } /* * Update the stored path or FRU for this vdev. */ int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); } int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); /* Tell userspace that the vdev is gone. */ zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { sysevent_id_t eid; nvlist_t *attr; char *physpath; if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); nvlist_free(attr); kmem_free(physpath, MAXPATHLEN); } static void spa_async_thread(void *arg) { spa_t *spa = arg; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks &= SPA_ASYNC_REMOVE; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), new_space, new_space - old_space); } } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE) spa_vdev_resilver_done(spa); /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) dsl_resilver_restart(spa->spa_dsl_pool, 0); /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } static void spa_async_thread_vd(void *arg) { spa_t *spa = arg; int tasks; mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; retry: spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; mutex_exit(&spa->spa_async_lock); /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; if ((tasks & SPA_ASYNC_REMOVE) != 0) goto retry; spa->spa_async_thread_vd = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL && spa->spa_async_thread_vd != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | SPA_ASYNC_REMOVE); config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < (zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL && rootdir != NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } static void spa_async_dispatch_vd(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && !spa->spa_async_suspended && spa->spa_async_thread_vd == NULL && rootdir != NULL) spa->spa_async_thread_vd = thread_create(NULL, 0, spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); spa_async_dispatch_vd(spa); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, tx); return (0); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), zio->io_flags)); return (0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = kmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); kmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (sav->sav_count == 0) { VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } /* * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. * The all-vdev ZAP must be empty. */ static void spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); } if (vd->vdev_leaf_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_leaf_zap, tx)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_avz_build(vd->vdev_child[i], avz, tx); } } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; /* * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, * its config may not be dirty but we still need to build per-vdev ZAPs. * Similarly, if the pool is being assembled (e.g. after a split), we * need to rebuild the AVZ although the config may not be dirty. */ if (list_is_empty(&spa->spa_config_dirty_list) && spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { /* Make and build the new AVZ */ uint64_t new_avz = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_avz_build(spa->spa_root_vdev, new_avz, tx); /* Diff old AVZ with new one */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t vdzap = za.za_first_integer; if (zap_lookup_int(spa->spa_meta_objset, new_avz, vdzap) == ENOENT) { /* * ZAP is listed in old AVZ but not in new one; * destroy it */ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, tx)); } } zap_cursor_fini(&zc); /* Destroy the old AVZ */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); /* Replace the old AVZ in the dir obj with the new one */ VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, sizeof (new_avz), 1, &new_avz, tx)); spa->spa_all_vdev_zaps = new_avz; } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { zap_cursor_t zc; zap_attribute_t za; /* Walk through the AVZ and destroy all listed ZAPs */ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t zap = za.za_first_integer; VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); } zap_cursor_fini(&zc); /* Destroy and unlink the AVZ itself */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); spa->spa_all_vdev_zaps = 0; } if (spa->spa_all_vdev_zaps == 0) { spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx); } spa->spa_avz_action = AVZ_ACTION_NONE; /* Create ZAPs for vdevs that don't have them. */ vdev_construct_zaps(spa->spa_root_vdev, tx); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; char *strval, *fname; zpool_prop_t prop; const char *propname; zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(nvpair_name(elem))) { case ZPOOL_PROP_INVAL: /* * We checked this earlier in spa_prop_validate(). */ ASSERT(zpool_prop_feature(nvpair_name(elem))); fname = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", nvpair_name(elem)); break; case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced seperatly before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persisitent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. It's unnecessary * to do this for pool creation since the vdev's * configuratoin has already been dirtied. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", nvpair_name(elem), intval); } else { ASSERT(0); /* not allowed */ } switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_DEDUPDITTO: spa->spa_dedup_ditto = intval; break; default: break; } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { dsl_pool_t *dp = spa->spa_dsl_pool; ASSERT(spa->spa_sync_pass == 1); rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } /* * If we haven't written the salt, do so now. Note that the * feature may not be activated yet, but that's fine since * the presence of this ZAP entry is backwards compatible. */ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT) == ENOENT) { VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes, tx)); } rrw_exit(&dp->dp_config_rwlock, FTAG); } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; int error; uint32_t max_queue_depth = zfs_vdev_async_write_max_active * zfs_vdev_queue_depth_pct / 100; VERIFY(spa_writeable(spa)); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; mutex_enter(&spa->spa_alloc_lock); VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); mutex_exit(&spa->spa_alloc_lock); /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while (list_head(&spa->spa_state_dirty_list) != NULL) { /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); #ifdef illumos VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, spa->spa_sync_starttime + spa->spa_deadman_synctime)); #else /* !illumos */ #ifdef _KERNEL callout_schedule(&spa->spa_deadman_cycid, hz * spa->spa_deadman_synctime / NANOSEC); #endif #endif /* illumos */ /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY(0 == zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } /* * Set the top-level vdev's max queue depth. Evaluate each * top-level's async write queue depth in case it changed. * The max queue depth will not change in the middle of syncing * out this txg. */ uint64_t queue_depth_total = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || mg->mg_class != spa_normal_class(spa) || !metaslab_group_initialized(mg)) continue; /* * It is safe to do a lock-free check here because only async * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); mg->mg_max_alloc_queue_depth = max_queue_depth; queue_depth_total += mg->mg_max_alloc_queue_depth; } metaslab_class_t *mc = spa_normal_class(spa); ASSERT0(refcount_count(&mc->mc_alloc_slots)); mc->mc_alloc_max_slots = queue_depth_total; mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; ASSERT3U(mc->mc_alloc_max_slots, <=, max_queue_depth * rvd->vdev_children); /* * Iterate to convergence. */ do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free) { spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_cb, &spa->spa_deferred_bpobj, tx); } ddt_sync(spa, txg); dsl_scan_sync(dp, tx); while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) vdev_sync(vd, txg); if (pass == 1) { spa_sync_upgrades(spa, tx); ASSERT3U(txg, >=, spa->spa_uberblock.ub_rootbp.blk_birth); /* * Note: We need to check if the MOS is dirty * because we could have marked the MOS dirty * without updating the uberblock (e.g. if we * have sync tasks but no dirty user data). We * need to check the uberblock's rootbp because * it is updated if we have synced out dirty * data (though in this case the MOS will most * likely also be dirty due to second order * effects, we don't want to rely on that here). */ if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, * therefore this TXG is a no-op. Avoid * syncing deferred frees, so that we * can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } } while (dmu_objset_is_dirty(mos, txg)); if (!list_is_empty(&spa->spa_config_dirty_list)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets * called if the config is dirty; otherwise there may be * outstanding AVZ operations that weren't completed in * spa_sync_config_object. */ uint64_t all_vdev_zap_entry_count; ASSERT0(zap_count(spa->spa_meta_objset, spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, all_vdev_zap_entry_count); } /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few * random top-level vdevs that are known to be visible in the * config cache (see spa_vdev_add() for a complete description). * If there *are* dirty vdevs, sync the uberblock to all vdevs. */ for (;;) { /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_DVAS_PER_BP]; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; if (vd->vdev_ms_array == 0 || vd->vdev_islog) continue; svd[svdcount++] = vd; if (svdcount == SPA_DVAS_PER_BP) break; } error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL); zio_resume_wait(spa); } dmu_tx_commit(tx); #ifdef illumos VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); #else /* !illumos */ #ifdef _KERNEL callout_drain(&spa->spa_deadman_cycid); #endif #endif /* illumos */ /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } dsl_pool_sync_done(dp, txg); mutex_enter(&spa->spa_alloc_lock); VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); mutex_exit(&spa->spa_alloc_lock); /* * Update usable space statistics. */ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) vdev_sync_done(vd, txg); spa_update_dspace(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); spa->spa_sync_pass = 0; /* * Update the last synced uberblock here. We want to do this at * the end of spa_sync() so that consumers of spa_last_synced_txg() * will be guaranteed that all the processing associated with * that txg has been completed. */ spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); spa_async_dispatch_vd(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &spareguid) == 0 && spareguid == guid) return (B_TRUE); } return (B_FALSE); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } static sysevent_t * -spa_event_create(spa_t *spa, vdev_t *vd, const char *name) +spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; #ifdef _KERNEL sysevent_attr_list_t *attr = NULL; sysevent_value_t value; ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", SE_SLEEP); ASSERT(ev != NULL); value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = spa_name(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) goto done; value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = spa_guid(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) goto done; if (vd) { value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = vd->vdev_guid; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, SE_SLEEP) != 0) goto done; if (vd->vdev_path) { value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = vd->vdev_path; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, &value, SE_SLEEP) != 0) goto done; } } + if (hist_nvl != NULL) { + fnvlist_merge((nvlist_t *)attr, hist_nvl); + } + if (sysevent_attach_attributes(ev, attr) != 0) goto done; attr = NULL; done: if (attr) sysevent_free_attr(attr); #endif return (ev); } static void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL sysevent_id_t eid; (void) log_sysevent(ev, SE_SLEEP, &eid); sysevent_free(ev); #endif } /* * Post a sysevent corresponding to the given event. The 'name' must be one of * the event definitions in sys/sysevent/eventdefs.h. The payload will be - * filled in from the spa and (optionally) the vdev. This doesn't do anything - * in the userland libzpool, as we don't want consumers to misinterpret ztest - * or zdb as real changes. + * filled in from the spa and (optionally) the vdev and history nvl. This + * doesn't do anything in the userland libzpool, as we don't want consumers to + * misinterpret ztest or zdb as real changes. */ void -spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { - spa_event_post(spa_event_create(spa, vd, name)); + spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c (revision 329689) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c (revision 329690) @@ -1,562 +1,563 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif /* * Pool configuration repository. * * Pool configuration is stored as a packed nvlist on the filesystem. By * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot * (when the ZFS module is loaded). Pools can also have the 'cachefile' * property set that allows them to be stored in an alternate location until * the control of external software. * * For each cache file, we have a single nvlist which holds all the * configuration information. When the module loads, we read this information * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is * maintained independently in spa.c. Whenever the namespace is modified, or * the configuration of a pool is changed, we call spa_config_sync(), which * walks through all the active pools and writes the configuration to disk. */ static uint64_t spa_config_generation = 1; /* * This can be overridden in userland to preserve an alternate namespace for * userland pools when doing testing. */ const char *spa_config_path = ZPOOL_CACHE; /* * Called when the module is first loaded, this routine loads the configuration * file into the SPA namespace. It does not actually open or load the pools; it * only populates the namespace. */ void spa_config_load(void) { void *buf = NULL; nvlist_t *nvlist, *child; nvpair_t *nvpair; char *pathname; struct _buf *file; uint64_t fsize; /* * Open the configuration file. */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); file = kobj_open_file(pathname); kmem_free(pathname, MAXPATHLEN); if (file == (struct _buf *)-1) return; if (kobj_get_filesize(file, &fsize) != 0) goto out; buf = kmem_alloc(fsize, KM_SLEEP); /* * Read the nvlist from the file. */ if (kobj_read_file(file, buf, fsize, 0) < 0) goto out; /* * Unpack the nvlist. */ if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) goto out; /* * Iterate over all elements in the nvlist, creating a new spa_t for * each one with the specified configuration. */ mutex_enter(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; child = fnvpair_value_nvlist(nvpair); if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; (void) spa_add(nvpair_name(nvpair), child, NULL); } mutex_exit(&spa_namespace_lock); nvlist_free(nvlist); out: if (buf != NULL) kmem_free(buf, fsize); kobj_close_file(file); } static void spa_config_clean(nvlist_t *nvl) { nvlist_t **child; nvlist_t *nvroot = NULL; uint_t c, children; if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) spa_config_clean(child[c]); } if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0) spa_config_clean(nvroot); nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY); nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY); } static int spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; char *buf; vnode_t *vp; int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; char *temp; int err; /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); return (err); } /* * Pack the configuration into a buffer. */ buf = fnvlist_pack(nvl, &buflen); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); /* * Write the configuration to disk. We need to do the traditional * 'write to temporary file, sync, move over original' to make sure we * always have a consistent view of the data. */ (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0); if (err == 0) { err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, NULL); if (err == 0) err = VOP_FSYNC(vp, FSYNC, kcred, NULL); if (err == 0) err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE); (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); } (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); fnvlist_pack_free(buf, buflen); kmem_free(temp, MAXPATHLEN); return (err); } /* * Synchronize pool configuration to disk. This must be called with the * namespace lock held. Synchronizing the pool cache is typically done after * the configuration has been synced to the MOS. This exposes a window where * the MOS config will have been updated but the cache file has not. If * the system were to crash at that instant then the cached config may not * contain the correct information to open the pool and an explicity import * would be required. */ void spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; boolean_t ccw_failure; int error; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (rootdir == NULL || !(spa_mode_global & FWRITE)) return; /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; if (dp->scd_path == NULL) continue; /* * Iterate over all pools, adding any matching pools to 'nvl'. */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { nvlist_t *nvroot = NULL; /* * Skip over our own pool if we're about to remove * ourselves from the spa namespace or any pool that * is readonly. Since we cannot guarantee that a * readonly pool would successfully import upon reboot, * we don't allow them to be written to the cache file. */ if ((spa == target && removing) || (spa_state(spa) == POOL_STATE_ACTIVE && !spa_writeable(spa))) continue; mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || tdp->scd_path == NULL || strcmp(tdp->scd_path, dp->scd_path) != 0) { mutex_exit(&spa->spa_props_lock); continue; } if (nvl == NULL) nvl = fnvlist_alloc(); fnvlist_add_nvlist(nvl, spa->spa_name, spa->spa_config); mutex_exit(&spa->spa_props_lock); if (nvlist_lookup_nvlist(nvl, spa->spa_name, &nvroot) == 0) spa_config_clean(nvroot); } error = spa_config_write(dp, nvl); if (error != 0) ccw_failure = B_TRUE; nvlist_free(nvl); } if (ccw_failure) { /* * Keep trying so that configuration data is * written if/when any temporary filesystem * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, target, NULL, NULL, 0, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); } else { /* * Do not rate limit future attempts to update * the config cache. */ target->spa_ccw_fail_time = 0; } /* * Remove any config entries older than the current one. */ dp = list_head(&target->spa_config_list); while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { list_remove(&target->spa_config_list, tdp); if (tdp->scd_path != NULL) spa_strfree(tdp->scd_path); kmem_free(tdp, sizeof (spa_config_dirent_t)); } spa_config_generation++; if (postsysevent) - spa_event_notify(target, NULL, ESC_ZFS_CONFIG_SYNC); + spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); } /* * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, * and we don't want to allow the local zone to see all the pools anyway. * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration * information for all pool visible within the zone. */ nvlist_t * spa_all_configs(uint64_t *generation) { nvlist_t *pools; spa_t *spa = NULL; if (*generation == spa_config_generation) return (NULL); pools = fnvlist_alloc(); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (INGLOBALZONE(curthread) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); fnvlist_add_nvlist(pools, spa_name(spa), spa->spa_config); mutex_exit(&spa->spa_props_lock); } } *generation = spa_config_generation; mutex_exit(&spa_namespace_lock); return (pools); } void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } /* * Generate the pool's configuration based on the current in-core state. * * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ nvlist_t * spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) { nvlist_t *config, *nvroot; vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; uint64_t split_guid; if (vd == NULL) { vd = rvd; locked = B_TRUE; spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == (SCL_CONFIG | SCL_STATE)); /* * If txg is -1, report the current value of spa->spa_config_txg. */ if (txg == -1ULL) txg = spa->spa_config_txg; config = fnvlist_alloc(); fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); if (spa->spa_comment != NULL) { fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); } #ifdef _KERNEL hostid = zone_get_hostid(NULL); #else /* _KERNEL */ /* * We're emulating the system's hostid in userland, so we can't use * zone_get_hostid(). */ (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); #endif /* _KERNEL */ if (hostid != 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); } fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename); int config_gen_flags = 0; if (vd != rvd) { fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_isspare) { fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, 1ULL); } if (vd->vdev_islog) { fnvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL); } vd = vd->vdev_top; /* label contains top config */ } else { /* * Only add the (potentially large) split information * in the mos config, and not in the vdev labels */ if (spa->spa_config_splitting != NULL) fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, spa->spa_config_splitting); fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); config_gen_flags |= VDEV_CONFIG_MOS; } /* * Add the top-level config. We even add this on pools which * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); /* * If we're splitting, record the original pool's guid. */ if (spa->spa_config_splitting != NULL && nvlist_lookup_uint64(spa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid); } nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); nvlist_free(nvroot); /* * Store what's necessary for reading the MOS in the label. */ fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, spa->spa_label_features); if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); kmem_free(ddh, sizeof (ddt_histogram_t)); ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); ddt_get_dedup_object_stats(spa, ddo); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); kmem_free(ddo, sizeof (ddt_object_t)); dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); ddt_get_dedup_stats(spa, dds); fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); kmem_free(dds, sizeof (ddt_stat_t)); } if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; int c; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); txg = spa_last_synced_txg(spa) + 1; if (what == SPA_CONFIG_UPDATE_POOL) { vdev_config_dirty(rvd); } else { /* * If we have top-level vdevs that were added but have * not yet been prepared for allocation, do that now. * (It's safe now because the config cache is up to date, * so it will be able to translate the new DVAs.) * See comments in spa_vdev_add() for full details. */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ms_array == 0) { vdev_ashift_optimize(tvd); vdev_metaslab_set_size(tvd); } vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); /* * Wait for the mosconfig to be regenerated and synced. */ txg_wait_synced(spa->spa_dsl_pool, txg); /* * Update the global config cache to reflect the new mosconfig. */ spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c (revision 329689) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c (revision 329690) @@ -1,546 +1,628 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" #ifdef _KERNEL #include #include #endif /* * Routines to manage the on-disk history log. * * The history log is stored as a dmu object containing * tuples. * * Where "record nvlist" is a nvlist containing uint64_ts and strings, and * "packed record length" is the packed length of the "record nvlist" stored * as a little endian uint64_t. * * The log is implemented as a ring buffer, though the original creation * of the pool ('zpool create') is never overwritten. * * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer * of 'spa_history' stores the offsets for logging/retrieving history as * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of * where the 'zpool create' record is stored. This allows us to never * overwrite the original creation of the pool. 'sh_phys_max_off' is the * physical ending offset in bytes of the log. This tells you the length of * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record * is added, 'sh_eof' is incremented by the the size of the record. * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). * This is where the consumer should start reading from after reading in * the 'zpool create' portion of the log. * * 'sh_records_lost' keeps track of how many records have been overwritten * and permanently lost. */ /* convert a logical offset to physical */ static uint64_t spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) { uint64_t phys_len; phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; return ((log_off - shpp->sh_pool_create_len) % phys_len + shpp->sh_pool_create_len); } void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) { dmu_buf_t *dbp; spa_history_phys_t *shpp; objset_t *mos = spa->spa_meta_objset; ASSERT(spa->spa_history == 0); spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, sizeof (spa_history_phys_t), tx); VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, sizeof (uint64_t), 1, &spa->spa_history, tx) == 0); VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); /* * Figure out maximum size of history log. We set it at * 0.1% of pool size, with a max of 1G and min of 128KB. */ shpp->sh_phys_max_off = metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); dmu_buf_rele(dbp, FTAG); } /* * Change 'sh_bof' to the beginning of the next record. */ static int spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) { objset_t *mos = spa->spa_meta_objset; uint64_t firstread, reclen, phys_bof; char buf[sizeof (reclen)]; int err; phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, buf, DMU_READ_PREFETCH)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, buf + firstread, DMU_READ_PREFETCH)) != 0) return (err); } reclen = LE_64(*((uint64_t *)buf)); shpp->sh_bof += reclen + sizeof (reclen); shpp->sh_records_lost++; return (0); } static int spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, dmu_tx_t *tx) { uint64_t firstwrite, phys_eof; objset_t *mos = spa->spa_meta_objset; int err; ASSERT(MUTEX_HELD(&spa->spa_history_lock)); /* see if we need to reset logical BOF */ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - (shpp->sh_eof - shpp->sh_bof) <= len) { if ((err = spa_history_advance_bof(spa, shpp)) != 0) { return (err); } } phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); shpp->sh_eof += len; dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); len -= firstwrite; if (len > 0) { /* write out the rest at the beginning of physical file */ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, len, (char *)buf + firstwrite, tx); } return (0); } static char * spa_history_zone(void) { #ifdef _KERNEL /* XXX: pr_hostname can be changed by default from within a jail! */ if (jailed(curthread->td_ucred)) return (curthread->td_ucred->cr_prison->pr_hostname); #endif return (NULL); } /* + * Post a history sysevent. + * + * The nvlist_t* passed into this function will be transformed into a new + * nvlist where: + * + * 1. Nested nvlists will be flattened to a single level + * 2. Keys will have their names normalized (to remove any problematic + * characters, such as whitespace) + * + * The nvlist_t passed into this function will duplicated and should be freed + * by caller. + * + */ +static void +spa_history_log_notify(spa_t *spa, nvlist_t *nvl) +{ + nvlist_t *hist_nvl = fnvlist_alloc(); + uint64_t uint64; + char *string; + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64); + + spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT); + + nvlist_free(hist_nvl); +} + +/* * Write out a history event. */ /*ARGSUSED*/ static void spa_history_log_sync(void *arg, dmu_tx_t *tx) { nvlist_t *nvl = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; spa_history_phys_t *shpp; size_t reclen; uint64_t le_len; char *record_packed = NULL; int ret; /* * If we have an older pool that doesn't have a command * history object, create it now. */ mutex_enter(&spa->spa_history_lock); if (!spa->spa_history) spa_history_create_obj(spa, tx); mutex_exit(&spa->spa_history_lock); /* * Get the offset of where we need to write via the bonus buffer. * Update the offset when the write completes. */ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); #ifdef _KERNEL fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename); #endif if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { zfs_dbgmsg("command: %s", fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { zfs_dbgmsg("txg %lld %s %s (id %llu) %s", fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } else { zfs_dbgmsg("txg %lld %s %s", fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } + /* + * The history sysevent is posted only for internal history + * messages to show what has happened, not how it happened. For + * example, the following command: + * + * # zfs destroy -r tank/foo + * + * will result in one sysevent posted per dataset that is + * destroyed as a result of the command - which could be more + * than one event in total. By contrast, if the sysevent was + * posted as a result of the ZPOOL_HIST_CMD key being present + * it would result in only one sysevent being posted with the + * full command line arguments, requiring the consumer to know + * how to parse and understand zfs(1M) command invocations. + */ + spa_history_log_notify(spa, nvl); } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { zfs_dbgmsg("ioctl %s", fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); } record_packed = fnvlist_pack(nvl, &reclen); mutex_enter(&spa->spa_history_lock); /* write out the packed length as little endian */ le_len = LE_64((uint64_t)reclen); ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); if (!ret) ret = spa_history_write(spa, record_packed, reclen, shpp, tx); /* The first command is the create, which we keep forever */ if (ret == 0 && shpp->sh_pool_create_len == 0 && nvlist_exists(nvl, ZPOOL_HIST_CMD)) { shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; } mutex_exit(&spa->spa_history_lock); fnvlist_pack_free(record_packed, reclen); dmu_buf_rele(dbp, FTAG); fnvlist_free(nvl); } /* * Write out a history event. */ int spa_history_log(spa_t *spa, const char *msg) { int err; nvlist_t *nvl = fnvlist_alloc(); fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg); err = spa_history_log_nvl(spa, nvl); fnvlist_free(nvl); return (err); } int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) { int err = 0; dmu_tx_t *tx; nvlist_t *nvarg; if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) return (EINVAL); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) return (SET_ERROR(EINVAL)); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } nvarg = fnvlist_dup(nvl); if (spa_history_zone() != NULL) { fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, spa_history_zone()); } fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); /* Kick this off asynchronously; errors are ignored. */ dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); dmu_tx_commit(tx); /* spa_history_log_sync will free nvl */ return (err); } /* * Read out the command history. */ int spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) { objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; uint64_t read_len, phys_read_off, phys_eof; uint64_t leftover = 0; spa_history_phys_t *shpp; int err; /* * If the command history doesn't exist (older pool), * that's ok, just return ENOENT. */ if (!spa->spa_history) return (SET_ERROR(ENOENT)); /* * The history is logged asynchronously, so when they request * the first chunk of history, make sure everything has been * synced to disk so that we get it. */ if (*offp == 0 && spa_writeable(spa)) txg_wait_synced(spa_get_dsl(spa), 0); if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); shpp = dbp->db_data; #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif mutex_enter(&spa->spa_history_lock); phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); if (*offp < shpp->sh_pool_create_len) { /* read in just the zpool create history */ phys_read_off = *offp; read_len = MIN(*len, shpp->sh_pool_create_len - phys_read_off); } else { /* * Need to reset passed in offset to BOF if the passed in * offset has since been overwritten. */ *offp = MAX(*offp, shpp->sh_bof); phys_read_off = spa_history_log_to_phys(*offp, shpp); /* * Read up to the minimum of what the user passed down or * the EOF (physical or logical). If we hit physical EOF, * use 'leftover' to read from the physical BOF. */ if (phys_read_off <= phys_eof) { read_len = MIN(*len, phys_eof - phys_read_off); } else { read_len = MIN(*len, shpp->sh_phys_max_off - phys_read_off); if (phys_read_off + *len > shpp->sh_phys_max_off) { leftover = MIN(*len - read_len, phys_eof - shpp->sh_pool_create_len); } } } /* offset for consumer to use next */ *offp += read_len + leftover; /* tell the consumer how much you actually read */ *len = read_len + leftover; if (read_len == 0) { mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (0); } err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, DMU_READ_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, leftover, buf + read_len, DMU_READ_PREFETCH); } mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (err); } /* * The nvlist will be consumed by this call. */ static void log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, dmu_tx_t *tx, const char *fmt, va_list adx) { char *msg; va_list adx2; /* * If this is part of creating a pool, not everything is * initialized yet, so don't bother logging the internal events. * Likewise if the pool is not writeable. */ if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { fnvlist_free(nvl); return; } va_copy(adx2, adx); msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); (void) vsprintf(msg, fmt, adx2); fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); strfree(msg); va_end(adx2); fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); if (dmu_tx_is_syncing(tx)) { spa_history_log_sync(nvl, tx); } else { dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); } /* spa_history_log_sync() will free nvl */ } void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...) { dmu_tx_t *htx = tx; va_list adx; /* create a tx if we didn't get one */ if (tx == NULL) { htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); if (dmu_tx_assign(htx, TXG_WAIT) != 0) { dmu_tx_abort(htx); return; } } va_start(adx, fmt); log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx); va_end(adx); /* if we didn't get a tx from the caller, commit the one we made */ if (tx == NULL) dmu_tx_commit(htx); } void spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, dmu_tx_t *tx, const char *fmt, ...) { va_list adx; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *nvl = fnvlist_alloc(); ASSERT(tx != NULL); dsl_dataset_name(ds, namebuf); fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object); va_start(adx, fmt); log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx); va_end(adx); } void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...) { va_list adx; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *nvl = fnvlist_alloc(); ASSERT(tx != NULL); dsl_dir_name(dd, namebuf); fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, dsl_dir_phys(dd)->dd_head_dataset_obj); va_start(adx, fmt); log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); va_end(adx); } void spa_history_log_version(spa_t *spa, const char *operation) { spa_history_log_internal(spa, operation, NULL, "pool version %llu; software version %llu/%llu; uts %s %s %s %s", (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION, utsname.nodename, utsname.release, utsname.version, utsname.machine); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h (revision 329689) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h (revision 329690) @@ -1,929 +1,931 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. */ #ifndef _SYS_SPA_H #define _SYS_SPA_H #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Forward references that lots of things need. */ typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct metaslab_group metaslab_group_t; typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; struct dsl_pool; struct dsl_dataset; /* * General-purpose 32-bit and 64-bit bitfield encodings. */ #define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) #define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) #define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) #define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) #define BF32_GET(x, low, len) BF32_DECODE(x, low, len) #define BF64_GET(x, low, len) BF64_DECODE(x, low, len) #define BF32_SET(x, low, len, val) do { \ ASSERT3U(val, <, 1U << (len)); \ ASSERT3U(low + len, <=, 32); \ (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ _NOTE(CONSTCOND) } while (0) #define BF64_SET(x, low, len, val) do { \ ASSERT3U(val, <, 1ULL << (len)); \ ASSERT3U(low + len, <=, 64); \ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ _NOTE(CONSTCOND) } while (0) #define BF32_GET_SB(x, low, len, shift, bias) \ ((BF32_GET(x, low, len) + (bias)) << (shift)) #define BF64_GET_SB(x, low, len, shift, bias) \ ((BF64_GET(x, low, len) + (bias)) << (shift)) #define BF32_SET_SB(x, low, len, shift, bias, val) do { \ ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ ASSERT3S((val) >> (shift), >=, bias); \ BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ _NOTE(CONSTCOND) } while (0) #define BF64_SET_SB(x, low, len, shift, bias, val) do { \ ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ ASSERT3S((val) >> (shift), >=, bias); \ BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ _NOTE(CONSTCOND) } while (0) /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * * Note that although blocks up to 16MB are supported, the recordsize * property can not be set larger than zfs_max_recordsize (default 1MB). * See the comment near zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. */ #define SPA_MINBLOCKSHIFT 9 #define SPA_OLD_MAXBLOCKSHIFT 17 #define SPA_MAXBLOCKSHIFT 24 #define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) #define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) #define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) /* * Default maximum supported logical ashift. * * The current 8k allocation block size limit is due to the 8k * aligned/sized operations performed by vdev_probe() on * vdev_label->vl_pad2. Using another "safe region" for these tests * would allow the limit to be raised to 16k, at the expense of * only having 8 available uberblocks in the label area. */ #define SPA_MAXASHIFT 13 /* * Default minimum supported logical ashift. */ #define SPA_MINASHIFT SPA_MINBLOCKSHIFT /* * Size of block to hold the configuration data (a packed nvlist) */ #define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. * The ASIZE encoding should be at least 64 times larger (6 more bits) * to support up to 4-way RAID-Z mirror mode with worst-case gang block * overhead, three DVAs per bp, plus one more bit in case we do anything * else that expands the ASIZE. */ #define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. */ typedef struct dva { uint64_t dva_word[2]; } dva_t; /* * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. */ typedef struct zio_cksum { uint64_t zc_word[4]; } zio_cksum_t; /* * Some checksums/hashes need a 256-bit initialization salt. This salt is kept * secret and is suitable for use in MAC algorithms as the key. */ typedef struct zio_cksum_salt { uint8_t zcs_bytes[32]; } zio_cksum_salt_t; /* * Each block is described by its DVAs, time of birth, checksum, etc. * The word-by-word, bit-by-bit layout of the blkptr is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | vdev3 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | checksum[2] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | checksum[3] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * vdev virtual device ID * offset offset into virtual device * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) * GRID RAID-Z layout information (reserved for future use) * cksum checksum function * comp compression function * G gang block indicator * B byteorder (endianness) * D dedup * X encryption (on version 30, which is not supported) * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * phys birth txg of block allocation; zero if same as logical birth txg * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ /* * "Embedded" blkptr_t's don't actually point to a block, instead they * have a data payload embedded in the blkptr_t itself. See the comment * in blkptr.c for more details. * * The blkptr_t is laid out as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | payload | * 1 | payload | * 2 | payload | * 3 | payload | * 4 | payload | * 5 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | payload | * 8 | payload | * 9 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | payload | * c | payload | * d | payload | * e | payload | * f | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * payload contains the embedded data * B (byteorder) byteorder (endianness) * D (dedup) padding (set to zero) * X encryption (set to zero; see above) * E (embedded) set to one * lvl indirection level * type DMU object type * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) * comp compression function of payload * PSIZE size of payload after compression, in bytes * LSIZE logical size of payload, in bytes * note that 25 bits is enough to store the largest * "normal" BP's LSIZE (2^16 * 2^9) in bytes * log. birth transaction group in which the block was logically born * * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded * bp's they are stored in units of SPA_MINBLOCKSHIFT. * Generally, the generic BP_GET_*() macros can be used on embedded BP's. * The B, D, X, lvl, type, and comp fields are stored the same as with normal * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before * other macros, as they assert that they are only used on BP's of the correct * "embedded-ness". */ #define BPE_GET_ETYPE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET((bp)->blk_prop, 40, 8)) #define BPE_SET_ETYPE(bp, t) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, t); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_LSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) #define BPE_SET_LSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_PSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED } bp_embedded_type_t; #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ /* * A block is a hole when it has either 1) never been written to, or * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads * without physically allocating disk space. Holes are represented in the * blkptr_t structure by zeroed blk_dva. Correct checking for holes is * done through the BP_IS_HOLE macro. For holes, the logical size, level, * DMU object type, and birth times are all also stored for holes that * were written to at some point (i.e. were punched after having been filled). */ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_pad[2]; /* Extra space for the future */ uint64_t blk_phys_birth; /* txg when block was allocated */ uint64_t blk_birth; /* transaction group at birth */ uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; /* * Macros to get and set fields in a bp or DVA. */ #define DVA_GET_ASIZE(dva) \ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) #define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_OFFSET(dva, x) \ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? \ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_LSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_PSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_PSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_COMPRESS(bp) \ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) #define BP_SET_COMPRESS(bp, x) \ BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) #define BP_GET_CHECKSUM(bp) \ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ BF64_GET((bp)->blk_prop, 40, 8)) #define BP_SET_CHECKSUM(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ (bp)->blk_birth = (logical); \ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ } #define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[2])) #define BP_GET_UCSIZE(bp) \ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) #define BP_COUNT_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ DVA_GET_GANG(&(bp)->blk_dva[2]))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ (bp1)->blk_birth == (bp2)->blk_birth && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) #define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ ((zc1).zc_word[3] - (zc2).zc_word[3]))) #define ZIO_CHECKSUM_IS_ZERO(zc) \ (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \ (zc)->zc_word[2] | (zc)->zc_word[3])) #define ZIO_CHECKSUM_BSWAP(zcp) \ { \ (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \ (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \ (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \ (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \ } #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ { \ (zcp)->zc_word[0] = w0; \ (zcp)->zc_word[1] = w1; \ (zcp)->zc_word[2] = w2; \ (zcp)->zc_word[3] = w3; \ } #define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) #define BP_IS_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ (dva)->dva_word[1] == 0ULL) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) #define BP_ZERO(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[0] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ (bp)->blk_phys_birth = 0; \ (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } #if BYTE_ORDER == _BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else #define ZFS_HOST_BYTEORDER (1ULL) #endif #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) #define BP_SPRINTF_LEN 320 /* * This macro allows code sharing between zfs, libzpool, and mdb. * 'func' is either snprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *copyname[] = \ { "zero", "single", "double", "triple" }; \ int len = 0; \ int copies = 0; \ \ if (bp == NULL) { \ len += func(buf + len, size - len, ""); \ } else if (BP_IS_HOLE(bp)) { \ len += func(buf + len, size - len, \ "HOLE [L%llu %s] " \ "size=%llxL birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ "size=%llxL/%llxP birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (int)BPE_GET_ETYPE(bp), \ compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ if (DVA_IS_VALID(dva)) \ copies++; \ len += func(buf + len, size - len, \ "DVA[%d]=<%llu:%llx:%llx>%c", d, \ (u_longlong_t)DVA_GET_VDEV(dva), \ (u_longlong_t)DVA_GET_OFFSET(dva), \ (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ if (BP_IS_GANG(bp) && \ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ copies--; \ len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ "cksum=%llx:%llx:%llx:%llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ compress, \ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ BP_IS_GANG(bp) ? "gang" : "contiguous", \ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ copyname[copies], \ ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth, \ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \ (u_longlong_t)bp->blk_cksum.zc_word[2], \ (u_longlong_t)bp->blk_cksum.zc_word[3]); \ } \ ASSERT(len < size); \ } #define BP_GET_BUFC_TYPE(bp) \ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, SPA_IMPORT_ASSEMBLE } spa_import_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, nvlist_t *zplprops); #ifdef illumos extern int spa_import_rootpool(char *devpath, char *devid); #else extern int spa_import_rootpool(const char *name); #endif extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); extern int spa_reset(char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 #define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 /* * Controls the behavior of spa_vdev_remove(). */ #define SPA_REMOVE_UNSPARE 0x01 #define SPA_REMOVE_DONE 0x02 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); /* L2ARC state (which is global across all pools) */ extern void spa_l2cache_add(vdev_t *vd); extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); /* scanning */ extern int spa_scan(spa_t *spa, pool_scan_func_t func); extern int spa_scan_stop(spa_t *spa); extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; /* * SPA configuration functions in spa_config.c */ #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_config_sync(spa_t *, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); /* * Miscellaneous SPA routines in spa_misc.c */ /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); extern void spa_async_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ #define SCL_ALLOC 0x08 #define SCL_ZIO 0x10 #define SCL_FREE 0x20 #define SCL_VDEV 0x40 #define SCL_LOCKS 7 #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Log state */ typedef enum spa_log_state { SPA_LOG_UNKNOWN = 0, /* unknown log state */ SPA_LOG_MISSING, /* missing log(s) */ SPA_LOG_CLEAR, /* clear the log(s) */ SPA_LOG_GOOD, /* log(s) are good */ } spa_log_state_t; extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_offline_log(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); extern int spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_final_dirty_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern void spa_evicting_os_register(spa_t *, objset_t *os); extern void spa_evicting_os_deregister(spa_t *, objset_t *os); extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern int spa_busy(void); extern uint8_t spa_get_failmode(spa_t *spa); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); /* Miscellaneous support routines */ extern void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern int spa_rename(const char *oldname, const char *newname); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); extern uint64_t spa_generate_guid(spa_t *spa); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); extern int spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); extern char *spa_his_ievent_table[]; extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf); extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); extern void spa_history_log_version(spa_t *spa, const char *operation); extern void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...); extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, dmu_tx_t *tx, const char *fmt, ...); extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, zio_t *zio); extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd, zio_t *zio, uint64_t stateoroffset, uint64_t length); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); /* vdev cache */ extern void vdev_cache_stat_init(void); extern void vdev_cache_stat_fini(void); /* Initialization and termination */ extern void spa_init(int flags); extern void spa_fini(void); extern void spa_boot_init(); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ -extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, + const char *name); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ _NOTE(CONSTCOND) } while (0) #else #define dprintf_bp(bp, fmt, ...) #endif extern boolean_t spa_debug_enabled(spa_t *spa); #define spa_dbgmsg(spa, ...) \ { \ if (spa_debug_enabled(spa)) \ zfs_dbgmsg(__VA_ARGS__); \ } extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ #ifdef __cplusplus } #endif #endif /* _SYS_SPA_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 329689) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c (revision 329690) @@ -1,3643 +1,3644 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright 2013 Martin Matuska . All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome + * Copyright 2017 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); /* * Virtual device management. */ /* * The limit for ZFS to automatically increase a top-level vdev's ashift * from logical ashift to physical ashift. * * Example: one or more 512B emulation child vdevs * child->vdev_ashift = 9 (512 bytes) * child->vdev_physical_ashift = 12 (4096 bytes) * zfs_max_auto_ashift = 11 (2048 bytes) * zfs_min_auto_ashift = 9 (512 bytes) * * On pool creation or the addition of a new top-level vdev, ZFS will * increase the ashift of the top-level vdev to 2048 as limited by * zfs_max_auto_ashift. * * Example: one or more 512B emulation child vdevs * child->vdev_ashift = 9 (512 bytes) * child->vdev_physical_ashift = 12 (4096 bytes) * zfs_max_auto_ashift = 13 (8192 bytes) * zfs_min_auto_ashift = 9 (512 bytes) * * On pool creation or the addition of a new top-level vdev, ZFS will * increase the ashift of the top-level vdev to 4096 to match the * max vdev_physical_ashift. * * Example: one or more 512B emulation child vdevs * child->vdev_ashift = 9 (512 bytes) * child->vdev_physical_ashift = 9 (512 bytes) * zfs_max_auto_ashift = 13 (8192 bytes) * zfs_min_auto_ashift = 12 (4096 bytes) * * On pool creation or the addition of a new top-level vdev, ZFS will * increase the ashift of the top-level vdev to 4096 to match the * zfs_min_auto_ashift. */ static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; static int sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_max_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) return (EINVAL); zfs_max_auto_ashift = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_max_auto_ashift, "QU", "Max ashift used when optimising for logical -> physical sectors size on " "new top-level vdevs."); static int sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_min_auto_ashift; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) return (EINVAL); zfs_min_auto_ashift = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_min_auto_ashift, "QU", "Min ashift used when creating new top-level vdevs."); static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, #ifdef _KERNEL &vdev_geom_ops, #else &vdev_disk_ops, #endif &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, NULL }; /* * When a vdev is added, it will be divided into approximately (but no * more than) this number of metaslabs. */ int metaslabs_per_vdev = 200; SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, &metaslabs_per_vdev, 0, "When a vdev is added, how many metaslabs the vdev should be divided into"); /* * Given a vdev type, return the appropriate ops vector. */ static vdev_ops_t * vdev_getops(const char *type) { vdev_ops_t *ops, **opspp; for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) if (strcmp(ops->vdev_op_type, type) == 0) break; return (ops); } /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. */ uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } return (asize); } /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to * replace or attach devices which don't have the same physical size but * can still satisfy the same number of allocations. */ uint64_t vdev_get_min_asize(vdev_t *vd) { vdev_t *pvd = vd->vdev_parent; /* * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) return (vd->vdev_asize); /* * The top-level vdev just returns the allocatable size rounded * to the nearest metaslab. */ if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* * The allocatable space for a raidz vdev is N * sizeof(smallest child), * so each child must provide at least 1/Nth of its asize. */ if (pvd->vdev_ops == &vdev_raidz_ops) return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / pvd->vdev_children); return (pvd->vdev_min_asize); } void vdev_set_min_asize(vdev_t *vd) { vd->vdev_min_asize = vdev_get_min_asize(vd); for (int c = 0; c < vd->vdev_children; c++) vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (vdev < rvd->vdev_children) { ASSERT(rvd->vdev_child[vdev] != NULL); return (rvd->vdev_child[vdev]); } return (NULL); } vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); return (NULL); } static int vdev_count_leaves_impl(vdev_t *vd) { int n = 0; if (vd->vdev_ops->vdev_op_leaf) return (1); for (int c = 0; c < vd->vdev_children; c++) n += vdev_count_leaves_impl(vd->vdev_child[c]); return (n); } int vdev_count_leaves(spa_t *spa) { return (vdev_count_leaves_impl(spa->spa_root_vdev)); } void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; spa_t *spa = cvd->vdev_spa; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; if (pvd == NULL) return; ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); oldsize = pvd->vdev_children * sizeof (vdev_t *); pvd->vdev_children = MAX(pvd->vdev_children, id + 1); newsize = pvd->vdev_children * sizeof (vdev_t *); newchild = kmem_zalloc(newsize, KM_SLEEP); if (pvd->vdev_child != NULL) { bcopy(pvd->vdev_child, newchild, oldsize); kmem_free(pvd->vdev_child, oldsize); } pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; } void vdev_remove_child(vdev_t *pvd, vdev_t *cvd) { int c; uint_t id = cvd->vdev_id; ASSERT(cvd->vdev_parent == pvd); if (pvd == NULL) return; ASSERT(id < pvd->vdev_children); ASSERT(pvd->vdev_child[id] == cvd); pvd->vdev_child[id] = NULL; cvd->vdev_parent = NULL; for (c = 0; c < pvd->vdev_children; c++) if (pvd->vdev_child[c]) break; if (c == pvd->vdev_children) { kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); pvd->vdev_child = NULL; pvd->vdev_children = 0; } /* * Walk up all ancestors to update guid sum. */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; } /* * Remove any holes in the child array. */ void vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; } } kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); pvd->vdev_child = newchild; pvd->vdev_children = newc; } /* * Allocate and minimally initialize a vdev_t. */ vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } vd->vdev_spa = spa; vd->vdev_id = id; vd->vdev_guid = guid; vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_ishole = (ops == &vdev_hole_ops); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL, &vd->vdev_dtl_lock); } txg_list_create(&vd->vdev_ms_list, spa, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, spa, offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); vdev_cache_init(vd); return (vd); } /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly * different for each case. */ int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) { vdev_ops_t *ops; char *type; uint64_t guid = 0, islog, nparity; vdev_t *vd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. * Otherwise, vdev_alloc_common() will generate one for us. */ if (alloctype == VDEV_ALLOC_LOAD) { uint64_t label_id; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. */ islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); /* * Set the nparity property for RAID-Z vdevs. */ nparity = -1ULL; if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) return (SET_ERROR(ENOTSUP)); if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) return (SET_ERROR(ENOTSUP)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ nparity = 1; } } else { nparity = 0; } ASSERT(nparity != -1ULL); vd = vdev_alloc_common(spa, id, guid, ops); vd->vdev_islog = islog; vd->vdev_nparity = nparity; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) vd->vdev_path = spa_strdup(vd->vdev_path); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value * as -1. */ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; /* * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); /* * Retrieve the vdev creation time. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); /* * If we're a top-level vdev, try to load the allocation parameters. */ if (parent && !parent->vdev_parent && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, &vd->vdev_removing); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, &vd->vdev_top_zap); } else { ASSERT0(vd->vdev_top_zap); } if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || alloctype == VDEV_ALLOC_ROOTPOOL); vd->vdev_mg = metaslab_group_create(islog ? spa_log_class(spa) : spa_normal_class(spa), vd); } if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); } else { ASSERT0(vd->vdev_leaf_zap); } /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } if (alloctype == VDEV_ALLOC_ROOTPOOL) { uint64_t spare = 0; if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, &spare) == 0 && spare) spa_spare_add(vd); } (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, &vd->vdev_resilver_txg); /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be * valid in the current context. Local vdevs will * remain in the faulted state. */ if (spa_load_state(spa) == SPA_LOAD_OPEN) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); if (vd->vdev_faulted || vd->vdev_degraded) { char *aux; vd->vdev_label_aux = VDEV_AUX_ERR_EXCEEDED; if (nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && strcmp(aux, "external") == 0) vd->vdev_label_aux = VDEV_AUX_EXTERNAL; } } } /* * Add ourselves to the parent's list of children. */ vdev_add_child(parent, vd); *vdp = vd; return (0); } void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. */ vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. */ vdev_remove_child(vd->vdev_parent, vd); ASSERT(vd->vdev_parent == NULL); /* * Clean up vdev structure. */ vdev_queue_fini(vd); vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); if (vd->vdev_devid) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); if (vd->vdev_fru) spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); if (vd->vdev_isl2cache) spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; kmem_free(vd, sizeof (vdev_t)); } /* * Transfer top-level vdev state from svd to tvd. */ static void vdev_top_transfer(vdev_t *svd, vdev_t *tvd) { spa_t *spa = svd->vdev_spa; metaslab_t *msp; vdev_t *vd; int t; ASSERT(tvd == tvd->vdev_top); tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; svd->vdev_top_zap = 0; if (tvd->vdev_mg) ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; svd->vdev_mg = NULL; svd->vdev_ms = NULL; if (tvd->vdev_mg != NULL) tvd->vdev_mg->mg_vd = tvd; tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; svd->vdev_stat.vs_alloc = 0; svd->vdev_stat.vs_space = 0; svd->vdev_stat.vs_dspace = 0; for (t = 0; t < TXG_SIZE; t++) { while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_ms_list, msp, t); while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); } if (list_link_active(&svd->vdev_config_dirty_node)) { vdev_config_clean(svd); vdev_config_dirty(tvd); } if (list_link_active(&svd->vdev_state_dirty_node)) { vdev_state_clean(svd); vdev_state_dirty(tvd); } tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; svd->vdev_deflate_ratio = 0; tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; } static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { if (vd == NULL) return; vd->vdev_top = tvd; for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } /* * Add a mirror/replacing vdev above an existing vdev. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) { spa_t *spa = cvd->vdev_spa; vdev_t *pvd = cvd->vdev_parent; vdev_t *mvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); cvd->vdev_id = mvd->vdev_children; vdev_add_child(mvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (mvd == mvd->vdev_top) vdev_top_transfer(cvd, mvd); return (mvd); } /* * Remove a 1-way mirror/replacing vdev from the tree. */ void vdev_remove_parent(vdev_t *cvd) { vdev_t *mvd = cvd->vdev_parent; vdev_t *pvd = mvd->vdev_parent; ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(mvd->vdev_children == 1); ASSERT(mvd->vdev_ops == &vdev_mirror_ops || mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ if (mvd->vdev_top == mvd) { uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; cvd->vdev_orig_guid = cvd->vdev_guid; cvd->vdev_guid += guid_delta; cvd->vdev_guid_sum += guid_delta; } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); if (cvd == cvd->vdev_top) vdev_top_transfer(mvd, cvd); ASSERT(mvd->vdev_children == 0); vdev_free(mvd); } int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); /* * This vdev is not being allocated from yet or is a hole. */ if (vd->vdev_ms_shift == 0) return (0); ASSERT(!vd->vdev_ishole); /* * Compute the raidz-deflation ratio. Note, we hard-code * in 128k (1 << 17) because it is the "typical" blocksize. * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, * otherwise it would inconsistently account for existing bp's. */ vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); ASSERT(oldc <= newc); mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); if (oldc != 0) { bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); } vd->vdev_ms = mspp; vd->vdev_ms_count = newc; for (m = oldc; m < newc; m++) { uint64_t object = 0; if (txg == 0) { error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error) return (error); } error = metaslab_init(vd->vdev_mg, m, object, txg, &(vd->vdev_ms[m])); if (error) return (error); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); /* * If the vdev is being removed we don't activate * the metaslabs since we want to ensure that no new * allocations are performed on this device. */ if (oldc == 0 && !vd->vdev_removing) metaslab_group_activate(vd->vdev_mg); if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); return (0); } void vdev_metaslab_fini(vdev_t *vd) { uint64_t m; uint64_t count = vd->vdev_ms_count; if (vd->vdev_ms != NULL) { metaslab_group_passivate(vd->vdev_mg); for (m = 0; m < count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp != NULL) metaslab_fini(msp); } kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; } } typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { if (zio->io_error == 0) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); ASSERT(vd->vdev_probe_zio == zio); vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); zio_link_t *zl = NULL; while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* * Determine whether this device is accessible. * * Read and write to several known locations: the pad regions of each * vdev label but the first, which we leave alone in case it contains * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; vdev_probe_stats_t *vps = NULL; zio_t *pio; ASSERT(vd->vdev_ops->vdev_op_leaf); /* * Don't probe the probe. */ if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) return (NULL); /* * To prevent 'probe storms' when a device fails, we create * just one probe i/o at a time. All zios that want to probe * this vdev will become parents of the probe io. */ mutex_enter(&vd->vdev_probe_lock); if ((pio = vd->vdev_probe_zio) == NULL) { vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* * vdev_cant_read and vdev_cant_write can only * transition from TRUE to FALSE when we have the * SCL_ZIO lock as writer; otherwise they can only * transition from FALSE to TRUE. This ensures that * any zio looking at these values can assume that * failures persist for the life of the I/O. That's * important because when a device has intermittent * connectivity problems, we want to ensure that * they're ascribed to the device (ENXIO) and not * the zio (EIO). * * Since we hold SCL_ZIO as writer here, clear both * values so the probe can reevaluate from first * principles. */ vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; } vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); /* * We can't change the vdev state in this context, so we * kick off an async task to do it on our behalf. */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); } } if (zio != NULL) zio_add_child(zio, pio); mutex_exit(&vd->vdev_probe_lock); if (vps == NULL) { ASSERT(zio != NULL); return (NULL); } for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } if (zio == NULL) return (pio); zio_nowait(pio); return (NULL); } static void vdev_open_child(void *arg) { vdev_t *vd = arg; vd->vdev_open_thread = curthread; vd->vdev_open_error = vdev_open(vd); vd->vdev_open_thread = NULL; } boolean_t vdev_uses_zvols(vdev_t *vd) { if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_uses_zvols(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } void vdev_open_children(vdev_t *vd) { taskq_t *tq; int children = vd->vdev_children; /* * in order to handle pools on top of zvols, do the opens * in a single thread so that the same thread holds the * spa_namespace_lock */ if (B_TRUE || vdev_uses_zvols(vd)) { for (int c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); return; } tq = taskq_create("vdev_open", children, minclsyspri, children, children, TASKQ_PREPOPULATE); for (int c = 0; c < children; c++) VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], TQ_SLEEP) != 0); taskq_destroy(tq); } /* * Prepare a virtual device for access. */ int vdev_open(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; uint64_t logical_ashift = 0; uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vd->vdev_notrim = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* * If this vdev is not removed, check its fault status. If it's * faulted, bail out of the open. */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (SET_ERROR(ENXIO)); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); /* * Reset the vdev_reopening flag so that we actually close * the vdev on error. */ vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) vd->vdev_removed = B_FALSE; vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); return (error); } vd->vdev_removed = B_FALSE; /* * Recheck the faulted flag now that we have confirmed that * the vdev is accessible. If we're faulted, bail. */ if (vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } /* * For hole or missing vdevs we just return success. */ if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) trim_map_create(vd); for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); max_asize = max_osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; max_asize = max_osize; } vd->vdev_psize = psize; /* * Make sure the allocatable size hasn't shrunk too much. */ if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (SET_ERROR(EINVAL)); } vd->vdev_physical_ashift = MAX(physical_ashift, vd->vdev_physical_ashift); vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_ASHIFT_TOO_BIG); return (EINVAL); } if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. * For testing purposes, a higher ashift can be requested. */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; } else { /* * Make sure the alignment requirement hasn't increased. */ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (EINVAL); } vd->vdev_max_asize = max_asize; } /* * If all children are healthy we update asize if either: * The asize has increased, due to a device expansion caused by dynamic * LUN growth or vdev replacement, and automatic expansion is enabled; * making the additional space available. * * The asize has decreased, due to a device shrink usually caused by a * vdev replace with a smaller device. This ensures that calculations * based of max_asize and asize e.g. esize are always valid. It's safe * to do this as we've already validated that asize is greater than * vdev_min_asize. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && ((asize > vd->vdev_asize && (vd->vdev_expanding || spa->spa_autoexpand)) || (asize < vd->vdev_asize))) vd->vdev_asize = asize; vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the * vdev open for business. */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); return (error); } /* * Track the min and max ashift values for normal data devices. */ if (vd->vdev_top == vd && vd->vdev_ashift != 0 && !vd->vdev_islog && vd->vdev_aux == NULL) { if (vd->vdev_ashift > spa->spa_max_ashift) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; } /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. */ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && vdev_resilver_needed(vd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); return (0); } /* * Called once the vdevs are all opened, this routine validates the label * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * * If 'strict' is false ignore the spa guid check. This is necessary because * if the machine crashed during a re-guid the new guid might have been written * to all of the vdev labels, but not the cached config. The strict check * will be performed when the pool is opened again using the mos config. * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int vdev_validate(vdev_t *vd, boolean_t strict) { spa_t *spa = vd->vdev_spa; nvlist_t *label; uint64_t guid = 0, top_guid; uint64_t state; for (int c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c], strict) != 0) return (SET_ERROR(EBADF)); /* * If the device has already failed, or was marked offline, don't do * any further validation. Otherwise, label I/O will fail and we will * overwrite the previous state. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { uint64_t aux_guid = 0; nvlist_t *nvl; uint64_t txg = spa_last_synced_txg(spa) != 0 ? spa_last_synced_txg(spa) : -1ULL; if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); } /* * Determine if this vdev has been split off into another * pool. If so, then refuse to open it. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, &aux_guid) == 0 && aux_guid == spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_SPLIT_POOL); nvlist_free(label); return (0); } if (strict && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != spa_guid(spa))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, &aux_guid) != 0) aux_guid = 0; /* * If this vdev just became a top-level vdev because its * sibling was detached, it will have adopted the parent's * vdev guid -- but the label may or may not be on disk yet. * Fortunately, either version of the label will have the * same top guid, so if we're a top-level vdev, we can * safely compare to that instead. * * If we split this vdev off instead, then we also check the * original pool's guid. We don't want to consider the vdev * corrupt if it is partway through a split operation. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } nvlist_free(label); /* * If this is a verbatim import, no need to check the * state of the pool. */ if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (SET_ERROR(EBADF)); /* * If we were able to open and validate a vdev that was * previously marked permanently unavailable, clear that state * now. */ if (vd->vdev_not_present) vd->vdev_not_present = 0; } return (0); } /* * Close a virtual device. */ void vdev_close(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *pvd = vd->vdev_parent; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are * going offline. */ if (pvd != NULL && pvd->vdev_reopening) vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); if (vd->vdev_ops->vdev_op_leaf) trim_map_destroy(vd); /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ vd->vdev_prevstate = vd->vdev_state; if (vd->vdev_offline) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void vdev_hold(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); if (spa->spa_state == POOL_STATE_UNINITIALIZED) return; for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_hold(vd); } void vdev_rele(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_is_root(spa)); for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_rele(vd); } /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock * on the spa_config_lock. Instead we only obtain the leaf's physical size. * If the leaf has never been opened then open it, as usual. */ void vdev_reopen(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* set the reopening flag unless we're taking the vdev offline */ vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); /* * Call vdev_validate() here to make sure we have the same device. * Otherwise, a device with an invalid label could be successfully * opened in response to vdev_reopen(). */ if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache && !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd, B_TRUE); } /* * Reassess parent vdev's health. */ vdev_propagate_state(vd); } int vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) { int error; /* * Normally, partial opens (e.g. of a mirror) are allowed. * For a create, however, we want to fail the request if * there are any components we can't open. */ error = vdev_open(vd); if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { vdev_close(vd); return (error ? error : ENXIO); } /* * Recursively load DTLs and initialize all labels. */ if ((error = vdev_dtl_load(vd)) != 0 || (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); } return (0); } void vdev_metaslab_set_size(vdev_t *vd) { /* * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. */ vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); } /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining * administrator defined limits and ensuring it doesn't go below the * logical ashift. */ void vdev_ashift_optimize(vdev_t *vd) { if (vd == vd->vdev_top) { if (vd->vdev_ashift < vd->vdev_physical_ashift) { vd->vdev_ashift = MIN( MAX(zfs_max_auto_ashift, vd->vdev_ashift), MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); } else { /* * Unusual case where logical ashift > physical ashift * so we can't cap the calculated ashift based on max * ashift as that would cause failures. * We still check if we need to increase it to match * the min ashift. */ vd->vdev_ashift = MAX(zfs_min_auto_ashift, vd->vdev_ashift); } } } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); ASSERT(!vd->vdev_ishole); ASSERT(ISP2(flags)); ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); if (flags & VDD_DTL) (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) { for (int c = 0; c < vd->vdev_children; c++) vdev_dirty_leaves(vd->vdev_child[c], flags, txg); if (vd->vdev_ops->vdev_op_leaf) vdev_dirty(vd->vdev_top, flags, vd, txg); } /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * * DTL_PARTIAL: txgs for which data is available, but not fully replicated * * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of * txgs that was scrubbed. * * DTL_OUTAGE: txgs which cannot currently be read, whether due to * persistent errors or just some device being offline. * Unlike the other three, the DTL_OUTAGE map is not generally * maintained; it's only computed when needed, typically to * determine whether a device can be detached. * * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device * either has the data or it doesn't. * * For interior vdevs such as mirror and RAID-Z the picture is more complex. * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because * if any child is less than fully replicated, then so is its parent. * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, * comprising only those txgs which appear in 'maxfaults' or more children; * those are the txgs we don't have enough replication to read. For example, * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); * thus, its DTL_MISSING consists of the set of txgs that appear in more than * two child DTL_MISSING maps. * * It should be clear from the above that to compute the DTLs and outage maps * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. * Therefore, that is all we keep on disk. When loading the pool, or after * a configuration change, we generate all other DTLs from first principles. */ void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); ASSERT(spa_writeable(vd->vdev_spa)); mutex_enter(rt->rt_lock); if (!range_tree_contains(rt, txg, size)) range_tree_add(rt, txg, size); mutex_exit(rt->rt_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); mutex_enter(rt->rt_lock); if (range_tree_space(rt) != 0) dirty = range_tree_contains(rt, txg, size); mutex_exit(rt->rt_lock); return (dirty); } boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; mutex_enter(rt->rt_lock); empty = (range_tree_space(rt) == 0); mutex_exit(rt->rt_lock); return (empty); } /* * Returns the lowest txg in the DTL range. */ static uint64_t vdev_dtl_min(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_start - 1); } /* * Returns the highest txg in the DTL. */ static uint64_t vdev_dtl_max(vdev_t *vd) { range_seg_t *rs; ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); ASSERT0(vd->vdev_children); rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); return (rs->rs_end); } /* * Determine if a resilvering vdev should remove any DTL entries from * its range. If the vdev was resilvering for the entire duration of the * scan then it should excise that range from its DTLs. Otherwise, this * vdev is considered partially resilvered and should leave its DTL * entries intact. The comment in vdev_dtl_reassess() describes how we * excise the DTLs. */ static boolean_t vdev_dtl_should_excise(vdev_t *vd) { spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) return (B_FALSE); if (vd->vdev_resilver_txg == 0 || range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) return (B_TRUE); /* * When a resilver is initiated the scan will assign the scn_max_txg * value to the highest txg value that exists in all DTLs. If this * device's max DTL is not part of this scan (i.e. it is not in * the range (scn_min_txg, scn_max_txg] then it is not eligible * for excision. */ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); return (B_TRUE); } return (B_FALSE); } /* * Reassess DTLs after a config change or scrub completion. */ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; avl_tree_t reftree; int minref; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); for (int c = 0; c < vd->vdev_children; c++) vdev_dtl_reassess(vd->vdev_child[c], txg, scrub_txg, scrub_done); if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) return; if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; mutex_enter(&vd->vdev_dtl_lock); /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to * excise regions on vdevs that were available during * the entire duration of this scan. */ if (scrub_txg != 0 && (spa->spa_scrub_started || (scn != NULL && scn->scn_phys.scn_errors == 0)) && vdev_dtl_should_excise(vd)) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. * * There's little trick here: to excise the beginning * of the DTL_MISSING map, we put it into a reference * tree and then add a segment with refcnt -1 that * covers the range [0, scrub_txg). This means * that each txg in that range has refcnt -1 or 0. * We then add DTL_SCRUB with a refcnt of 2, so that * entries in the range [0, scrub_txg) will have a * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ space_reftree_create(&reftree); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_add_seg(&reftree, 0, scrub_txg, -1); space_reftree_add_map(&reftree, vd->vdev_dtl[DTL_SCRUB], 2); space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else range_tree_walk(vd->vdev_dtl[DTL_MISSING], range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); /* * If the vdev was resilvering and no longer has any * DTLs then reset its resilvering flag and dirty * the top level so that we persist the change. */ if (vd->vdev_resilver_txg != 0 && range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { vd->vdev_resilver_txg = 0; vdev_config_dirty(vd->vdev_top); } mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); return; } mutex_enter(&vd->vdev_dtl_lock); for (int t = 0; t < DTL_TYPES; t++) { /* account for child's outage in parent's missing map */ int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; if (t == DTL_SCRUB) continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ else if (vd->vdev_nparity != 0) minref = vd->vdev_nparity + 1; /* RAID-Z */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { ASSERT(!vd->vdev_ishole); error = space_map_open(&vd->vdev_dtl_sm, mos, vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); if (error) return (error); ASSERT(vd->vdev_dtl_sm != NULL); mutex_enter(&vd->vdev_dtl_lock); /* * Now that we've opened the space_map we need to update * the in-core DTL. */ space_map_update(vd->vdev_dtl_sm); error = space_map_load(vd->vdev_dtl_sm, vd->vdev_dtl[DTL_MISSING], SM_ALLOC); mutex_exit(&vd->vdev_dtl_lock); return (error); } for (int c = 0; c < vd->vdev_children; c++) { error = vdev_dtl_load(vd->vdev_child[c]); if (error != 0) break; } return (error); } void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zapobj, tx)); } uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); ASSERT(zap != 0); VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, zap, tx)); return (zap); } void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) { if (vd->vdev_ops != &vdev_hole_ops && vd->vdev_ops != &vdev_missing_ops && vd->vdev_ops != &vdev_root_ops && !vd->vdev_top->vdev_removing) { if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); } if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { vd->vdev_top_zap = vdev_create_link_zap(vd, tx); } } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); } } void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; range_tree_t *rtsync; kmutex_t rtlock; dmu_tx_t *tx; uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(!vd->vdev_ishole); ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached || vd->vdev_top->vdev_removing) { mutex_enter(&vd->vdev_dtl_lock); space_map_free(vd->vdev_dtl_sm, tx); space_map_close(vd->vdev_dtl_sm); vd->vdev_dtl_sm = NULL; mutex_exit(&vd->vdev_dtl_lock); /* * We only destroy the leaf ZAP for detached leaves or for * removed log devices. Removed data devices handle leaf ZAP * cleanup later, once cancellation is no longer possible. */ if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || vd->vdev_top->vdev_islog)) { vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); vd->vdev_leaf_zap = 0; } dmu_tx_commit(tx); return; } if (vd->vdev_dtl_sm == NULL) { uint64_t new_object; new_object = space_map_alloc(mos, tx); VERIFY3U(new_object, !=, 0); VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 0, -1ULL, 0, &vd->vdev_dtl_lock)); ASSERT(vd->vdev_dtl_sm != NULL); } bzero(&rtlock, sizeof(rtlock)); mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); rtsync = range_tree_create(NULL, NULL, &rtlock); mutex_enter(&rtlock); mutex_enter(&vd->vdev_dtl_lock); range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, tx); space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); mutex_exit(&rtlock); mutex_destroy(&rtlock); /* * If the object for the space map has changed then dirty * the top level so that we update the config. */ if (object != space_map_object(vd->vdev_dtl_sm)) { zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " "new object %llu", txg, spa_name(spa), object, space_map_object(vd->vdev_dtl_sm)); vdev_config_dirty(vd->vdev_top); } dmu_tx_commit(tx); mutex_enter(&vd->vdev_dtl_lock); space_map_update(vd->vdev_dtl_sm); mutex_exit(&vd->vdev_dtl_lock); } /* * Determine whether the specified vdev can be offlined/detached/removed * without losing data. */ boolean_t vdev_dtl_required(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *tvd = vd->vdev_top; uint8_t cant_read = vd->vdev_cant_read; boolean_t required; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == spa->spa_root_vdev || vd == tvd) return (B_TRUE); /* * Temporarily mark the device as unreadable, and then determine * whether this results in any DTL outages in the top-level vdev. * If not, we can safely offline/detach/remove the device. */ vd->vdev_cant_read = B_TRUE; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); required = !vdev_dtl_empty(tvd, DTL_OUTAGE); vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); if (!required && zio_injection_enabled) required = !!zio_handle_device_injection(vd, NULL, ECHILD); return (required); } /* * Determine if resilver is needed, and if so the txg range. */ boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) { boolean_t needed = B_FALSE; uint64_t thismin = UINT64_MAX; uint64_t thismax = 0; if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && vdev_writeable(vd)) { thismin = vdev_dtl_min(vd); thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; if (vdev_resilver_needed(cvd, &cmin, &cmax)) { thismin = MIN(thismin, cmin); thismax = MAX(thismax, cmax); needed = B_TRUE; } } } if (needed && minp) { *minp = thismin; *maxp = thismax; } return (needed); } void vdev_load(vdev_t *vd) { /* * Recursively load all children. */ for (int c = 0; c < vd->vdev_children; c++) vdev_load(vd->vdev_child[c]); /* * If this is a top-level vdev, initialize its metaslabs. */ if (vd == vd->vdev_top && !vd->vdev_ishole && (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || vdev_metaslab_init(vd, 0) != 0)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); /* * If this is a leaf vdev, load its DTL. */ if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } /* * The special vdev case is used for hot spares and l2cache devices. Its * sole purpose it to set the vdev state for the associated vdev. To do this, * we make sure that we can open the underlying device, then try to read the * label, and make sure that the label is sane and that it hasn't been * repurposed to another pool. */ int vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; uint64_t state; if (!vdev_readable(vd)) return (0); if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (-1); } /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. */ nvlist_free(label); return (0); } void vdev_remove(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); ASSERT(vd == vd->vdev_top); ASSERT3U(txg, ==, spa_syncing_txg(spa)); if (vd->vdev_ms != NULL) { metaslab_group_t *mg = vd->vdev_mg; metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; if (msp == NULL || msp->ms_sm == NULL) continue; mutex_enter(&msp->ms_lock); /* * If the metaslab was not loaded when the vdev * was removed then the histogram accounting may * not be accurate. Update the histogram information * here so that we ensure that the metaslab group * and metaslab class are up-to-date. */ metaslab_group_histogram_remove(mg, msp); VERIFY0(space_map_allocated(msp->ms_sm)); space_map_free(msp->ms_sm, tx); space_map_close(msp->ms_sm); msp->ms_sm = NULL; mutex_exit(&msp->ms_lock); } metaslab_group_histogram_verify(mg); metaslab_class_histogram_verify(mg->mg_class); for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) ASSERT0(mg->mg_histogram[i]); } if (vd->vdev_ms_array) { (void) dmu_object_free(mos, vd->vdev_ms_array, tx); vd->vdev_ms_array = 0; } if (vd->vdev_islog && vd->vdev_top_zap != 0) { vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); ASSERT(!vd->vdev_ishole); while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); if (reassess) metaslab_sync_reassess(vd->vdev_mg); } void vdev_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; vdev_t *lvd; metaslab_t *msp; dmu_tx_t *tx; ASSERT(!vd->vdev_ishole); if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { ASSERT(vd == vd->vdev_top); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); ASSERT(vd->vdev_ms_array != 0); vdev_config_dirty(vd); dmu_tx_commit(tx); } /* * Remove the metadata associated with this vdev once it's empty. */ if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove(vd, txg); while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); } while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) vdev_dtl_sync(lvd, txg); (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); } uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize) { return (vd->vdev_ops->vdev_op_asize(vd, psize)); } /* * Mark the given vdev faulted. A faulted vdev behaves as if the device could * not be opened, and no I/O is attempted. */ int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we * were faulted. */ vd->vdev_label_aux = aux; /* * Faulted state takes precedence over degraded. */ vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; /* * If we reopen the device and it's not dead, only then do we * mark it degraded. */ vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); } /* * Mark the given vdev degraded. A degraded vdev is purely an indication to the * user that something is wrong. The vdev continues to operate as normal as far * as I/O is concerned. */ int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); /* * If the vdev is already faulted, then don't do anything. */ if (vd->vdev_faulted || vd->vdev_degraded) return (spa_vdev_state_exit(spa, NULL, 0)); vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); return (spa_vdev_state_exit(spa, vd, 0)); } /* * Online the given vdev. * * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached * spare device should be detached when the device finishes resilvering. * Second, the online should be treated like a 'test' online case, so no FMA * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; boolean_t wasoffline; vdev_state_t oldstate; spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); oldstate = vd->vdev_state; tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); /* XXX - L2ARC 1.0 does not support expansion */ if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); } vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; if (!vd->vdev_aux) { for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) pvd->vdev_expanding = B_FALSE; } if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && !vdev_is_dead(vd) && vd->vdev_parent && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { /* XXX - L2ARC 1.0 does not support expansion */ if (vd->vdev_aux) return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) - spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); return (spa_vdev_state_exit(spa, vd, 0)); } static int vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { vdev_t *vd, *tvd; int error = 0; uint64_t generation; metaslab_group_t *mg; top: spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* * If this device has the only valid copy of some data, * don't allow it to be offlined. Log devices are always * expendable. */ if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); /* * If the top-level is a slog and it has had allocations * then proceed. We check that the vdev's metaslab group * is not NULL since it's possible that we may have just * added this vdev but not yet initialized its metaslabs. */ if (tvd->vdev_islog && mg != NULL) { /* * Prevent any future allocations. */ metaslab_group_passivate(mg); (void) spa_vdev_state_exit(spa, vd, 0); error = spa_offline_log(spa); spa_vdev_state_enter(spa, SCL_ALLOC); /* * Check to see if the config has changed. */ if (error || generation != spa->spa_config_generation) { metaslab_group_activate(mg); if (error) return (spa_vdev_state_exit(spa, vd, error)); (void) spa_vdev_state_exit(spa, vd, 0); goto top; } ASSERT0(tvd->vdev_stat.vs_alloc); } /* * Offline this device and reopen its top-level vdev. * If the top-level vdev is a log device then just offline * it. Otherwise, if this action results in the top-level * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; vdev_reopen(tvd); if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } /* * Add the device back into the metaslab rotor so that * once we online the device it's open for business. */ if (tvd->vdev_islog && mg != NULL) metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); return (spa_vdev_state_exit(spa, vd, 0)); } int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) { int error; mutex_enter(&spa->spa_vdev_top_lock); error = vdev_offline_locked(spa, guid, flags); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all * children. If 'vd' is NULL, then the user wants to clear all vdevs. */ void vdev_clear(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); if (vd == NULL) vd = rvd; vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); if (vd == rvd) { for (int c = 0; c < spa->spa_l2cache.sav_count; c++) vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); for (int c = 0; c < spa->spa_spares.sav_count; c++) vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); } /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We * also mark the vdev config dirty, so that the new faulted state is * written out to disk. */ if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { /* * When reopening in reponse to a clear event, it may be due to * a fmadm repair request. In this case, if the device is * still broken, we want to still post the ereport again. */ vd->vdev_forcefault = B_TRUE; vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) spa_async_request(spa, SPA_ASYNC_RESILVER); - spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } /* * When clearing a FMA-diagnosed fault, we always want to * unspare the device, as we assume that the original spare was * done in response to the FMA fault. */ if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && vd->vdev_parent->vdev_ops == &vdev_spare_ops && vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { /* * Holes and missing devices are always considered "dead". * This simplifies the code since we don't have to check for * these types of devices in the various code paths. * Instead we rely on the fact that we skip over dead devices * before issuing I/O to them. */ return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops); } boolean_t vdev_readable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_read); } boolean_t vdev_writeable(vdev_t *vd) { return (!vdev_is_dead(vd) && !vd->vdev_cant_write); } boolean_t vdev_allocatable(vdev_t *vd) { uint64_t state = vd->vdev_state; /* * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding * the proper locks. Note that we have to get the vdev state * in a local variable because although it changes atomically, * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && !vd->vdev_cant_write && !vd->vdev_ishole && vd->vdev_mg->mg_initialized); } boolean_t vdev_accessible(vdev_t *vd, zio_t *zio) { ASSERT(zio->io_vd == vd); if (vdev_is_dead(vd) || vd->vdev_remove_wanted) return (B_FALSE); if (zio->io_type == ZIO_TYPE_READ) return (!vd->vdev_cant_read); if (zio->io_type == ZIO_TYPE_WRITE) return (!vd->vdev_cant_write); return (B_TRUE); } /* * Get statistics for the given vdev. */ void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *tvd = vd->vdev_top; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* * Report expandable space on top-level, non-auxillary devices only. * The expandable space is reported in terms of metaslab sized units * since that determines how much space the pool can expand. */ if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); } vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; vs->vs_physical_ashift = vd->vdev_physical_ashift; if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; } /* * If we're getting stats on the root vdev, aggregate the I/O counts * over all top-level vdevs (i.e. the direct children of the root). */ if (vd == rvd) { for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *cvd = rvd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; for (int t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; } } mutex_exit(&vd->vdev_stat_lock); } void vdev_clear_stats(vdev_t *vd) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_space = 0; vd->vdev_stat.vs_dspace = 0; vd->vdev_stat.vs_alloc = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_scan_stat_init(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (int c = 0; c < vd->vdev_children; c++) vdev_scan_stat_init(vd->vdev_child[c]); mutex_enter(&vd->vdev_stat_lock); vs->vs_scan_processed = 0; mutex_exit(&vd->vdev_stat_lock); } void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; vdev_stat_t *vs = &vd->vdev_stat; zio_type_t type = zio->io_type; int flags = zio->io_flags; /* * If this i/o is a gang leader, it didn't do any actual work. */ if (zio->io_gang_tree) return; if (zio->io_error == 0) { /* * If this is a root i/o, don't count it -- we've already * counted the top-level vdevs, and vdev_get_stats() will * aggregate them when asked. This reduces contention on * the root vdev_stat_lock and implicitly handles blocks * that compress away to holes, for which there is no i/o. * (Holes never create vdev children, so all the counters * remain zero, which is what we want.) * * Note: this only applies to successful i/o (io_error == 0) * because unlike i/o counts, errors are not additive. * When reading a ditto block, for example, failure of * one top-level vdev does not imply a root-level error. */ if (vd == rvd) return; ASSERT(vd == zio->io_vd); if (flags & ZIO_FLAG_IO_BYPASS) return; mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { if (flags & ZIO_FLAG_SCAN_THREAD) { dsl_scan_phys_t *scn_phys = &spa->spa_dsl_pool->dp_scan->scn_phys; uint64_t *processed = &scn_phys->scn_processed; /* XXX cleanup? */ if (vd->vdev_ops->vdev_op_leaf) atomic_add_64(processed, psize); vs->vs_scan_processed += psize; } if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } vs->vs_ops[type]++; vs->vs_bytes[type] += psize; mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; /* * If this is an I/O error that is going to be retried, then ignore the * error. Otherwise, the user may interpret B_FAILFAST I/O errors as * hard errors, when in reality they can happen for any number of * innocuous reasons (bus resets, MPxIO link failure, etc). */ if (zio->io_error == EIO && !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; /* * Intent logs writes won't propagate their error to the root * I/O so don't mark these types of failures as pool-level * errors. */ if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; mutex_enter(&vd->vdev_stat_lock); if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's * a repair induced by the scrub thread, or it's a repair * made by zil_claim() during spa_load() in the first txg. * In the normal case, we commit the DTL change in the same * txg as the block was born. In the scrub-induced repair * case, we know that scrubs run in first-pass syncing context, * so we commit the DTL change in spa_syncing_txg(spa). * In the zil_claim() case, we commit in spa_first_txg(spa). * * We currently do not make DTL entries for failed spontaneous * self-healing writes triggered by normal (non-scrubbing) * reads, because we have no transactional context in which to * do so -- and it's not clear that it'd be desirable anyway. */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); commit_txg = spa_syncing_txg(spa); } else if (spa->spa_claiming) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); commit_txg = spa_first_txg(spa); } ASSERT(commit_txg >= spa_syncing_txg(spa)); if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } if (vd != rvd) vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. */ void vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta = space_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; metaslab_group_t *mg = vd->vdev_mg; metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion * factor. We must calculate this here and not at the root vdev * because the root vdev's psize-to-asize is simply the max of its * childrens', thus not accurate enough for us. */ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_alloc += alloc_delta; vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); if (mc == spa_normal_class(spa)) { mutex_enter(&rvd->vdev_stat_lock); rvd->vdev_stat.vs_alloc += alloc_delta; rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } if (mc != NULL) { ASSERT(rvd == vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, dspace_delta); } } /* * Mark a top-level vdev's config as dirty, placing it on the dirty list * so that it will be written out next time the vdev configuration is synced. * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. */ void vdev_config_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int c; ASSERT(spa_writeable(spa)); /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; nvlist_t **aux; uint_t naux; for (c = 0; c < sav->sav_count; c++) { if (sav->sav_vdevs[c] == vd) break; } if (c == sav->sav_count) { /* * We're being removed. There's nothing more to do. */ ASSERT(sav->sav_sync == B_TRUE); return; } sav->sav_sync = B_TRUE; if (nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); } ASSERT(c < naux); /* * Setting the nvlist in the middle if the array is a little * sketchy, but it will work. */ nvlist_free(aux[c]); aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } /* * The dirty list is protected by the SCL_CONFIG lock. The caller * must either hold SCL_CONFIG as writer, or must be the sync thread * (which holds SCL_CONFIG as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); if (vd == rvd) { for (c = 0; c < rvd->vdev_children; c++) vdev_config_dirty(rvd->vdev_child[c]); } else { ASSERT(vd == vd->vdev_top); if (!list_link_active(&vd->vdev_config_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_config_dirty_list, vd); } } void vdev_config_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_CONFIG, RW_READER))); ASSERT(list_link_active(&vd->vdev_config_dirty_node)); list_remove(&spa->spa_config_dirty_list, vd); } /* * Mark a top-level vdev's state as dirty, so that the next pass of * spa_sync() can convert this into vdev_config_dirty(). We distinguish * the state changes from larger config changes because they require * much less locking, and are often needed for administrative actions. */ void vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* * The state list is protected by the SCL_STATE lock. The caller * must either hold SCL_STATE as writer, or must be the sync thread * (which holds SCL_STATE as reader). There's only one sync thread, * so this is sufficient to ensure mutual exclusion. */ ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_state_dirty_list, vd); } void vdev_state_clean(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); ASSERT(list_link_active(&vd->vdev_state_dirty_node)); list_remove(&spa->spa_state_dirty_list, vd); } /* * Propagate vdev state up from children to parent. */ void vdev_propagate_state(vdev_t *vd) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; vdev_t *child; if (vd->vdev_children > 0) { for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; /* * Don't factor holes into the decision. */ if (child->vdev_ishole) continue; if (!vdev_readable(child) || (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were * degraded. */ if (child->vdev_islog && vd == rvd) degraded++; else faulted++; } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) corrupted++; } vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); /* * Root special: if there is a top-level vdev that cannot be * opened due to corrupted metadata, then propagate the root * vdev's aux state as 'corrupt' rather than 'insufficient * replicas'. */ if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN) vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); } if (vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Set a vdev's state. If this is during an open, we don't update the parent * state, because we're in the process of opening children depth-first. * Otherwise, we propagate the change to the parent. * * If this routine places a device in a faulted state, an appropriate ereport is * generated. */ void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { uint64_t save_state; spa_t *spa = vd->vdev_spa; if (state == vd->vdev_state) { vd->vdev_stat.vs_aux = aux; return; } save_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; /* * If we are setting the vdev state to anything but an open state, then * always close the underlying device unless the device has requested * a delayed close (i.e. we're about to remove or fault the device). * Otherwise, we keep accessible but invalid devices open forever. * We don't call vdev_close() itself, because that implies some extra * checks (offline, etc) that we don't want here. This is limited to * leaf devices, because otherwise closing the device will affect other * children. */ if (!vd->vdev_delayed_close && vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { /* * If the previous state is set to VDEV_STATE_REMOVED, then this * device was previously marked removed and someone attempted to * reopen it. If this failed due to a nonexistent device, then * keep the device in the REMOVED state. We also let this be if * it is one of our special test online cases, which is only * attempting to online the device and shouldn't generate an FMA * fault. */ vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* * If we fail to open a vdev during an import or recovery, we * mark it as "not available", which signifies that it was * never there to begin with. Failure to open such a device * is not considered an error. */ if ((spa_load_state(spa) == SPA_LOAD_IMPORT || spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; /* * Post the appropriate ereport. If the 'prevstate' field is * set to something other than VDEV_STATE_UNKNOWN, it indicates * that this is part of a vdev_reopen(). In this case, we don't * want to post the ereport if the device was already in the * CANT_OPEN state beforehand. * * If the 'checkremove' flag is set, then this is an attempt to * online the device in response to an insertion event. If we * hit this case, then we have detected an insertion event for a * faulted or offline device that wasn't in the removed state. * In this scenario, we don't post an ereport because we are * about to replace the device, or attempt an online with * vdev_forcefault, which will generate the fault for us. */ if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && !vd->vdev_not_present && !vd->vdev_checkremove && vd != spa->spa_root_vdev) { const char *class; switch (aux) { case VDEV_AUX_OPEN_FAILED: class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; break; case VDEV_AUX_CORRUPT_DATA: class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; break; case VDEV_AUX_NO_REPLICAS: class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; break; case VDEV_AUX_BAD_GUID_SUM: class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; break; case VDEV_AUX_TOO_SMALL: class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; break; case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } zfs_ereport_post(class, spa, vd, NULL, save_state, 0); } /* Erase any notion of persistent removed state */ vd->vdev_removed = B_FALSE; } else { vd->vdev_removed = B_FALSE; } /* * Notify the fmd of the state change. Be verbose and post * notifications even for stuff that's not important; the fmd agent can * sort it out. Don't emit state change events for non-leaf vdevs since * they can't change state on their own. The FMD can check their state * if it wants to when it sees that a leaf vdev had a state change. */ if (vd->vdev_ops->vdev_op_leaf) zfs_post_state_change(spa, vd); if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. * In addition, only a single top-level vdev is allowed. * * FreeBSD does not have above limitations. */ boolean_t vdev_is_bootable(vdev_t *vd) { #ifdef illumos if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && vd->vdev_children > 1) { return (B_FALSE); } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { return (B_FALSE); } } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } #endif /* illumos */ return (B_TRUE); } /* * Load the state from the original vdev tree (ovd) which * we've retrieved from the MOS config object. If the original * vdev was offline or faulted then we transfer that state to the * device in the current vdev tree (nvd). */ void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { spa_t *spa = nvd->vdev_spa; ASSERT(nvd->vdev_top->vdev_islog); ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); for (int c = 0; c < nvd->vdev_children; c++) vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); if (nvd->vdev_ops->vdev_op_leaf) { /* * Restore the persistent vdev state */ nvd->vdev_offline = ovd->vdev_offline; nvd->vdev_faulted = ovd->vdev_faulted; nvd->vdev_degraded = ovd->vdev_degraded; nvd->vdev_removed = ovd->vdev_removed; } } /* * Determine if a log device has valid content. If the vdev was * removed or faulted in the MOS config then we know that * the content on the log device has already been written to the pool. */ boolean_t vdev_log_state_valid(vdev_t *vd) { if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && !vd->vdev_removed) return (B_TRUE); for (int c = 0; c < vd->vdev_children; c++) if (vdev_log_state_valid(vd->vdev_child[c])) return (B_TRUE); return (B_FALSE); } /* * Expand a vdev if possible. */ void vdev_expand(vdev_t *vd, uint64_t txg) { ASSERT(vd->vdev_top == vd); ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { VERIFY(vdev_metaslab_init(vd, txg) == 0); vdev_config_dirty(vd); } } /* * Split a vdev. */ void vdev_split(vdev_t *vd) { vdev_t *cvd, *pvd = vd->vdev_parent; vdev_remove_child(pvd, vd); vdev_compact_children(pvd); cvd = pvd->vdev_child[0]; if (pvd->vdev_children == 1) { vdev_remove_parent(cvd); cvd->vdev_splitting = B_TRUE; } vdev_propagate_state(cvd); } void vdev_deadman(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; vdev_deadman(cvd); } if (vd->vdev_ops->vdev_op_leaf) { vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); if (avl_numnodes(&vq->vq_active_tree) > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime we panic the system. */ fio = avl_first(&vq->vq_active_tree); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) { zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " "delta %lluns, last io %lluns", fio->io_timestamp, delta, vq->vq_io_complete_ts); fm_panic("I/O to pool '%s' appears to be " "hung on vdev guid %llu at '%s'.", spa_name(spa), (long long unsigned int) vd->vdev_guid, vd->vdev_path); } } mutex_exit(&vq->vq_lock); } } Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h (revision 329689) +++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h (revision 329690) @@ -1,1032 +1,1066 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska . All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 Datto Inc. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Types and constants shared between userland and the kernel. */ /* * Each dataset can be one of the following types. These constants can be * combined into masks that can be passed to various functions. */ typedef enum { ZFS_TYPE_FILESYSTEM = (1 << 0), ZFS_TYPE_SNAPSHOT = (1 << 1), ZFS_TYPE_VOLUME = (1 << 2), ZFS_TYPE_POOL = (1 << 3), ZFS_TYPE_BOOKMARK = (1 << 4) } zfs_type_t; /* * NB: lzc_dataset_type should be updated whenever a new objset type is added, * if it represents a real type of a dataset that can be created from userland. */ typedef enum dmu_objset_type { DMU_OST_NONE, DMU_OST_META, DMU_OST_ZFS, DMU_OST_ZVOL, DMU_OST_OTHER, /* For testing only! */ DMU_OST_ANY, /* Be careful! */ DMU_OST_NUMTYPES } dmu_objset_type_t; #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) /* * All of these include the terminating NUL byte. */ #define ZAP_MAXNAMELEN 256 #define ZAP_MAXVALUELEN (1024 * 8) #define ZAP_OLDMAXVALUELEN 1024 #define ZFS_MAX_DATASET_NAME_LEN 256 /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zfs_prop.c. */ typedef enum { ZPROP_CONT = -2, ZPROP_INVAL = -1, ZFS_PROP_TYPE = 0, ZFS_PROP_CREATION, ZFS_PROP_USED, ZFS_PROP_AVAILABLE, ZFS_PROP_REFERENCED, ZFS_PROP_COMPRESSRATIO, ZFS_PROP_MOUNTED, ZFS_PROP_ORIGIN, ZFS_PROP_QUOTA, ZFS_PROP_RESERVATION, ZFS_PROP_VOLSIZE, ZFS_PROP_VOLBLOCKSIZE, ZFS_PROP_RECORDSIZE, ZFS_PROP_MOUNTPOINT, ZFS_PROP_SHARENFS, ZFS_PROP_CHECKSUM, ZFS_PROP_COMPRESSION, ZFS_PROP_ATIME, ZFS_PROP_DEVICES, ZFS_PROP_EXEC, ZFS_PROP_SETUID, ZFS_PROP_READONLY, ZFS_PROP_ZONED, ZFS_PROP_SNAPDIR, ZFS_PROP_ACLMODE, ZFS_PROP_ACLINHERIT, ZFS_PROP_CREATETXG, /* not exposed to the user */ ZFS_PROP_NAME, /* not exposed to the user */ ZFS_PROP_CANMOUNT, ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ ZFS_PROP_XATTR, ZFS_PROP_NUMCLONES, /* not exposed to the user */ ZFS_PROP_COPIES, ZFS_PROP_VERSION, ZFS_PROP_UTF8ONLY, ZFS_PROP_NORMALIZE, ZFS_PROP_CASE, ZFS_PROP_VSCAN, ZFS_PROP_NBMAND, ZFS_PROP_SHARESMB, ZFS_PROP_REFQUOTA, ZFS_PROP_REFRESERVATION, ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, ZFS_PROP_USEDSNAP, ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ ZFS_PROP_DEFER_DESTROY, ZFS_PROP_USERREFS, ZFS_PROP_LOGBIAS, ZFS_PROP_UNIQUE, /* not exposed to the user */ ZFS_PROP_OBJSETID, /* not exposed to the user */ ZFS_PROP_DEDUP, ZFS_PROP_MLSLABEL, ZFS_PROP_SYNC, ZFS_PROP_REFRATIO, ZFS_PROP_WRITTEN, ZFS_PROP_CLONES, ZFS_PROP_LOGICALUSED, ZFS_PROP_LOGICALREFERENCED, ZFS_PROP_INCONSISTENT, /* not exposed to the user */ ZFS_PROP_VOLMODE, ZFS_PROP_FILESYSTEM_LIMIT, ZFS_PROP_SNAPSHOT_LIMIT, ZFS_PROP_FILESYSTEM_COUNT, ZFS_PROP_SNAPSHOT_COUNT, ZFS_PROP_REDUNDANT_METADATA, ZFS_PROP_PREV_SNAP, ZFS_PROP_RECEIVE_RESUME_TOKEN, ZFS_NUM_PROPS } zfs_prop_t; typedef enum { ZFS_PROP_USERUSED, ZFS_PROP_USERQUOTA, ZFS_PROP_GROUPUSED, ZFS_PROP_GROUPQUOTA, ZFS_NUM_USERQUOTA_PROPS } zfs_userquota_prop_t; extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected * by the change. If you make any changes to this list, be sure to update * the property table in usr/src/common/zfs/zpool_prop.c. */ typedef enum { ZPOOL_PROP_INVAL = -1, ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, ZPOOL_PROP_GUID, ZPOOL_PROP_VERSION, ZPOOL_PROP_BOOTFS, ZPOOL_PROP_DELEGATION, ZPOOL_PROP_AUTOREPLACE, ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, ZPOOL_PROP_AUTOEXPAND, ZPOOL_PROP_DEDUPDITTO, ZPOOL_PROP_DEDUPRATIO, ZPOOL_PROP_FREE, ZPOOL_PROP_ALLOCATED, ZPOOL_PROP_READONLY, ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_PROP_MAXBLOCKSIZE, ZPOOL_PROP_BOOTSIZE, ZPOOL_NUM_PROPS } zpool_prop_t; /* Small enough to not hog a whole line of printout in zpool(1M). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_VALUE "value" #define ZPROP_SOURCE "source" typedef enum { ZPROP_SRC_NONE = 0x1, ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, ZPROP_SRC_INHERITED = 0x10, ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; #define ZPROP_SRC_ALL 0x3f #define ZPROP_SOURCE_VAL_RECVD "$recvd" #define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" /* * Dataset flag implemented as a special entry in the props zap object * indicating that the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties * just as it did in earlier versions, and thereafter, local properties are * preserved. */ #define ZPROP_HAS_RECVD "$hasrecvd" typedef enum { ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ } zprop_errflags_t; typedef int (*zprop_func)(int, void *); /* * Properties to be set on the root file system of a new pool * are stuffed into their own nvlist, which is then included in * the properties nvlist with the pool properties. */ #define ZPOOL_ROOTFS_PROPS "root-props-nvl" /* * Length of 'written@' and 'written#' */ #define ZFS_WRITTEN_PROP_PREFIX_LEN 8 /* * Dataset property functions shared between libzfs and kernel. */ const char *zfs_prop_default_string(zfs_prop_t); uint64_t zfs_prop_default_numeric(zfs_prop_t); boolean_t zfs_prop_readonly(zfs_prop_t); boolean_t zfs_prop_visible(zfs_prop_t prop); boolean_t zfs_prop_inheritable(zfs_prop_t); boolean_t zfs_prop_setonce(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); boolean_t zfs_prop_userquota(const char *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); boolean_t zfs_prop_valid_for_type(int, zfs_type_t); /* * Pool property functions shared between libzfs and kernel. */ zpool_prop_t zpool_name_to_prop(const char *); const char *zpool_prop_to_name(zpool_prop_t); const char *zpool_prop_default_string(zpool_prop_t); uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); boolean_t zpool_prop_feature(const char *); boolean_t zpool_prop_unsupported(const char *name); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * Definitions for the Delegation. */ typedef enum { ZFS_DELEG_WHO_UNKNOWN = 0, ZFS_DELEG_USER = 'u', ZFS_DELEG_USER_SETS = 'U', ZFS_DELEG_GROUP = 'g', ZFS_DELEG_GROUP_SETS = 'G', ZFS_DELEG_EVERYONE = 'e', ZFS_DELEG_EVERYONE_SETS = 'E', ZFS_DELEG_CREATE = 'c', ZFS_DELEG_CREATE_SETS = 'C', ZFS_DELEG_NAMED_SET = 's', ZFS_DELEG_NAMED_SET_SETS = 'S' } zfs_deleg_who_type_t; typedef enum { ZFS_DELEG_NONE = 0, ZFS_DELEG_PERM_LOCAL = 1, ZFS_DELEG_PERM_DESCENDENT = 2, ZFS_DELEG_PERM_LOCALDESCENDENT = 3, ZFS_DELEG_PERM_CREATE = 4 } zfs_deleg_inherit_t; #define ZFS_DELEG_PERM_UID "uid" #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" #define ZFS_MLSLABEL_DEFAULT "none" #define ZFS_SMB_ACL_SRC "src" #define ZFS_SMB_ACL_TARGET "target" typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; typedef enum { ZFS_LOGBIAS_LATENCY = 0, ZFS_LOGBIAS_THROUGHPUT = 1 } zfs_logbias_op_t; typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, ZFS_SHARE_SMB = 2, ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; typedef enum zfs_smb_acl_op { ZFS_SMB_ACL_ADD, ZFS_SMB_ACL_REMOVE, ZFS_SMB_ACL_RENAME, ZFS_SMB_ACL_PURGE } zfs_smb_acl_op_t; typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, ZFS_CACHE_ALL = 2 } zfs_cache_type_t; typedef enum { ZFS_SYNC_STANDARD = 0, ZFS_SYNC_ALWAYS = 1, ZFS_SYNC_DISABLED = 2 } zfs_sync_type_t; typedef enum { ZFS_VOLMODE_DEFAULT = 0, ZFS_VOLMODE_GEOM = 1, ZFS_VOLMODE_DEV = 2, ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; typedef enum { ZFS_REDUNDANT_METADATA_ALL, ZFS_REDUNDANT_METADATA_MOST } zfs_redundant_metadata_type_t; /* * On-disk version number. */ #define SPA_VERSION_1 1ULL #define SPA_VERSION_2 2ULL #define SPA_VERSION_3 3ULL #define SPA_VERSION_4 4ULL #define SPA_VERSION_5 5ULL #define SPA_VERSION_6 6ULL #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL #define SPA_VERSION_15 15ULL #define SPA_VERSION_16 16ULL #define SPA_VERSION_17 17ULL #define SPA_VERSION_18 18ULL #define SPA_VERSION_19 19ULL #define SPA_VERSION_20 20ULL #define SPA_VERSION_21 21ULL #define SPA_VERSION_22 22ULL #define SPA_VERSION_23 23ULL #define SPA_VERSION_24 24ULL #define SPA_VERSION_25 25ULL #define SPA_VERSION_26 26ULL #define SPA_VERSION_27 27ULL #define SPA_VERSION_28 28ULL #define SPA_VERSION_5000 5000ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. Also bump the version number in * usr/src/grub/capability. */ #define SPA_VERSION SPA_VERSION_5000 #define SPA_VERSION_STRING "5000" /* * Symbolic names for the changes that caused a SPA_VERSION switch. * Used in the code when checking for presence or absence of a feature. * Feel free to define multiple symbolic names for each version if there * were multiple changes to on-disk structures during that version. * * NOTE: When checking the current SPA_VERSION in your code, be sure * to use spa_version() since it reports the version of the * last synced uberblock. Checking the in-flight version can * be dangerous in some cases. */ #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 #define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 #define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 #define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 #define SPA_VERSION_BOOTFS SPA_VERSION_6 #define SPA_VERSION_SLOGS SPA_VERSION_7 #define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 #define SPA_VERSION_FUID SPA_VERSION_9 #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 #define SPA_VERSION_L2CACHE SPA_VERSION_10 #define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 #define SPA_VERSION_USERSPACE SPA_VERSION_15 #define SPA_VERSION_STMF_PROP SPA_VERSION_16 #define SPA_VERSION_RAIDZ3 SPA_VERSION_17 #define SPA_VERSION_USERREFS SPA_VERSION_18 #define SPA_VERSION_HOLES SPA_VERSION_19 #define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 #define SPA_VERSION_DEDUP SPA_VERSION_21 #define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 #define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 #define SPA_VERSION_SA SPA_VERSION_24 #define SPA_VERSION_SCAN SPA_VERSION_25 #define SPA_VERSION_DIR_CLONES SPA_VERSION_26 #define SPA_VERSION_DEADLISTS SPA_VERSION_26 #define SPA_VERSION_FAST_SNAP SPA_VERSION_27 #define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 #define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 #define SPA_VERSION_FEATURES SPA_VERSION_5000 #define SPA_VERSION_IS_SUPPORTED(v) \ (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) /* * ZPL version - rev'd whenever an incompatible on-disk format change * occurs. This is independent of SPA/DMU/ZAP versioning. You must * also update the version_table[] and help message in zfs_prop.c. * * When changing, be sure to teach GRUB how to read the new format! * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL #define ZPL_VERSION_4 4ULL #define ZPL_VERSION_5 5ULL #define ZPL_VERSION ZPL_VERSION_5 #define ZPL_VERSION_STRING "5" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 #define ZPL_VERSION_USERSPACE ZPL_VERSION_4 #define ZPL_VERSION_SA ZPL_VERSION_5 /* Rewind request information */ #define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ #define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ #define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ #define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ #define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ #define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ #define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ typedef struct zpool_rewind_policy { uint32_t zrp_request; /* rewind behavior requested */ uint64_t zrp_maxmeta; /* max acceptable meta-data errors */ uint64_t zrp_maxdata; /* max acceptable data errors */ uint64_t zrp_txg; /* specific txg to load */ } zpool_rewind_policy_t; /* * The following are configuration names used in the nvlist describing a pool's * configuration. */ #define ZPOOL_CONFIG_VERSION "version" #define ZPOOL_CONFIG_POOL_NAME "name" #define ZPOOL_CONFIG_POOL_STATE "state" #define ZPOOL_CONFIG_POOL_TXG "txg" #define ZPOOL_CONFIG_POOL_GUID "pool_guid" #define ZPOOL_CONFIG_CREATE_TXG "create_txg" #define ZPOOL_CONFIG_TOP_GUID "top_guid" #define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" #define ZPOOL_CONFIG_TYPE "type" #define ZPOOL_CONFIG_CHILDREN "children" #define ZPOOL_CONFIG_ID "id" #define ZPOOL_CONFIG_GUID "guid" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" #define ZPOOL_CONFIG_ASIZE "asize" #define ZPOOL_CONFIG_DTL "DTL" #define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_ERRCOUNT "error_count" #define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" #define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" #define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" #define ZPOOL_CONFIG_IS_HOLE "is_hole" #define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" #define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" #define ZPOOL_CONFIG_DDT_STATS "ddt_stats" #define ZPOOL_CONFIG_SPLIT "splitcfg" #define ZPOOL_CONFIG_ORIG_GUID "orig_guid" #define ZPOOL_CONFIG_SPLIT_GUID "split_guid" #define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_REMOVING "removing" #define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" #define ZPOOL_CONFIG_COMMENT "comment" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ #define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ #define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ #define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ #define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ #define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ #define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such * as offline and degraded. */ #define ZPOOL_CONFIG_OFFLINE "offline" #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" #define ZPOOL_CONFIG_FRU "fru" #define ZPOOL_CONFIG_AUX_STATE "aux_state" /* Rewind policy parameters */ #define ZPOOL_REWIND_POLICY "rewind-policy" #define ZPOOL_REWIND_REQUEST "rewind-request" #define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg" #define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh" #define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh" /* Rewind data discovered */ #define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" /* * This is needed in userland to report the minimum necessary device size. * * Note that the zfs test suite uses 64MB vdevs. */ #define SPA_MINDEVSIZE (64ULL << 20) /* * Set if the fragmentation has not yet been calculated. This can happen * because the space maps have not been upgraded or the histogram feature * is not enabled. */ #define ZFS_FRAG_INVALID UINT64_MAX /* * The location of the pool configuration repository, shared between kernel and * userland. */ #define ZPOOL_CACHE "/boot/zfs/zpool.cache" /* * vdev states are ordered from least to most healthy. * A vdev that's CANT_OPEN or below is considered unusable. */ typedef enum vdev_state { VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ VDEV_STATE_CLOSED, /* Not currently open */ VDEV_STATE_OFFLINE, /* Not allowed to open */ VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; #define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY /* * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field * of the vdev stats structure uses these constants to distinguish why. */ typedef enum vdev_aux { VDEV_AUX_NONE, /* no error */ VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ VDEV_AUX_TOO_SMALL, /* vdev size is too small */ VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ VDEV_AUX_UNSUP_FEAT, /* unsupported features */ VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis */ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_ASHIFT_TOO_BIG /* vdev's min block size is too large */ } vdev_aux_t; /* * pool state. The following states are written to disk as part of the normal * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining * states are software abstractions used at various levels to communicate * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ } pool_state_t; /* * Scan Functions. */ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, POOL_SCAN_FUNCS } pool_scan_func_t; /* * Used to control scrub pause and resume. */ typedef enum pool_scrub_cmd { POOL_SCRUB_NORMAL = 0, POOL_SCRUB_PAUSE, POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; /* * ZIO types. Needed to interpret vdev statistics below. */ typedef enum zio_type { ZIO_TYPE_NULL = 0, ZIO_TYPE_READ, ZIO_TYPE_WRITE, ZIO_TYPE_FREE, ZIO_TYPE_CLAIM, ZIO_TYPE_IOCTL, ZIO_TYPES } zio_type_t; /* * Pool statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct pool_scan_stat { /* values stored on disk */ uint64_t pss_func; /* pool_scan_func_t */ uint64_t pss_state; /* dsl_scan_state_t */ uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total examined bytes */ uint64_t pss_to_process; /* total bytes to process */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; } pool_scan_stat_t; typedef enum dsl_scan_state { DSS_NONE, DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, DSS_NUM_STATES } dsl_scan_state_t; /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct vdev_stat { hrtime_t vs_timestamp; /* time since vdev load */ uint64_t vs_state; /* vdev state */ uint64_t vs_aux; /* see vdev_aux_t */ uint64_t vs_alloc; /* space allocated */ uint64_t vs_space; /* total capacity */ uint64_t vs_dspace; /* deflated capacity */ uint64_t vs_rsize; /* replaceable dev size */ uint64_t vs_esize; /* expandable dev size */ uint64_t vs_ops[ZIO_TYPES]; /* operation count */ uint64_t vs_bytes[ZIO_TYPES]; /* bytes read/written */ uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_configured_ashift; /* TLV vdev_ashift */ uint64_t vs_logical_ashift; /* vdev_logical_ashift */ uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_fragmentation; /* device fragmentation */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ ((uint64_t_field_count * sizeof(uint64_t)) >= \ (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field))) /* * DDT statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. */ typedef struct ddt_object { uint64_t ddo_count; /* number of elments in ddt */ uint64_t ddo_dspace; /* size of ddt on disk */ uint64_t ddo_mspace; /* size of ddt in-core */ } ddt_object_t; typedef struct ddt_stat { uint64_t dds_blocks; /* blocks */ uint64_t dds_lsize; /* logical size */ uint64_t dds_psize; /* physical size */ uint64_t dds_dsize; /* deflated allocated size */ uint64_t dds_ref_blocks; /* referenced blocks */ uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ uint64_t dds_ref_psize; /* referenced psize * refcnt */ uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ } ddt_stat_t; typedef struct ddt_histogram { ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ } ddt_histogram_t; #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV_NAME "zfs" #define ZFS_DEV "/dev/" ZFS_DEV_NAME #define ZFS_DISK_ROOT "/dev/dsk" #define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/" #define ZFS_RDISK_ROOT "/dev/rdsk" #define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/" /* general zvol path */ #define ZVOL_DIR "/dev/zvol" /* expansion */ #define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:" /* for dump and swap */ #define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/" #define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/" #define ZVOL_PROP_NAME "name" #define ZVOL_DEFAULT_BLOCKSIZE 8192 /* * /dev/zfs ioctl numbers. */ typedef enum zfs_ioc { ZFS_IOC_FIRST = 0, ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, ZFS_IOC_POOL_DESTROY, ZFS_IOC_POOL_IMPORT, ZFS_IOC_POOL_EXPORT, ZFS_IOC_POOL_CONFIGS, ZFS_IOC_POOL_STATS, ZFS_IOC_POOL_TRYIMPORT, ZFS_IOC_POOL_SCAN, ZFS_IOC_POOL_FREEZE, ZFS_IOC_POOL_UPGRADE, ZFS_IOC_POOL_GET_HISTORY, ZFS_IOC_VDEV_ADD, ZFS_IOC_VDEV_REMOVE, ZFS_IOC_VDEV_SET_STATE, ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, ZFS_IOC_VDEV_SETFRU, ZFS_IOC_OBJSET_STATS, ZFS_IOC_OBJSET_ZPLPROPS, ZFS_IOC_DATASET_LIST_NEXT, ZFS_IOC_SNAPSHOT_LIST_NEXT, ZFS_IOC_SET_PROP, ZFS_IOC_CREATE, ZFS_IOC_DESTROY, ZFS_IOC_ROLLBACK, ZFS_IOC_RENAME, ZFS_IOC_RECV, ZFS_IOC_SEND, ZFS_IOC_INJECT_FAULT, ZFS_IOC_CLEAR_FAULT, ZFS_IOC_INJECT_LIST_NEXT, ZFS_IOC_ERROR_LOG, ZFS_IOC_CLEAR, ZFS_IOC_PROMOTE, ZFS_IOC_DESTROY_SNAPS, ZFS_IOC_SNAPSHOT, ZFS_IOC_DSOBJ_TO_DSNAME, ZFS_IOC_OBJ_TO_PATH, ZFS_IOC_POOL_SET_PROPS, ZFS_IOC_POOL_GET_PROPS, ZFS_IOC_SET_FSACL, ZFS_IOC_GET_FSACL, ZFS_IOC_SHARE, ZFS_IOC_INHERIT_PROP, ZFS_IOC_SMB_ACL, ZFS_IOC_USERSPACE_ONE, ZFS_IOC_USERSPACE_MANY, ZFS_IOC_USERSPACE_UPGRADE, ZFS_IOC_HOLD, ZFS_IOC_RELEASE, ZFS_IOC_GET_HOLDS, ZFS_IOC_OBJSET_RECVD_PROPS, ZFS_IOC_VDEV_SPLIT, ZFS_IOC_NEXT_OBJ, ZFS_IOC_DIFF, ZFS_IOC_TMP_SNAPSHOT, ZFS_IOC_OBJ_TO_STATS, ZFS_IOC_JAIL, ZFS_IOC_UNJAIL, ZFS_IOC_POOL_REGUID, ZFS_IOC_SPACE_WRITTEN, ZFS_IOC_SPACE_SNAPS, ZFS_IOC_SEND_PROGRESS, ZFS_IOC_POOL_REOPEN, ZFS_IOC_LOG_HISTORY, ZFS_IOC_SEND_NEW, ZFS_IOC_SEND_SPACE, ZFS_IOC_CLONE, ZFS_IOC_BOOKMARK, ZFS_IOC_GET_BOOKMARKS, ZFS_IOC_DESTROY_BOOKMARKS, #ifdef __FreeBSD__ ZFS_IOC_NEXTBOOT, #endif ZFS_IOC_CHANNEL_PROGRAM, ZFS_IOC_LAST } zfs_ioc_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { SPA_LOAD_NONE, /* no load in progress */ SPA_LOAD_OPEN, /* normal open */ SPA_LOAD_IMPORT, /* import in progress */ SPA_LOAD_TRYIMPORT, /* tryimport in progress */ SPA_LOAD_RECOVER, /* recovery requested */ SPA_LOAD_ERROR, /* load failed */ SPA_LOAD_CREATE /* creation in progress */ } spa_load_state_t; /* * Bookmark name values. */ #define ZPOOL_ERR_LIST "error list" #define ZPOOL_ERR_DATASET "dataset" #define ZPOOL_ERR_OBJECT "object" #define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) /* * The following are names used in the nvlist describing * the pool's history log. */ #define ZPOOL_HIST_RECORD "history record" #define ZPOOL_HIST_TIME "history time" #define ZPOOL_HIST_CMD "history command" #define ZPOOL_HIST_WHO "history who" #define ZPOOL_HIST_ZONE "history zone" #define ZPOOL_HIST_HOST "history hostname" #define ZPOOL_HIST_TXG "history txg" #define ZPOOL_HIST_INT_EVENT "history internal event" #define ZPOOL_HIST_INT_STR "history internal str" #define ZPOOL_HIST_INT_NAME "internal_name" #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" /* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 #define ZFS_ONLINE_EXPAND 0x8 #define ZFS_OFFLINE_TEMPORARY 0x1 /* * Flags for ZFS_IOC_POOL_IMPORT */ #define ZFS_IMPORT_NORMAL 0x0 #define ZFS_IMPORT_VERBATIM 0x1 #define ZFS_IMPORT_ANY_HOST 0x2 #define ZFS_IMPORT_MISSING_LOG 0x4 #define ZFS_IMPORT_ONLY 0x8 /* * Channel program argument/return nvlist keys and defaults. */ #define ZCP_ARG_PROGRAM "program" #define ZCP_ARG_ARGLIST "arg" #define ZCP_ARG_SYNC "sync" #define ZCP_ARG_INSTRLIMIT "instrlimit" #define ZCP_ARG_MEMLIMIT "memlimit" #define ZCP_ARG_CLIARGV "argv" #define ZCP_RET_ERROR "error" #define ZCP_RET_RETURN "return" #define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) #define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) #define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) #define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) /* * Sysevent payload members. ZFS will generate the following sysevents with the * given payloads: * * ESC_ZFS_RESILVER_START * ESC_ZFS_RESILVER_END * ESC_ZFS_POOL_DESTROY * ESC_ZFS_POOL_REGUID * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * * ESC_ZFS_VDEV_REMOVE * ESC_ZFS_VDEV_CLEAR * ESC_ZFS_VDEV_CHECK * * ZFS_EV_POOL_NAME DATA_TYPE_STRING * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 + * + * ESC_ZFS_HISTORY_EVENT + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) + * + * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the + * history log nvlist. The keynames will be free of any spaces or other + * characters that could be potentially unexpected to consumers of the + * sysevents. */ #define ZFS_EV_POOL_NAME "pool_name" #define ZFS_EV_POOL_GUID "pool_guid" #define ZFS_EV_VDEV_PATH "vdev_path" #define ZFS_EV_VDEV_GUID "vdev_guid" +#define ZFS_EV_HIST_TIME "history_time" +#define ZFS_EV_HIST_CMD "history_command" +#define ZFS_EV_HIST_WHO "history_who" +#define ZFS_EV_HIST_ZONE "history_zone" +#define ZFS_EV_HIST_HOST "history_hostname" +#define ZFS_EV_HIST_TXG "history_txg" +#define ZFS_EV_HIST_INT_EVENT "history_internal_event" +#define ZFS_EV_HIST_INT_STR "history_internal_str" +#define ZFS_EV_HIST_INT_NAME "history_internal_name" +#define ZFS_EV_HIST_IOCTL "history_ioctl" +#define ZFS_EV_HIST_DSNAME "history_dsname" +#define ZFS_EV_HIST_DSID "history_dsid" #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h (revision 329689) +++ head/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h (revision 329690) @@ -1,228 +1,230 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_SYSEVENT_EVENTDEFS_H #define _SYS_SYSEVENT_EVENTDEFS_H #ifdef __cplusplus extern "C" { #endif /* * eventdefs.h contains public definitions for sysevent types (classes * and subclasses). All additions/removal/changes are subject * to PSARC approval. */ /* Sysevent Class definitions */ #define EC_NONE "EC_none" #define EC_PRIV "EC_priv" #define EC_PLATFORM "EC_platform" /* events private to platform */ #define EC_DR "EC_dr" /* Dynamic reconfiguration event class */ #define EC_ENV "EC_env" /* Environmental monitor event class */ #define EC_DOMAIN "EC_domain" /* Domain event class */ #define EC_AP_DRIVER "EC_ap_driver" /* Alternate Pathing event class */ #define EC_IPMP "EC_ipmp" /* IP Multipathing event class */ #define EC_DEV_ADD "EC_dev_add" /* device add event class */ #define EC_DEV_REMOVE "EC_dev_remove" /* device remove event class */ #define EC_DEV_BRANCH "EC_dev_branch" /* device tree branch event class */ #define EC_DEV_STATUS "EC_dev_status" /* device status event class */ #define EC_FM "EC_fm" /* FMA error report event */ #define EC_ZFS "EC_zfs" /* ZFS event */ #define EC_DATALINK "EC_datalink" /* datalink event */ #define EC_VRRP "EC_vrrp" /* VRRP event */ /* * The following event class is reserved for exclusive use * by Sun Cluster software. */ #define EC_CLUSTER "EC_Cluster" /* * EC_DR subclass definitions - supporting attributes (name/value pairs) * are found in sys/sysevent/dr.h */ /* Attachment point state change */ #define ESC_DR_AP_STATE_CHANGE "ESC_dr_ap_state_change" #define ESC_DR_REQ "ESC_dr_req" /* Request DR */ #define ESC_DR_TARGET_STATE_CHANGE "ESC_dr_target_state_change" /* * EC_ENV subclass definitions - supporting attributes (name/value pairs) * are found in sys/sysevent/env.h */ #define ESC_ENV_TEMP "ESC_env_temp" /* Temperature change event subclass */ #define ESC_ENV_FAN "ESC_env_fan" /* Fan status change event subclass */ #define ESC_ENV_POWER "ESC_env_power" /* Power supply change event subclass */ #define ESC_ENV_LED "ESC_env_led" /* LED change event subclass */ /* * EC_DOMAIN subclass definitions - supporting attributes (name/value pairs) * are found in sys/sysevent/domain.h */ /* Domain state change */ #define ESC_DOMAIN_STATE_CHANGE "ESC_domain_state_change" /* Domain loghost name change */ #define ESC_DOMAIN_LOGHOST_CHANGE "ESC_domain_loghost_change" /* * EC_AP_DRIVER subclass definitions - supporting attributes (name/value pairs) * are found in sys/sysevent/ap_driver.h */ /* Alternate Pathing path switch */ #define ESC_AP_DRIVER_PATHSWITCH "ESC_ap_driver_pathswitch" /* Alternate Pathing database commit */ #define ESC_AP_DRIVER_COMMIT "ESC_ap_driver_commit" /* Alternate Pathing physical path status change */ #define ESC_AP_DRIVER_PHYS_PATH_STATUS_CHANGE \ "ESC_ap_driver_phys_path_status_change" /* * EC_IPMP subclass definitions - supporting attributes (name/value pairs) * are found in sys/sysevent/ipmp.h */ /* IPMP group has changed state */ #define ESC_IPMP_GROUP_STATE "ESC_ipmp_group_state" /* IPMP group has been created or removed */ #define ESC_IPMP_GROUP_CHANGE "ESC_ipmp_group_change" /* IPMP group has had an interface added or removed */ #define ESC_IPMP_GROUP_MEMBER_CHANGE "ESC_ipmp_group_member_change" /* Interface within an IPMP group has changed state or type */ #define ESC_IPMP_IF_CHANGE "ESC_ipmp_if_change" /* IPMP probe has changed state */ #define ESC_IPMP_PROBE_STATE "ESC_ipmp_probe_state" /* * EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes * (name/value pairs) are found in sys/sysevent/dev.h */ #define ESC_DISK "disk" /* disk device */ #define ESC_NETWORK "network" /* network interface */ #define ESC_PRINTER "printer" /* printer device */ #define ESC_LOFI "lofi" /* lofi device */ /* * EC_DEV_BRANCH subclass definitions - supporting attributes (name/value pairs) * are found in sys/sysevent/dev.h */ /* device tree branch added */ #define ESC_DEV_BRANCH_ADD "ESC_dev_branch_add" /* device tree branch removed */ #define ESC_DEV_BRANCH_REMOVE "ESC_dev_branch_remove" /* * EC_DEV_STATUS subclass definitions * * device capacity dynamically changed */ #define ESC_DEV_DLE "ESC_dev_dle" /* LUN has received an eject request from the user */ #define ESC_DEV_EJECT_REQUEST "ESC_dev_eject_request" /* FMA Fault and Error event protocol subclass */ #define ESC_FM_ERROR "ESC_FM_error" #define ESC_FM_ERROR_REPLAY "ESC_FM_error_replay" /* Service processor subclass definitions */ #define ESC_PLATFORM_SP_RESET "ESC_platform_sp_reset" /* * EC_PWRCTL subclass definitions */ #define EC_PWRCTL "EC_pwrctl" #define ESC_PWRCTL_ADD "ESC_pwrctl_add" #define ESC_PWRCTL_REMOVE "ESC_pwrctl_remove" #define ESC_PWRCTL_WARN "ESC_pwrctl_warn" #define ESC_PWRCTL_LOW "ESC_pwrctl_low" #define ESC_PWRCTL_STATE_CHANGE "ESC_pwrctl_state_change" #define ESC_PWRCTL_POWER_BUTTON "ESC_pwrctl_power_button" #define ESC_PWRCTL_BRIGHTNESS_UP "ESC_pwrctl_brightness_up" #define ESC_PWRCTL_BRIGHTNESS_DOWN "ESC_pwrctl_brightness_down" /* EC_ACPIEV subclass definitions */ #define EC_ACPIEV "EC_acpiev" #define ESC_ACPIEV_DISPLAY_SWITCH "ESC_acpiev_display_switch" #define ESC_ACPIEV_SCREEN_LOCK "ESC_acpiev_screen_lock" #define ESC_ACPIEV_SLEEP "ESC_acpiev_sleep" #define ESC_ACPIEV_AUDIO_MUTE "ESC_acpiev_audio_mute" #define ESC_ACPIEV_WIFI "ESC_acpiev_wifi" #define ESC_ACPIEV_TOUCHPAD "ESC_acpiev_touchpad" /* * ZFS subclass definitions. supporting attributes (name/value paris) are found * in sys/fs/zfs.h */ #define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start" #define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish" #define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove" #define ESC_ZFS_VDEV_REMOVE_AUX "ESC_ZFS_vdev_remove_aux" #define ESC_ZFS_VDEV_REMOVE_DEV "ESC_ZFS_vdev_remove_dev" #define ESC_ZFS_POOL_CREATE "ESC_ZFS_pool_create" #define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy" #define ESC_ZFS_POOL_IMPORT "ESC_ZFS_pool_import" #define ESC_ZFS_VDEV_ADD "ESC_ZFS_vdev_add" #define ESC_ZFS_VDEV_ATTACH "ESC_ZFS_vdev_attach" #define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear" #define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check" #define ESC_ZFS_VDEV_ONLINE "ESC_ZFS_vdev_online" #define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync" #define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start" #define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish" #define ESC_ZFS_SCRUB_ABORT "ESC_ZFS_scrub_abort" #define ESC_ZFS_SCRUB_RESUME "ESC_ZFS_scrub_resume" #define ESC_ZFS_SCRUB_PAUSED "ESC_ZFS_scrub_paused" #define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare" #define ESC_ZFS_BOOTFS_VDEV_ATTACH "ESC_ZFS_bootfs_vdev_attach" #define ESC_ZFS_POOL_REGUID "ESC_ZFS_pool_reguid" +#define ESC_ZFS_HISTORY_EVENT "ESC_ZFS_history_event" #define ESC_ZFS_VDEV_AUTOEXPAND "ESC_ZFS_vdev_autoexpand" /* * datalink subclass definitions. */ #define ESC_DATALINK_PHYS_ADD "ESC_datalink_phys_add" /* new physical link */ /* * VRRP subclass definitions. Supporting attributes (name/value paris) are * found in sys/sysevent/vrrp.h */ #define ESC_VRRP_STATE_CHANGE "ESC_vrrp_state_change" #ifdef __cplusplus } #endif #endif /* _SYS_SYSEVENT_EVENTDEFS_H */ Index: head/sys/cddl/contrib/opensolaris =================================================================== --- head/sys/cddl/contrib/opensolaris (revision 329689) +++ head/sys/cddl/contrib/opensolaris (revision 329690) Property changes on: head/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor-sys/illumos/dist:r319737