diff --git a/include/sys/dsl_synctask.h b/include/sys/dsl_synctask.h index 957963ffe553..0bb602e8f7ff 100644 --- a/include/sys/dsl_synctask.h +++ b/include/sys/dsl_synctask.h @@ -1,127 +1,127 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_SYNCTASK_H #define _SYS_DSL_SYNCTASK_H #include #include #ifdef __cplusplus extern "C" { #endif struct dsl_pool; typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *); typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *); typedef void (dsl_sigfunc_t)(void *, dmu_tx_t *); typedef enum zfs_space_check { /* * Normal space check: if there is less than 3.2% free space, * the operation will fail. Operations which are logically * creating things should use this (e.g. "zfs create", "zfs snapshot"). * User writes (via the ZPL / ZVOL) also fail at this point. */ ZFS_SPACE_CHECK_NORMAL, /* * Space check allows use of half the slop space. If there * is less than 1.6% free space, the operation will fail. Most * operations should use this (e.g. "zfs set", "zfs rename"), * because we want them to succeed even after user writes are failing, * so that they can be used as part of the space recovery process. */ ZFS_SPACE_CHECK_RESERVED, /* * Space check allows use of three quarters of the slop space. * If there is less than 0.8% free space, the operation will * fail. */ ZFS_SPACE_CHECK_EXTRA_RESERVED, /* * In all cases "zfs destroy" is expected to result in an net * reduction of space, except one. When the pool has a * checkpoint, space freed by "zfs destroy" will not actually * free anything internally. Thus, it starts failing after * three quarters of the slop space is exceeded. */ ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED, /* * A channel program can run a "zfs destroy" as part of its * script and therefore has the same space_check policy when * being evaluated. */ ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY, /* * No space check is performed. This level of space check should * be used cautiously as operations that use it can even run when * 0.8% capacity is left for use. In this scenario, if there is a * checkpoint, async destroys are suspended and any kind of freeing * can potentially add space instead of freeing it. * * See also the comments above spa_slop_shift. */ ZFS_SPACE_CHECK_NONE, ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE, } zfs_space_check_t; typedef struct dsl_sync_task { txg_node_t dst_node; struct dsl_pool *dst_pool; uint64_t dst_txg; int dst_space; zfs_space_check_t dst_space_check; dsl_checkfunc_t *dst_checkfunc; dsl_syncfunc_t *dst_syncfunc; void *dst_arg; int dst_error; boolean_t dst_nowaiter; } dsl_sync_task_t; void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *); int dsl_sync_task(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *, void *, int, zfs_space_check_t); void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, - void *, int, zfs_space_check_t, dmu_tx_t *); + void *, dmu_tx_t *); int dsl_early_sync_task(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *, void *, int, zfs_space_check_t); void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, - void *, int, zfs_space_check_t, dmu_tx_t *); + void *, dmu_tx_t *); int dsl_sync_task_sig(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *, dsl_sigfunc_t *, void *, int, zfs_space_check_t); #ifdef __cplusplus } #endif #endif /* _SYS_DSL_SYNCTASK_H */ diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index df10d8d6faae..1c5f1bd24860 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -1,1186 +1,1185 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2017, 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #endif /* * This controls the number of entries in the buffer the redaction_list_update * synctask uses to buffer writes to the redaction list. */ int redact_sync_bufsize = 1024; /* * Controls how often to update the redaction list when creating a redaction * list. */ uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */ /* * This tunable controls the length of the queues that zfs redact worker threads * use to communicate. If the dmu_redact_snap thread is blocking on these * queues, this variable may need to be increased. If there is a significant * slowdown at the start of a redact operation as these threads consume all the * available IO resources, or the queues are consuming too much memory, this * variable may need to be decreased. */ int zfs_redact_queue_length = 1024 * 1024; /* * These tunables control the fill fraction of the queues by zfs redact. The * fill fraction controls the frequency with which threads have to be * cv_signaled. If a lot of cpu time is being spent on cv_signal, then these * should be tuned down. If the queues empty before the signalled thread can * catch up, then these should be tuned up. */ uint64_t zfs_redact_queue_ff = 20; struct redact_record { bqueue_node_t ln; boolean_t eos_marker; /* Marks the end of the stream */ uint64_t start_object; uint64_t start_blkid; uint64_t end_object; uint64_t end_blkid; uint8_t indblkshift; uint32_t datablksz; }; struct redact_thread_arg { bqueue_t q; objset_t *os; /* Objset to traverse */ dsl_dataset_t *ds; /* Dataset to traverse */ struct redact_record *current_record; int error_code; boolean_t cancel; zbookmark_phys_t resume; objlist_t *deleted_objs; uint64_t *num_blocks_visited; uint64_t ignore_object; /* ignore further callbacks on this */ uint64_t txg; /* txg to traverse since */ }; /* * The redaction node is a wrapper around the redaction record that is used * by the redaction merging thread to sort the records and determine overlaps. * * It contains two nodes; one sorts the records by their start_zb, and the other * sorts the records by their end_zb. */ struct redact_node { avl_node_t avl_node_start; avl_node_t avl_node_end; struct redact_record *record; struct redact_thread_arg *rt_arg; uint32_t thread_num; }; struct merge_data { list_t md_redact_block_pending; redact_block_phys_t md_coalesce_block; uint64_t md_last_time; redact_block_phys_t md_furthest[TXG_SIZE]; /* Lists of struct redact_block_list_node. */ list_t md_blocks[TXG_SIZE]; boolean_t md_synctask_txg[TXG_SIZE]; uint64_t md_latest_synctask_txg; redaction_list_t *md_redaction_list; }; /* * A wrapper around struct redact_block so it can be stored in a list_t. */ struct redact_block_list_node { redact_block_phys_t block; list_node_t node; }; /* * We've found a new redaction candidate. In order to improve performance, we * coalesce these blocks when they're adjacent to each other. This function * handles that. If the new candidate block range is immediately after the * range we're building, coalesce it into the range we're building. Otherwise, * put the record we're building on the queue, and update the build pointer to * point to the new record. */ static void record_merge_enqueue(bqueue_t *q, struct redact_record **build, struct redact_record *new) { if (new->eos_marker) { if (*build != NULL) bqueue_enqueue(q, *build, sizeof (*build)); bqueue_enqueue_flush(q, new, sizeof (*new)); return; } if (*build == NULL) { *build = new; return; } struct redact_record *curbuild = *build; if ((curbuild->end_object == new->start_object && curbuild->end_blkid + 1 == new->start_blkid && curbuild->end_blkid != UINT64_MAX) || (curbuild->end_object + 1 == new->start_object && curbuild->end_blkid == UINT64_MAX && new->start_blkid == 0)) { curbuild->end_object = new->end_object; curbuild->end_blkid = new->end_blkid; kmem_free(new, sizeof (*new)); } else { bqueue_enqueue(q, curbuild, sizeof (*curbuild)); *build = new; } } #ifdef _KERNEL struct objnode { avl_node_t node; uint64_t obj; }; static int objnode_compare(const void *o1, const void *o2) { const struct objnode *obj1 = o1; const struct objnode *obj2 = o2; if (obj1->obj < obj2->obj) return (-1); if (obj1->obj > obj2->obj) return (1); return (0); } static objlist_t * zfs_get_deleteq(objset_t *os) { objlist_t *deleteq_objlist = objlist_create(); uint64_t deleteq_obj; zap_cursor_t zc; zap_attribute_t za; dmu_object_info_t doi; ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi)); ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE); VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); /* * In order to insert objects into the objlist, they must be in sorted * order. We don't know what order we'll get them out of the ZAP in, so * we insert them into and remove them from an avl_tree_t to sort them. */ avl_tree_t at; avl_create(&at, objnode_compare, sizeof (struct objnode), offsetof(struct objnode, node)); for (zap_cursor_init(&zc, os, deleteq_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP); obj->obj = za.za_first_integer; avl_add(&at, obj); } zap_cursor_fini(&zc); struct objnode *next, *found = avl_first(&at); while (found != NULL) { next = AVL_NEXT(&at, found); objlist_insert(deleteq_objlist, found->obj); found = next; } void *cookie = NULL; while ((found = avl_destroy_nodes(&at, &cookie)) != NULL) kmem_free(found, sizeof (*found)); avl_destroy(&at); return (deleteq_objlist); } #endif /* * This is the callback function to traverse_dataset for the redaction threads * for dmu_redact_snap. This thread is responsible for creating redaction * records for all the data that is modified by the snapshots we're redacting * with respect to. Redaction records represent ranges of data that have been * modified by one of the redaction snapshots, and are stored in the * redact_record struct. We need to create redaction records for three * cases: * * First, if there's a normal write, we need to create a redaction record for * that block. * * Second, if there's a hole, we need to create a redaction record that covers * the whole range of the hole. If the hole is in the meta-dnode, it must cover * every block in all of the objects in the hole. * * Third, if there is a deleted object, we need to create a redaction record for * all of the blocks in that object. */ /*ARGSUSED*/ static int redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) { struct redact_thread_arg *rta = arg; struct redact_record *record; ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= rta->resume.zb_object); if (rta->cancel) return (SET_ERROR(EINTR)); if (rta->ignore_object == zb->zb_object) return (0); /* * If we're visiting a dnode, we need to handle the case where the * object has been deleted. */ if (zb->zb_level == ZB_DNODE_LEVEL) { ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); if (zb->zb_object == 0) return (0); /* * If the object has been deleted, redact all of the blocks in * it. */ if (dnp->dn_type == DMU_OT_NONE || objlist_exists(rta->deleted_objs, zb->zb_object)) { rta->ignore_object = zb->zb_object; record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); record->eos_marker = B_FALSE; record->start_object = record->end_object = zb->zb_object; record->start_blkid = 0; record->end_blkid = UINT64_MAX; record_merge_enqueue(&rta->q, &rta->current_record, record); } return (0); } else if (zb->zb_level < 0) { return (0); } else if (zb->zb_level > 0 && !BP_IS_HOLE(bp)) { /* * If this is an indirect block, but not a hole, it doesn't * provide any useful information for redaction, so ignore it. */ return (0); } /* * At this point, there are two options left for the type of block we're * looking at. Either this is a hole (which could be in the dnode or * the meta-dnode), or it's a level 0 block of some sort. If it's a * hole, we create a redaction record that covers the whole range. If * the hole is in a dnode, we need to redact all the blocks in that * hole. If the hole is in the meta-dnode, we instead need to redact * all blocks in every object covered by that hole. If it's a level 0 * block, we only need to redact that single block. */ record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); record->eos_marker = B_FALSE; record->start_object = record->end_object = zb->zb_object; if (BP_IS_HOLE(bp)) { record->start_blkid = zb->zb_blkid * bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level); record->end_blkid = ((zb->zb_blkid + 1) * bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level)) - 1; if (zb->zb_object == DMU_META_DNODE_OBJECT) { record->start_object = record->start_blkid * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / sizeof (dnode_phys_t)); record->start_blkid = 0; record->end_object = ((record->end_blkid + 1) * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / sizeof (dnode_phys_t))) - 1; record->end_blkid = UINT64_MAX; } } else if (zb->zb_level != 0 || zb->zb_object == DMU_META_DNODE_OBJECT) { kmem_free(record, sizeof (*record)); return (0); } else { record->start_blkid = record->end_blkid = zb->zb_blkid; } record->indblkshift = dnp->dn_indblkshift; record->datablksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; record_merge_enqueue(&rta->q, &rta->current_record, record); return (0); } static void redact_traverse_thread(void *arg) { struct redact_thread_arg *rt_arg = arg; int err; struct redact_record *data; #ifdef _KERNEL if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS) rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os); else rt_arg->deleted_objs = objlist_create(); #else rt_arg->deleted_objs = objlist_create(); #endif err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg, &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, redact_cb, rt_arg); if (err != EINTR) rt_arg->error_code = err; objlist_destroy(rt_arg->deleted_objs); data = kmem_zalloc(sizeof (*data), KM_SLEEP); data->eos_marker = B_TRUE; record_merge_enqueue(&rt_arg->q, &rt_arg->current_record, data); thread_exit(); } static inline void create_zbookmark_from_obj_off(zbookmark_phys_t *zb, uint64_t object, uint64_t blkid) { zb->zb_object = object; zb->zb_level = 0; zb->zb_blkid = blkid; } /* * This is a utility function that can do the comparison for the start or ends * of the ranges in a redact_record. */ static int redact_range_compare(uint64_t obj1, uint64_t off1, uint32_t dbss1, uint64_t obj2, uint64_t off2, uint32_t dbss2) { zbookmark_phys_t z1, z2; create_zbookmark_from_obj_off(&z1, obj1, off1); create_zbookmark_from_obj_off(&z2, obj2, off2); return (zbookmark_compare(dbss1 >> SPA_MINBLOCKSHIFT, 0, dbss2 >> SPA_MINBLOCKSHIFT, 0, &z1, &z2)); } /* * Compare two redaction records by their range's start location. Also makes * eos records always compare last. We use the thread number in the redact_node * to ensure that records do not compare equal (which is not allowed in our avl * trees). */ static int redact_node_compare_start(const void *arg1, const void *arg2) { const struct redact_node *rn1 = arg1; const struct redact_node *rn2 = arg2; const struct redact_record *rr1 = rn1->record; const struct redact_record *rr2 = rn2->record; if (rr1->eos_marker) return (1); if (rr2->eos_marker) return (-1); int cmp = redact_range_compare(rr1->start_object, rr1->start_blkid, rr1->datablksz, rr2->start_object, rr2->start_blkid, rr2->datablksz); if (cmp == 0) cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); return (cmp); } /* * Compare two redaction records by their range's end location. Also makes * eos records always compare last. We use the thread number in the redact_node * to ensure that records do not compare equal (which is not allowed in our avl * trees). */ static int redact_node_compare_end(const void *arg1, const void *arg2) { const struct redact_node *rn1 = arg1; const struct redact_node *rn2 = arg2; const struct redact_record *srr1 = rn1->record; const struct redact_record *srr2 = rn2->record; if (srr1->eos_marker) return (1); if (srr2->eos_marker) return (-1); int cmp = redact_range_compare(srr1->end_object, srr1->end_blkid, srr1->datablksz, srr2->end_object, srr2->end_blkid, srr2->datablksz); if (cmp == 0) cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); return (cmp); } /* * Utility function that compares two redaction records to determine if any part * of the "from" record is before any part of the "to" record. Also causes End * of Stream redaction records to compare after all others, so that the * redaction merging logic can stay simple. */ static boolean_t redact_record_before(const struct redact_record *from, const struct redact_record *to) { if (from->eos_marker == B_TRUE) return (B_FALSE); else if (to->eos_marker == B_TRUE) return (B_TRUE); return (redact_range_compare(from->start_object, from->start_blkid, from->datablksz, to->end_object, to->end_blkid, to->datablksz) <= 0); } /* * Pop a new redaction record off the queue, check that the records are in the * right order, and free the old data. */ static struct redact_record * get_next_redact_record(bqueue_t *bq, struct redact_record *prev) { struct redact_record *next = bqueue_dequeue(bq); ASSERT(redact_record_before(prev, next)); kmem_free(prev, sizeof (*prev)); return (next); } /* * Remove the given redaction node from both trees, pull a new redaction record * off the queue, free the old redaction record, update the redaction node, and * reinsert the node into the trees. */ static int update_avl_trees(avl_tree_t *start_tree, avl_tree_t *end_tree, struct redact_node *redact_node) { avl_remove(start_tree, redact_node); avl_remove(end_tree, redact_node); redact_node->record = get_next_redact_record(&redact_node->rt_arg->q, redact_node->record); avl_add(end_tree, redact_node); avl_add(start_tree, redact_node); return (redact_node->rt_arg->error_code); } /* * Synctask for updating redaction lists. We first take this txg's list of * redacted blocks and append those to the redaction list. We then update the * redaction list's bonus buffer. We store the furthest blocks we visited and * the list of snapshots that we're redacting with respect to. We need these so * that redacted sends and receives can be correctly resumed. */ static void redaction_list_update_sync(void *arg, dmu_tx_t *tx) { struct merge_data *md = arg; uint64_t txg = dmu_tx_get_txg(tx); list_t *list = &md->md_blocks[txg & TXG_MASK]; redact_block_phys_t *furthest_visited = &md->md_furthest[txg & TXG_MASK]; objset_t *mos = tx->tx_pool->dp_meta_objset; redaction_list_t *rl = md->md_redaction_list; int bufsize = redact_sync_bufsize; redact_block_phys_t *buf = kmem_alloc(bufsize * sizeof (*buf), KM_SLEEP); int index = 0; dmu_buf_will_dirty(rl->rl_dbuf, tx); for (struct redact_block_list_node *rbln = list_remove_head(list); rbln != NULL; rbln = list_remove_head(list)) { ASSERT3U(rbln->block.rbp_object, <=, furthest_visited->rbp_object); ASSERT(rbln->block.rbp_object < furthest_visited->rbp_object || rbln->block.rbp_blkid <= furthest_visited->rbp_blkid); buf[index] = rbln->block; index++; if (index == bufsize) { dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries * sizeof (*buf), bufsize * sizeof (*buf), buf, tx); rl->rl_phys->rlp_num_entries += bufsize; index = 0; } kmem_free(rbln, sizeof (*rbln)); } if (index > 0) { dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries * sizeof (*buf), index * sizeof (*buf), buf, tx); rl->rl_phys->rlp_num_entries += index; } kmem_free(buf, bufsize * sizeof (*buf)); md->md_synctask_txg[txg & TXG_MASK] = B_FALSE; rl->rl_phys->rlp_last_object = furthest_visited->rbp_object; rl->rl_phys->rlp_last_blkid = furthest_visited->rbp_blkid; } static void commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object, uint64_t blkid) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node)); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); if (!md->md_synctask_txg[txg & TXG_MASK]) { dsl_sync_task_nowait(dmu_tx_pool(tx), - redaction_list_update_sync, md, 5, ZFS_SPACE_CHECK_NONE, - tx); + redaction_list_update_sync, md, tx); md->md_synctask_txg[txg & TXG_MASK] = B_TRUE; md->md_latest_synctask_txg = txg; } md->md_furthest[txg & TXG_MASK].rbp_object = object; md->md_furthest[txg & TXG_MASK].rbp_blkid = blkid; list_move_tail(&md->md_blocks[txg & TXG_MASK], &md->md_redact_block_pending); dmu_tx_commit(tx); md->md_last_time = gethrtime(); } /* * We want to store the list of blocks that we're redacting in the bookmark's * redaction list. However, this list is stored in the MOS, which means it can * only be written to in syncing context. To get around this, we create a * synctask that will write to the mos for us. We tell it what to write by * a linked list for each current transaction group; every time we decide to * redact a block, we append it to the transaction group that is currently in * open context. We also update some progress information that the synctask * will store to enable resumable redacted sends. */ static void update_redaction_list(struct merge_data *md, objset_t *os, uint64_t object, uint64_t blkid, uint64_t endblkid, uint32_t blksz) { boolean_t enqueue = B_FALSE; redact_block_phys_t cur = {0}; uint64_t count = endblkid - blkid + 1; while (count > REDACT_BLOCK_MAX_COUNT) { update_redaction_list(md, os, object, blkid, blkid + REDACT_BLOCK_MAX_COUNT - 1, blksz); blkid += REDACT_BLOCK_MAX_COUNT; count -= REDACT_BLOCK_MAX_COUNT; } redact_block_phys_t *coalesce = &md->md_coalesce_block; boolean_t new; if (coalesce->rbp_size_count == 0) { new = B_TRUE; enqueue = B_FALSE; } else { uint64_t old_count = redact_block_get_count(coalesce); if (coalesce->rbp_object == object && coalesce->rbp_blkid + old_count == blkid && old_count + count <= REDACT_BLOCK_MAX_COUNT) { ASSERT3U(redact_block_get_size(coalesce), ==, blksz); redact_block_set_count(coalesce, old_count + count); new = B_FALSE; enqueue = B_FALSE; } else { new = B_TRUE; enqueue = B_TRUE; } } if (new) { cur = *coalesce; coalesce->rbp_blkid = blkid; coalesce->rbp_object = object; redact_block_set_count(coalesce, count); redact_block_set_size(coalesce, blksz); } if (enqueue && redact_block_get_size(&cur) != 0) { struct redact_block_list_node *rbln = kmem_alloc(sizeof (struct redact_block_list_node), KM_SLEEP); rbln->block = cur; list_insert_tail(&md->md_redact_block_pending, rbln); } if (gethrtime() > md->md_last_time + redaction_list_update_interval_ns) { commit_rl_updates(os, md, object, blkid); } } /* * This thread merges all the redaction records provided by the worker threads, * and determines which blocks are redacted by all the snapshots. The algorithm * for doing so is similar to performing a merge in mergesort with n sub-lists * instead of 2, with some added complexity due to the fact that the entries are * ranges, not just single blocks. This algorithm relies on the fact that the * queues are sorted, which is ensured by the fact that traverse_dataset * traverses the dataset in a consistent order. We pull one entry off the front * of the queues of each secure dataset traversal thread. Then we repeat the * following: each record represents a range of blocks modified by one of the * redaction snapshots, and each block in that range may need to be redacted in * the send stream. Find the record with the latest start of its range, and the * record with the earliest end of its range. If the last start is before the * first end, then we know that the blocks in the range [last_start, first_end] * are covered by all of the ranges at the front of the queues, which means * every thread redacts that whole range. For example, let's say the ranges on * each queue look like this: * * Block Id 1 2 3 4 5 6 7 8 9 10 11 * Thread 1 | [====================] * Thread 2 | [========] * Thread 3 | [=================] * * Thread 3 has the last start (5), and the thread 2 has the last end (6). All * three threads modified the range [5,6], so that data should not be sent over * the wire. After we've determined whether or not to redact anything, we take * the record with the first end. We discard that record, and pull a new one * off the front of the queue it came from. In the above example, we would * discard Thread 2's record, and pull a new one. Let's say the next record we * pulled from Thread 2 covered range [10,11]. The new layout would look like * this: * * Block Id 1 2 3 4 5 6 7 8 9 10 11 * Thread 1 | [====================] * Thread 2 | [==] * Thread 3 | [=================] * * When we compare the last start (10, from Thread 2) and the first end (9, from * Thread 1), we see that the last start is greater than the first end. * Therefore, we do not redact anything from these records. We'll iterate by * replacing the record from Thread 1. * * We iterate by replacing the record with the lowest end because we know * that the record with the lowest end has helped us as much as it can. All the * ranges before it that we will ever redact have been redacted. In addition, * by replacing the one with the lowest end, we guarantee we catch all ranges * that need to be redacted. For example, if in the case above we had replaced * the record from Thread 1 instead, we might have ended up with the following: * * Block Id 1 2 3 4 5 6 7 8 9 10 11 12 * Thread 1 | [==] * Thread 2 | [========] * Thread 3 | [=================] * * If the next record from Thread 2 had been [8,10], for example, we should have * redacted part of that range, but because we updated Thread 1's record, we * missed it. * * We implement this algorithm by using two trees. The first sorts the * redaction records by their start_zb, and the second sorts them by their * end_zb. We use these to find the record with the last start and the record * with the first end. We create a record with that start and end, and send it * on. The overall runtime of this implementation is O(n log m), where n is the * total number of redaction records from all the different redaction snapshots, * and m is the number of redaction snapshots. * * If we redact with respect to zero snapshots, we create a redaction * record with the start object and blkid to 0, and the end object and blkid to * UINT64_MAX. This will result in us redacting every block. */ static int perform_thread_merge(bqueue_t *q, uint32_t num_threads, struct redact_thread_arg *thread_args, boolean_t *cancel) { struct redact_node *redact_nodes = NULL; avl_tree_t start_tree, end_tree; struct redact_record *record; struct redact_record *current_record = NULL; int err = 0; struct merge_data md = { {0} }; list_create(&md.md_redact_block_pending, sizeof (struct redact_block_list_node), offsetof(struct redact_block_list_node, node)); /* * If we're redacting with respect to zero snapshots, then no data is * permitted to be sent. We enqueue a record that redacts all blocks, * and an eos marker. */ if (num_threads == 0) { record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); // We can't redact object 0, so don't try. record->start_object = 1; record->start_blkid = 0; record->end_object = record->end_blkid = UINT64_MAX; bqueue_enqueue(q, record, sizeof (*record)); return (0); } if (num_threads > 0) { redact_nodes = kmem_zalloc(num_threads * sizeof (*redact_nodes), KM_SLEEP); } avl_create(&start_tree, redact_node_compare_start, sizeof (struct redact_node), offsetof(struct redact_node, avl_node_start)); avl_create(&end_tree, redact_node_compare_end, sizeof (struct redact_node), offsetof(struct redact_node, avl_node_end)); for (int i = 0; i < num_threads; i++) { struct redact_node *node = &redact_nodes[i]; struct redact_thread_arg *targ = &thread_args[i]; node->record = bqueue_dequeue(&targ->q); node->rt_arg = targ; node->thread_num = i; avl_add(&start_tree, node); avl_add(&end_tree, node); } /* * Once the first record in the end tree has returned EOS, every record * must be an EOS record, so we should stop. */ while (err == 0 && !((struct redact_node *)avl_first(&end_tree))-> record->eos_marker) { if (*cancel) { err = EINTR; break; } struct redact_node *last_start = avl_last(&start_tree); struct redact_node *first_end = avl_first(&end_tree); /* * If the last start record is before the first end record, * then we have blocks that are redacted by all threads. * Therefore, we should redact them. Copy the record, and send * it to the main thread. */ if (redact_record_before(last_start->record, first_end->record)) { record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); *record = *first_end->record; record->start_object = last_start->record->start_object; record->start_blkid = last_start->record->start_blkid; record_merge_enqueue(q, ¤t_record, record); } err = update_avl_trees(&start_tree, &end_tree, first_end); } /* * We're done; if we were cancelled, we need to cancel our workers and * clear out their queues. Either way, we need to remove every thread's * redact_node struct from the avl trees. */ for (int i = 0; i < num_threads; i++) { if (err != 0) { thread_args[i].cancel = B_TRUE; while (!redact_nodes[i].record->eos_marker) { (void) update_avl_trees(&start_tree, &end_tree, &redact_nodes[i]); } } avl_remove(&start_tree, &redact_nodes[i]); avl_remove(&end_tree, &redact_nodes[i]); kmem_free(redact_nodes[i].record, sizeof (struct redact_record)); } avl_destroy(&start_tree); avl_destroy(&end_tree); kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); if (current_record != NULL) bqueue_enqueue(q, current_record, sizeof (current_record)); return (err); } struct redact_merge_thread_arg { bqueue_t q; spa_t *spa; int numsnaps; struct redact_thread_arg *thr_args; boolean_t cancel; int error_code; }; static void redact_merge_thread(void *arg) { struct redact_merge_thread_arg *rmta = arg; rmta->error_code = perform_thread_merge(&rmta->q, rmta->numsnaps, rmta->thr_args, &rmta->cancel); struct redact_record *rec = kmem_zalloc(sizeof (*rec), KM_SLEEP); rec->eos_marker = B_TRUE; bqueue_enqueue_flush(&rmta->q, rec, 1); thread_exit(); } /* * Find the next object in or after the redaction range passed in, and hold * its dnode with the provided tag. Also update *object to contain the new * object number. */ static int hold_next_object(objset_t *os, struct redact_record *rec, void *tag, uint64_t *object, dnode_t **dn) { int err = 0; if (*dn != NULL) dnode_rele(*dn, FTAG); *dn = NULL; if (*object < rec->start_object) { *object = rec->start_object - 1; } err = dmu_object_next(os, object, B_FALSE, 0); if (err != 0) return (err); err = dnode_hold(os, *object, tag, dn); while (err == 0 && (*object < rec->start_object || DMU_OT_IS_METADATA((*dn)->dn_type))) { dnode_rele(*dn, tag); *dn = NULL; err = dmu_object_next(os, object, B_FALSE, 0); if (err != 0) break; err = dnode_hold(os, *object, tag, dn); } return (err); } static int perform_redaction(objset_t *os, redaction_list_t *rl, struct redact_merge_thread_arg *rmta) { int err = 0; bqueue_t *q = &rmta->q; struct redact_record *rec = NULL; struct merge_data md = { {0} }; list_create(&md.md_redact_block_pending, sizeof (struct redact_block_list_node), offsetof(struct redact_block_list_node, node)); md.md_redaction_list = rl; for (int i = 0; i < TXG_SIZE; i++) { list_create(&md.md_blocks[i], sizeof (struct redact_block_list_node), offsetof(struct redact_block_list_node, node)); } dnode_t *dn = NULL; uint64_t prev_obj = 0; for (rec = bqueue_dequeue(q); !rec->eos_marker && err == 0; rec = get_next_redact_record(q, rec)) { ASSERT3U(rec->start_object, !=, 0); uint64_t object; if (prev_obj != rec->start_object) { object = rec->start_object - 1; err = hold_next_object(os, rec, FTAG, &object, &dn); } else { object = prev_obj; } while (err == 0 && object <= rec->end_object) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = EINTR; break; } /* * Part of the current object is contained somewhere in * the range covered by rec. */ uint64_t startblkid; uint64_t endblkid; uint64_t maxblkid = dn->dn_phys->dn_maxblkid; if (rec->start_object < object) startblkid = 0; else if (rec->start_blkid > maxblkid) break; else startblkid = rec->start_blkid; if (rec->end_object > object || rec->end_blkid > maxblkid) { endblkid = maxblkid; } else { endblkid = rec->end_blkid; } update_redaction_list(&md, os, object, startblkid, endblkid, dn->dn_datablksz); if (object == rec->end_object) break; err = hold_next_object(os, rec, FTAG, &object, &dn); } if (err == ESRCH) err = 0; if (dn != NULL) prev_obj = object; } if (err == 0 && dn != NULL) dnode_rele(dn, FTAG); if (err == ESRCH) err = 0; rmta->cancel = B_TRUE; while (!rec->eos_marker) rec = get_next_redact_record(q, rec); kmem_free(rec, sizeof (*rec)); /* * There may be a block that's being coalesced, sync that out before we * return. */ if (err == 0 && md.md_coalesce_block.rbp_size_count != 0) { struct redact_block_list_node *rbln = kmem_alloc(sizeof (struct redact_block_list_node), KM_SLEEP); rbln->block = md.md_coalesce_block; list_insert_tail(&md.md_redact_block_pending, rbln); } commit_rl_updates(os, &md, UINT64_MAX, UINT64_MAX); /* * Wait for all the redaction info to sync out before we return, so that * anyone who attempts to resume this redaction will have all the data * they need. */ dsl_pool_t *dp = spa_get_dsl(os->os_spa); if (md.md_latest_synctask_txg != 0) txg_wait_synced(dp, md.md_latest_synctask_txg); for (int i = 0; i < TXG_SIZE; i++) list_destroy(&md.md_blocks[i]); return (err); } static boolean_t redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) { for (int i = 0; i < num_snaps; i++) { if (snaps[i] == guid) return (B_TRUE); } return (B_FALSE); } int dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, const char *redactbook) { int err = 0; dsl_pool_t *dp = NULL; dsl_dataset_t *ds = NULL; int numsnaps = 0; objset_t *os; struct redact_thread_arg *args = NULL; redaction_list_t *new_rl = NULL; if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0) return (err); if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) != 0) { goto out; } dsl_dataset_long_hold(ds, FTAG); if (!ds->ds_is_snapshot || dmu_objset_from_ds(ds, &os) != 0) { err = EINVAL; goto out; } if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) { err = EALREADY; goto out; } numsnaps = fnvlist_num_pairs(redactnvl); if (numsnaps > 0) args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); nvpair_t *pair = NULL; for (int i = 0; i < numsnaps; i++) { pair = nvlist_next_nvpair(redactnvl, pair); const char *name = nvpair_name(pair); struct redact_thread_arg *rta = &args[i]; err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT, FTAG, &rta->ds); if (err != 0) break; /* * We want to do the long hold before we can get any other * errors, because the cleanup code will release the long * hold if rta->ds is filled in. */ dsl_dataset_long_hold(rta->ds, FTAG); err = dmu_objset_from_ds(rta->ds, &rta->os); if (err != 0) break; if (!dsl_dataset_is_before(rta->ds, ds, 0)) { err = EINVAL; break; } if (dsl_dataset_feature_is_active(rta->ds, SPA_FEATURE_REDACTED_DATASETS)) { err = EALREADY; break; } } VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL); if (err != 0) goto out; boolean_t resuming = B_FALSE; char newredactbook[ZFS_MAX_DATASET_NAME_LEN]; zfs_bookmark_phys_t bookmark; (void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN); char *c = strchr(newredactbook, '@'); ASSERT3P(c, !=, NULL); int n = snprintf(c, ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook), "#%s", redactbook); if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) { dsl_pool_rele(dp, FTAG); return (SET_ERROR(ENAMETOOLONG)); } err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark); if (err == 0) { resuming = B_TRUE; if (bookmark.zbm_redaction_obj == 0) { err = EEXIST; goto out; } err = dsl_redaction_list_hold_obj(dp, bookmark.zbm_redaction_obj, FTAG, &new_rl); if (err != 0) { err = EIO; goto out; } dsl_redaction_list_long_hold(dp, new_rl, FTAG); if (new_rl->rl_phys->rlp_num_snaps != numsnaps) { err = ESRCH; goto out; } for (int i = 0; i < numsnaps; i++) { struct redact_thread_arg *rta = &args[i]; if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps, new_rl->rl_phys->rlp_num_snaps, dsl_dataset_phys(rta->ds)->ds_guid)) { err = ESRCH; goto out; } } if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX && new_rl->rl_phys->rlp_last_object == UINT64_MAX) { err = EEXIST; goto out; } dsl_pool_rele(dp, FTAG); dp = NULL; } else { uint64_t *guids = NULL; if (numsnaps > 0) { guids = kmem_zalloc(numsnaps * sizeof (uint64_t), KM_SLEEP); } for (int i = 0; i < numsnaps; i++) { struct redact_thread_arg *rta = &args[i]; guids[i] = dsl_dataset_phys(rta->ds)->ds_guid; } dsl_pool_rele(dp, FTAG); dp = NULL; err = dsl_bookmark_create_redacted(newredactbook, snapname, numsnaps, guids, FTAG, &new_rl); kmem_free(guids, numsnaps * sizeof (uint64_t)); if (err != 0) { goto out; } } for (int i = 0; i < numsnaps; i++) { struct redact_thread_arg *rta = &args[i]; (void) bqueue_init(&rta->q, zfs_redact_queue_ff, zfs_redact_queue_length, offsetof(struct redact_record, ln)); if (resuming) { rta->resume.zb_blkid = new_rl->rl_phys->rlp_last_blkid; rta->resume.zb_object = new_rl->rl_phys->rlp_last_object; } rta->txg = dsl_dataset_phys(ds)->ds_creation_txg; (void) thread_create(NULL, 0, redact_traverse_thread, rta, 0, curproc, TS_RUN, minclsyspri); } struct redact_merge_thread_arg rmta = { { {0} } }; (void) bqueue_init(&rmta.q, zfs_redact_queue_ff, zfs_redact_queue_length, offsetof(struct redact_record, ln)); rmta.numsnaps = numsnaps; rmta.spa = os->os_spa; rmta.thr_args = args; (void) thread_create(NULL, 0, redact_merge_thread, &rmta, 0, curproc, TS_RUN, minclsyspri); err = perform_redaction(os, new_rl, &rmta); out: if (new_rl != NULL) { dsl_redaction_list_long_rele(new_rl, FTAG); dsl_redaction_list_rele(new_rl, FTAG); } for (int i = 0; i < numsnaps; i++) { struct redact_thread_arg *rta = &args[i]; /* * rta->ds may be NULL if we got an error while filling * it in. */ if (rta->ds != NULL) { dsl_dataset_long_rele(rta->ds, FTAG); dsl_dataset_rele_flags(rta->ds, DS_HOLD_FLAG_DECRYPT, FTAG); } } if (args != NULL) kmem_free(args, numsnaps * sizeof (*args)); if (dp != NULL) dsl_pool_rele(dp, FTAG); if (ds != NULL) { dsl_dataset_long_rele(ds, FTAG); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); } return (SET_ERROR(err)); } diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c index 2d6ca8549eb9..148e8fff2437 100644 --- a/module/zfs/dsl_synctask.c +++ b/module/zfs/dsl_synctask.c @@ -1,261 +1,257 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #include #include #include #include #include #include #define DST_AVG_BLKSHIFT 14 /* ARGSUSED */ static int dsl_null_checkfunc(void *arg, dmu_tx_t *tx) { return (0); } static int dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg, int blocks_modified, zfs_space_check_t space_check, boolean_t early) { spa_t *spa; dmu_tx_t *tx; int err; dsl_sync_task_t dst = { { { NULL } } }; dsl_pool_t *dp; err = spa_open(pool, &spa, FTAG); if (err != 0) return (err); dp = spa_get_dsl(spa); top: tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dst.dst_pool = dp; dst.dst_txg = dmu_tx_get_txg(tx); dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT; dst.dst_space_check = space_check; dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc; dst.dst_syncfunc = syncfunc; dst.dst_arg = arg; dst.dst_error = 0; dst.dst_nowaiter = B_FALSE; dsl_pool_config_enter(dp, FTAG); err = dst.dst_checkfunc(arg, tx); dsl_pool_config_exit(dp, FTAG); if (err != 0) { dmu_tx_commit(tx); spa_close(spa, FTAG); return (err); } txg_list_t *task_list = (early) ? &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg)); dmu_tx_commit(tx); if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) { /* current contract is to call func once */ sigfunc(arg, tx); sigfunc = NULL; /* in case we're performing an EAGAIN retry */ } txg_wait_synced(dp, dst.dst_txg); if (dst.dst_error == EAGAIN) { txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); goto top; } spa_close(spa, FTAG); return (dst.dst_error); } /* * Called from open context to perform a callback in syncing context. Waits * for the operation to complete. * * The checkfunc will be called from open context as a preliminary check * which can quickly fail. If it succeeds, it will be called again from * syncing context. The checkfunc should generally be designed to work * properly in either context, but if necessary it can check * dmu_tx_is_syncing(tx). * * The synctask infrastructure enforces proper locking strategy with respect * to the dp_config_rwlock -- the lock will always be held when the callbacks * are called. It will be held for read during the open-context (preliminary) * call to the checkfunc, and then held for write from syncing context during * the calls to the check and sync funcs. * * A dataset or pool name can be passed as the first argument. Typically, * the check func will hold, check the return value of the hold, and then * release the dataset. The sync func will VERIFYO(hold()) the dataset. * This is safe because no changes can be made between the check and sync funcs, * and the sync func will only be called if the check func successfully opened * the dataset. */ int dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified, zfs_space_check_t space_check) { return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg, blocks_modified, space_check, B_FALSE)); } /* * An early synctask works exactly as a standard synctask with one important * difference on the way it is handled during syncing context. Standard * synctasks run after we've written out all the dirty blocks of dirty * datasets. Early synctasks are executed before writing out any dirty data, * and thus before standard synctasks. * * For that reason, early synctasks can affect the process of writing dirty * changes to disk for the txg that they run and should be used with caution. * In addition, early synctasks should not dirty any metaslabs as this would * invalidate the precondition/invariant for subsequent early synctasks. * [see dsl_pool_sync() and dsl_early_sync_task_verify()] */ int dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified, zfs_space_check_t space_check) { return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg, blocks_modified, space_check, B_TRUE)); } /* * A standard synctask that can be interrupted from a signal. The sigfunc * is called once if a signal occurred while waiting for the task to sync. */ int dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg, int blocks_modified, zfs_space_check_t space_check) { return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg, blocks_modified, space_check, B_FALSE)); } static void dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx, - boolean_t early) + dmu_tx_t *tx, boolean_t early) { dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); dst->dst_pool = dp; dst->dst_txg = dmu_tx_get_txg(tx); - dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; - dst->dst_space_check = space_check; + dst->dst_space_check = ZFS_SPACE_CHECK_NONE; dst->dst_checkfunc = dsl_null_checkfunc; dst->dst_syncfunc = syncfunc; dst->dst_arg = arg; dst->dst_error = 0; dst->dst_nowaiter = B_TRUE; txg_list_t *task_list = (early) ? &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg)); } void dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) + dmu_tx_t *tx) { - dsl_sync_task_nowait_common(dp, syncfunc, arg, - blocks_modified, space_check, tx, B_FALSE); + dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_FALSE); } void dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, - int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) + dmu_tx_t *tx) { - dsl_sync_task_nowait_common(dp, syncfunc, arg, - blocks_modified, space_check, tx, B_TRUE); + dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_TRUE); } /* * Called in syncing context to execute the synctask. */ void dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) { dsl_pool_t *dp = dst->dst_pool; ASSERT0(dst->dst_error); /* * Check for sufficient space. * * When the sync task was created, the caller specified the * type of space checking required. See the comment in * zfs_space_check_t for details on the semantics of each * type of space checking. * * We just check against what's on-disk; we don't want any * in-flight accounting to get in our way, because open context * may have already used up various in-core limits * (arc_tempreserve, dsl_pool_tempreserve). */ if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) { uint64_t quota = dsl_pool_unreserved_space(dp, dst->dst_space_check); uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; /* MOS space is triple-dittoed, so we multiply by 3. */ if (used + dst->dst_space * 3 > quota) { dst->dst_error = SET_ERROR(ENOSPC); if (dst->dst_nowaiter) kmem_free(dst, sizeof (*dst)); return; } } /* * Check for errors by calling checkfunc. */ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx); if (dst->dst_error == 0) dst->dst_syncfunc(dst->dst_arg, tx); rrw_exit(&dp->dp_config_rwlock, FTAG); if (dst->dst_nowaiter) kmem_free(dst, sizeof (*dst)); } #if defined(_KERNEL) EXPORT_SYMBOL(dsl_sync_task); EXPORT_SYMBOL(dsl_sync_task_nowait); #endif diff --git a/module/zfs/spa.c b/module/zfs/spa.c index aac469f44b59..31fa52d1d034 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1,9756 +1,9755 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "iss", "iss_h", "int", "int_h" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_BATCH * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */ { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ /* * Report any spa_load_verify errors found, but do not fail spa_load. * This is used by zdb to analyze non-idle pools. */ boolean_t spa_load_verify_dryrun = B_FALSE; /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * For debugging purposes: print out vdev tree during pool import. */ int spa_load_print_vdev_tree = B_FALSE; /* * A non-zero value for zfs_max_missing_tvds means that we allow importing * pools with missing top-level vdevs. This is strictly intended for advanced * pool recovery cases since missing data is almost inevitable. Pools with * missing devices can only be imported read-only for safety reasons, and their * fail-mode will be automatically set to "continue". * * With 1 missing vdev we should be able to import the pool and mount all * datasets. User data that was not modified after the missing device has been * added should be recoverable. This means that snapshots created prior to the * addition of that device should be completely intact. * * With 2 missing vdevs, some datasets may fail to mount since there are * dataset statistics that are stored as regular metadata. Some data might be * recoverable if those vdevs were added recently. * * With 3 or more missing vdevs, the pool is severely damaged and MOS entries * may be missing entirely. Chances of data recovery are very low. Note that * there are also risks of performing an inadvertent rewind as we might be * missing all the vdevs with the latest uberblocks. */ unsigned long zfs_max_missing_tvds = 0; /* * The parameters below are similar to zfs_max_missing_tvds but are only * intended for a preliminary open of the pool with an untrusted config which * might be incomplete or out-dated. * * We are more tolerant for pools opened from a cachefile since we could have * an out-dated cachefile where a device removal was not registered. * We could have set the limit arbitrarily high but in the case where devices * are really missing we would want to return the proper error codes; we chose * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available * and we get a chance to retrieve the trusted config. */ uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; /* * In the case where config was assembled by scanning device paths (/dev/dsks * by default) we are less tolerant since all the existing devices should have * been detected and we want spa_load to return the right error codes. */ uint64_t zfs_max_missing_tvds_scan = 0; /* * Debugging aid that pauses spa_sync() towards the end. */ boolean_t zfs_pause_spa_sync = B_FALSE; /* * Variables to indicate the livelist condense zthr func should wait at certain * points for the livelist to be removed - used to test condense/destroy races */ int zfs_livelist_condense_zthr_pause = 0; int zfs_livelist_condense_sync_pause = 0; /* * Variables to track whether or not condense cancellation has been * triggered in testing. */ int zfs_livelist_condense_sync_cancel = 0; int zfs_livelist_condense_zthr_cancel = 0; /* * Variable to track whether or not extra ALLOC blkptrs were added to a * livelist entry while it was being condensed (caused by the way we track * remapped blkptrs in dbuf_remap_impl) */ int zfs_livelist_condense_new_alloc = 0; /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); if (strval != NULL) VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); else VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; const zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(mc); alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == SPA_MODE_READ), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_DEFAULT); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_LOCAL); } spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, NULL, spa_load_guid(spa), src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MAX_SIZE, ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MIN_SIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; dsl_pool_t *dp; int err; err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); if (err) return (err); dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) goto out; /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_dataset_t *ds = NULL; err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds); if (err != 0) break; strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } spa_prop_add_list(*nvp, prop, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); out: mutex_exit(&spa->spa_props_lock); dsl_pool_config_exit(dp, FTAG); if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPOOL_PROP_INVAL: if (!zpool_prop_feature(propname)) { error = SET_ERROR(EINVAL); break; } /* * Sanitize the input. */ if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: case ZPOOL_PROP_AUTOTRIM: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_MULTIHOST: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); if (!error) { uint32_t hostid = zone_get_hostid(NULL); if (hostid) spa->spa_hostid = hostid; else error = SET_ERROR(ENOTSUP); } break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } error = dmu_objset_hold(strval, FTAG, &os); if (error != 0) break; /* Must be ZPL. */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && intval > ZIO_FAILURE_MODE_PANIC) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { if (!isprint(*check)) { error = SET_ERROR(EINVAL); break; } } if (strlen(strval) > ZPROP_MAX_COMMENT) error = SET_ERROR(E2BIG); break; default: break; } if (error) break; } (void) nvlist_remove_all(props, zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } /*ARGSUSED*/ static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid __maybe_unused = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { int error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (SET_ERROR(error)); } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", (u_longlong_t)oldguid, (u_longlong_t)*newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. */ int spa_change_guid(spa_t *spa) { int error; uint64_t guid; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { const spa_error_entry_t *sa = (const spa_error_entry_t *)a; const spa_error_entry_t *sb = (const spa_error_entry_t *)b; int ret; ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); return (TREE_ISIGN(ret)); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; uint_t flags = 0; boolean_t batch = B_FALSE; if (mode == ZTI_MODE_NULL) { tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >=, 1); value = MAX(value, 1); flags |= TASKQ_DYNAMIC; break; case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = MIN(zio_taskq_batch_pct, 100); break; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } for (uint_t i = 0; i < count; i++) { taskq_t *tq; char name[32]; (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly less important * priority than the other taskqs. Under Linux this * means incrementing the priority value on platforms * like illumos it should be decremented. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) pri++; tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); } tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT3U(tqs->stqs_count, ==, 0); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. In that case we choose which taskq at random by using * the low bits of gethrtime(). */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } taskq_dispatch_ent(tq, func, arg, flags, ent); } /* * Same as spa_taskq_dispatch_ent() but block on the task until completion. */ void spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; taskqid_t id; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; } id = taskq_dispatch(tq, func, arg, flags); if (id) taskq_wait_id(tq, id); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } /* * Disabled until spa_thread() can be adapted for Linux. */ #undef HAVE_SPA_THREAD #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) static void spa_thread(void *arg) { psetid_t zio_taskq_psrset_bind = PS_NONE; callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, spa_mode_t mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; #ifdef HAVE_SPA_THREAD /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* HAVE_SPA_THREAD */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } for (size_t i = 0; i < TXG_SIZE; i++) { spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); list_create(&spa->spa_evicting_os_list, sizeof (objset_t), offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, spa, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); spa_keystore_init(&spa->spa_keystore); /* * This taskq is used to perform zvol-minor-related tasks * asynchronously. This has several advantages, including easy * resolution of various deadlocks (zfsonlinux bug #3681). * * The taskq must be single threaded to ensure tasks are always * processed in the order in which they were dispatched. * * A taskq per pool allows one to keep the pools independent. * This way if one pool is suspended, it will not impact another. * * The preferred location to dispatch a zvol minor task is a sync * task. In this context, there is easy access to the spa_t and minimal * error handling is required because the sync task must succeed. */ spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1, INT_MAX, 0); /* * Taskq dedicated to prefetcher threads: this is used to prevent the * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus, defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); spa_evicting_os_wait(spa); if (spa->spa_zvol_taskq) { taskq_destroy(spa->spa_zvol_taskq); spa->spa_zvol_taskq = NULL; } if (spa->spa_prefetch_taskq) { taskq_destroy(spa->spa_prefetch_taskq); spa->spa_prefetch_taskq = NULL; } if (spa->spa_upgrade_taskq) { taskq_destroy(spa->spa_upgrade_taskq); spa->spa_upgrade_taskq = NULL; } txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } for (size_t i = 0; i < TXG_SIZE; i++) { ASSERT3P(spa->spa_txg_zio[i], !=, NULL); VERIFY0(zio_wait(spa->spa_txg_zio[i])); spa->spa_txg_zio[i] = NULL; } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); spa_keystore_fini(&spa->spa_keystore); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } static boolean_t spa_should_flush_logs_on_unload(spa_t *spa) { if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) return (B_FALSE); if (!spa_writeable(spa)) return (B_FALSE); if (!spa->spa_sync_on) return (B_FALSE); if (spa_state(spa) != POOL_STATE_EXPORTED) return (B_FALSE); if (zfs_keep_log_spacemaps_at_export) return (B_FALSE); return (B_TRUE); } /* * Opens a transaction that will set the flag that will instruct * spa_sync to attempt to flush all the metaslabs for that txg. */ static void spa_unload_log_sm_flush_all(spa_t *spa) { dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); ASSERT3U(spa->spa_log_flushall_txg, ==, 0); spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); } static void spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); e != NULL; e = list_head(&spa->spa_log_summary)) { VERIFY0(e->lse_mscount); list_remove(&spa->spa_log_summary, e); kmem_free(e, sizeof (log_summary_entry_t)); } spa->spa_unflushed_stats.sus_nblocks = 0; spa->spa_unflushed_stats.sus_memused = 0; spa->spa_unflushed_stats.sus_blocklimit = 0; } static void spa_destroy_aux_threads(spa_t *spa) { if (spa->spa_condense_zthr != NULL) { zthr_destroy(spa->spa_condense_zthr); spa->spa_condense_zthr = NULL; } if (spa->spa_checkpoint_discard_zthr != NULL) { zthr_destroy(spa->spa_checkpoint_discard_zthr); spa->spa_checkpoint_discard_zthr = NULL; } if (spa->spa_livelist_delete_zthr != NULL) { zthr_destroy(spa->spa_livelist_delete_zthr); spa->spa_livelist_delete_zthr = NULL; } if (spa->spa_livelist_condense_zthr != NULL) { zthr_destroy(spa->spa_livelist_condense_zthr); spa->spa_livelist_condense_zthr = NULL; } } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); spa_import_progress_remove(spa_guid(spa)); spa_load_note(spa, "UNLOADING"); spa_wake_waiters(spa); /* * If the log space map feature is enabled and the pool is getting * exported (but not destroyed), we want to spend some time flushing * as many metaslabs as we can in an attempt to destroy log space * maps and save import time. */ if (spa_should_flush_logs_on_unload(spa)) spa_unload_log_sm_flush_all(spa); /* * Stop async tasks. */ spa_async_suspend(spa); if (spa->spa_root_vdev) { vdev_t *root_vdev = spa->spa_root_vdev; vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); } /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * This ensures that there is no async metaslab prefetching * while we attempt to unload the spa. */ if (spa->spa_root_vdev != NULL) { for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; if (vc->vdev_mg != NULL) taskq_wait(vc->vdev_mg->mg_taskq); } } if (spa->spa_mmp.mmp_thread) mmp_thread_stop(spa); /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } if (spa->spa_vdev_removal != NULL) { spa_vdev_removal_destroy(spa->spa_vdev_removal); spa->spa_vdev_removal = NULL; } spa_destroy_aux_threads(spa); spa_condense_fini(spa); bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); spa_unload_log_sm_metadata(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); for (int i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; spa->spa_indirect_vdevs_loaded = B_FALSE; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } spa_config_exit(spa, SCL_ALL, spa); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As spare vdevs are shared among open pools, we skip loading * them when we load the checkpointed state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache = NULL; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; #ifndef _KERNEL /* * zdb opens both the current state of the pool and the * checkpointed state (if present), with a different spa_t. * * As L2 caches are part of the ARC which is shared among open * pools, we skip loading them when we load the checkpointed * state of the pool. */ if (!spa_writeable(spa)) return; #endif ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; if (sav->sav_config == NULL) { nl2cache = 0; newvdevs = NULL; goto out; } VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); /* * Upon cache device addition to a pool or pool * creation with a cache device or if the header * of the device is invalid we issue an async * TRIM command for the whole device which will * execute if l2arc_trim_ahead > 0. */ spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); if (sav->sav_count > 0) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: /* * Purge vdevs that were dropped */ for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = vmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); vmem_free(packed, nvsize); return (error); } /* * Concrete top-level vdevs that are not missing and are not logs. At every * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. */ static uint64_t spa_healthy_core_tvds(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t tvds = 0; for (uint64_t i = 0; i < rvd->vdev_children; i++) { vdev_t *vd = rvd->vdev_child[i]; if (vd->vdev_islog) continue; if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) tvds++; } return (tvds); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (uint64_t c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && vdev_is_concrete(vd)) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); } } static int spa_check_for_missing_logs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; /* * If we're doing a normal import, then build up any additional * diagnostic information about missing log devices. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; /* * We consider a device as missing only if it failed * to open (i.e. offline or faulted is not considered * as missing). */ if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { child[idx++] = vdev_config_generate(spa, tvd, B_FALSE, VDEV_CONFIG_MISSING); } } if (idx > 0) { fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, idx); fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv); for (uint64_t i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } } else { for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); break; } } } return (0); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; dsl_pool_t *dp = spa_get_dsl(spa); switch (spa->spa_log_state) { default: break; case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { metaslab_group_passivate(mg); slog_found = B_TRUE; } } return (slog_found); } static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) metaslab_group_activate(mg); } } int spa_reset_logs(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_reset, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { for (int i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of inflight bytes is the log2 fraction of the arc size. * By default, we set it to 1/16th of the arc. */ int spa_load_verify_shift = 4; int spa_load_verify_metadata = B_TRUE; int spa_load_verify_data = B_TRUE; /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) return (0); /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); uint64_t maxinflight_bytes = arc_target_bytes() >> spa_load_verify_shift; zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_load_verify_bytes >= maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_load_verify_bytes += size; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } /* ARGSUSED */ static int verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_load_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) return (0); dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); error = dmu_objset_find_dp(spa->spa_dsl_pool, spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, DS_FIND_CHILDREN); dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); if (error != 0) return (error); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { if (spa->spa_extreme_rewind) { spa_load_note(spa, "performing a complete scan of the " "pool since extreme rewind is on. This may take " "a very long time.\n (spa_load_verify_data=%u, " "spa_load_verify_metadata=%u)", spa_load_verify_data, spa_load_verify_metadata); } error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); } (void) zio_wait(rio); ASSERT0(spa->spa_load_verify_bytes); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { spa_load_note(spa, "spa_load_verify found %llu metadata errors " "and %llu data errors", (u_longlong_t)sle.sle_meta_count, (u_longlong_t)sle.sle_data_count); } if (spa_load_verify_dryrun || (!error && sle.sle_meta_count <= policy.zlp_maxmeta && sle.sle_data_count <= policy.zlp_maxdata)) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); VERIFY(nvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (spa_load_verify_dryrun) return (0); if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val); if (error != 0 && (error != ENOENT || log_enoent)) { spa_load_failed(spa, "couldn't get '%s' value in MOS directory " "[error=%d]", name, error); } return (error); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (SET_ERROR(err)); } boolean_t spa_livelist_delete_check(spa_t *spa) { return (spa->spa_livelists_to_delete != 0); } /* ARGSUSED */ static boolean_t spa_livelist_delete_cb_check(void *arg, zthr_t *z) { spa_t *spa = arg; return (spa_livelist_delete_check(spa)); } static int delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { spa_t *spa = arg; zio_free(spa, tx->tx_txg, bp); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); return (0); } static int dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) { int err; zap_cursor_t zc; zap_attribute_t za; zap_cursor_init(&zc, os, zap_obj); err = zap_cursor_retrieve(&zc, &za); zap_cursor_fini(&zc); if (err == 0) *llp = za.za_first_integer; return (err); } /* * Components of livelist deletion that must be performed in syncing * context: freeing block pointers and updating the pool-wide data * structures to indicate how much work is left to do */ typedef struct sublist_delete_arg { spa_t *spa; dsl_deadlist_t *ll; uint64_t key; bplist_t *to_free; } sublist_delete_arg_t; static void sublist_delete_sync(void *arg, dmu_tx_t *tx) { sublist_delete_arg_t *sda = arg; spa_t *spa = sda->spa; dsl_deadlist_t *ll = sda->ll; uint64_t key = sda->key; bplist_t *to_free = sda->to_free; bplist_iterate(to_free, delete_blkptr_cb, spa, tx); dsl_deadlist_remove_entry(ll, key, tx); } typedef struct livelist_delete_arg { spa_t *spa; uint64_t ll_obj; uint64_t zap_obj; } livelist_delete_arg_t; static void livelist_delete_sync(void *arg, dmu_tx_t *tx) { livelist_delete_arg_t *lda = arg; spa_t *spa = lda->spa; uint64_t ll_obj = lda->ll_obj; uint64_t zap_obj = lda->zap_obj; objset_t *mos = spa->spa_meta_objset; uint64_t count; /* free the livelist and decrement the feature count */ VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); dsl_deadlist_free(mos, ll_obj, tx); spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); VERIFY0(zap_count(mos, zap_obj, &count)); if (count == 0) { /* no more livelists to delete */ VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, tx)); VERIFY0(zap_destroy(mos, zap_obj, tx)); spa->spa_livelists_to_delete = 0; spa_notify_waiters(spa); } } /* * Load in the value for the livelist to be removed and open it. Then, * load its first sublist and determine which block pointers should actually * be freed. Then, call a synctask which performs the actual frees and updates * the pool-wide livelist data. */ /* ARGSUSED */ static void spa_livelist_delete_cb(void *arg, zthr_t *z) { spa_t *spa = arg; uint64_t ll_obj = 0, count; objset_t *mos = spa->spa_meta_objset; uint64_t zap_obj = spa->spa_livelists_to_delete; /* * Determine the next livelist to delete. This function should only * be called if there is at least one deleted clone. */ VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); VERIFY0(zap_count(mos, ll_obj, &count)); if (count > 0) { dsl_deadlist_t ll = { 0 }; dsl_deadlist_entry_t *dle; bplist_t to_free; dsl_deadlist_open(&ll, mos, ll_obj); dle = dsl_deadlist_first(&ll); ASSERT3P(dle, !=, NULL); bplist_create(&to_free); int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, z, NULL); if (err == 0) { sublist_delete_arg_t sync_arg = { .spa = spa, .ll = &ll, .key = dle->dle_mintxg, .to_free = &to_free }; zfs_dbgmsg("deleting sublist (id %llu) from" " livelist %llu, %d remaining", dle->dle_bpobj.bpo_object, ll_obj, count - 1); VERIFY0(dsl_sync_task(spa_name(spa), NULL, sublist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } else { VERIFY3U(err, ==, EINTR); } bplist_clear(&to_free); bplist_destroy(&to_free); dsl_deadlist_close(&ll); } else { livelist_delete_arg_t sync_arg = { .spa = spa, .ll_obj = ll_obj, .zap_obj = zap_obj }; zfs_dbgmsg("deletion of livelist %llu completed", ll_obj); VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); } } static void spa_start_livelist_destroy_thread(spa_t *spa) { ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); spa->spa_livelist_delete_zthr = zthr_create("z_livelist_destroy", spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa); } typedef struct livelist_new_arg { bplist_t *allocs; bplist_t *frees; } livelist_new_arg_t; static int livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(tx == NULL); livelist_new_arg_t *lna = arg; if (bp_freed) { bplist_append(lna->frees, bp); } else { bplist_append(lna->allocs, bp); zfs_livelist_condense_new_alloc++; } return (0); } typedef struct livelist_condense_arg { spa_t *spa; bplist_t to_keep; uint64_t first_size; uint64_t next_size; } livelist_condense_arg_t; static void spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) { livelist_condense_arg_t *lca = arg; spa_t *spa = lca->spa; bplist_t new_frees; dsl_dataset_t *ds = spa->spa_to_condense.ds; /* Have we been cancelled? */ if (spa->spa_to_condense.cancelled) { zfs_livelist_condense_sync_cancel++; goto out; } dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; /* * It's possible that the livelist was changed while the zthr was * running. Therefore, we need to check for new blkptrs in the two * entries being condensed and continue to track them in the livelist. * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), * it's possible that the newly added blkptrs are FREEs or ALLOCs so * we need to sort them into two different bplists. */ uint64_t first_obj = first->dle_bpobj.bpo_object; uint64_t next_obj = next->dle_bpobj.bpo_object; uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; bplist_create(&new_frees); livelist_new_arg_t new_bps = { .allocs = &lca->to_keep, .frees = &new_frees, }; if (cur_first_size > lca->first_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, livelist_track_new_cb, &new_bps, lca->first_size)); } if (cur_next_size > lca->next_size) { VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, livelist_track_new_cb, &new_bps, lca->next_size)); } dsl_deadlist_clear_entry(first, ll, tx); ASSERT(bpobj_is_empty(&first->dle_bpobj)); dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); bplist_destroy(&new_frees); char dsname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds, dsname); zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj, cur_first_size, next_obj, cur_next_size, first->dle_bpobj.bpo_object, first->dle_bpobj.bpo_phys->bpo_num_blkptrs); out: dmu_buf_rele(ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); spa->spa_to_condense.syncing = B_FALSE; } static void spa_livelist_condense_cb(void *arg, zthr_t *t) { while (zfs_livelist_condense_zthr_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); spa_t *spa = arg; dsl_deadlist_entry_t *first = spa->spa_to_condense.first; dsl_deadlist_entry_t *next = spa->spa_to_condense.next; uint64_t first_size, next_size; livelist_condense_arg_t *lca = kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); bplist_create(&lca->to_keep); /* * Process the livelists (matching FREEs and ALLOCs) in open context * so we have minimal work in syncing context to condense. * * We save bpobj sizes (first_size and next_size) to use later in * syncing context to determine if entries were added to these sublists * while in open context. This is possible because the clone is still * active and open for normal writes and we want to make sure the new, * unprocessed blockpointers are inserted into the livelist normally. * * Note that dsl_process_sub_livelist() both stores the size number of * blockpointers and iterates over them while the bpobj's lock held, so * the sizes returned to us are consistent which what was actually * processed. */ int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, &first_size); if (err == 0) err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, t, &next_size); if (err == 0) { while (zfs_livelist_condense_sync_pause && !(zthr_has_waiters(t) || zthr_iscancelled(t))) delay(1); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_mark_netfree(tx); dmu_tx_hold_space(tx, 1); err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); if (err == 0) { /* * Prevent the condense zthr restarting before * the synctask completes. */ spa->spa_to_condense.syncing = B_TRUE; lca->spa = spa; lca->first_size = first_size; lca->next_size = next_size; dsl_sync_task_nowait(spa_get_dsl(spa), - spa_livelist_condense_sync, lca, 0, - ZFS_SPACE_CHECK_NONE, tx); + spa_livelist_condense_sync, lca, tx); dmu_tx_commit(tx); return; } } /* * Condensing can not continue: either it was externally stopped or * we were unable to assign to a tx because the pool has run out of * space. In the second case, we'll just end up trying to condense * again in a later txg. */ ASSERT(err != 0); bplist_clear(&lca->to_keep); bplist_destroy(&lca->to_keep); kmem_free(lca, sizeof (livelist_condense_arg_t)); dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); spa->spa_to_condense.ds = NULL; if (err == EINTR) zfs_livelist_condense_zthr_cancel++; } /* ARGSUSED */ /* * Check that there is something to condense but that a condense is not * already in progress and that condensing has not been cancelled. */ static boolean_t spa_livelist_condense_cb_check(void *arg, zthr_t *z) { spa_t *spa = arg; if ((spa->spa_to_condense.ds != NULL) && (spa->spa_to_condense.syncing == B_FALSE) && (spa->spa_to_condense.cancelled == B_FALSE)) { return (B_TRUE); } return (B_FALSE); } static void spa_start_livelist_condensing_thread(spa_t *spa) { spa->spa_to_condense.ds = NULL; spa->spa_to_condense.first = NULL; spa->spa_to_condense.next = NULL; spa->spa_to_condense.syncing = B_FALSE; spa->spa_to_condense.cancelled = B_FALSE; ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); spa->spa_livelist_condense_zthr = zthr_create("z_livelist_condense", spa_livelist_condense_cb_check, spa_livelist_condense_cb, spa); } static void spa_spawn_aux_threads(spa_t *spa) { ASSERT(spa_writeable(spa)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_start_indirect_condensing_thread(spa); spa_start_livelist_destroy_thread(spa); spa_start_livelist_condensing_thread(spa); ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); spa->spa_checkpoint_discard_zthr = zthr_create("z_checkpoint_discard", spa_checkpoint_discard_thread_check, spa_checkpoint_discard_thread, spa); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) { char *ereport = FM_EREPORT_ZFS_POOL; int error; spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { (void) zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); return (error); } #ifdef ZFS_DEBUG /* * Count the number of per-vdev ZAPs associated with all of the vdevs in the * vdev tree rooted in the given vd, and ensure that each ZAP is present in the * spa's per-vdev ZAP list. */ static uint64_t vdev_count_verify_zaps(vdev_t *vd) { spa_t *spa = vd->vdev_spa; uint64_t total = 0; if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_top_zap)); } if (vd->vdev_leaf_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { total += vdev_count_verify_zaps(vd->vdev_child[i]); } return (total); } #endif /* * Determine whether the activity check is required. */ static boolean_t spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, nvlist_t *config) { uint64_t state = 0; uint64_t hostid = 0; uint64_t tryconfig_txg = 0; uint64_t tryconfig_timestamp = 0; uint16_t tryconfig_mmp_seq = 0; nvlist_t *nvinfo; if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, &tryconfig_txg); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, &tryconfig_timestamp); (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, &tryconfig_mmp_seq); } (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); /* * Disable the MMP activity check - This is used by zdb which * is intended to be used on potentially active pools. */ if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) return (B_FALSE); /* * Skip the activity check when the MMP feature is disabled. */ if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) return (B_FALSE); /* * If the tryconfig_ values are nonzero, they are the results of an * earlier tryimport. If they all match the uberblock we just found, * then the pool has not changed and we return false so we do not test * a second time. */ if (tryconfig_txg && tryconfig_txg == ub->ub_txg && tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && tryconfig_mmp_seq && tryconfig_mmp_seq == (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) return (B_FALSE); /* * Allow the activity check to be skipped when importing the pool * on the same host which last imported it. Since the hostid from * configuration may be stale use the one read from the label. */ if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); if (hostid == spa_get_hostid(spa)) return (B_FALSE); /* * Skip the activity test when the pool was cleanly exported. */ if (state != POOL_STATE_ACTIVE) return (B_FALSE); return (B_TRUE); } /* * Nanoseconds the activity check must watch for changes on-disk. */ static uint64_t spa_activity_check_duration(spa_t *spa, uberblock_t *ub) { uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); uint64_t multihost_interval = MSEC2NSEC( MMP_INTERVAL_OK(zfs_multihost_interval)); uint64_t import_delay = MAX(NANOSEC, import_intervals * multihost_interval); /* * Local tunables determine a minimum duration except for the case * where we know when the remote host will suspend the pool if MMP * writes do not land. * * See Big Theory comment at the top of mmp.c for the reasoning behind * these cases and times. */ ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) > 0) { /* MMP on remote host will suspend pool after failed writes */ import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * MMP_IMPORT_SAFETY_FACTOR / 100; zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " "mmp_fails=%llu ub_mmp mmp_interval=%llu " "import_intervals=%u", import_delay, MMP_FAIL_INT(ub), MMP_INTERVAL(ub), import_intervals); } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && MMP_FAIL_INT(ub) == 0) { /* MMP on remote host will never suspend pool */ import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " "mmp_interval=%llu ub_mmp_delay=%llu " "import_intervals=%u", import_delay, MMP_INTERVAL(ub), ub->ub_mmp_delay, import_intervals); } else if (MMP_VALID(ub)) { /* * zfs-0.7 compatibility case */ import_delay = MAX(import_delay, (multihost_interval + ub->ub_mmp_delay) * import_intervals); zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " "import_intervals=%u leaves=%u", import_delay, ub->ub_mmp_delay, import_intervals, vdev_count_leaves(spa)); } else { /* Using local tunings is the only reasonable option */ zfs_dbgmsg("pool last imported on non-MMP aware " "host using import_delay=%llu multihost_interval=%llu " "import_intervals=%u", import_delay, multihost_interval, import_intervals); } return (import_delay); } /* * Perform the import activity check. If the user canceled the import or * we detected activity then fail. */ static int spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; hrtime_t import_expire; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; kmutex_t mtx; int error = 0; cv_init(&cv, NULL, CV_DEFAULT, NULL); mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&mtx); /* * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed * during the earlier tryimport. If the txg recorded there is 0 then * the pool is known to be active on another host. * * Otherwise, the pool might be in use on another host. Check for * changes in the uberblocks on disk if necessary. */ if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { vdev_uberblock_load(rvd, ub, &mmp_label); error = SET_ERROR(EREMOTEIO); goto out; } } import_delay = spa_activity_check_duration(spa, ub); /* Add a small random factor in case of simultaneous imports (0-25%) */ import_delay += import_delay * spa_get_random(250) / 1000; import_expire = gethrtime() + import_delay; while (gethrtime() < import_expire) { (void) spa_import_progress_set_mmp_check(spa_guid(spa), NSEC2SEC(import_expire - gethrtime())); vdev_uberblock_load(rvd, ub, &mmp_label); if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { zfs_dbgmsg("multihost activity detected " "txg %llu ub_txg %llu " "timestamp %llu ub_timestamp %llu " "mmp_config %#llx ub_mmp_config %#llx", txg, ub->ub_txg, timestamp, ub->ub_timestamp, mmp_config, ub->ub_mmp_config); error = SET_ERROR(EREMOTEIO); break; } if (mmp_label) { nvlist_free(mmp_label); mmp_label = NULL; } error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); if (error != -1) { error = SET_ERROR(EINTR); break; } error = 0; } out: mutex_exit(&mtx); mutex_destroy(&mtx); cv_destroy(&cv); /* * If the pool is determined to be active store the status in the * spa->spa_load_info nvlist. If the remote hostname or hostid are * available from configuration read from disk store them as well. * This allows 'zpool import' to generate a more useful message. * * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool */ if (error == EREMOTEIO) { char *hostname = ""; uint64_t hostid = 0; if (mmp_label) { if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { hostname = fnvlist_lookup_string(mmp_label, ZPOOL_CONFIG_HOSTNAME); fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTNAME, hostname); } if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { hostid = fnvlist_lookup_uint64(mmp_label, ZPOOL_CONFIG_HOSTID); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_HOSTID, hostid); } } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, 0); error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); } if (mmp_label) nvlist_free(mmp_label); return (error); } static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { uint64_t hostid; char *hostname; uint64_t myhostid = 0; if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { hostname = fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME); myhostid = zone_get_hostid(NULL); if (hostid != 0 && myhostid != 0 && hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%llx). " "See: https://openzfs.github.io/openzfs-docs/msg/" "ZFS-8000-EY", spa_name(spa), hostname, (u_longlong_t)hostid); spa_load_failed(spa, "hostid verification failed: pool " "last accessed by host: %s (hostid: 0x%llx)", hostname, (u_longlong_t)hostid); return (SET_ERROR(EBADF)); } } return (0); } static int spa_ld_parse_config(spa_t *spa, spa_import_type_t type) { int error = 0; nvlist_t *nvtree, *nvl, *config = spa->spa_config; int parse; vdev_t *rvd; uint64_t pool_guid; char *comment; /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_POOL_GUID); return (SET_ERROR(EINVAL)); } /* * If we are doing an import, ensure that the pool is not already * imported by checking if its pool guid already exists in the * spa namespace. * * The only case that we allow an already imported pool to be * imported again, is when the pool is checkpointed and we want to * look at its checkpointed state from userland tools like zdb. */ #ifdef _KERNEL if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { #else if ((spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0) && !spa_importing_readonly_checkpoint(spa)) { #endif spa_load_failed(spa, "a pool with guid %llu is already open", (u_longlong_t)pool_guid); return (SET_ERROR(EEXIST)); } spa->spa_config_guid = pool_guid; nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) spa->spa_config_splitting = fnvlist_dup(nvl); if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { spa_load_failed(spa, "invalid config provided: '%s' missing", ZPOOL_CONFIG_VDEV_TREE); return (SET_ERROR(EINVAL)); } /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "unable to parse config [error=%d]", error); return (error); } ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } return (0); } /* * Recursively open all vdevs in the vdev tree. This function is called twice: * first with the untrusted config, then with the trusted config. */ static int spa_ld_open_vdevs(spa_t *spa) { int error = 0; /* * spa_missing_tvds_allowed defines how many top-level vdevs can be * missing/unopenable for the root vdev to be still considered openable. */ if (spa->spa_trust_config) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; } else { spa->spa_missing_tvds_allowed = 0; } spa->spa_missing_tvds_allowed = MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { /* * Although theoretically we could allow users to open * incomplete pools in RW mode, we'd need to add a lot * of extra logic (e.g. adjust pool space to account * for missing vdevs). * This limitation also prevents users from accidentally * opening the pool in RW mode during data recovery and * damaging it further. */ spa_load_note(spa, "pools with missing top-level " "vdevs can only be opened in read-only mode."); error = SET_ERROR(ENXIO); } else { spa_load_note(spa, "current settings allow for maximum " "%lld missing top-level vdevs at this stage.", (u_longlong_t)spa->spa_missing_tvds_allowed); } } if (error != 0) { spa_load_failed(spa, "unable to open vdev tree [error=%d]", error); } if (spa->spa_missing_tvds != 0 || error != 0) vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); return (error); } /* * We need to validate the vdev labels against the configuration that * we have in hand. This function is called twice: first with an untrusted * config, then with a trusted config. The validation is more strict when the * config is trusted. */ static int spa_ld_validate_vdevs(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "vdev_validate failed [error=%d]", error); return (error); } if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { spa_load_failed(spa, "cannot open vdev tree after invalidating " "some vdevs"); vdev_dbgmsg_print_tree(rvd, 2); return (SET_ERROR(ENXIO)); } return (0); } static void spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) { spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; } static int spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) { vdev_t *rvd = spa->spa_root_vdev; nvlist_t *label; uberblock_t *ub = &spa->spa_uberblock; boolean_t activity_check = B_FALSE; /* * If we are opening the checkpointed state of the pool by * rewinding to it, at this point we will have written the * checkpointed uberblock to the vdev labels, so searching * the labels will find the right uberblock. However, if * we are opening the checkpointed state read-only, we have * not modified the labels. Therefore, we must ignore the * labels and continue using the spa_uberblock that was set * by spa_ld_checkpoint_rewind. * * Note that it would be fine to ignore the labels when * rewinding (opening writeable) as well. However, if we * crash just after writing the labels, we will end up * searching the labels. Doing so in the common case means * that this code path gets exercised normally, rather than * just in the edge case. */ if (ub->ub_checkpoint_txg != 0 && spa_importing_readonly_checkpoint(spa)) { spa_ld_select_uberblock_done(spa, ub); return (0); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); spa_load_failed(spa, "no valid uberblock found"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (spa->spa_load_max_txg != UINT64_MAX) { (void) spa_import_progress_set_max_txg(spa_guid(spa), (u_longlong_t)spa->spa_load_max_txg); } spa_load_note(spa, "using uberblock with txg=%llu", (u_longlong_t)ub->ub_txg); /* * For pools which have the multihost property on determine if the * pool is truly inactive and can be safely imported. Prevent * hosts which don't have a hostid set from importing the pool. */ activity_check = spa_activity_check_required(spa, ub, label, spa->spa_config); if (activity_check) { if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && spa_get_hostid(spa) == 0) { nvlist_free(label); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } int error = spa_activity_check(spa, ub, spa->spa_config); if (error) { nvlist_free(label); return (error); } fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); fnvlist_add_uint16(spa->spa_load_info, ZPOOL_CONFIG_MMP_SEQ, (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); spa_load_failed(spa, "version %llu is not supported", (u_longlong_t)ub->ub_version); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL) { spa_load_failed(spa, "label config unavailable"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); spa_load_failed(spa, "invalid label: '%s' missing", ZPOOL_CONFIG_FEATURES_FOR_READ); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { VERIFY(nvlist_add_string(unsup_feat, nvpair_name(nvp), "") == 0); } } if (!nvlist_empty(unsup_feat)) { VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); nvlist_free(unsup_feat); spa_load_failed(spa, "some features are unsupported"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, spa->spa_config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa_ld_select_uberblock_done(spa, ub); return (0); } static int spa_ld_open_rootbp(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error != 0) { spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; return (0); } static int spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t reloading) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv, *mos_config, *policy; int error = 0, copy_error; uint64_t healthy_tvds, healthy_tvds_mos; uint64_t mos_config_txg; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling a pool from a split, the config provided is * already trusted so there is nothing to do. */ if (type == SPA_IMPORT_ASSEMBLE) return (0); healthy_tvds = spa_healthy_core_tvds(spa); if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * If we are doing an open, pool owner wasn't verified yet, thus do * the verification here. */ if (spa->spa_load_state == SPA_LOAD_OPEN) { error = spa_verify_host(spa, mos_config); if (error != 0) { nvlist_free(mos_config); return (error); } } nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Build a new vdev tree from the trusted config */ VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); /* * Vdev paths in the MOS may be obsolete. If the untrusted config was * obtained by scanning /dev/dsk, then it will have the right vdev * paths. We update the trusted MOS config with this information. * We first try to copy the paths with vdev_copy_path_strict, which * succeeds only when both configs have exactly the same vdev tree. * If that fails, we fall back to a more flexible method that has a * best effort policy. */ copy_error = vdev_copy_path_strict(rvd, mrvd); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "provided vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); spa_load_note(spa, "MOS vdev tree:"); vdev_dbgmsg_print_tree(mrvd, 2); } if (copy_error != 0) { spa_load_note(spa, "vdev_copy_path_strict failed, falling " "back to vdev_copy_path_relaxed"); vdev_copy_path_relaxed(rvd, mrvd); } vdev_close(rvd); vdev_free(rvd); spa->spa_root_vdev = mrvd; rvd = mrvd; spa_config_exit(spa, SCL_ALL, FTAG); /* * We will use spa_config if we decide to reload the spa or if spa_load * fails and we rewind. We must thus regenerate the config using the * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to * pass settings on how to load the pool and is not stored in the MOS. * We copy it over to our new, trusted config. */ mos_config_txg = fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_POOL_TXG); nvlist_free(mos_config); mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, &policy) == 0) fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); spa_config_set(spa, mos_config); spa->spa_config_source = SPA_CONFIG_SRC_MOS; /* * Now that we got the config from the MOS, we should be more strict * in checking blkptrs and can make assumptions about the consistency * of the vdev tree. spa_trust_config must be set to true before opening * vdevs in order for them to be writeable. */ spa->spa_trust_config = B_TRUE; /* * Open and validate the new vdev tree */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); if (copy_error != 0 || spa_load_print_vdev_tree) { spa_load_note(spa, "final vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); } if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { /* * Sanity check to make sure that we are indeed loading the * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds * in the config provided and they happened to be the only ones * to have the latest uberblock, we could involuntarily perform * an extreme rewind. */ healthy_tvds_mos = spa_healthy_core_tvds(spa); if (healthy_tvds_mos - healthy_tvds >= SPA_SYNC_MIN_VDEVS) { spa_load_note(spa, "config provided misses too many " "top-level vdevs compared to MOS (%lld vs %lld). ", (u_longlong_t)healthy_tvds, (u_longlong_t)healthy_tvds_mos); spa_load_note(spa, "vdev tree:"); vdev_dbgmsg_print_tree(rvd, 2); if (reloading) { spa_load_failed(spa, "config was already " "provided from MOS. Aborting."); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_load_note(spa, "spa must be reloaded using MOS " "config"); return (SET_ERROR(EAGAIN)); } } error = spa_check_for_missing_logs(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { spa_load_failed(spa, "uberblock guid sum doesn't match MOS " "guid sum (%llu != %llu)", (u_longlong_t)spa->spa_uberblock.ub_guid_sum, (u_longlong_t)rvd->vdev_guid_sum); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } return (0); } static int spa_ld_open_indirect_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * Everything that we read before spa_remove_init() must be stored * on concreted vdevs. Therefore we do this as early as possible. */ error = spa_remove_init(spa); if (error != 0) { spa_load_failed(spa, "spa_remove_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Retrieve information needed to condense indirect vdev mappings. */ error = spa_condense_init(spa); if (error != 0) { spa_load_failed(spa, "spa_condense_init failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } return (0); } static int spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj, B_TRUE) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || spa->spa_load_state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { *missing_feat_writep = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (*missing_feat_writep && spa_writeable(spa))) { spa_load_failed(spa, "pool uses unsupported features"); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { spa_load_failed(spa, "error getting refcount " "for feature %s [error=%d]", spa_feature_table[i].fi_guid, error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Encryption was added before bookmark_v2, even though bookmark_v2 * is now a dependency. If this pool has encryption enabled without * bookmark_v2, trigger an errata message. */ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; } return (0); } static int spa_ld_load_special_directories(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) { spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_get_props(spa_t *spa) { int error = 0; uint64_t obj; vdev_t *rvd = spa->spa_root_vdev; /* Grab the checksum salt from the MOS. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes); if (error == ENOENT) { /* Generate a new salt for subsequent use */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); } else if (error != 0) { spa_load_failed(spa, "unable to retrieve checksum salt from " "MOS [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) { spa_load_failed(spa, "error opening deferred-frees bpobj " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the livelist deletion field. If a livelist is queued for * deletion, indicate that in the spa */ error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, &spa->spa_livelists_to_delete, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the per-vdev ZAP map. If we have an older pool, this will not * be present; in this case, defer its creation to a later time to * avoid dirtying the MOS this early / out of sync context. See * spa_sync_config_object. */ /* The sentinel is only available in the MOS config. */ nvlist_t *mos_config; if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { spa_load_failed(spa, "unable to retrieve MOS config"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, &spa->spa_all_vdev_zaps, B_FALSE); if (error == ENOENT) { VERIFY(!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); spa->spa_avz_action = AVZ_ACTION_INITIALIZE; ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } else if (error != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { /* * An older version of ZFS overwrote the sentinel value, so * we have orphaned per-vdev ZAPs in the MOS. Defer their * destruction to later; see spa_sync_config_object. */ spa->spa_avz_action = AVZ_ACTION_DESTROY; /* * We're assuming that no vdevs have had their ZAPs created * before this. Better be sure of it. */ ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); } nvlist_free(mos_config); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, B_FALSE); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); spa->spa_autoreplace = (autoreplace != 0); } /* * If we are importing a pool with missing top-level vdevs, * we enforce that the pool doesn't panic or get suspended on * error since the likelihood of missing data is extremely high. */ if (spa->spa_missing_tvds > 0 && spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_load_note(spa, "forcing failmode to 'continue' " "as some top level vdevs are missing"); spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; } return (0); } static int spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) { spa_load_failed(spa, "error loading spares nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object, B_FALSE); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) { spa_load_failed(spa, "error loading l2cache nvlist"); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } return (0); } static int spa_ld_load_vdev_metadata(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * If the 'multihost' property is set, then never allow a pool to * be imported when the system hostid is zero. The exception to * this rule is zdb which is always allowed to access pools. */ if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (spa->spa_load_state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. */ error = vdev_load(rvd); if (error != 0) { spa_load_failed(spa, "vdev_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } error = spa_ld_log_spacemaps(spa); if (error != 0) { spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } /* * Propagate the leaf DTLs we just loaded all the way up the vdev tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); return (0); } static int spa_ld_load_dedup_tables(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; error = ddt_load(spa); if (error != 0) { spa_load_failed(spa, "ddt_load failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } return (0); } static int spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) { vdev_t *rvd = spa->spa_root_vdev; if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { boolean_t missing = spa_check_logs(spa); if (missing) { if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "spa_check_logs failed " "so dropping the logs"); } else { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; spa_load_failed(spa, "spa_check_logs failed"); return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } } return (0); } static int spa_ld_verify_pool_data(spa_t *spa) { int error = 0; vdev_t *rvd = spa->spa_root_vdev; /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { error = spa_load_verify(spa); if (error != 0) { spa_load_failed(spa, "spa_load_verify failed " "[error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } } return (0); } static void spa_ld_claim_log_blocks(spa_t *spa) { dmu_tx_t *tx; dsl_pool_t *dp = spa_get_dsl(spa); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); } static void spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, boolean_t update_config_cache) { vdev_t *rvd = spa->spa_root_vdev; int need_update = B_FALSE; /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (update_config_cache || config_cache_txg != spa->spa_config_txg || spa->spa_load_state == SPA_LOAD_IMPORT || spa->spa_load_state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asynchronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } static void spa_ld_prepare_for_reload(spa_t *spa) { spa_mode_t mode = spa->spa_mode; int async_suspended = spa->spa_async_suspended; spa_unload(spa); spa_deactivate(spa); spa_activate(spa, mode); /* * We save the value of spa_async_suspended as it gets reset to 0 by * spa_unload(). We want to restore it back to the original value before * returning as we might be calling spa_async_resume() later. */ spa->spa_async_suspended = async_suspended; } static int spa_ld_read_checkpoint_txg(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT0(spa->spa_checkpoint_txg); ASSERT(MUTEX_HELD(&spa_namespace_lock)); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error == ENOENT) return (0); if (error != 0) return (error); ASSERT3U(checkpoint.ub_txg, !=, 0); ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); ASSERT3U(checkpoint.ub_timestamp, !=, 0); spa->spa_checkpoint_txg = checkpoint.ub_txg; spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; return (0); } static int spa_ld_mos_init(spa_t *spa, spa_import_type_t type) { int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); /* * Never trust the config that is provided unless we are assembling * a pool following a split. * This means don't trust blkptrs and the vdev tree in general. This * also effectively puts the spa in read-only mode since * spa_writeable() checks for spa_trust_config to be true. * We will later load a trusted config from the MOS. */ if (type != SPA_IMPORT_ASSEMBLE) spa->spa_trust_config = B_FALSE; /* * Parse the config provided to create a vdev tree. */ error = spa_ld_parse_config(spa, type); if (error != 0) return (error); spa_import_progress_add(spa); /* * Now that we have the vdev tree, try to open each vdev. This involves * opening the underlying physical device, retrieving its geometry and * probing the vdev with a dummy I/O. The state of each vdev will be set * based on the success of those operations. After this we'll be ready * to read from the vdevs. */ error = spa_ld_open_vdevs(spa); if (error != 0) return (error); /* * Read the label of each vdev and make sure that the GUIDs stored * there match the GUIDs in the config provided. * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { error = spa_ld_validate_vdevs(spa); if (error != 0) return (error); } /* * Read all vdev labels to find the best uberblock (i.e. latest, * unless spa_load_max_txg is set) and store it in spa_uberblock. We * get the list of features required to read blkptrs in the MOS from * the vdev label with the best uberblock and verify that our version * of zfs supports them all. */ error = spa_ld_select_uberblock(spa, type); if (error != 0) return (error); /* * Pass that uberblock to the dsl_pool layer which will open the root * blkptr. This blkptr points to the latest version of the MOS and will * allow us to read its contents. */ error = spa_ld_open_rootbp(spa); if (error != 0) return (error); return (0); } static int spa_ld_checkpoint_rewind(spa_t *spa) { uberblock_t checkpoint; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); if (error != 0) { spa_load_failed(spa, "unable to retrieve checkpointed " "uberblock from the MOS config [error=%d]", error); if (error == ENOENT) error = ZFS_ERR_NO_CHECKPOINT; return (error); } ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); /* * We need to update the txg and timestamp of the checkpointed * uberblock to be higher than the latest one. This ensures that * the checkpointed uberblock is selected if we were to close and * reopen the pool right after we've written it in the vdev labels. * (also see block comment in vdev_uberblock_compare) */ checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; checkpoint.ub_timestamp = gethrestime_sec(); /* * Set current uberblock to be the checkpointed uberblock. */ spa->spa_uberblock = checkpoint; /* * If we are doing a normal rewind, then the pool is open for * writing and we sync the "updated" checkpointed uberblock to * disk. Once this is done, we've basically rewound the whole * pool and there is no way back. * * There are cases when we don't want to attempt and sync the * checkpointed uberblock to disk because we are opening a * pool as read-only. Specifically, verifying the checkpointed * state with zdb, and importing the checkpointed state to get * a "preview" of its content. */ if (spa_writeable(spa)) { vdev_t *rvd = spa->spa_root_vdev; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_load_failed(spa, "failed to write checkpointed " "uberblock to the vdev labels [error=%d]", error); return (error); } } return (0); } static int spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, boolean_t *update_config_cache) { int error; /* * Parse the config for pool, open and validate vdevs, * select an uberblock, and use that uberblock to open * the MOS. */ error = spa_ld_mos_init(spa, type); if (error != 0) return (error); /* * Retrieve the trusted config stored in the MOS and use it to create * a new, exact version of the vdev tree, then reopen all vdevs. */ error = spa_ld_trusted_config(spa, type, B_FALSE); if (error == EAGAIN) { if (update_config_cache != NULL) *update_config_cache = B_TRUE; /* * Redo the loading process with the trusted config if it is * too different from the untrusted config. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "RELOADING"); error = spa_ld_mos_init(spa, type); if (error != 0) return (error); error = spa_ld_trusted_config(spa, type, B_TRUE); if (error != 0) return (error); } else if (error != 0) { return (error); } return (0); } /* * Load an existing storage pool, using the config provided. This config * describes which vdevs are part of the pool and is later validated against * partial configs present in each vdev's label and an entire copy of the * config stored in the MOS. */ static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) { int error = 0; boolean_t missing_feat_write = B_FALSE; boolean_t checkpoint_rewind = (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); boolean_t update_config_cache = B_FALSE; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); spa_load_note(spa, "LOADING"); error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); if (error != 0) return (error); /* * If we are rewinding to the checkpoint then we need to repeat * everything we've done so far in this function but this time * selecting the checkpointed uberblock and using that to open * the MOS. */ if (checkpoint_rewind) { /* * If we are rewinding to the checkpoint update config cache * anyway. */ update_config_cache = B_TRUE; /* * Extract the checkpointed uberblock from the current MOS * and use this as the pool's uberblock from now on. If the * pool is imported as writeable we also write the checkpoint * uberblock to the labels, making the rewind permanent. */ error = spa_ld_checkpoint_rewind(spa); if (error != 0) return (error); /* * Redo the loading process again with the * checkpointed uberblock. */ spa_ld_prepare_for_reload(spa); spa_load_note(spa, "LOADING checkpointed uberblock"); error = spa_ld_mos_with_trusted_config(spa, type, NULL); if (error != 0) return (error); } /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ error = spa_ld_read_checkpoint_txg(spa); if (error != 0) return (error); /* * Retrieve the mapping of indirect vdevs. Those vdevs were removed * from the pool and their contents were re-mapped to other vdevs. Note * that everything that we read before this step must have been * rewritten on concrete vdevs after the last device removal was * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) return (error); /* * Retrieve the full list of active features from the MOS and check if * they are all supported. */ error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) return (error); /* * Load several special directories from the MOS needed by the dsl_pool * layer. */ error = spa_ld_load_special_directories(spa); if (error != 0) return (error); /* * Retrieve pool properties from the MOS. */ error = spa_ld_get_props(spa); if (error != 0) return (error); /* * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) return (error); /* * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ error = spa_ld_load_vdev_metadata(spa); if (error != 0) return (error); error = spa_ld_load_dedup_tables(spa); if (error != 0) return (error); /* * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) return (error); if (missing_feat_write) { ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Traverse the last txgs to make sure the pool was left off in a safe * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ error = spa_ld_verify_pool_data(spa); if (error != 0) return (error); /* * Calculate the deflated space for the pool. This must be done before * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ spa_update_dspace(spa); /* * We have now retrieved all the information we needed to open the * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); /* * In case of a checkpoint rewind, log the original txg * of the checkpointed uberblock. */ if (checkpoint_rewind) { spa_history_log_internal(spa, "checkpoint rewind", NULL, "rewound state to txg=%llu", (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } /* * Traverse the ZIL and claim all blocks. */ spa_ld_claim_log_blocks(spa); /* * Kick-off the syncing thread. */ spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); mmp_thread_start(spa); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by ZIL traversal operations * performed above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * Check if we need to request an update of the config. On the * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); /* * Check if a rebuild was in progress and if so resume it. * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER); } /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open", NULL); spa_restart_removal(spa); spa_spawn_aux_threads(spa); /* * Delete any inconsistent datasets. * * Note: * Since we may be issuing deletes for clones here, * we make sure to do so after we've spawned all the * auxiliary threads above (from which the livelist * deletion zthr is part of). */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_import_progress_remove(spa_guid(spa)); spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); spa_load_note(spa, "LOADED"); return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state) { spa_mode_t mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", (u_longlong_t)spa->spa_load_max_txg); return (spa_load(spa, state, SPA_IMPORT_EXISTING)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); if (load_error == 0) return (0); if (load_error == ZFS_ERR_NO_CHECKPOINT) { /* * When attempting checkpoint-rewind on a pool with no * checkpoint, we should not attempt to load uberblocks * from previous txgs when spa_load fails. */ ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); spa_import_progress_remove(spa_guid(spa)); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); else nvlist_free(config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); spa_import_progress_remove(spa_guid(spa)); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; spa_import_progress_remove(spa_guid(spa)); return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (MUTEX_NOT_HELD(&spa_namespace_lock)) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_load_policy_t policy; firstopen = B_TRUE; zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; zfs_dbgmsg("spa_open_common: opening %s", pool); error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); } if (firstopen) zvol_create_minors_recursive(spa_name(spa)); *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); if (nl2cache != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); vdev_config_generate_stats(vd, l2cache[i]); } } } static void spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) { zap_cursor_t zc; zap_attribute_t za; if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY0(nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } } static void spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) { int i; for (i = 0; i < SPA_FEATURES; i++) { zfeature_info_t feature = spa_feature_table[i]; uint64_t refcount; if (feature_get_refcount(spa, &feature, &refcount) != 0) continue; VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); } } /* * Store a list of pool features and their reference counts in the * config. * * The first time this is called on a spa, allocate a new nvlist, fetch * the pool features and reference counts from disk, then save the list * in the spa. In subsequent calls on the same spa use the saved nvlist * and refresh its values from the cached reference counts. This * ensures we don't block here on I/O on a suspended pool so 'zpool * clear' can resume the pool. */ static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); mutex_enter(&spa->spa_feat_stats_lock); features = spa->spa_feat_stats; if (features != NULL) { spa_feature_stats_from_cache(spa, features); } else { VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); spa->spa_feat_stats = features; spa_feature_stats_from_disk(spa, features); } VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features)); mutex_exit(&spa->spa_feat_stats_lock); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; VERIFY(nvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); if (spa_suspended(spa)) { VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED_REASON, spa->spa_suspended) == 0); } spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatenating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs) == 0); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) VERIFY(nvlist_dup(olddevs[i], &newdevs[i], KM_SLEEP) == 0); for (i = 0; i < ndevs; i++) VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], KM_SLEEP) == 0); VERIFY(nvlist_remove(sav->sav_config, config, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, newdevs, ndevs + oldndevs) == 0); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs) == 0); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Verify encryption parameters for spa creation. If we are encrypting, we must * have the encryption feature flag enabled. */ static int spa_create_check_encryption_params(dsl_crypto_params_t *dcp, boolean_t has_encryption) { if (dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT && !has_encryption) return (SET_ERROR(ENOTSUP)); return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, dsl_crypto_params_t *dcp) { spa_t *spa; char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; spa_feature_t feat; char *feat_name; char *poolname; nvlist_t *nvl; if (props == NULL || nvlist_lookup_string(props, "tname", &poolname) != 0) poolname = (char *)pool; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(poolname) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ nvl = fnvlist_alloc(); fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(poolname, nvl, altroot); fnvlist_free(nvl); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Temporary pool names should never be written to disk. */ if (poolname != pool) spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; has_encryption = B_FALSE; has_allocclass = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) { has_features = B_TRUE; feat_name = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(feat_name, &feat)); if (feat == SPA_FEATURE_ENCRYPTION) has_encryption = B_TRUE; if (feat == SPA_FEATURE_ALLOCATION_CLASSES) has_allocclass = B_TRUE; } } /* verify encryption params, if they were provided */ if (dcp != NULL) { error = spa_create_check_encryption_params(dcp, has_encryption); if (error != 0) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } } if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (ENOTSUP); } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; spa->spa_load_state = SPA_LOAD_CREATE; spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point */ for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_ashift_optimize(vd); vdev_metaslab_set_size(vd); vdev_expand(vd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) spa_history_create_obj(spa, tx); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); spa_history_log_version(spa, "create", tx); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Generate some random noise for salted checksums to operate on. */ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, sizeof (spa->spa_cksum_salt.zcs_bytes)); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(dp); mmp_thread_start(spa); txg_wait_synced(dp, txg); spa_spawn_aux_threads(spa); spa_write_cachefile(spa, B_FALSE, B_TRUE); /* * Don't count references from objsets that are already closed * and are making their way through the eviction process. */ spa_evicting_os_wait(spa); spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); return (0); } /* * Import a non-root pool into the system. */ int spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_load_policy_t policy; spa_mode_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = SPA_MODE_READ; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_load_policy(config, &policy); if (policy.zlp_rewind & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; if (state != SPA_LOAD_RECOVER) { spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; zfs_dbgmsg("spa_import: importing %s", pool); } else { zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); } error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); spa_history_log_version(spa, "import", NULL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); mutex_exit(&spa_namespace_lock); zvol_create_minors_recursive(pool); return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; char *poolname, *cachefile; spa_t *spa; uint64_t state; int error; zpool_load_policy_t policy; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, SPA_MODE_READ); /* * Rewind pool if a max txg was provided. */ zpool_get_load_policy(spa->spa_config, &policy); if (policy.zlp_txg != UINT64_MAX) { spa->spa_load_max_txg = policy.zlp_txg; spa->spa_extreme_rewind = B_TRUE; zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", poolname, (longlong_t)policy.zlp_txg); } else { zfs_dbgmsg("spa_tryimport: importing %s", poolname); } if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) == 0) { zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; } else { spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata) == 0); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname; dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname) == 0); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & SPA_MODE_WRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_is_exporting) { /* the pool is being exported by another thread */ mutex_exit(&spa_namespace_lock); return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); } spa->spa_is_exporting = B_TRUE; /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); if (spa->spa_zvol_taskq) { zvol_remove_minors(spa, spa_name(spa), B_TRUE); taskq_wait(spa->spa_zvol_taskq); } mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state == POOL_STATE_UNINITIALIZED) goto export_spa; /* * The pool will be in core if it's openable, in which case we can * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ if (spa->spa_sync_on) { txg_wait_synced(spa->spa_dsl_pool, 0); spa_evicting_os_wait(spa); } /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); spa->spa_is_exporting = B_FALSE; mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } if (spa->spa_sync_on) { /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { spa_async_resume(spa); spa->spa_is_exporting = B_FALSE; mutex_exit(&spa_namespace_lock); return (SET_ERROR(EXDEV)); } /* * We're about to export or destroy this pool. Make sure * we stop all initialization and trim activity here before * we set the spa_final_txg. This will ensure that all * dirty data resulting from the initialization is * committed to disk before we unload the pool. */ if (spa->spa_root_vdev != NULL) { vdev_t *rvd = spa->spa_root_vdev; vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_all(spa); vdev_rebuild_stop_all(spa); } /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } } export_spa: if (new_state == POOL_STATE_DESTROYED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); else if (new_state == POOL_STATE_EXPORTED) spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_write_cachefile(spa, B_TRUE, B_TRUE); spa_remove(spa); } else { /* * If spa_remove() is not called for this spa_t and * there is any possibility that it can be reused, * we make sure to reset the exporting flag. */ spa->spa_is_exporting = B_FALSE; } mutex_exit(&spa_namespace_lock); return (0); } /* * Destroy a storage pool. */ int spa_destroy(char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect * vdevs, we can not add raidz toplevels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; if (spa->spa_vdev_removal != NULL && tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* Fail if top level vdev is raidz */ if (tvd->vdev_ops == &vdev_raidz_ops) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } /* * Need the top level mirror to be * a mirror of leaf vdevs only */ if (tvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < tvd->vdev_children; cid++) { vdev_t *cvd = tvd->vdev_child[cid]; if (!cvd->vdev_ops->vdev_op_leaf) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } } } } } for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = rvd->vdev_children; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. * * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) * should be performed instead of traditional healing reconstruction. From * an administrators perspective these are both resilver operations. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (rebuild) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); if (dsl_scan_resilvering(spa_get_dsl(spa))) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_REBUILD_IN_PROGRESS)); } if (spa->spa_vdev_removal != NULL) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * Spares can't replace logs */ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); if (rebuild) { /* * For rebuilds, the parent vdev must support reconstruction * using only space maps. This means the only allowable * parents are the root vdev or a mirror vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(pvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver or rebuild to restart in the future. We do * this to ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ if (rebuild) { newvd->vdev_rebuild_txg = txg; vdev_rebuild(tvd); } else { newvd->vdev_resilver_txg = txg; if (dsl_scan_resilvering(spa_get_dsl(spa)) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { vdev_defer_resilver(newvd); } else { dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); } } if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd __maybe_unused = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_detach_enter(spa, guid); vd = spa_lookup_by_guid(spa, guid, B_FALSE); /* * Besides being called directly from the userland through the * ioctl interface, spa_vdev_detach() can be potentially called * at the end of spa_vdev_resilver_done(). * * In the regular case, when we have a checkpoint this shouldn't * happen as we never empty the DTLs of a vdev during the scrub * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() * should never get here when we have a checkpoint. * * That said, even in a case when we checkpoint the pool exactly * as spa_vdev_resilver_done() calls this function everything * should be fine as the resilver will return right away. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a spare, then it implies * that the spare should become a real disk, and be removed from the * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0 && pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); spa_notify_waiters(spa); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } static int spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } mutex_enter(&vd->vdev_initialize_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate an initialize action we check to see * if the vdev_initialize_thread is NULL. We do this instead * of using the vdev_initialize_state since there might be * a previous initialization process which has completed but * the thread is not exited. */ if (cmd_type == POOL_INITIALIZE_START && (vd->vdev_initialize_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_INITIALIZE_CANCEL && (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_INITIALIZE_SUSPEND && vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_INITIALIZE_START: vdev_initialize(vd); break; case POOL_INITIALIZE_CANCEL: vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); break; case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_initialize_lock); return (0); } int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping initialization. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the initializing operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all initialize threads to stop. */ vdev_initialize_stop_wait(spa, &vd_list); /* Sync out the initializing state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } static int spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); /* Look up vdev and ensure it's a leaf. */ vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || vd->vdev_detached) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(ENODEV)); } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EINVAL)); } else if (!vdev_writeable(vd)) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EROFS)); } else if (!vd->vdev_has_trim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } else if (secure && !vd->vdev_has_securetrim) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (SET_ERROR(EOPNOTSUPP)); } mutex_enter(&vd->vdev_trim_lock); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); /* * When we activate a TRIM action we check to see if the * vdev_trim_thread is NULL. We do this instead of using the * vdev_trim_state since there might be a previous TRIM process * which has completed but the thread is not exited. */ if (cmd_type == POOL_TRIM_START && (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(EBUSY)); } else if (cmd_type == POOL_TRIM_CANCEL && (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } else if (cmd_type == POOL_TRIM_SUSPEND && vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { mutex_exit(&vd->vdev_trim_lock); return (SET_ERROR(ESRCH)); } switch (cmd_type) { case POOL_TRIM_START: vdev_trim(vd, rate, partial, secure); break; case POOL_TRIM_CANCEL: vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); break; case POOL_TRIM_SUSPEND: vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } mutex_exit(&vd->vdev_trim_lock); return (0); } /* * Initiates a manual TRIM for the requested vdevs. This kicks off individual * TRIM threads for each child vdev. These threads pass over all of the free * space in the vdev's metaslabs and issues TRIM commands for that space. */ int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) { int total_errors = 0; list_t vd_list; list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); /* * We hold the namespace lock through the whole function * to prevent any changes to the pool while we're starting or * stopping TRIM. The config and state locks are held so that * we can properly assess the vdev state before we commit to * the TRIM operation. */ mutex_enter(&spa_namespace_lock); for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { uint64_t vdev_guid = fnvpair_value_uint64(pair); int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, rate, partial, secure, &vd_list); if (error != 0) { char guid_as_str[MAXNAMELEN]; (void) snprintf(guid_as_str, sizeof (guid_as_str), "%llu", (unsigned long long)vdev_guid); fnvlist_add_int64(vdev_errlist, guid_as_str, error); total_errors++; } } /* Wait for all TRIM threads to stop. */ vdev_trim_stop_wait(spa, &vd_list); /* Sync out the TRIM state */ txg_wait_synced(spa->spa_dsl_pool, 0); mutex_exit(&spa_namespace_lock); list_destroy(&vd_list); return (total_errors); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; return (spa_vdev_exit(spa, NULL, txg, error)); } /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_reset_logs(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && !vdev_is_concrete(vd))) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* deal with indirect vdevs */ if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == &vdev_indirect_ops) continue; /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || !vdev_is_concrete(vml[c]) || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c]) || vdev_resilver_needed(vml[c], NULL, NULL)) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); /* transfer per-vdev ZAPs */ ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); VERIFY0(nvlist_add_uint64(child[c], ZPOOL_CONFIG_VDEV_TOP_ZAP, vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children) == 0); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl) == 0); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); /* * Temporarily stop the initializing and TRIM activity. We set the * state to ACTIVE so that we know to resume initializing or TRIM * once the split has completed. */ list_t vd_initialize_list; list_create(&vd_initialize_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); list_t vd_trim_list; list_create(&vd_trim_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); mutex_exit(&vml[c]->vdev_initialize_lock); mutex_enter(&vml[c]->vdev_trim_lock); vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); mutex_exit(&vml[c]->vdev_trim_lock); } } vdev_initialize_stop_wait(spa, &vd_initialize_list); vdev_trim_stop_wait(spa, &vd_trim_list); list_destroy(&vd_initialize_list); list_destroy(&vd_trim_list); newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; newspa->spa_is_splitting = B_TRUE; /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { VERIFY(nvlist_alloc(&newspa->spa_config_splitting, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { vdev_t *tvd = vml[c]->vdev_top; /* * Need to be sure the detachable VDEV is not * on any *other* txg's DTL list to prevent it * from being accessed after it's freed. */ for (int t = 0; t < TXG_SIZE; t++) { (void) txg_list_remove_this( &tvd->vdev_dtl_list, vml[c], t); } vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); newspa->spa_is_splitting = B_FALSE; kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } /* restart initializing or trimming disks as necessary */ spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. * Also potentially update faulted state. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); vdev_propagate_state(vd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); /* * If a detach was not performed above replace waiters will not have * been notified. In which case we must do so now. */ spa_notify_waiters(spa); } /* * Update the stored path or FRU for this vdev. */ static int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); } int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); if (func == POOL_SCAN_RESILVER && !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); } static void spa_async_thread(void *arg) { spa_t *spa = (spa_t *)arg; dsl_pool_t *dp = spa->spa_dsl_pool; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks = 0; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); old_space += metaslab_class_get_space(spa_special_class(spa)); old_space += metaslab_class_get_space(spa_dedup_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); new_space += metaslab_class_get_space(spa_special_class(spa)); new_space += metaslab_class_get_space(spa_dedup_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), (u_longlong_t)new_space, (u_longlong_t)(new_space - old_space)); } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE) spa_vdev_resilver_done(spa); /* * If any devices are done replacing, detach them. Then if no * top-level vdevs are rebuilding attempt to kick off a scrub. */ if (tasks & SPA_ASYNC_REBUILD_DONE) { spa_vdev_resilver_done(spa); if (!vdev_rebuild_active(spa->spa_root_vdev)) (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); } /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER && !vdev_rebuild_active(spa->spa_root_vdev) && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_initialize_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_TRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_restart(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache whole device TRIM. */ if (tasks & SPA_ASYNC_L2CACHE_TRIM) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_l2arc(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_exit(&spa_namespace_lock); } /* * Kick off L2 cache rebuilding. */ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { mutex_enter(&spa_namespace_lock); spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); l2arc_spa_rebuild_start(spa); spa_config_exit(spa, SCL_L2ARC, FTAG); mutex_exit(&spa_namespace_lock); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); spa_vdev_remove_suspend(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_cancel(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_cancel(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_cancel(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_cancel(ll_condense_thread); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); spa_restart_removal(spa); zthr_t *condense_thread = spa->spa_condense_zthr; if (condense_thread != NULL) zthr_resume(condense_thread); zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; if (discard_thread != NULL) zthr_resume(discard_thread); zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; if (ll_delete_thread != NULL) zthr_resume(ll_delete_thread); zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; if (ll_condense_thread != NULL) zthr_resume(ll_condense_thread); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); } int spa_async_tasks(spa_t *spa) { return (spa->spa_async_tasks); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, bp_freed, tx); return (0); } int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); } int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *pio = arg; zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, pio->io_flags)); return (0); } static int bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(!bp_freed); return (spa_free_sync_cb(arg, bp, tx)); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; /* * Note: * If the log space map feature is active, we stop deferring * frees to the next TXG and therefore running this function * would be considered a no-op as spa_deferred_bpobj should * not have any entries. * * That said we run this function anyway (instead of returning * immediately) for the edge-case scenario where we just * activated the log space map feature in this TXG but we have * deferred frees from the previous TXG. */ zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, bpobj_spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = vmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); vmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (sav->sav_count == 0) { VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } /* * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. * The all-vdev ZAP must be empty. */ static void spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); } if (vd->vdev_leaf_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_leaf_zap, tx)); } for (uint64_t i = 0; i < vd->vdev_children; i++) { spa_avz_build(vd->vdev_child[i], avz, tx); } } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; /* * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, * its config may not be dirty but we still need to build per-vdev ZAPs. * Similarly, if the pool is being assembled (e.g. after a split), we * need to rebuild the AVZ although the config may not be dirty. */ if (list_is_empty(&spa->spa_config_dirty_list) && spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || spa->spa_avz_action == AVZ_ACTION_INITIALIZE || spa->spa_all_vdev_zaps != 0); if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { /* Make and build the new AVZ */ uint64_t new_avz = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_avz_build(spa->spa_root_vdev, new_avz, tx); /* Diff old AVZ with new one */ zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t vdzap = za.za_first_integer; if (zap_lookup_int(spa->spa_meta_objset, new_avz, vdzap) == ENOENT) { /* * ZAP is listed in old AVZ but not in new one; * destroy it */ VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, tx)); } } zap_cursor_fini(&zc); /* Destroy the old AVZ */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); /* Replace the old AVZ in the dir obj with the new one */ VERIFY0(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, sizeof (new_avz), 1, &new_avz, tx)); spa->spa_all_vdev_zaps = new_avz; } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { zap_cursor_t zc; zap_attribute_t za; /* Walk through the AVZ and destroy all listed ZAPs */ for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_all_vdev_zaps); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t zap = za.za_first_integer; VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); } zap_cursor_fini(&zc); /* Destroy and unlink the AVZ itself */ VERIFY0(zap_destroy(spa->spa_meta_objset, spa->spa_all_vdev_zaps, tx)); VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); spa->spa_all_vdev_zaps = 0; } if (spa->spa_all_vdev_zaps == 0) { spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx); } spa->spa_avz_action = AVZ_ACTION_NONE; /* Create ZAPs for vdevs that don't have them. */ vdev_construct_zaps(spa->spa_root_vdev, tx); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", (longlong_t)version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; char *strval, *fname; zpool_prop_t prop; const char *propname; zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(nvpair_name(elem))) { case ZPOOL_PROP_INVAL: /* * We checked this earlier in spa_prop_validate(). */ ASSERT(zpool_prop_feature(nvpair_name(elem))); fname = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", nvpair_name(elem)); break; case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced separately before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persistent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. It's unnecessary * to do this for pool creation since the vdev's * configuration has already been dirtied. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", nvpair_name(elem), (longlong_t)intval); } else { ASSERT(0); /* not allowed */ } switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOTRIM: spa->spa_autotrim = intval; spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_MULTIHOST: spa->spa_multihost = intval; break; default: break; } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { if (spa_sync_pass(spa) != 1) return; dsl_pool_t *dp = spa->spa_dsl_pool; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } /* * If we haven't written the salt, do so now. Note that the * feature may not be activated yet, but that's fine since * the presence of this ZAP entry is backwards compatible. */ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT) == ENOENT) { VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, sizeof (spa->spa_cksum_salt.zcs_bytes), spa->spa_cksum_salt.zcs_bytes, tx)); } rrw_exit(&dp->dp_config_rwlock, FTAG); } static void vdev_indirect_state_sync_verify(vdev_t *vd) { vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; if (vd->vdev_ops == &vdev_indirect_ops) { ASSERT(vim != NULL); ASSERT(vib != NULL); } uint64_t obsolete_sm_object = 0; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, space_map_allocated(vd->vdev_obsolete_sm)); } ASSERT(vd->vdev_obsolete_segments != NULL); /* * Since frees / remaps to an indirect vdev can only * happen in syncing context, the obsolete segments * tree must be empty when we start syncing. */ ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); } /* * Set the top-level vdev's max queue depth. Evaluate each top-level's * async write queue depth in case it changed. The max queue depth will * not change in the middle of syncing out this txg. */ static void spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; uint32_t max_queue_depth = zfs_vdev_async_write_max_active * zfs_vdev_queue_depth_pct / 100; metaslab_class_t *normal = spa_normal_class(spa); metaslab_class_t *special = spa_special_class(spa); metaslab_class_t *dedup = spa_dedup_class(spa); uint64_t slots_per_allocator = 0; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (mg == NULL || !metaslab_group_initialized(mg)) continue; metaslab_class_t *mc = mg->mg_class; if (mc != normal && mc != special && mc != dedup) continue; /* * It is safe to do a lock-free check here because only async * allocations look at mg_max_alloc_queue_depth, and async * allocations all happen from spa_sync(). */ for (int i = 0; i < mg->mg_allocators; i++) { ASSERT0(zfs_refcount_count( &(mg->mg_allocator[i].mga_alloc_queue_depth))); } mg->mg_max_alloc_queue_depth = max_queue_depth; for (int i = 0; i < mg->mg_allocators; i++) { mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = zfs_vdev_def_queue_depth; } slots_per_allocator += zfs_vdev_def_queue_depth; } for (int i = 0; i < spa->spa_alloc_count; i++) { ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); normal->mc_alloc_max_slots[i] = slots_per_allocator; special->mc_alloc_max_slots[i] = slots_per_allocator; dedup->mc_alloc_max_slots[i] = slots_per_allocator; } normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; } static void spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) { ASSERT(spa_writeable(spa)); vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; vdev_indirect_state_sync_verify(vd); if (vdev_indirect_should_condense(vd)) { spa_condense_indirect_start_sync(vd, tx); break; } } } static void spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) { objset_t *mos = spa->spa_meta_objset; dsl_pool_t *dp = spa->spa_dsl_pool; uint64_t txg = tx->tx_txg; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free || spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { /* * If the log space map feature is active we don't * care about deferred frees and the deferred bpobj * as the log space map should effectively have the * same results (i.e. appending only to one object). */ spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, &spa->spa_deferred_bpobj, tx); } ddt_sync(spa, txg); dsl_scan_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); spa_flush_metaslabs(spa, tx); vdev_t *vd = NULL; while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) != NULL) vdev_sync(vd, txg); /* * Note: We need to check if the MOS is dirty because we could * have marked the MOS dirty without updating the uberblock * (e.g. if we have sync tasks but no dirty user data). We need * to check the uberblock's rootbp because it is updated if we * have synced out dirty data (though in this case the MOS will * most likely also be dirty due to second order effects, we * don't want to rely on that here). */ if (pass == 1 && spa->spa_uberblock.ub_rootbp.blk_birth < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, therefore this * TXG is a no-op. Avoid syncing deferred frees, so * that we can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } while (dmu_objset_is_dirty(mos, txg)); } /* * Rewrite the vdev configuration (which includes the uberblock) to * commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few random * top-level vdevs that are known to be visible in the config cache * (see spa_vdev_add() for a complete description). If there *are* dirty * vdevs, sync the uberblock to all vdevs. */ static void spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg = tx->tx_txg; for (;;) { int error = 0; /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; /* Stop when revisiting the first vdev */ if (c > 0 && svd[0] == vd) break; if (vd->vdev_ms_array == 0 || vd->vdev_islog || !vdev_is_concrete(vd)) continue; svd[svdcount++] = vd; if (svdcount == SPA_SYNC_MIN_VDEVS) break; } error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); zio_resume_wait(spa); } } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { vdev_t *vd = NULL; VERIFY(spa_writeable(spa)); /* * Wait for i/os issued in open context that need to complete * before this txg syncs. */ (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_alloc_locks[i]); VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); mutex_exit(&spa->spa_alloc_locks[i]); } /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while (list_head(&spa->spa_state_dirty_list) != NULL) { /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); dsl_pool_t *dp = spa->spa_dsl_pool; dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + NSEC_TO_TICK(spa->spa_deadman_synctime)); /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { vdev_t *rvd = spa->spa_root_vdev; int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } spa_sync_adjust_vdev_max_queue_depth(spa); spa_sync_condense_indirect(spa, tx); spa_sync_iterate_to_convergence(spa, tx); #ifdef ZFS_DEBUG if (!list_is_empty(&spa->spa_config_dirty_list)) { /* * Make sure that the number of ZAPs for all the vdevs matches * the number of ZAPs in the per-vdev ZAP list. This only gets * called if the config is dirty; otherwise there may be * outstanding AVZ operations that weren't completed in * spa_sync_config_object. */ uint64_t all_vdev_zap_entry_count; ASSERT0(zap_count(spa->spa_meta_objset, spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, all_vdev_zap_entry_count); } #endif if (spa->spa_vdev_removal != NULL) { ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); } spa_sync_rewrite_vdev_config(spa, tx); dmu_tx_commit(tx); taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); spa->spa_deadman_tqid = 0; /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } dsl_pool_sync_done(dp, txg); for (int i = 0; i < spa->spa_alloc_count; i++) { mutex_enter(&spa->spa_alloc_locks[i]); VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); mutex_exit(&spa->spa_alloc_locks[i]); } /* * Update usable space statistics. */ while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) != NULL) vdev_sync_done(vd, txg); metaslab_class_evict_old(spa->spa_normal_class, txg); metaslab_class_evict_old(spa->spa_log_class, txg); spa_sync_close_syncing_log_sm(spa); spa_update_dspace(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); while (zfs_pause_spa_sync) delay(1); spa->spa_sync_pass = 0; /* * Update the last synced uberblock here. We want to do this at * the end of spa_sync() so that consumers of spa_last_synced_txg() * will be guaranteed that all the processing associated with * that txg has been completed. */ spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &spareguid) == 0 && spareguid == guid) return (B_TRUE); } return (B_FALSE); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } uint64_t spa_total_metaslabs(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; uint64_t m = 0; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; if (!vdev_is_concrete(vd)) continue; m += vd->vdev_ms_count; } return (m); } /* * Notify any waiting threads that some activity has switched from being in- * progress to not-in-progress so that the thread can wake up and determine * whether it is finished waiting. */ void spa_notify_waiters(spa_t *spa) { /* * Acquiring spa_activities_lock here prevents the cv_broadcast from * happening between the waiting thread's check and cv_wait. */ mutex_enter(&spa->spa_activities_lock); cv_broadcast(&spa->spa_activities_cv); mutex_exit(&spa->spa_activities_lock); } /* * Notify any waiting threads that the pool is exporting, and then block until * they are finished using the spa_t. */ void spa_wake_waiters(spa_t *spa) { mutex_enter(&spa->spa_activities_lock); spa->spa_waiters_cancel = B_TRUE; cv_broadcast(&spa->spa_activities_cv); while (spa->spa_waiters != 0) cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); spa->spa_waiters_cancel = B_FALSE; mutex_exit(&spa->spa_activities_lock); } /* Whether the vdev or any of its descendants are being initialized/trimmed. */ static boolean_t spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) { spa_t *spa = vd->vdev_spa; ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); ASSERT(activity == ZPOOL_WAIT_INITIALIZE || activity == ZPOOL_WAIT_TRIM); kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? &vd->vdev_initialize_lock : &vd->vdev_trim_lock; mutex_exit(&spa->spa_activities_lock); mutex_enter(lock); mutex_enter(&spa->spa_activities_lock); boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); mutex_exit(lock); if (in_progress) return (B_TRUE); for (int i = 0; i < vd->vdev_children; i++) { if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], activity)) return (B_TRUE); } return (B_FALSE); } /* * If use_guid is true, this checks whether the vdev specified by guid is * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool * is being initialized/trimmed. The caller must hold the config lock and * spa_activities_lock. */ static int spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, zpool_wait_activity_t activity, boolean_t *in_progress) { mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); vdev_t *vd; if (use_guid) { vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (EINVAL); } } else { vd = spa->spa_root_vdev; } *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (0); } /* * Locking for waiting threads * --------------------------- * * Waiting threads need a way to check whether a given activity is in progress, * and then, if it is, wait for it to complete. Each activity will have some * in-memory representation of the relevant on-disk state which can be used to * determine whether or not the activity is in progress. The in-memory state and * the locking used to protect it will be different for each activity, and may * not be suitable for use with a cvar (e.g., some state is protected by the * config lock). To allow waiting threads to wait without any races, another * lock, spa_activities_lock, is used. * * When the state is checked, both the activity-specific lock (if there is one) * and spa_activities_lock are held. In some cases, the activity-specific lock * is acquired explicitly (e.g. the config lock). In others, the locking is * internal to some check (e.g. bpobj_is_empty). After checking, the waiting * thread releases the activity-specific lock and, if the activity is in * progress, then cv_waits using spa_activities_lock. * * The waiting thread is woken when another thread, one completing some * activity, updates the state of the activity and then calls * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only * needs to hold its activity-specific lock when updating the state, and this * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. * * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, * and because it is held when the waiting thread checks the state of the * activity, it can never be the case that the completing thread both updates * the activity state and cv_broadcasts in between the waiting thread's check * and cv_wait. Thus, a waiting thread can never miss a wakeup. * * In order to prevent deadlock, when the waiting thread does its check, in some * cases it will temporarily drop spa_activities_lock in order to acquire the * activity-specific lock. The order in which spa_activities_lock and the * activity specific lock are acquired in the waiting thread is determined by * the order in which they are acquired in the completing thread; if the * completing thread calls spa_notify_waiters with the activity-specific lock * held, then the waiting thread must also acquire the activity-specific lock * first. */ static int spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *in_progress) { int error = 0; ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); switch (activity) { case ZPOOL_WAIT_CKPT_DISCARD: *in_progress = (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == ENOENT); break; case ZPOOL_WAIT_FREE: *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || spa_livelist_delete_check(spa)); break; case ZPOOL_WAIT_INITIALIZE: case ZPOOL_WAIT_TRIM: error = spa_vdev_activity_in_progress(spa, use_tag, tag, activity, in_progress); break; case ZPOOL_WAIT_REPLACE: mutex_exit(&spa->spa_activities_lock); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); mutex_enter(&spa->spa_activities_lock); *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); break; case ZPOOL_WAIT_REMOVE: *in_progress = (spa->spa_removing_phys.sr_state == DSS_SCANNING); break; case ZPOOL_WAIT_RESILVER: if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) break; /* fall through */ case ZPOOL_WAIT_SCRUB: { boolean_t scanning, paused, is_scrub; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); scanning = (scn->scn_phys.scn_state == DSS_SCANNING); paused = dsl_scan_is_paused_scrub(scn); *in_progress = (scanning && !paused && is_scrub == (activity == ZPOOL_WAIT_SCRUB)); break; } default: panic("unrecognized value for activity %d", activity); } return (error); } static int spa_wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, uint64_t tag, boolean_t *waited) { /* * The tag is used to distinguish between instances of an activity. * 'initialize' and 'trim' are the only activities that we use this for. * The other activities can only have a single instance in progress in a * pool at one time, making the tag unnecessary. * * There can be multiple devices being replaced at once, but since they * all finish once resilvering finishes, we don't bother keeping track * of them individually, we just wait for them all to finish. */ if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && activity != ZPOOL_WAIT_TRIM) return (EINVAL); if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) return (EINVAL); spa_t *spa; int error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); /* * Increment the spa's waiter count so that we can call spa_close and * still ensure that the spa_t doesn't get freed before this thread is * finished with it when the pool is exported. We want to call spa_close * before we start waiting because otherwise the additional ref would * prevent the pool from being exported or destroyed throughout the * potentially long wait. */ mutex_enter(&spa->spa_activities_lock); spa->spa_waiters++; spa_close(spa, FTAG); *waited = B_FALSE; for (;;) { boolean_t in_progress; error = spa_activity_in_progress(spa, activity, use_tag, tag, &in_progress); if (error || !in_progress || spa->spa_waiters_cancel) break; *waited = B_TRUE; if (cv_wait_sig(&spa->spa_activities_cv, &spa->spa_activities_lock) == 0) { error = EINTR; break; } } spa->spa_waiters--; cv_signal(&spa->spa_waiters_cv); mutex_exit(&spa->spa_activities_lock); return (error); } /* * Wait for a particular instance of the specified activity to complete, where * the instance is identified by 'tag' */ int spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, boolean_t *waited) { return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); } /* * Wait for all instances of the specified activity complete */ int spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) { return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); } sysevent_t * spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { sysevent_t *ev = NULL; #ifdef _KERNEL nvlist_t *resource; resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); if (resource) { ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); ev->resource = resource; } #endif return (ev); } void spa_event_post(sysevent_t *ev) { #ifdef _KERNEL if (ev) { zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); kmem_free(ev, sizeof (*ev)); } #endif } /* * Post a zevent corresponding to the given sysevent. The 'name' must be one * of the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) { spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); } /* state manipulation functions */ EXPORT_SYMBOL(spa_open); EXPORT_SYMBOL(spa_open_rewind); EXPORT_SYMBOL(spa_get_stats); EXPORT_SYMBOL(spa_create); EXPORT_SYMBOL(spa_import); EXPORT_SYMBOL(spa_tryimport); EXPORT_SYMBOL(spa_destroy); EXPORT_SYMBOL(spa_export); EXPORT_SYMBOL(spa_reset); EXPORT_SYMBOL(spa_async_request); EXPORT_SYMBOL(spa_async_suspend); EXPORT_SYMBOL(spa_async_resume); EXPORT_SYMBOL(spa_inject_addref); EXPORT_SYMBOL(spa_inject_delref); EXPORT_SYMBOL(spa_scan_stat_init); EXPORT_SYMBOL(spa_scan_get_stats); /* device manipulation */ EXPORT_SYMBOL(spa_vdev_add); EXPORT_SYMBOL(spa_vdev_attach); EXPORT_SYMBOL(spa_vdev_detach); EXPORT_SYMBOL(spa_vdev_setpath); EXPORT_SYMBOL(spa_vdev_setfru); EXPORT_SYMBOL(spa_vdev_split_mirror); /* spare statech is global across all pools) */ EXPORT_SYMBOL(spa_spare_add); EXPORT_SYMBOL(spa_spare_remove); EXPORT_SYMBOL(spa_spare_exists); EXPORT_SYMBOL(spa_spare_activate); /* L2ARC statech is global across all pools) */ EXPORT_SYMBOL(spa_l2cache_add); EXPORT_SYMBOL(spa_l2cache_remove); EXPORT_SYMBOL(spa_l2cache_exists); EXPORT_SYMBOL(spa_l2cache_activate); EXPORT_SYMBOL(spa_l2cache_drop); /* scanning */ EXPORT_SYMBOL(spa_scan); EXPORT_SYMBOL(spa_scan_stop); /* spa syncing */ EXPORT_SYMBOL(spa_sync); /* only for DMU use */ EXPORT_SYMBOL(spa_sync_allpools); /* properties */ EXPORT_SYMBOL(spa_prop_set); EXPORT_SYMBOL(spa_prop_get); EXPORT_SYMBOL(spa_prop_clear_bootfs); /* asynchronous event notification */ EXPORT_SYMBOL(spa_event_notify); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW, "log2(fraction of arc that can be used by inflight I/Os when " "verifying pool during import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, "Set to traverse metadata on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, "Set to traverse data on pool import"); ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, "Print vdev tree to zfs_dbgmsg during pool import"); ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, "Percentage of CPUs to run an IO worker thread"); ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, "Set the livelist condense zthr to pause"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, "Set the livelist condense synctask to pause"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the synctask"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, "Whether livelist condensing was canceled in the zthr function"); ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); /* END CSTYLED */ diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index f47adb94d55b..2ab58815400a 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -1,629 +1,628 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" #include "zfs_gitrev.h" #ifdef _KERNEL #include #endif /* * Routines to manage the on-disk history log. * * The history log is stored as a dmu object containing * tuples. * * Where "record nvlist" is an nvlist containing uint64_ts and strings, and * "packed record length" is the packed length of the "record nvlist" stored * as a little endian uint64_t. * * The log is implemented as a ring buffer, though the original creation * of the pool ('zpool create') is never overwritten. * * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer * of 'spa_history' stores the offsets for logging/retrieving history as * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of * where the 'zpool create' record is stored. This allows us to never * overwrite the original creation of the pool. 'sh_phys_max_off' is the * physical ending offset in bytes of the log. This tells you the length of * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record * is added, 'sh_eof' is incremented by the size of the record. * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). * This is where the consumer should start reading from after reading in * the 'zpool create' portion of the log. * * 'sh_records_lost' keeps track of how many records have been overwritten * and permanently lost. */ /* convert a logical offset to physical */ static uint64_t spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) { uint64_t phys_len; phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; return ((log_off - shpp->sh_pool_create_len) % phys_len + shpp->sh_pool_create_len); } void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) { dmu_buf_t *dbp; spa_history_phys_t *shpp; objset_t *mos = spa->spa_meta_objset; ASSERT0(spa->spa_history); spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, sizeof (spa_history_phys_t), tx); VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, sizeof (uint64_t), 1, &spa->spa_history, tx)); VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); ASSERT3U(dbp->db_size, >=, sizeof (spa_history_phys_t)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); /* * Figure out maximum size of history log. We set it at * 0.1% of pool size, with a max of 1G and min of 128KB. */ shpp->sh_phys_max_off = metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); dmu_buf_rele(dbp, FTAG); } /* * Change 'sh_bof' to the beginning of the next record. */ static int spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) { objset_t *mos = spa->spa_meta_objset; uint64_t firstread, reclen, phys_bof; char buf[sizeof (reclen)]; int err; phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, buf, DMU_READ_PREFETCH)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, buf + firstread, DMU_READ_PREFETCH)) != 0) return (err); } reclen = LE_64(*((uint64_t *)buf)); shpp->sh_bof += reclen + sizeof (reclen); shpp->sh_records_lost++; return (0); } static int spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, dmu_tx_t *tx) { uint64_t firstwrite, phys_eof; objset_t *mos = spa->spa_meta_objset; int err; ASSERT(MUTEX_HELD(&spa->spa_history_lock)); /* see if we need to reset logical BOF */ while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - (shpp->sh_eof - shpp->sh_bof) <= len) { if ((err = spa_history_advance_bof(spa, shpp)) != 0) { return (err); } } phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); shpp->sh_eof += len; dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); len -= firstwrite; if (len > 0) { /* write out the rest at the beginning of physical file */ dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, len, (char *)buf + firstwrite, tx); } return (0); } /* * Post a history sysevent. * * The nvlist_t* passed into this function will be transformed into a new * nvlist where: * * 1. Nested nvlists will be flattened to a single level * 2. Keys will have their names normalized (to remove any problematic * characters, such as whitespace) * * The nvlist_t passed into this function will duplicated and should be freed * by caller. * */ static void spa_history_log_notify(spa_t *spa, nvlist_t *nvl) { nvlist_t *hist_nvl = fnvlist_alloc(); uint64_t uint64; char *string; if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0) fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string); if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0) fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string); if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0) fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string); if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0) fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string); if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0) fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string); if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0) fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string); if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0) fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64); if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0) fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64); if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0) fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64); if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0) fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64); if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0) fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64); spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT); nvlist_free(hist_nvl); } /* * Write out a history event. */ /*ARGSUSED*/ static void spa_history_log_sync(void *arg, dmu_tx_t *tx) { nvlist_t *nvl = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; spa_history_phys_t *shpp; size_t reclen; uint64_t le_len; char *record_packed = NULL; int ret; /* * If we have an older pool that doesn't have a command * history object, create it now. */ mutex_enter(&spa->spa_history_lock); if (!spa->spa_history) spa_history_create_obj(spa, tx); mutex_exit(&spa->spa_history_lock); /* * Get the offset of where we need to write via the bonus buffer. * Update the offset when the write completes. */ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname()->nodename); if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { zfs_dbgmsg("command: %s", fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { zfs_dbgmsg("txg %lld %s %s (id %llu) %s", fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } else { zfs_dbgmsg("txg %lld %s %s", fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); } /* * The history sysevent is posted only for internal history * messages to show what has happened, not how it happened. For * example, the following command: * * # zfs destroy -r tank/foo * * will result in one sysevent posted per dataset that is * destroyed as a result of the command - which could be more * than one event in total. By contrast, if the sysevent was * posted as a result of the ZPOOL_HIST_CMD key being present * it would result in only one sysevent being posted with the * full command line arguments, requiring the consumer to know * how to parse and understand zfs(1M) command invocations. */ spa_history_log_notify(spa, nvl); } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { zfs_dbgmsg("ioctl %s", fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); } VERIFY3U(nvlist_pack(nvl, &record_packed, &reclen, NV_ENCODE_NATIVE, KM_SLEEP), ==, 0); mutex_enter(&spa->spa_history_lock); /* write out the packed length as little endian */ le_len = LE_64((uint64_t)reclen); ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); if (!ret) ret = spa_history_write(spa, record_packed, reclen, shpp, tx); /* The first command is the create, which we keep forever */ if (ret == 0 && shpp->sh_pool_create_len == 0 && nvlist_exists(nvl, ZPOOL_HIST_CMD)) { shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; } mutex_exit(&spa->spa_history_lock); fnvlist_pack_free(record_packed, reclen); dmu_buf_rele(dbp, FTAG); fnvlist_free(nvl); } /* * Write out a history event. */ int spa_history_log(spa_t *spa, const char *msg) { int err; nvlist_t *nvl = fnvlist_alloc(); fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg); err = spa_history_log_nvl(spa, nvl); fnvlist_free(nvl); return (err); } int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) { int err = 0; dmu_tx_t *tx; nvlist_t *nvarg, *in_nvl = NULL; if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) return (SET_ERROR(EINVAL)); err = nvlist_lookup_nvlist(nvl, ZPOOL_HIST_INPUT_NVL, &in_nvl); if (err == 0) { (void) nvlist_remove_all(in_nvl, ZPOOL_HIDDEN_ARGS); } tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } VERIFY0(nvlist_dup(nvl, &nvarg, KM_SLEEP)); if (spa_history_zone() != NULL) { fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, spa_history_zone()); } fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); /* Kick this off asynchronously; errors are ignored. */ - dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, - nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); + dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, tx); dmu_tx_commit(tx); /* spa_history_log_sync will free nvl */ return (err); } /* * Read out the command history. */ int spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) { objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; uint64_t read_len, phys_read_off, phys_eof; uint64_t leftover = 0; spa_history_phys_t *shpp; int err; /* * If the command history doesn't exist (older pool), * that's ok, just return ENOENT. */ if (!spa->spa_history) return (SET_ERROR(ENOENT)); /* * The history is logged asynchronously, so when they request * the first chunk of history, make sure everything has been * synced to disk so that we get it. */ if (*offp == 0 && spa_writeable(spa)) txg_wait_synced(spa_get_dsl(spa), 0); if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); shpp = dbp->db_data; #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif mutex_enter(&spa->spa_history_lock); phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); if (*offp < shpp->sh_pool_create_len) { /* read in just the zpool create history */ phys_read_off = *offp; read_len = MIN(*len, shpp->sh_pool_create_len - phys_read_off); } else { /* * Need to reset passed in offset to BOF if the passed in * offset has since been overwritten. */ *offp = MAX(*offp, shpp->sh_bof); phys_read_off = spa_history_log_to_phys(*offp, shpp); /* * Read up to the minimum of what the user passed down or * the EOF (physical or logical). If we hit physical EOF, * use 'leftover' to read from the physical BOF. */ if (phys_read_off <= phys_eof) { read_len = MIN(*len, phys_eof - phys_read_off); } else { read_len = MIN(*len, shpp->sh_phys_max_off - phys_read_off); if (phys_read_off + *len > shpp->sh_phys_max_off) { leftover = MIN(*len - read_len, phys_eof - shpp->sh_pool_create_len); } } } /* offset for consumer to use next */ *offp += read_len + leftover; /* tell the consumer how much you actually read */ *len = read_len + leftover; if (read_len == 0) { mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (0); } err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, DMU_READ_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, leftover, buf + read_len, DMU_READ_PREFETCH); } mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (err); } /* * The nvlist will be consumed by this call. */ static void log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, dmu_tx_t *tx, const char *fmt, va_list adx) { char *msg; /* * If this is part of creating a pool, not everything is * initialized yet, so don't bother logging the internal events. * Likewise if the pool is not writeable. */ if (spa_is_initializing(spa) || !spa_writeable(spa)) { fnvlist_free(nvl); return; } msg = kmem_vasprintf(fmt, adx); fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); kmem_strfree(msg); fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); if (dmu_tx_is_syncing(tx)) { spa_history_log_sync(nvl, tx); } else { dsl_sync_task_nowait(spa_get_dsl(spa), - spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); + spa_history_log_sync, nvl, tx); } /* spa_history_log_sync() will free nvl */ } void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...) { dmu_tx_t *htx = tx; va_list adx; /* create a tx if we didn't get one */ if (tx == NULL) { htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); if (dmu_tx_assign(htx, TXG_WAIT) != 0) { dmu_tx_abort(htx); return; } } va_start(adx, fmt); log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx); va_end(adx); /* if we didn't get a tx from the caller, commit the one we made */ if (tx == NULL) dmu_tx_commit(htx); } void spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, dmu_tx_t *tx, const char *fmt, ...) { va_list adx; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *nvl = fnvlist_alloc(); ASSERT(tx != NULL); dsl_dataset_name(ds, namebuf); fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object); va_start(adx, fmt); log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx); va_end(adx); } void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...) { va_list adx; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; nvlist_t *nvl = fnvlist_alloc(); ASSERT(tx != NULL); dsl_dir_name(dd, namebuf); fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, dsl_dir_phys(dd)->dd_head_dataset_obj); va_start(adx, fmt); log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); va_end(adx); } void spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx) { utsname_t *u = utsname(); spa_history_log_internal(spa, operation, tx, "pool version %llu; software version %s; uts %s %s %s %s", (u_longlong_t)spa_version(spa), ZFS_META_GITREV, u->nodename, u->release, u->version, u->machine); } #ifndef _KERNEL const char * spa_history_zone(void) { return (NULL); } #endif #if defined(_KERNEL) EXPORT_SYMBOL(spa_history_create_obj); EXPORT_SYMBOL(spa_history_get); EXPORT_SYMBOL(spa_history_log); EXPORT_SYMBOL(spa_history_log_internal); EXPORT_SYMBOL(spa_history_log_version); #endif diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 6bc2d917d59c..5301e0665af2 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1,1890 +1,1889 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2014, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * An indirect vdev corresponds to a vdev that has been removed. Since * we cannot rewrite block pointers of snapshots, etc., we keep a * mapping from old location on the removed device to the new location * on another device in the pool and use this mapping whenever we need * to access the DVA. Unfortunately, this mapping did not respect * logical block boundaries when it was first created, and so a DVA on * this indirect vdev may be "split" into multiple sections that each * map to a different location. As a consequence, not all DVAs can be * translated to an equivalent new DVA. Instead we must provide a * "vdev_remap" operation that executes a callback on each contiguous * segment of the new location. This function is used in multiple ways: * * - i/os to this vdev use the callback to determine where the * data is now located, and issue child i/os for each segment's new * location. * * - frees and claims to this vdev use the callback to free or claim * each mapped segment. (Note that we don't actually need to claim * log blocks on indirect vdevs, because we don't allocate to * removing vdevs. However, zdb uses zio_claim() for its leak * detection.) */ /* * "Big theory statement" for how we mark blocks obsolete. * * When a block on an indirect vdev is freed or remapped, a section of * that vdev's mapping may no longer be referenced (aka "obsolete"). We * keep track of how much of each mapping entry is obsolete. When * an entry becomes completely obsolete, we can remove it, thus reducing * the memory used by the mapping. The complete picture of obsolescence * is given by the following data structures, described below: * - the entry-specific obsolete count * - the vdev-specific obsolete spacemap * - the pool-specific obsolete bpobj * * == On disk data structures used == * * We track the obsolete space for the pool using several objects. Each * of these objects is created on demand and freed when no longer * needed, and is assumed to be empty if it does not exist. * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. * * - Each vic_mapping_object (associated with an indirect vdev) can * have a vimp_counts_object. This is an array of uint32_t's * with the same number of entries as the vic_mapping_object. When * the mapping is condensed, entries from the vic_obsolete_sm_object * (see below) are folded into the counts. Therefore, each * obsolete_counts entry tells us the number of bytes in the * corresponding mapping entry that were not referenced when the * mapping was last condensed. * * - Each indirect or removing vdev can have a vic_obsolete_sm_object. * This is a space map containing an alloc entry for every DVA that * has been obsoleted since the last time this indirect vdev was * condensed. We use this object in order to improve performance * when marking a DVA as obsolete. Instead of modifying an arbitrary * offset of the vimp_counts_object, we only need to append an entry * to the end of this object. When a DVA becomes obsolete, it is * added to the obsolete space map. This happens when the DVA is * freed, remapped and not referenced by a snapshot, or the last * snapshot referencing it is destroyed. * * - Each dataset can have a ds_remap_deadlist object. This is a * deadlist object containing all blocks that were remapped in this * dataset but referenced in a previous snapshot. Blocks can *only* * appear on this list if they were remapped (dsl_dataset_block_remapped); * blocks that were killed in a head dataset are put on the normal * ds_deadlist and marked obsolete when they are freed. * * - The pool can have a dp_obsolete_bpobj. This is a list of blocks * in the pool that need to be marked obsolete. When a snapshot is * destroyed, we move some of the ds_remap_deadlist to the obsolete * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then * asynchronously process the obsolete bpobj, moving its entries to * the specific vdevs' obsolete space maps. * * == Summary of how we mark blocks as obsolete == * * - When freeing a block: if any DVA is on an indirect vdev, append to * vic_obsolete_sm_object. * - When remapping a block, add dva to ds_remap_deadlist (if prev snap * references; otherwise append to vic_obsolete_sm_object). * - When freeing a snapshot: move parts of ds_remap_deadlist to * dp_obsolete_bpobj (same algorithm as ds_deadlist). * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to * individual vdev's vic_obsolete_sm_object. */ /* * "Big theory statement" for how we condense indirect vdevs. * * Condensing an indirect vdev's mapping is the process of determining * the precise counts of obsolete space for each mapping entry (by * integrating the obsolete spacemap into the obsolete counts) and * writing out a new mapping that contains only referenced entries. * * We condense a vdev when we expect the mapping to shrink (see * vdev_indirect_should_condense()), but only perform one condense at a * time to limit the memory usage. In addition, we use a separate * open-context thread (spa_condense_indirect_thread) to incrementally * create the new mapping object in a way that minimizes the impact on * the rest of the system. * * == Generating a new mapping == * * To generate a new mapping, we follow these steps: * * 1. Save the old obsolete space map and create a new mapping object * (see spa_condense_indirect_start_sync()). This initializes the * spa_condensing_indirect_phys with the "previous obsolete space map", * which is now read only. Newly obsolete DVAs will be added to a * new (initially empty) obsolete space map, and will not be * considered as part of this condense operation. * * 2. Construct in memory the precise counts of obsolete space for each * mapping entry, by incorporating the obsolete space map into the * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) * * 3. Iterate through each mapping entry, writing to the new mapping any * entries that are not completely obsolete (i.e. which don't have * obsolete count == mapping length). (See * spa_condense_indirect_generate_new_mapping().) * * 4. Destroy the old mapping object and switch over to the new one * (spa_condense_indirect_complete_sync). * * == Restarting from failure == * * To restart the condense when we import/open the pool, we must start * at the 2nd step above: reconstruct the precise counts in memory, * based on the space map + counts. Then in the 3rd step, we start * iterating where we left off: at vimp_max_offset of the new mapping * object. */ int zfs_condense_indirect_vdevs_enable = B_TRUE; /* * Condense if at least this percent of the bytes in the mapping is * obsolete. With the default of 25%, the amount of space mapped * will be reduced to 1% of its original size after at most 16 * condenses. Higher values will condense less often (causing less * i/o); lower values will reduce the mapping size more quickly. */ int zfs_indirect_condense_obsolete_pct = 25; /* * Condense if the obsolete space map takes up more than this amount of * space on disk (logically). This limits the amount of disk space * consumed by the obsolete space map; the default of 1GB is small enough * that we typically don't mind "wasting" it. */ unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; /* * Don't bother condensing if the mapping uses less than this amount of * memory. The default of 128KB is considered a "trivial" amount of * memory and not worth reducing. */ unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a condense (which might otherwise * complete too quickly). If used to reduce the performance impact of * condensing in production, a maximum value of 1 should be sufficient. */ int zfs_condense_indirect_commit_entry_delay_ms = 0; /* * If an indirect split block contains more than this many possible unique * combinations when being reconstructed, consider it too computationally * expensive to check them all. Instead, try at most 100 randomly-selected * combinations each time the block is accessed. This allows all segment * copies to participate fairly in the reconstruction when all combinations * cannot be checked and prevents repeated use of one bad copy. */ int zfs_reconstruct_indirect_combinations_max = 4096; /* * Enable to simulate damaged segments and validate reconstruction. This * is intentionally not exposed as a module parameter. */ unsigned long zfs_reconstruct_indirect_damage_fraction = 0; /* * The indirect_child_t represents the vdev that we will read from, when we * need to read all copies of the data (e.g. for scrub or reconstruction). * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, * ic_vdev is a child of the mirror. */ typedef struct indirect_child { abd_t *ic_data; vdev_t *ic_vdev; /* * ic_duplicate is NULL when the ic_data contents are unique, when it * is determined to be a duplicate it references the primary child. */ struct indirect_child *ic_duplicate; list_node_t ic_node; /* node on is_unique_child */ } indirect_child_t; /* * The indirect_split_t represents one mapped segment of an i/o to the * indirect vdev. For non-split (contiguously-mapped) blocks, there will be * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. * For split blocks, there will be several of these. */ typedef struct indirect_split { list_node_t is_node; /* link on iv_splits */ /* * is_split_offset is the offset into the i/o. * This is the sum of the previous splits' is_size's. */ uint64_t is_split_offset; vdev_t *is_vdev; /* top-level vdev */ uint64_t is_target_offset; /* offset on is_vdev */ uint64_t is_size; int is_children; /* number of entries in is_child[] */ int is_unique_children; /* number of entries in is_unique_child */ list_t is_unique_child; /* * is_good_child is the child that we are currently using to * attempt reconstruction. */ indirect_child_t *is_good_child; indirect_child_t is_child[1]; /* variable-length */ } indirect_split_t; /* * The indirect_vsd_t is associated with each i/o to the indirect vdev. * It is the "Vdev-Specific Data" in the zio_t's io_vsd. */ typedef struct indirect_vsd { boolean_t iv_split_block; boolean_t iv_reconstruct; uint64_t iv_unique_combinations; uint64_t iv_attempts; uint64_t iv_attempts_max; list_t iv_splits; /* list of indirect_split_t's */ } indirect_vsd_t; static void vdev_indirect_map_free(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; while ((is = list_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } list_remove(&iv->iv_splits, is); indirect_child_t *ic; while ((ic = list_head(&is->is_unique_child)) != NULL) list_remove(&is->is_unique_child, ic); list_destroy(&is->is_unique_child); kmem_free(is, offsetof(indirect_split_t, is_child[is->is_children])); } kmem_free(iv, sizeof (*iv)); } static const zio_vsd_ops_t vdev_indirect_vsd_ops = { .vsd_free = vdev_indirect_map_free, .vsd_cksum_report = zio_vsd_default_cksum_report }; /* * Mark the given offset and size as being obsolete. */ void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(size > 0); VERIFY(vdev_indirect_mapping_entry_for_offset( vd->vdev_indirect_mapping, offset) != NULL); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { mutex_enter(&vd->vdev_obsolete_lock); range_tree_add(vd->vdev_obsolete_segments, offset, size); mutex_exit(&vd->vdev_obsolete_lock); vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); } } /* * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This * wrapper is provided because the DMU does not know about vdev_t's and * cannot directly call vdev_indirect_mark_obsolete. */ void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, uint64_t size, dmu_tx_t *tx) { vdev_t *vd = vdev_lookup_top(spa, vdev_id); ASSERT(dmu_tx_is_syncing(tx)); /* The DMU can only remap indirect vdevs. */ ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vdev_indirect_mark_obsolete(vd, offset, size); } static spa_condensing_indirect_t * spa_condensing_indirect_create(spa_t *spa) { spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); objset_t *mos = spa->spa_meta_objset; for (int i = 0; i < TXG_SIZE; i++) { list_create(&sci->sci_new_mapping_entries[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } sci->sci_new_mapping = vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); return (sci); } static void spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) { for (int i = 0; i < TXG_SIZE; i++) list_destroy(&sci->sci_new_mapping_entries[i]); if (sci->sci_new_mapping != NULL) vdev_indirect_mapping_close(sci->sci_new_mapping); kmem_free(sci, sizeof (*sci)); } boolean_t vdev_indirect_should_condense(vdev_t *vd) { vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; spa_t *spa = vd->vdev_spa; ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); if (!zfs_condense_indirect_vdevs_enable) return (B_FALSE); /* * We can only condense one indirect vdev at a time. */ if (spa->spa_condensing_indirect != NULL) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); /* * The mapping object size must not change while we are * condensing, so we can only condense indirect vdevs * (not vdevs that are still in the middle of being removed). */ if (vd->vdev_ops != &vdev_indirect_ops) return (B_FALSE); /* * If nothing new has been marked obsolete, there is no * point in condensing. */ uint64_t obsolete_sm_obj __maybe_unused; ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); if (vd->vdev_obsolete_sm == NULL) { ASSERT0(obsolete_sm_obj); return (B_FALSE); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm)); uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); uint64_t mapping_size = vdev_indirect_mapping_size(vim); uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); ASSERT3U(bytes_obsolete, <=, bytes_mapped); /* * If a high percentage of the bytes that are mapped have become * obsolete, condense (unless the mapping is already small enough). * This has a good chance of reducing the amount of memory used * by the mapping. */ if (bytes_obsolete * 100 / bytes_mapped >= zfs_indirect_condense_obsolete_pct && mapping_size > zfs_condense_min_mapping_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete " "spacemap covers %d%% of %lluMB mapping", (u_longlong_t)vd->vdev_id, (int)(bytes_obsolete * 100 / bytes_mapped), (u_longlong_t)bytes_mapped / 1024 / 1024); return (B_TRUE); } /* * If the obsolete space map takes up too much space on disk, * condense in order to free up this disk space. */ if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { zfs_dbgmsg("should condense vdev %llu because obsolete sm " "length %lluMB >= max size %lluMB", (u_longlong_t)vd->vdev_id, (u_longlong_t)obsolete_sm_size / 1024 / 1024, (u_longlong_t)zfs_condense_max_obsolete_bytes / 1024 / 1024); return (B_TRUE); } return (B_FALSE); } /* * This sync task completes (finishes) a condense, deleting the old * mapping and replacing it with the new one. */ static void spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_meta_objset; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); uint64_t new_count = vdev_indirect_mapping_num_entries(sci->sci_new_mapping); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3P(sci, ==, spa->spa_condensing_indirect); for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } ASSERT(vic->vic_mapping_object != 0); ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); /* * Reset vdev_indirect_mapping to refer to the new object. */ rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = sci->sci_new_mapping; rw_exit(&vd->vdev_indirect_rwlock); sci->sci_new_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = scip->scip_next_mapping_object; scip->scip_next_mapping_object = 0; space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); scip->scip_prev_obsolete_sm_object = 0; scip->scip_vdev = 0; VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, tx)); spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " "new mapping object %llu has %llu entries " "(was %llu entries)", vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, new_count, old_count); vdev_config_dirty(spa->spa_root_vdev); } /* * This sync task appends entries to the new mapping object. */ static void spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) { spa_condensing_indirect_t *sci = arg; uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(sci, ==, spa->spa_condensing_indirect); vdev_indirect_mapping_add_entries(sci->sci_new_mapping, &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); } /* * Open-context function to add one entry to the new mapping. The new * entry will be remembered and written from syncing context. */ static void spa_condense_indirect_commit_entry(spa_t *spa, vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) { spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; /* * If we are the first entry committed this txg, kick off the sync * task to write to the MOS on our behalf. */ if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { dsl_sync_task_nowait(dmu_tx_pool(tx), - spa_condense_indirect_commit_sync, sci, - 0, ZFS_SPACE_CHECK_NONE, tx); + spa_condense_indirect_commit_sync, sci, tx); } vdev_indirect_mapping_entry_t *vime = kmem_alloc(sizeof (*vime), KM_SLEEP); vime->vime_mapping = *vimep; vime->vime_obsolete_count = count; list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); dmu_tx_commit(tx); } static void spa_condense_indirect_generate_new_mapping(vdev_t *vd, uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) { spa_t *spa = vd->vdev_spa; uint64_t mapi = start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; uint64_t old_num_entries = vdev_indirect_mapping_num_entries(old_mapping); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); zfs_dbgmsg("starting condense of vdev %llu from index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); while (mapi < old_num_entries) { if (zthr_iscancelled(zthr)) { zfs_dbgmsg("pausing condense of vdev %llu " "at index %llu", (u_longlong_t)vd->vdev_id, (u_longlong_t)mapi); break; } vdev_indirect_mapping_entry_phys_t *entry = &old_mapping->vim_entries[mapi]; uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); ASSERT3U(obsolete_counts[mapi], <=, entry_size); if (obsolete_counts[mapi] < entry_size) { spa_condense_indirect_commit_entry(spa, entry, obsolete_counts[mapi]); /* * This delay may be requested for testing, debugging, * or performance reasons. */ hrtime_t now = gethrtime(); hrtime_t sleep_until = now + MSEC2NSEC( zfs_condense_indirect_commit_entry_delay_ms); zfs_sleep_until(sleep_until); } mapi++; } } /* ARGSUSED */ static boolean_t spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) { spa_t *spa = arg; return (spa->spa_condensing_indirect != NULL); } /* ARGSUSED */ static void spa_condense_indirect_thread(void *arg, zthr_t *zthr) { spa_t *spa = arg; vdev_t *vd; ASSERT3P(spa->spa_condensing_indirect, !=, NULL); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); ASSERT3P(vd, !=, NULL); spa_config_exit(spa, SCL_VDEV, FTAG); spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; uint32_t *counts; uint64_t start_index; vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; space_map_t *prev_obsolete_sm = NULL; ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); ASSERT(scip->scip_next_mapping_object != 0); ASSERT(scip->scip_prev_obsolete_sm_object != 0); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { /* * The list must start out empty in order for the * _commit_sync() sync task to be properly registered * on the first call to _commit_entry(); so it's wise * to double check and ensure we actually are starting * with empty lists. */ ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); } VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); if (prev_obsolete_sm != NULL) { vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, counts, prev_obsolete_sm); } space_map_close(prev_obsolete_sm); /* * Generate new mapping. Determine what index to continue from * based on the max offset that we've already written in the * new mapping. */ uint64_t max_offset = vdev_indirect_mapping_max_offset(sci->sci_new_mapping); if (max_offset == 0) { /* We haven't written anything to the new mapping yet. */ start_index = 0; } else { /* * Pick up from where we left off. _entry_for_offset() * returns a pointer into the vim_entries array. If * max_offset is greater than any of the mappings * contained in the table NULL will be returned and * that indicates we've exhausted our iteration of the * old_mapping. */ vdev_indirect_mapping_entry_phys_t *entry = vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, max_offset); if (entry == NULL) { /* * We've already written the whole new mapping. * This special value will cause us to skip the * generate_new_mapping step and just do the sync * task to complete the condense. */ start_index = UINT64_MAX; } else { start_index = entry - old_mapping->vim_entries; ASSERT3U(start_index, <, vdev_indirect_mapping_num_entries(old_mapping)); } } spa_condense_indirect_generate_new_mapping(vd, counts, start_index, zthr); vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); /* * If the zthr has received a cancellation signal while running * in generate_new_mapping() or at any point after that, then bail * early. We don't want to complete the condense if the spa is * shutting down. */ if (zthr_iscancelled(zthr)) return; VERIFY0(dsl_sync_task(spa_name(spa), NULL, spa_condense_indirect_complete_sync, sci, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* * Sync task to begin the condensing process. */ void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; spa_condensing_indirect_phys_t *scip = &spa->spa_condensing_indirect_phys; ASSERT0(scip->scip_next_mapping_object); ASSERT0(scip->scip_prev_obsolete_sm_object); ASSERT0(scip->scip_vdev); ASSERT(dmu_tx_is_syncing(tx)); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); uint64_t obsolete_sm_obj; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); ASSERT3U(obsolete_sm_obj, !=, 0); scip->scip_vdev = vd->vdev_id; scip->scip_next_mapping_object = vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; /* * We don't need to allocate a new space map object, since * vdev_indirect_sync_obsolete will allocate one when needed. */ space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (*scip) / sizeof (uint64_t), scip, tx)); ASSERT3P(spa->spa_condensing_indirect, ==, NULL); spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " "posm=%llu nm=%llu", vd->vdev_id, dmu_tx_get_txg(tx), (u_longlong_t)scip->scip_prev_obsolete_sm_object, (u_longlong_t)scip->scip_next_mapping_object); zthr_wakeup(spa->spa_condense_zthr); } /* * Sync to the given vdev's obsolete space map any segments that are no longer * referenced as of the given txg. * * If the obsolete space map doesn't exist yet, create and open it. */ void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; ASSERT3U(vic->vic_mapping_object, !=, 0); ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object == 0) { obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, zfs_vdev_standard_sm_blksz, tx); ASSERT(vd->vdev_top_zap != 0); VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); ASSERT3U(obsolete_sm_object, !=, 0); spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(space_map_open(&vd->vdev_obsolete_sm, spa->spa_meta_objset, obsolete_sm_object, 0, vd->vdev_asize, 0)); } ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); space_map_write(vd->vdev_obsolete_sm, vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } int spa_condense_init(spa_t *spa) { int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), &spa->spa_condensing_indirect_phys); if (error == 0) { if (spa_writeable(spa)) { spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); } return (0); } else if (error == ENOENT) { return (0); } else { return (error); } } void spa_condense_fini(spa_t *spa) { if (spa->spa_condensing_indirect != NULL) { spa_condensing_indirect_destroy(spa->spa_condensing_indirect); spa->spa_condensing_indirect = NULL; } } void spa_start_indirect_condensing_thread(spa_t *spa) { ASSERT3P(spa->spa_condense_zthr, ==, NULL); spa->spa_condense_zthr = zthr_create("z_indirect_condense", spa_condense_indirect_thread_check, spa_condense_indirect_thread, spa); } /* * Gets the obsolete spacemap object from the vdev's ZAP. On success sm_obj * will contain either the obsolete spacemap object or zero if none exists. * All other errors are returned to the caller. */ int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *sm_obj = 0; return (0); } int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj); if (error == ENOENT) { *sm_obj = 0; error = 0; } return (error); } /* * Gets the obsolete count are precise spacemap object from the vdev's ZAP. * On success are_precise will be set to reflect if the counts are precise. * All other errors are returned to the caller. */ int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise) { ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_top_zap == 0) { *are_precise = B_FALSE; return (0); } uint64_t val = 0; int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); if (error == 0) { *are_precise = (val != 0); } else if (error == ENOENT) { *are_precise = B_FALSE; error = 0; } return (error); } /* ARGSUSED */ static void vdev_indirect_close(vdev_t *vd) { } /* ARGSUSED */ static int vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { *psize = *max_psize = vd->vdev_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; *logical_ashift = vd->vdev_ashift; *physical_ashift = vd->vdev_physical_ashift; return (0); } typedef struct remap_segment { vdev_t *rs_vd; uint64_t rs_offset; uint64_t rs_asize; uint64_t rs_split_offset; list_node_t rs_node; } remap_segment_t; static remap_segment_t * rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) { remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); rs->rs_vd = vd; rs->rs_offset = offset; rs->rs_asize = asize; rs->rs_split_offset = split_offset; return (rs); } /* * Given an indirect vdev and an extent on that vdev, it duplicates the * physical entries of the indirect mapping that correspond to the extent * to a new array and returns a pointer to it. In addition, copied_entries * is populated with the number of mapping entries that were duplicated. * * Note that the function assumes that the caller holds vdev_indirect_rwlock. * This ensures that the mapping won't change due to condensing as we * copy over its contents. * * Finally, since we are doing an allocation, it is up to the caller to * free the array allocated in this function. */ static vdev_indirect_mapping_entry_phys_t * vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t *copied_entries) { vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t entries = 0; ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); vdev_indirect_mapping_entry_phys_t *first_mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); ASSERT3P(first_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *m = first_mapping; while (asize > 0) { uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(asize, size - inner_offset); offset += inner_size; asize -= inner_size; entries++; m++; } size_t copy_length = entries * sizeof (*first_mapping); duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); bcopy(first_mapping, duplicate_mappings, copy_length); *copied_entries = entries; return (duplicate_mappings); } /* * Goes through the relevant indirect mappings until it hits a concrete vdev * and issues the callback. On the way to the concrete vdev, if any other * indirect vdevs are encountered, then the callback will also be called on * each of those indirect vdevs. For example, if the segment is mapped to * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is * mapped to segment B on concrete vdev 2, then the callback will be called on * both vdev 1 and vdev 2. * * While the callback passed to vdev_indirect_remap() is called on every vdev * the function encounters, certain callbacks only care about concrete vdevs. * These types of callbacks should return immediately and explicitly when they * are called on an indirect vdev. * * Because there is a possibility that a DVA section in the indirect device * has been split into multiple sections in our mapping, we keep track * of the relevant contiguous segments of the new location (remap_segment_t) * in a stack. This way we can call the callback for each of the new sections * created by a single section of the indirect device. Note though, that in * this scenario the callbacks in each split block won't occur in-order in * terms of offset, so callers should not make any assumptions about that. * * For callbacks that don't handle split blocks and immediately return when * they encounter them (as is the case for remap_blkptr_cb), the caller can * assume that its callback will be applied from the first indirect vdev * encountered to the last one and then the concrete vdev, in that order. */ static void vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) { list_t stack; spa_t *spa = vd->vdev_spa; list_create(&stack, sizeof (remap_segment_t), offsetof(remap_segment_t, rs_node)); for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); rs != NULL; rs = list_remove_head(&stack)) { vdev_t *v = rs->rs_vd; uint64_t num_entries = 0; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); ASSERT(rs->rs_asize > 0); /* * Note: As this function can be called from open context * (e.g. zio_read()), we need the following rwlock to * prevent the mapping from being changed by condensing. * * So we grab the lock and we make a copy of the entries * that are relevant to the extent that we are working on. * Once that is done, we drop the lock and iterate over * our copy of the mapping. Once we are done with the with * the remap segment and we free it, we also free our copy * of the indirect mapping entries that are relevant to it. * * This way we don't need to wait until the function is * finished with a segment, to condense it. In addition, we * don't need a recursive rwlock for the case that a call to * vdev_indirect_remap() needs to call itself (through the * codepath of its callback) for the same vdev in the middle * of its execution. */ rw_enter(&v->vdev_indirect_rwlock, RW_READER); ASSERT3P(v->vdev_indirect_mapping, !=, NULL); vdev_indirect_mapping_entry_phys_t *mapping = vdev_indirect_mapping_duplicate_adjacent_entries(v, rs->rs_offset, rs->rs_asize, &num_entries); ASSERT3P(mapping, !=, NULL); ASSERT3U(num_entries, >, 0); rw_exit(&v->vdev_indirect_rwlock); for (uint64_t i = 0; i < num_entries; i++) { /* * Note: the vdev_indirect_mapping can not change * while we are running. It only changes while the * removal is in progress, and then only from syncing * context. While a removal is in progress, this * function is only called for frees, which also only * happen from syncing context. */ vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; ASSERT3P(m, !=, NULL); ASSERT3U(rs->rs_asize, >, 0); uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); ASSERT3U(rs->rs_offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); ASSERT3U(rs->rs_offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); ASSERT3U(dst_vdev, !=, v->vdev_id); uint64_t inner_offset = rs->rs_offset - DVA_MAPPING_GET_SRC_OFFSET(m); uint64_t inner_size = MIN(rs->rs_asize, size - inner_offset); vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); ASSERT3P(dst_v, !=, NULL); if (dst_v->vdev_ops == &vdev_indirect_ops) { list_insert_head(&stack, rs_alloc(dst_v, dst_offset + inner_offset, inner_size, rs->rs_split_offset)); } if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { /* * Note: This clause exists only solely for * testing purposes. We use it to ensure that * split blocks work and that the callbacks * using them yield the same result if issued * in reverse order. */ uint64_t inner_half = inner_size / 2; func(rs->rs_split_offset + inner_half, dst_v, dst_offset + inner_offset + inner_half, inner_half, arg); func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_half, arg); } else { func(rs->rs_split_offset, dst_v, dst_offset + inner_offset, inner_size, arg); } rs->rs_offset += inner_size; rs->rs_asize -= inner_size; rs->rs_split_offset += inner_size; } VERIFY0(rs->rs_asize); kmem_free(mapping, num_entries * sizeof (*mapping)); kmem_free(rs, sizeof (remap_segment_t)); } list_destroy(&stack); } static void vdev_indirect_child_io_done(zio_t *zio) { zio_t *pio = zio->io_private; mutex_enter(&pio->io_lock); pio->io_error = zio_worst_error(pio->io_error, zio->io_error); mutex_exit(&pio->io_lock); abd_put(zio->io_abd); } /* * This is a callback for vdev_indirect_remap() which allocates an * indirect_split_t for each split segment and adds it to iv_splits. */ static void vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg) { zio_t *zio = arg; indirect_vsd_t *iv = zio->io_vsd; ASSERT3P(vd, !=, NULL); if (vd->vdev_ops == &vdev_indirect_ops) return; int n = 1; if (vd->vdev_ops == &vdev_mirror_ops) n = vd->vdev_children; indirect_split_t *is = kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); is->is_children = n; is->is_size = size; is->is_split_offset = split_offset; is->is_target_offset = offset; is->is_vdev = vd; list_create(&is->is_unique_child, sizeof (indirect_child_t), offsetof(indirect_child_t, ic_node)); /* * Note that we only consider multiple copies of the data for * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even * though they use the same ops as mirror, because there's only one * "good" copy under the replacing/spare. */ if (vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < n; i++) { is->is_child[i].ic_vdev = vd->vdev_child[i]; list_link_init(&is->is_child[i].ic_node); } } else { is->is_child[0].ic_vdev = vd; } list_insert_tail(&iv->iv_splits, is); } static void vdev_indirect_read_split_done(zio_t *zio) { indirect_child_t *ic = zio->io_private; if (zio->io_error != 0) { /* * Clear ic_data to indicate that we do not have data for this * child. */ abd_free(ic->ic_data); ic->ic_data = NULL; } } /* * Issue reads for all copies (mirror children) of all splits. */ static void vdev_indirect_read_all(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (!vdev_readable(ic->ic_vdev)) continue; /* * Note, we may read from a child whose DTL * indicates that the data may not be present here. * While this might result in a few i/os that will * likely return incorrect data, it simplifies the * code since we can treat scrub and resilver * identically. (The incorrect data will be * detected and ignored when we verify the * checksum.) */ ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); ic->ic_duplicate = NULL; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, ic->ic_data, is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_read_split_done, ic)); } } iv->iv_reconstruct = B_TRUE; } static void vdev_indirect_io_start(zio_t *zio) { spa_t *spa __maybe_unused = zio->io_spa; indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); list_create(&iv->iv_splits, sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); zio->io_vsd = iv; zio->io_vsd_ops = &vdev_indirect_vsd_ops; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (zio->io_type != ZIO_TYPE_READ) { ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); /* * Note: this code can handle other kinds of writes, * but we don't expect them. */ ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); } vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, vdev_indirect_gather_splits, zio); indirect_split_t *first = list_head(&iv->iv_splits); if (first->is_size == zio->io_size) { /* * This is not a split block; we are pointing to the entire * data, which will checksum the same as the original data. * Pass the BP down so that the child i/o can verify the * checksum, and try a different location if available * (e.g. on a mirror). * * While this special case could be handled the same as the * general (split block) case, doing it this way ensures * that the vast majority of blocks on indirect vdevs * (which are not split) are handled identically to blocks * on non-indirect vdevs. This allows us to be less strict * about performance in the general (but rare) case. */ ASSERT0(first->is_split_offset); ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, first->is_vdev, first->is_target_offset, abd_get_offset(zio->io_abd, 0), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } else { iv->iv_split_block = B_TRUE; if (zio->io_type == ZIO_TYPE_READ && zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { /* * Read all copies. Note that for simplicity, * we don't bother consulting the DTL in the * resilver case. */ vdev_indirect_read_all(zio); } else { /* * If this is a read zio, we read one copy of each * split segment, from the top-level vdev. Since * we don't know the checksum of each split * individually, the child zio can't ensure that * we get the right data. E.g. if it's a mirror, * it will just read from a random (healthy) leaf * vdev. We have to verify the checksum in * vdev_indirect_io_done(). * * For write zios, the vdev code will ensure we write * to all children. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { zio_nowait(zio_vdev_child_io(zio, NULL, is->is_vdev, is->is_target_offset, abd_get_offset(zio->io_abd, is->is_split_offset), is->is_size, zio->io_type, zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } } } zio_execute(zio); } /* * Report a checksum error for a child. */ static void vdev_indirect_checksum_error(zio_t *zio, indirect_split_t *is, indirect_child_t *ic) { vdev_t *vd = ic->ic_vdev; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); zio_bad_cksum_t zbc = {{{ 0 }}}; abd_t *bad_abd = ic->ic_data; abd_t *good_abd = is->is_good_child->ic_data; (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); } /* * Issue repair i/os for any incorrect copies. We do this by comparing * each split segment's correct data (is_good_child's ic_data) with each * other copy of the data. If they differ, then we overwrite the bad data * with the good copy. Note that we do this without regard for the DTL's, * which simplifies this code and also issues the optimal number of writes * (based on which copies actually read bad data, as opposed to which we * think might be wrong). For the same reason, we always use * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). */ static void vdev_indirect_repair(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; enum zio_flag flags = ZIO_FLAG_IO_REPAIR; if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) flags |= ZIO_FLAG_SELF_HEAL; if (!spa_writeable(zio->io_spa)) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; if (ic->ic_duplicate == is->is_good_child) continue; zio_nowait(zio_vdev_child_io(zio, NULL, ic->ic_vdev, is->is_target_offset, is->is_good_child->ic_data, is->is_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL)); vdev_indirect_checksum_error(zio, is, ic); } } } /* * Report checksum errors on all children that we read from. */ static void vdev_indirect_all_checksum_errors(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (zio->io_flags & ZIO_FLAG_SPECULATIVE) return; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data == NULL) continue; vdev_t *vd = ic->ic_vdev; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); (void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); } } } /* * Copy data from all the splits to a main zio then validate the checksum. * If then checksum is successfully validated return success. */ static int vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) { zio_bad_cksum_t zbc; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { ASSERT3P(is->is_good_child->ic_data, !=, NULL); ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); abd_copy_off(zio->io_abd, is->is_good_child->ic_data, is->is_split_offset, 0, is->is_size); } return (zio_checksum_error(zio, &zbc)); } /* * There are relatively few possible combinations making it feasible to * deterministically check them all. We do this by setting the good_child * to the next unique split version. If we reach the end of the list then * "carry over" to the next unique split version (like counting in base * is_unique_children, but each digit can have a different base). */ static int vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) { boolean_t more = B_TRUE; iv->iv_attempts = 0; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) is->is_good_child = list_head(&is->is_unique_child); while (more == B_TRUE) { iv->iv_attempts++; more = B_FALSE; if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_good_child = list_next(&is->is_unique_child, is->is_good_child); if (is->is_good_child != NULL) { more = B_TRUE; break; } is->is_good_child = list_head(&is->is_unique_child); } } ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); return (SET_ERROR(ECKSUM)); } /* * There are too many combinations to try all of them in a reasonable amount * of time. So try a fixed number of random combinations from the unique * split versions, after which we'll consider the block unrecoverable. */ static int vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) { iv->iv_attempts = 0; while (iv->iv_attempts < iv->iv_attempts_max) { iv->iv_attempts++; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic = list_head(&is->is_unique_child); int children = is->is_unique_children; for (int i = spa_get_random(children); i > 0; i--) ic = list_next(&is->is_unique_child, ic); ASSERT3P(ic, !=, NULL); is->is_good_child = ic; } if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) return (0); } return (SET_ERROR(ECKSUM)); } /* * This is a validation function for reconstruction. It randomly selects * a good combination, if one can be found, and then it intentionally * damages all other segment copes by zeroing them. This forces the * reconstruction algorithm to locate the one remaining known good copy. */ static int vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) { int error; /* Presume all the copies are unique for initial selection. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic = &is->is_child[i]; if (ic->ic_data != NULL) { is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic); } } if (list_is_empty(&is->is_unique_child)) { error = SET_ERROR(EIO); goto out; } } /* * Set each is_good_child to a randomly-selected child which * is known to contain validated data. */ error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error) goto out; /* * Damage all but the known good copy by zeroing it. This will * result in two or less unique copies per indirect_child_t. * Both may need to be checked in order to reconstruct the block. * Set iv->iv_attempts_max such that all unique combinations will * enumerated, but limit the damage to at most 12 indirect splits. */ iv->iv_attempts_max = 1; for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic == is->is_good_child) continue; if (ic->ic_data == NULL) continue; abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); } iv->iv_attempts_max *= 2; if (iv->iv_attempts_max >= (1ULL << 12)) { iv->iv_attempts_max = UINT64_MAX; break; } } out: /* Empty the unique children lists so they can be reconstructed. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; while ((ic = list_head(&is->is_unique_child)) != NULL) list_remove(&is->is_unique_child, ic); is->is_unique_children = 0; } return (error); } /* * This function is called when we have read all copies of the data and need * to try to find a combination of copies that gives us the right checksum. * * If we pointed to any mirror vdevs, this effectively does the job of the * mirror. The mirror vdev code can't do its own job because we don't know * the checksum of each split segment individually. * * We have to try every unique combination of copies of split segments, until * we find one that checksums correctly. Duplicate segment copies are first * identified and latter skipped during reconstruction. This optimization * reduces the search space and ensures that of the remaining combinations * at most one is correct. * * When the total number of combinations is small they can all be checked. * For example, if we have 3 segments in the split, and each points to a * 2-way mirror with unique copies, we will have the following pieces of data: * * | mirror child * split | [0] [1] * ======|===================== * A | data_A_0 data_A_1 * B | data_B_0 data_B_1 * C | data_C_0 data_C_1 * * We will try the following (mirror children)^(number of splits) (2^3=8) * combinations, which is similar to bitwise-little-endian counting in * binary. In general each "digit" corresponds to a split segment, and the * base of each digit is is_children, which can be different for each * digit. * * "low bit" "high bit" * v v * data_A_0 data_B_0 data_C_0 * data_A_1 data_B_0 data_C_0 * data_A_0 data_B_1 data_C_0 * data_A_1 data_B_1 data_C_0 * data_A_0 data_B_0 data_C_1 * data_A_1 data_B_0 data_C_1 * data_A_0 data_B_1 data_C_1 * data_A_1 data_B_1 data_C_1 * * Note that the split segments may be on the same or different top-level * vdevs. In either case, we may need to try lots of combinations (see * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror * has small silent errors on all of its children, we can still reconstruct * the correct data, as long as those errors are at sufficiently-separated * offsets (specifically, separated by the largest block size - default of * 128KB, but up to 16MB). */ static void vdev_indirect_reconstruct_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; boolean_t known_good = B_FALSE; int error; iv->iv_unique_combinations = 1; iv->iv_attempts_max = UINT64_MAX; if (zfs_reconstruct_indirect_combinations_max > 0) iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; /* * If nonzero, every 1/x blocks will be damaged, in order to validate * reconstruction when there are split segments with damaged copies. * Known_good will be TRUE when reconstruction is known to be possible. */ if (zfs_reconstruct_indirect_damage_fraction != 0 && spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0) known_good = (vdev_indirect_splits_damage(iv, zio) == 0); /* * Determine the unique children for a split segment and add them * to the is_unique_child list. By restricting reconstruction * to these children, only unique combinations will be considered. * This can vastly reduce the search space when there are a large * number of indirect splits. */ for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { is->is_unique_children = 0; for (int i = 0; i < is->is_children; i++) { indirect_child_t *ic_i = &is->is_child[i]; if (ic_i->ic_data == NULL || ic_i->ic_duplicate != NULL) continue; for (int j = i + 1; j < is->is_children; j++) { indirect_child_t *ic_j = &is->is_child[j]; if (ic_j->ic_data == NULL || ic_j->ic_duplicate != NULL) continue; if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0) ic_j->ic_duplicate = ic_i; } is->is_unique_children++; list_insert_tail(&is->is_unique_child, ic_i); } /* Reconstruction is impossible, no valid children */ EQUIV(list_is_empty(&is->is_unique_child), is->is_unique_children == 0); if (list_is_empty(&is->is_unique_child)) { zio->io_error = EIO; vdev_indirect_all_checksum_errors(zio); zio_checksum_verified(zio); return; } iv->iv_unique_combinations *= is->is_unique_children; } if (iv->iv_unique_combinations <= iv->iv_attempts_max) error = vdev_indirect_splits_enumerate_all(iv, zio); else error = vdev_indirect_splits_enumerate_randomly(iv, zio); if (error != 0) { /* All attempted combinations failed. */ ASSERT3B(known_good, ==, B_FALSE); zio->io_error = error; vdev_indirect_all_checksum_errors(zio); } else { /* * The checksum has been successfully validated. Issue * repair I/Os to any copies of splits which don't match * the validated version. */ ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); vdev_indirect_repair(zio); zio_checksum_verified(zio); } } static void vdev_indirect_io_done(zio_t *zio) { indirect_vsd_t *iv = zio->io_vsd; if (iv->iv_reconstruct) { /* * We have read all copies of the data (e.g. from mirrors), * either because this was a scrub/resilver, or because the * one-copy read didn't checksum correctly. */ vdev_indirect_reconstruct_io_done(zio); return; } if (!iv->iv_split_block) { /* * This was not a split block, so we passed the BP down, * and the checksum was handled by the (one) child zio. */ return; } zio_bad_cksum_t zbc; int ret = zio_checksum_error(zio, &zbc); if (ret == 0) { zio_checksum_verified(zio); return; } /* * The checksum didn't match. Read all copies of all splits, and * then we will try to reconstruct. The next time * vdev_indirect_io_done() is called, iv_reconstruct will be set. */ vdev_indirect_read_all(zio); zio_vdev_io_redone(zio); } vdev_ops_t vdev_indirect_ops = { .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, .vdev_op_asize = vdev_default_asize, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = vdev_indirect_remap, .vdev_op_xlate = NULL, .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* leaf vdev */ }; EXPORT_SYMBOL(spa_condense_fini); EXPORT_SYMBOL(spa_start_indirect_condensing_thread); EXPORT_SYMBOL(spa_condense_indirect_start_sync); EXPORT_SYMBOL(spa_condense_init); EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_mark_obsolete); EXPORT_SYMBOL(vdev_indirect_should_condense); EXPORT_SYMBOL(vdev_indirect_sync_obsolete); EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); EXPORT_SYMBOL(vdev_obsolete_sm_object); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW, "Whether to attempt condensing indirect vdev mappings"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW, "Don't bother condensing if the mapping uses less than this amount of " "memory"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW, "Minimum size obsolete spacemap to attempt condensing"); ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW, "Used by tests to ensure certain actions happen in the middle of a " "condense. A maximum value of 1 should be sufficient."); ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW, "Maximum number of combinations when reconstructing split segments"); /* END CSTYLED */ diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index ab711441d9ca..7ff7fffcc80e 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -1,750 +1,749 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2016, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * Value that is written to disk during initialization. */ #ifdef _ILP32 unsigned long zfs_initialize_value = 0xdeadbeefUL; #else unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; #endif /* maximum number of I/Os outstanding per leaf vdev */ int zfs_initialize_limit = 1; /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ unsigned long zfs_initialize_chunk_size = 1024 * 1024; static boolean_t vdev_initialize_should_stop(vdev_t *vd) { return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || vd->vdev_detached || vd->vdev_top->vdev_removing); } static void vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) { /* * We pass in the guid instead of the vdev_t since the vdev may * have been freed prior to the sync task being processed. This * happens when a vdev is detached as we call spa_config_vdev_exit(), * stop the initializing thread, schedule the sync task, and free * the vdev. Later when the scheduled sync task is invoked, it would * find that the vdev has been freed. */ uint64_t guid = *(uint64_t *)arg; uint64_t txg = dmu_tx_get_txg(tx); kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) return; uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; vd->vdev_initialize_offset[txg & TXG_MASK] = 0; VERIFY(vd->vdev_leaf_zap != 0); objset_t *mos = vd->vdev_spa->spa_meta_objset; if (last_offset > 0) { vd->vdev_initialize_last_offset = last_offset; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, sizeof (last_offset), 1, &last_offset, tx)); } if (vd->vdev_initialize_action_time > 0) { uint64_t val = (uint64_t)vd->vdev_initialize_action_time; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), 1, &val, tx)); } uint64_t initialize_state = vd->vdev_initialize_state; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, &initialize_state, tx)); } static void vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); spa_t *spa = vd->vdev_spa; if (new_state == vd->vdev_initialize_state) return; /* * Copy the vd's guid, this will be freed by the sync task. */ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* * If we're suspending, then preserving the original start time. */ if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { vd->vdev_initialize_action_time = gethrestime_sec(); } vd->vdev_initialize_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, - guid, 2, ZFS_SPACE_CHECK_NONE, tx); + guid, tx); switch (new_state) { case VDEV_INITIALIZE_ACTIVE: spa_history_log_internal(spa, "initialize", tx, "vdev=%s activated", vd->vdev_path); break; case VDEV_INITIALIZE_SUSPENDED: spa_history_log_internal(spa, "initialize", tx, "vdev=%s suspended", vd->vdev_path); break; case VDEV_INITIALIZE_CANCELED: spa_history_log_internal(spa, "initialize", tx, "vdev=%s canceled", vd->vdev_path); break; case VDEV_INITIALIZE_COMPLETE: spa_history_log_internal(spa, "initialize", tx, "vdev=%s complete", vd->vdev_path); break; default: panic("invalid state %llu", (unsigned long long)new_state); } dmu_tx_commit(tx); if (new_state != VDEV_INITIALIZE_ACTIVE) spa_notify_waiters(spa); } static void vdev_initialize_cb(zio_t *zio) { vdev_t *vd = zio->io_vd; mutex_enter(&vd->vdev_initialize_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the vdev was unavailable; roll the * last offset back. (This works because spa_sync waits on * spa_txg_zio before it runs sync tasks.) */ uint64_t *off = &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; *off = MIN(*off, zio->io_offset); } else { /* * Since initializing is best-effort, we ignore I/O errors and * rely on vdev_probe to determine if the errors are more * critical. */ if (zio->io_error != 0) vd->vdev_stat.vs_initialize_errors++; vd->vdev_initialize_bytes_done += zio->io_orig_size; } ASSERT3U(vd->vdev_initialize_inflight, >, 0); vd->vdev_initialize_inflight--; cv_broadcast(&vd->vdev_initialize_io_cv); mutex_exit(&vd->vdev_initialize_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* Takes care of physical writing and limiting # of concurrent ZIOs. */ static int vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) { spa_t *spa = vd->vdev_spa; /* Limit inflight initializing I/Os */ mutex_enter(&vd->vdev_initialize_io_lock); while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { cv_wait(&vd->vdev_initialize_io_cv, &vd->vdev_initialize_io_lock); } vd->vdev_initialize_inflight++; mutex_exit(&vd->vdev_initialize_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); mutex_enter(&vd->vdev_initialize_lock); if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* This is the first write of this txg. */ dsl_sync_task_nowait(spa_get_dsl(spa), - vdev_initialize_zap_update_sync, guid, 2, - ZFS_SPACE_CHECK_RESERVED, tx); + vdev_initialize_zap_update_sync, guid, tx); } /* * We know the vdev struct will still be around since all * consumers of vdev_free must stop the initialization first. */ if (vdev_initialize_should_stop(vd)) { mutex_enter(&vd->vdev_initialize_io_lock); ASSERT3U(vd->vdev_initialize_inflight, >, 0); vd->vdev_initialize_inflight--; mutex_exit(&vd->vdev_initialize_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_initialize_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_initialize_lock); vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); /* vdev_initialize_cb releases SCL_STATE_ALL */ dmu_tx_commit(tx); return (0); } /* * Callback to fill each ABD chunk with zfs_initialize_value. len must be * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD * allocation will guarantee these for us. */ /* ARGSUSED */ static int vdev_initialize_block_fill(void *buf, size_t len, void *unused) { ASSERT0(len % sizeof (uint64_t)); #ifdef _ILP32 for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) { *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value; } #else for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; } #endif return (0); } static abd_t * vdev_initialize_block_alloc(void) { /* Allocate ABD for filler data */ abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, vdev_initialize_block_fill, NULL); return (data); } static void vdev_initialize_block_free(abd_t *data) { abd_free(data); } static int vdev_initialize_ranges(vdev_t *vd, abd_t *data) { range_tree_t *rt = vd->vdev_initialize_tree; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t where; for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; rs = zfs_btree_next(bt, &where, &where)) { uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); /* Split range into legally-sized physical chunks */ uint64_t writes_required = ((size - 1) / zfs_initialize_chunk_size) + 1; for (uint64_t w = 0; w < writes_required; w++) { int error; error = vdev_initialize_write(vd, VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + (w * zfs_initialize_chunk_size), MIN(size - (w * zfs_initialize_chunk_size), zfs_initialize_chunk_size), data); if (error != 0) return (error); } } return (0); } static void vdev_initialize_calculate_progress(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); vd->vdev_initialize_bytes_est = 0; vd->vdev_initialize_bytes_done = 0; for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); uint64_t ms_free = msp->ms_size - metaslab_allocated_space(msp); if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) ms_free /= vd->vdev_top->vdev_children; /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; vdev_xlate(vd, &logical_rs, &physical_rs); if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } else if (vd->vdev_initialize_last_offset > physical_rs.rs_end) { vd->vdev_initialize_bytes_done += ms_free; vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } /* * If we get here, we're in the middle of initializing this * metaslab. Load it and walk the free tree for more accurate * progress estimation. */ VERIFY0(metaslab_load(msp)); zfs_btree_index_t where; range_tree_t *rt = msp->ms_allocatable; for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); vdev_xlate(vd, &logical_rs, &physical_rs); uint64_t size = physical_rs.rs_end - physical_rs.rs_start; vd->vdev_initialize_bytes_est += size; if (vd->vdev_initialize_last_offset > physical_rs.rs_end) { vd->vdev_initialize_bytes_done += size; } else if (vd->vdev_initialize_last_offset > physical_rs.rs_start && vd->vdev_initialize_last_offset < physical_rs.rs_end) { vd->vdev_initialize_bytes_done += vd->vdev_initialize_last_offset - physical_rs.rs_start; } } mutex_exit(&msp->ms_lock); } } static int vdev_initialize_load(vdev_t *vd) { int err = 0; ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, sizeof (vd->vdev_initialize_last_offset), 1, &vd->vdev_initialize_last_offset); if (err == ENOENT) { vd->vdev_initialize_last_offset = 0; err = 0; } } vdev_initialize_calculate_progress(vd); return (err); } /* * Convert the logical range into a physical range and add it to our * avl tree. */ static void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_xlate(vd, &logical_rs, &physical_rs); IMPLY(vd->vdev_top == vd, logical_rs.rs_start == physical_rs.rs_start); IMPLY(vd->vdev_top == vd, logical_rs.rs_end == physical_rs.rs_end); /* Only add segments that we have not visited yet */ if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) return; /* Pick up where we left off mid-range. */ if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " "(%llu, %llu)", vd->vdev_path, (u_longlong_t)physical_rs.rs_start, (u_longlong_t)physical_rs.rs_end, (u_longlong_t)vd->vdev_initialize_last_offset, (u_longlong_t)physical_rs.rs_end); ASSERT3U(physical_rs.rs_end, >, vd->vdev_initialize_last_offset); physical_rs.rs_start = vd->vdev_initialize_last_offset; } ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); /* * With raidz, it's possible that the logical range does not live on * this leaf vdev. We only add the physical range to this vdev's if it * has a length greater than 0. */ if (physical_rs.rs_end > physical_rs.rs_start) { range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, physical_rs.rs_end - physical_rs.rs_start); } else { ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); } } static void vdev_initialize_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; int error = 0; uint64_t ms_count = 0; ASSERT(vdev_is_concrete(vd)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd->vdev_initialize_last_offset = 0; VERIFY0(vdev_initialize_load(vd)); abd_t *deadbeef = vdev_initialize_block_alloc(); vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; boolean_t unload_when_done = B_FALSE; /* * If we've expanded the top-level vdev or it's our * first pass, calculate our progress. */ if (vd->vdev_top->vdev_ms_count != ms_count) { vdev_initialize_calculate_progress(vd); ms_count = vd->vdev_top->vdev_ms_count; } spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); mutex_enter(&msp->ms_lock); if (!msp->ms_loaded && !msp->ms_loading) unload_when_done = B_TRUE; VERIFY0(metaslab_load(msp)); range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, vd); mutex_exit(&msp->ms_lock); error = vdev_initialize_ranges(vd, deadbeef); metaslab_enable(msp, B_TRUE, unload_when_done); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); if (error != 0) break; } spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_enter(&vd->vdev_initialize_io_lock); while (vd->vdev_initialize_inflight > 0) { cv_wait(&vd->vdev_initialize_io_cv, &vd->vdev_initialize_io_lock); } mutex_exit(&vd->vdev_initialize_io_lock); range_tree_destroy(vd->vdev_initialize_tree); vdev_initialize_block_free(deadbeef); vd->vdev_initialize_tree = NULL; mutex_enter(&vd->vdev_initialize_lock); if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); } ASSERT(vd->vdev_initialize_thread != NULL || vd->vdev_initialize_inflight == 0); /* * Drop the vdev_initialize_lock while we sync out the * txg since it's possible that a device might be trying to * come online and must check to see if it needs to restart an * initialization. That thread will be holding the spa_config_lock * which would prevent the txg_wait_synced from completing. */ mutex_exit(&vd->vdev_initialize_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&vd->vdev_initialize_lock); vd->vdev_initialize_thread = NULL; cv_broadcast(&vd->vdev_initialize_cv); mutex_exit(&vd->vdev_initialize_lock); thread_exit(); } /* * Initiates a device. Caller must hold vdev_initialize_lock. * Device must be a leaf and not already be initializing. */ void vdev_initialize(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_initialize_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); vd->vdev_initialize_thread = thread_create(NULL, 0, vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); } /* * Wait for the initialize thread to be terminated (cancelled or stopped). */ static void vdev_initialize_stop_wait_impl(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); while (vd->vdev_initialize_thread != NULL) cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); vd->vdev_initialize_exit_wanted = B_FALSE; } /* * Wait for vdev initialize threads which were either to cleanly exit. */ void vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) { vdev_t *vd; ASSERT(MUTEX_HELD(&spa_namespace_lock)); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop_wait_impl(vd); mutex_exit(&vd->vdev_initialize_lock); } } /* * Stop initializing a device, with the resultant initializing state being * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when * a list_t is provided the stopping vdev is inserted in to the list. Callers * are then required to call vdev_initialize_stop_wait() to block for all the * initialization threads to exit. The caller must hold vdev_initialize_lock * and must not be writing to the spa config, as the initializing thread may * try to enter the config as a reader before exiting. */ void vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, list_t *vd_list) { ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); /* * Allow cancel requests to proceed even if the initialize thread * has stopped. */ if (vd->vdev_initialize_thread == NULL && tgt_state != VDEV_INITIALIZE_CANCELED) { return; } vdev_initialize_change_state(vd, tgt_state); vd->vdev_initialize_exit_wanted = B_TRUE; if (vd_list == NULL) { vdev_initialize_stop_wait_impl(vd); } else { ASSERT(MUTEX_HELD(&spa_namespace_lock)); list_insert_tail(vd_list, vd); } } static void vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, list_t *vd_list) { if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { mutex_enter(&vd->vdev_initialize_lock); vdev_initialize_stop(vd, tgt_state, vd_list); mutex_exit(&vd->vdev_initialize_lock); return; } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, vd_list); } } /* * Convenience function to stop initializing of a vdev tree and set all * initialize thread pointers to NULL. */ void vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) { spa_t *spa = vd->vdev_spa; list_t vd_list; ASSERT(MUTEX_HELD(&spa_namespace_lock)); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_initialize_node)); vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); vdev_initialize_stop_wait(spa, &vd_list); if (vd->vdev_spa->spa_sync_on) { /* Make sure that our state has been synced to disk */ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); } list_destroy(&vd_list); } void vdev_initialize_restart(vdev_t *vd) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { mutex_enter(&vd->vdev_initialize_lock); uint64_t initialize_state = VDEV_INITIALIZE_NONE; int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, &initialize_state); ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_state = initialize_state; uint64_t timestamp = 0; err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (timestamp), 1, ×tamp); ASSERT(err == 0 || err == ENOENT); vd->vdev_initialize_action_time = timestamp; if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || vd->vdev_offline) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_initialize_load(vd)); } else if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && vd->vdev_initialize_thread == NULL) { vdev_initialize(vd); } mutex_exit(&vd->vdev_initialize_lock); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_initialize_restart(vd->vdev_child[i]); } } EXPORT_SYMBOL(vdev_initialize); EXPORT_SYMBOL(vdev_initialize_stop); EXPORT_SYMBOL(vdev_initialize_stop_all); EXPORT_SYMBOL(vdev_initialize_stop_wait); EXPORT_SYMBOL(vdev_initialize_restart); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW, "Value written during zpool initialize"); ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW, "Size in bytes of writes by zpool initialize"); /* END CSTYLED */ diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 85ed8afe1cf4..3362d608c037 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -1,1108 +1,1104 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * * Copyright (c) 2018, Intel Corporation. * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include #include #include #include /* * This file contains the sequential reconstruction implementation for * resilvering. This form of resilvering is internally referred to as device * rebuild to avoid conflating it with the traditional healing reconstruction * performed by the dsl scan code. * * When replacing a device, or scrubbing the pool, ZFS has historically used * a process called resilvering which is a form of healing reconstruction. * This approach has the advantage that as blocks are read from disk their * checksums can be immediately verified and the data repaired. Unfortunately, * it also results in a random IO pattern to the disk even when extra care * is taken to sequentialize the IO as much as possible. This substantially * increases the time required to resilver the pool and restore redundancy. * * For mirrored devices it's possible to implement an alternate sequential * reconstruction strategy when resilvering. Sequential reconstruction * behaves like a traditional RAID rebuild and reconstructs a device in LBA * order without verifying the checksum. After this phase completes a second * scrub phase is started to verify all of the checksums. This two phase * process will take longer than the healing reconstruction described above. * However, it has that advantage that after the reconstruction first phase * completes redundancy has been restored. At this point the pool can incur * another device failure without risking data loss. * * There are a few noteworthy limitations and other advantages of resilvering * using sequential reconstruction vs healing reconstruction. * * Limitations: * * - Only supported for mirror vdev types. Due to the variable stripe * width used by raidz sequential reconstruction is not possible. * * - Block checksums are not verified during sequential reconstuction. * Similar to traditional RAID the parity/mirror data is reconstructed * but cannot be immediately double checked. For this reason when the * last active resilver completes the pool is automatically scrubbed. * * - Deferred resilvers using sequential reconstruction are not currently * supported. When adding another vdev to an active top-level resilver * it must be restarted. * * Advantages: * * - Sequential reconstuction is performed in LBA order which may be faster * than healing reconstuction particularly when using using HDDs (or * especially with SMR devices). Only allocated capacity is resilvered. * * - Sequential reconstruction is not constrained by ZFS block boundaries. * This allows it to issue larger IOs to disk which span multiple blocks * allowing all of these logical blocks to be repaired with a single IO. * * - Unlike a healing resilver or scrub which are pool wide operations, * sequential reconstruction is handled by the top-level mirror vdevs. * This allows for it to be started or canceled on a top-level vdev * without impacting any other top-level vdevs in the pool. * * - Data only referenced by a pool checkpoint will be repaired because * that space is reflected in the space maps. This differs for a * healing resilver or scrub which will not repair that data. */ /* * Maximum number of queued rebuild I/Os top-level vdev. The number of * concurrent rebuild I/Os issued to the device is controlled by the * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module * options. */ unsigned int zfs_rebuild_queue_limit = 20; /* * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE. */ unsigned long zfs_rebuild_max_segment = 1024 * 1024; /* * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). */ static void vdev_rebuild_thread(void *arg); /* * Clear the per-vdev rebuild bytes value for a vdev tree. */ static void clear_rebuild_bytes(vdev_t *vd) { vdev_stat_t *vs = &vd->vdev_stat; for (uint64_t i = 0; i < vd->vdev_children; i++) clear_rebuild_bytes(vd->vdev_child[i]); mutex_enter(&vd->vdev_stat_lock); vs->vs_rebuild_processed = 0; mutex_exit(&vd->vdev_stat_lock); } /* * Determines whether a vdev_rebuild_thread() should be stopped. */ static boolean_t vdev_rebuild_should_stop(vdev_t *vd) { return (!vdev_writeable(vd) || vd->vdev_removing || vd->vdev_rebuild_exit_wanted || vd->vdev_rebuild_cancel_wanted || vd->vdev_rebuild_reset_wanted); } /* * Determine if the rebuild should be canceled. This may happen when all * vdevs with MISSING DTLs are detached. */ static boolean_t vdev_rebuild_should_cancel(vdev_t *vd) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) return (B_TRUE); return (B_FALSE); } /* * The sync task for updating the on-disk state of a rebuild. This is * scheduled by vdev_rebuild_range(). */ static void vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t txg = dmu_tx_get_txg(tx); mutex_enter(&vd->vdev_rebuild_lock); if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; vr->vr_scan_offset[txg & TXG_MASK] = 0; } vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); mutex_exit(&vd->vdev_rebuild_lock); } /* * Initialize the on-disk state for a new rebuild, start the rebuild thread. */ static void vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; ASSERT(vd->vdev_rebuilding); spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); mutex_enter(&vd->vdev_rebuild_lock); bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; vrp->vrp_min_txg = 0; vrp->vrp_max_txg = dmu_tx_get_txg(tx); vrp->vrp_start_time = gethrestime_sec(); vrp->vrp_scan_time_ms = 0; vr->vr_prev_scan_time_ms = 0; /* * Rebuilds are currently only used when replacing a device, in which * case there must be DTL_MISSING entries. In the future, we could * allow rebuilds to be used in a way similar to a scrub. This would * be useful because it would allow us to rebuild the space used by * pool checkpoints. */ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu started", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&vd->vdev_rebuild_lock); } static void vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); spa_event_notify(spa, vd, aux, name); nvlist_free(aux); } /* * Called to request that a new rebuild be started. The feature will remain * active for the duration of the rebuild, then revert to the enabled state. */ static void vdev_rebuild_initiate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(vd->vdev_top == vd); ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); ASSERT(!vd->vdev_rebuilding); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); vd->vdev_rebuilding = B_TRUE; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, - (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, tx); dmu_tx_commit(tx); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); } /* * Update the on-disk state to completed when a rebuild finishes. */ static void vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; vrp->vrp_end_time = gethrestime_sec(); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu complete", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); /* Handles detaching of spares */ spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); vd->vdev_rebuilding = B_FALSE; mutex_exit(&vd->vdev_rebuild_lock); spa_notify_waiters(spa); cv_broadcast(&vd->vdev_rebuild_cv); } /* * Update the on-disk state to canceled when a rebuild finishes. */ static void vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; vrp->vrp_end_time = gethrestime_sec(); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu canceled", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); vd->vdev_rebuild_cancel_wanted = B_FALSE; vd->vdev_rebuilding = B_FALSE; mutex_exit(&vd->vdev_rebuild_lock); spa_notify_waiters(spa); cv_broadcast(&vd->vdev_rebuild_cv); } /* * Resets the progress of a running rebuild. This will occur when a new * vdev is added to rebuild. */ static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); vrp->vrp_last_offset = 0; vrp->vrp_min_txg = 0; vrp->vrp_max_txg = dmu_tx_get_txg(tx); vrp->vrp_bytes_scanned = 0; vrp->vrp_bytes_issued = 0; vrp->vrp_bytes_rebuilt = 0; vrp->vrp_bytes_est = 0; vrp->vrp_scan_time_ms = 0; vr->vr_prev_scan_time_ms = 0; /* See vdev_rebuild_initiate_sync comment */ VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); spa_history_log_internal(spa, "rebuild", tx, "vdev_id=%llu vdev_guid=%llu reset", (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); vd->vdev_rebuild_reset_wanted = B_FALSE; ASSERT(vd->vdev_rebuilding); vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&vd->vdev_rebuild_lock); } /* * Clear the last rebuild status. */ void vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; objset_t *mos = spa_meta_objset(spa); mutex_enter(&vd->vdev_rebuild_lock); if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { mutex_exit(&vd->vdev_rebuild_lock); return; } clear_rebuild_bytes(vd); bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { VERIFY0(zap_update(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); } mutex_exit(&vd->vdev_rebuild_lock); } /* * The zio_done_func_t callback for each rebuild I/O issued. It's responsible * for updating the rebuild stats and limiting the number of in flight I/Os. */ static void vdev_rebuild_cb(zio_t *zio) { vdev_rebuild_t *vr = zio->io_private; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vdev_t *vd = vr->vr_top_vdev; mutex_enter(&vd->vdev_rebuild_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the top-level vdev was unavailable. * Attempt to roll back to the last completed offset, in order * resume from the correct location if the pool is resumed. * (This works because spa_sync waits on spa_txg_zio before * it runs sync tasks.) */ uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; *off = MIN(*off, zio->io_offset); } else if (zio->io_error) { vrp->vrp_errors++; } abd_free(zio->io_abd); ASSERT3U(vd->vdev_rebuild_inflight, >, 0); vd->vdev_rebuild_inflight--; cv_broadcast(&vd->vdev_rebuild_io_cv); mutex_exit(&vd->vdev_rebuild_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* * Rebuild the data in this range by constructing a special dummy block * pointer for the given range. It has no relation to any existing blocks * in the pool. But by disabling checksum verification and issuing a scrub * I/O mirrored vdevs will replicate the block using any available mirror * leaf vdevs. */ static void vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, uint64_t txg) { vdev_t *vd = vr->vr_top_vdev; spa_t *spa = vd->vdev_spa; uint64_t psize = asize; ASSERT(vd->vdev_ops == &vdev_mirror_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); blkptr_t blk, *bp = &blk; BP_ZERO(bp); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); DVA_SET_OFFSET(&bp->blk_dva[0], start); DVA_SET_GANG(&bp->blk_dva[0], 0); DVA_SET_ASIZE(&bp->blk_dva[0], asize); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(bp, psize); BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(bp, DMU_OT_NONE); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); /* * We increment the issued bytes by the asize rather than the psize * so the scanned and issued bytes may be directly compared. This * is consistent with the scrub/resilver issued reporting. */ vr->vr_pass_bytes_issued += asize; vr->vr_rebuild_phys.vrp_bytes_issued += asize; zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp, abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | ZIO_FLAG_RESILVER, NULL)); } /* * Issues a rebuild I/O and takes care of rate limiting the number of queued * rebuild I/Os. The provided start and size must be properly aligned for the * top-level vdev type being rebuilt. */ static int vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) { uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; vdev_t *vd = vr->vr_top_vdev; spa_t *spa = vd->vdev_spa; ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); vr->vr_pass_bytes_scanned += size; vr->vr_rebuild_phys.vrp_bytes_scanned += size; mutex_enter(&vd->vdev_rebuild_io_lock); /* Limit in flight rebuild I/Os */ while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit) cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); vd->vdev_rebuild_inflight++; mutex_exit(&vd->vdev_rebuild_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); mutex_enter(&vd->vdev_rebuild_lock); /* This is the first I/O for this txg. */ if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { vr->vr_scan_offset[txg & TXG_MASK] = start; dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_update_sync, - (void *)(uintptr_t)vd->vdev_id, 2, - ZFS_SPACE_CHECK_RESERVED, tx); + (void *)(uintptr_t)vd->vdev_id, tx); } /* When exiting write out our progress. */ if (vdev_rebuild_should_stop(vd)) { mutex_enter(&vd->vdev_rebuild_io_lock); vd->vdev_rebuild_inflight--; mutex_exit(&vd->vdev_rebuild_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_rebuild_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_rebuild_lock); vr->vr_scan_offset[txg & TXG_MASK] = start + size; vdev_rebuild_rebuild_block(vr, start, size, txg); dmu_tx_commit(tx); return (0); } /* * Split range into legally-sized logical chunks given the constraints of the * top-level mirror vdev type. */ static uint64_t vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size) { uint64_t chunk_size, max_asize, max_segment; ASSERT(vd->vdev_ops == &vdev_mirror_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment, 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); max_asize = vdev_psize_to_asize(vd, max_segment); chunk_size = MIN(size, max_asize); return (chunk_size); } /* * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. */ static int vdev_rebuild_ranges(vdev_rebuild_t *vr) { vdev_t *vd = vr->vr_top_vdev; zfs_btree_t *t = &vr->vr_scan_tree->rt_root; zfs_btree_index_t idx; int error; for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; rs = zfs_btree_next(t, &idx, &idx)) { uint64_t start = rs_get_start(rs, vr->vr_scan_tree); uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; /* * zfs_scan_suspend_progress can be set to disable rebuild * progress for testing. See comment in dsl_scan_sync(). */ while (zfs_scan_suspend_progress && !vdev_rebuild_should_stop(vd)) { delay(hz); } while (size > 0) { uint64_t chunk_size; chunk_size = vdev_rebuild_chunk_size(vd, start, size); error = vdev_rebuild_range(vr, start, chunk_size); if (error != 0) return (error); size -= chunk_size; start += chunk_size; } } return (0); } /* * Calculates the estimated capacity which remains to be scanned. Since * we traverse the pool in metaslab order only allocated capacity beyond * the vrp_last_offset need be considered. All lower offsets must have * already been rebuilt and are thus already included in vrp_bytes_scanned. */ static void vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t bytes_est = vrp->vrp_bytes_scanned; if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) return; for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_ms[i]; mutex_enter(&msp->ms_lock); bytes_est += metaslab_allocated_space(msp); mutex_exit(&msp->ms_lock); } vrp->vrp_bytes_est = bytes_est; } /* * Load from disk the top-level vdev's rebuild information. */ int vdev_rebuild_load(vdev_t *vd) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; spa_t *spa = vd->vdev_spa; int err = 0; mutex_enter(&vd->vdev_rebuild_lock); vd->vdev_rebuilding = B_FALSE; if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); mutex_exit(&vd->vdev_rebuild_lock); return (SET_ERROR(ENOTSUP)); } ASSERT(vd->vdev_top == vd); err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp); /* * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should * not prevent a pool from being imported. Clear the rebuild * status allowing a new resilver/rebuild to be started. */ if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); } else if (err) { mutex_exit(&vd->vdev_rebuild_lock); return (err); } vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; vr->vr_top_vdev = vd; mutex_exit(&vd->vdev_rebuild_lock); return (0); } /* * Each scan thread is responsible for rebuilding a top-level vdev. The * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. */ static void vdev_rebuild_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; int error = 0; /* * If there's a scrub in process request that it be stopped. This * is not required for a correct rebuild, but we do want rebuilds to * emulate the resilver behavior as much as possible. */ dsl_pool_t *dsl = spa_get_dsl(spa); if (dsl_scan_scrubbing(dsl)) dsl_scan_cancel(dsl); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&vd->vdev_rebuild_lock); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); ASSERT(vd->vdev_rebuilding); ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); clear_rebuild_bytes(vr->vr_top_vdev); mutex_exit(&vd->vdev_rebuild_lock); /* * Systematically walk the metaslabs and issue rebuild I/Os for * all ranges in the allocated space map. */ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_ms[i]; vr->vr_scan_msp = msp; /* * Removal of vdevs from the vdev tree may eliminate the need * for the rebuild, in which case it should be canceled. The * vdev_rebuild_cancel_wanted flag is set until the sync task * completes. This may be after the rebuild thread exits. */ if (vdev_rebuild_should_cancel(vd)) { vd->vdev_rebuild_cancel_wanted = B_TRUE; error = EINTR; break; } ASSERT0(range_tree_space(vr->vr_scan_tree)); /* * Disable any new allocations to this metaslab and wait * for any writes inflight to complete. This is needed to * ensure all allocated ranges are rebuilt. */ metaslab_disable(msp); spa_config_exit(spa, SCL_CONFIG, FTAG); txg_wait_synced(dsl, 0); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * When a metaslab has been allocated from read its allocated * ranges from the space map object in to the vr_scan_tree. * Then add inflight / unflushed ranges and remove inflight / * unflushed frees. This is the minimum range to be rebuilt. */ if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, vr->vr_scan_tree, SM_ALLOC)); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(range_tree_space( msp->ms_allocating[i])); } range_tree_walk(msp->ms_unflushed_allocs, range_tree_add, vr->vr_scan_tree); range_tree_walk(msp->ms_unflushed_frees, range_tree_remove, vr->vr_scan_tree); /* * Remove ranges which have already been rebuilt based * on the last offset. This can happen when restarting * a scan after exporting and re-importing the pool. */ range_tree_clear(vr->vr_scan_tree, 0, vrp->vrp_last_offset); } mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); /* * To provide an accurate estimate re-calculate the estimated * size every 5 minutes to account for recent allocations and * frees made space maps which have not yet been rebuilt. */ if (gethrtime() > update_est_time + SEC2NSEC(300)) { update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, i); } /* * Walk the allocated space map and issue the rebuild I/O. */ error = vdev_rebuild_ranges(vr); range_tree_vacate(vr->vr_scan_tree, NULL, NULL); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); metaslab_enable(msp, B_FALSE, B_FALSE); if (error != 0) break; } range_tree_destroy(vr->vr_scan_tree); spa_config_exit(spa, SCL_CONFIG, FTAG); /* Wait for any remaining rebuild I/O to complete */ mutex_enter(&vd->vdev_rebuild_io_lock); while (vd->vdev_rebuild_inflight > 0) cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); mutex_exit(&vd->vdev_rebuild_io_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); dsl_pool_t *dp = spa_get_dsl(spa); dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); mutex_enter(&vd->vdev_rebuild_lock); if (error == 0) { /* * After a successful rebuild clear the DTLs of all ranges * which were missing when the rebuild was started. These * ranges must have been rebuilt as a consequence of rebuilding * all allocated space. Note that unlike a scrub or resilver * the rebuild operation will reconstruct data only referenced * by a pool checkpoint. See the dsl_scan_done() comments. */ dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, - (void *)(uintptr_t)vd->vdev_id, 0, - ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, tx); } else if (vd->vdev_rebuild_cancel_wanted) { /* * The rebuild operation was canceled. This will occur when * a device participating in the rebuild is detached. */ dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, - (void *)(uintptr_t)vd->vdev_id, 0, - ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, tx); } else if (vd->vdev_rebuild_reset_wanted) { /* * Reset the running rebuild without canceling and restarting * it. This will occur when a new device is attached and must * participate in the rebuild. */ dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, - (void *)(uintptr_t)vd->vdev_id, 0, - ZFS_SPACE_CHECK_NONE, tx); + (void *)(uintptr_t)vd->vdev_id, tx); } else { /* * The rebuild operation should be suspended. This may occur * when detaching a child vdev or when exporting the pool. The * rebuild is left in the active state so it will be resumed. */ ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); vd->vdev_rebuilding = B_FALSE; } dmu_tx_commit(tx); vd->vdev_rebuild_thread = NULL; mutex_exit(&vd->vdev_rebuild_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); cv_broadcast(&vd->vdev_rebuild_cv); thread_exit(); } /* * Returns B_TRUE if any top-level vdev are rebuilding. */ boolean_t vdev_rebuild_active(vdev_t *vd) { spa_t *spa = vd->vdev_spa; boolean_t ret = B_FALSE; if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) { ret = vdev_rebuild_active(vd->vdev_child[i]); if (ret) return (ret); } } else if (vd->vdev_top_zap != 0) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); mutex_exit(&vd->vdev_rebuild_lock); } return (ret); } /* * Start a rebuild operation. The rebuild may be restarted when the * top-level vdev is currently actively rebuilding. */ void vdev_rebuild(vdev_t *vd) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; ASSERT(vd->vdev_top == vd); ASSERT(vdev_is_concrete(vd)); ASSERT(!vd->vdev_removing); ASSERT(spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD)); mutex_enter(&vd->vdev_rebuild_lock); if (vd->vdev_rebuilding) { ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); /* * Signal a running rebuild operation that it should restart * from the beginning because a new device was attached. The * vdev_rebuild_reset_wanted flag is set until the sync task * completes. This may be after the rebuild thread exits. */ if (!vd->vdev_rebuild_reset_wanted) vd->vdev_rebuild_reset_wanted = B_TRUE; } else { vdev_rebuild_initiate(vd); } mutex_exit(&vd->vdev_rebuild_lock); } static void vdev_rebuild_restart_impl(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_rebuild_restart_impl(vd->vdev_child[i]); } else if (vd->vdev_top_zap != 0) { vdev_rebuild_t *vr = &vd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&vd->vdev_rebuild_lock); if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && vdev_writeable(vd) && !vd->vdev_rebuilding) { ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); vd->vdev_rebuilding = B_TRUE; vd->vdev_rebuild_thread = thread_create(NULL, 0, vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); } mutex_exit(&vd->vdev_rebuild_lock); } } /* * Conditionally restart all of the vdev_rebuild_thread's for a pool. The * feature flag must be active and the rebuild in the active state. This * cannot be used to start a new rebuild. */ void vdev_rebuild_restart(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); vdev_rebuild_restart_impl(spa->spa_root_vdev); } /* * Stop and wait for all of the vdev_rebuild_thread's associated with the * vdev tree provide to be terminated (canceled or stopped). */ void vdev_rebuild_stop_wait(vdev_t *vd) { spa_t *spa = vd->vdev_spa; ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (vd == spa->spa_root_vdev) { for (uint64_t i = 0; i < vd->vdev_children; i++) vdev_rebuild_stop_wait(vd->vdev_child[i]); } else if (vd->vdev_top_zap != 0) { ASSERT(vd == vd->vdev_top); mutex_enter(&vd->vdev_rebuild_lock); if (vd->vdev_rebuild_thread != NULL) { vd->vdev_rebuild_exit_wanted = B_TRUE; while (vd->vdev_rebuilding) { cv_wait(&vd->vdev_rebuild_cv, &vd->vdev_rebuild_lock); } vd->vdev_rebuild_exit_wanted = B_FALSE; } mutex_exit(&vd->vdev_rebuild_lock); } } /* * Stop all rebuild operations but leave them in the active state so they * will be resumed when importing the pool. */ void vdev_rebuild_stop_all(spa_t *spa) { vdev_rebuild_stop_wait(spa->spa_root_vdev); } /* * Rebuild statistics reported per top-level vdev. */ int vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) { spa_t *spa = tvd->vdev_spa; if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (SET_ERROR(ENOTSUP)); if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) return (SET_ERROR(EINVAL)); int error = zap_contains(spa_meta_objset(spa), tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); if (error == ENOENT) { bzero(vrs, sizeof (vdev_rebuild_stat_t)); vrs->vrs_state = VDEV_REBUILD_NONE; error = 0; } else if (error == 0) { vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; mutex_enter(&tvd->vdev_rebuild_lock); vrs->vrs_state = vrp->vrp_rebuild_state; vrs->vrs_start_time = vrp->vrp_start_time; vrs->vrs_end_time = vrp->vrp_end_time; vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; vrs->vrs_bytes_est = vrp->vrp_bytes_est; vrs->vrs_errors = vrp->vrp_errors; vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; mutex_exit(&tvd->vdev_rebuild_lock); } return (error); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, "Max segment size in bytes of rebuild reads"); /* END CSTYLED */ diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 56e420871f61..fdeca7ab3418 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1,2340 +1,2339 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2019 by Delphix. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This file contains the necessary logic to remove vdevs from a * storage pool. Currently, the only devices that can be removed * are log, cache, and spare devices; and top level vdevs from a pool * w/o raidz or mirrors. (Note that members of a mirror can be removed * by the detach operation.) * * Log vdevs are removed by evacuating them and then turning the vdev * into a hole vdev while holding spa config locks. * * Top level vdevs are removed and converted into an indirect vdev via * a multi-step process: * * - Disable allocations from this device (spa_vdev_remove_top). * * - From a new thread (spa_vdev_remove_thread), copy data from * the removing vdev to a different vdev. The copy happens in open * context (spa_vdev_copy_impl) and issues a sync task * (vdev_mapping_sync) so the sync thread can update the partial * indirect mappings in core and on disk. * * - If a free happens during a removal, it is freed from the * removing vdev, and if it has already been copied, from the new * location as well (free_from_removing_vdev). * * - After the removal is completed, the copy thread converts the vdev * into an indirect vdev (vdev_remove_complete) before instructing * the sync thread to destroy the space maps and finish the removal * (spa_finish_removal). */ typedef struct vdev_copy_arg { metaslab_t *vca_msp; uint64_t vca_outstanding_bytes; uint64_t vca_read_error_bytes; uint64_t vca_write_error_bytes; kcondvar_t vca_cv; kmutex_t vca_lock; } vdev_copy_arg_t; /* * The maximum amount of memory we can use for outstanding i/o while * doing a device removal. This determines how much i/o we can have * in flight concurrently. */ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; /* * The largest contiguous segment that we will attempt to allocate when * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If * there is a performance problem with attempting to allocate large blocks, * consider decreasing this. * * See also the accessor function spa_remove_max_segment(). */ int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; /* * Ignore hard IO errors during device removal. When set if a device * encounters hard IO error during the removal process the removal will * not be cancelled. This can result in a normally recoverable block * becoming permanently damaged and is not recommended. */ int zfs_removal_ignore_errors = 0; /* * Allow a remap segment to span free chunks of at most this size. The main * impact of a larger span is that we will read and write larger, more * contiguous chunks, with more "unnecessary" data -- trading off bandwidth * for iops. The value here was chosen to align with * zfs_vdev_read_gap_limit, which is a similar concept when doing regular * reads (but there's no reason it has to be the same). * * Additionally, a higher span will have the following relatively minor * effects: * - the mapping will be smaller, since one entry can cover more allocated * segments * - more of the fragmentation in the removing device will be preserved * - we'll do larger allocations, which may fail and fall back on smaller * allocations */ int vdev_removal_max_span = 32 * 1024; /* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ int zfs_removal_suspend_progress = 0; #define VDEV_REMOVAL_ZAP_OBJS "lzap" static void spa_vdev_remove_thread(void *arg); static int spa_vdev_remove_cancel_impl(spa_t *spa); static void spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) { VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REMOVING, sizeof (uint64_t), sizeof (spa->spa_removing_phys) / sizeof (uint64_t), &spa->spa_removing_phys, tx)); } static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { for (int i = 0; i < count; i++) { uint64_t guid = fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); if (guid == target_guid) return (nvpp[i]); } return (NULL); } static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; if (count > 1) newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); for (int i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); } VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); for (int i = 0; i < count - 1; i++) nvlist_free(newdev[i]); if (count > 1) kmem_free(newdev, (count - 1) * sizeof (void *)); } static spa_vdev_removal_t * spa_vdev_removal_create(vdev_t *vd) { spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); } return (svr); } void spa_vdev_removal_destroy(spa_vdev_removal_t *svr) { for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(svr->svr_bytes_done[i]); ASSERT0(svr->svr_max_offset_to_sync[i]); range_tree_destroy(svr->svr_frees[i]); list_destroy(&svr->svr_new_segments[i]); } range_tree_destroy(svr->svr_allocd_segs); mutex_destroy(&svr->svr_lock); cv_destroy(&svr->svr_cv); kmem_free(svr, sizeof (*svr)); } /* * This is called as a synctask in the txg in which we will mark this vdev * as removing (in the config stored in the MOS). * * It begins the evacuation of a toplevel vdev by: * - initializing the spa_removing_phys which tracks this removal * - computing the amount of space to remove for accounting purposes * - dirtying all dbufs in the spa_config_object * - creating the spa_vdev_removal * - starting the spa_vdev_remove_thread */ static void vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) { int vdev_id = (uintptr_t)arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; spa_vdev_removal_t *svr = NULL; uint64_t txg __maybe_unused = dmu_tx_get_txg(tx); ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { /* * By activating the OBSOLETE_COUNTS feature, we prevent * the pool from being downgraded and ensure that the * refcounts are precise. */ spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); uint64_t one = 1; VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, &one, tx)); boolean_t are_precise __maybe_unused; ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise)); ASSERT3B(are_precise, ==, B_TRUE); } vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); vd->vdev_indirect_mapping = vdev_indirect_mapping_open(mos, vic->vic_mapping_object); vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); vd->vdev_indirect_births = vdev_indirect_births_open(mos, vic->vic_births_object); spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; spa->spa_removing_phys.sr_start_time = gethrestime_sec(); spa->spa_removing_phys.sr_end_time = 0; spa->spa_removing_phys.sr_state = DSS_SCANNING; spa->spa_removing_phys.sr_to_copy = 0; spa->spa_removing_phys.sr_copied = 0; /* * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because * there may be space in the defer tree, which is free, but still * counted in vs_alloc. */ for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { metaslab_t *ms = vd->vdev_ms[i]; if (ms->ms_sm == NULL) continue; spa->spa_removing_phys.sr_to_copy += metaslab_allocated_space(ms); /* * Space which we are freeing this txg does not need to * be copied. */ spa->spa_removing_phys.sr_to_copy -= range_tree_space(ms->ms_freeing); ASSERT0(range_tree_space(ms->ms_freed)); for (int t = 0; t < TXG_SIZE; t++) ASSERT0(range_tree_space(ms->ms_allocating[t])); } /* * Sync tasks are called before metaslab_sync(), so there should * be no already-synced metaslabs in the TXG_CLEAN list. */ ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); spa_sync_removing_state(spa, tx); /* * All blocks that we need to read the most recent mapping must be * stored on concrete vdevs. Therefore, we must dirty anything that * is read before spa_remove_init(). Specifically, the * spa_config_object. (Note that although we already modified the * spa_config_object in spa_sync_removing_state, that may not have * modified all blocks of the object.) */ dmu_object_info_t doi; VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { dmu_buf_t *dbuf; VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, offset, FTAG, &dbuf, 0)); dmu_buf_will_dirty(dbuf, tx); offset += dbuf->db_size; dmu_buf_rele(dbuf, FTAG); } /* * Now that we've allocated the im_object, dirty the vdev to ensure * that the object gets written to the config on disk. */ vdev_config_dirty(vd); zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu " "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), vic->vic_mapping_object); spa_history_log_internal(spa, "vdev remove started", tx, "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); /* * Setting spa_vdev_removal causes subsequent frees to call * free_from_removing_vdev(). Note that we don't need any locking * because we are the sync thread, and metaslab_free_impl() is only * called from syncing context (potentially from a zio taskq thread, * but in any case only when there are outstanding free i/os, which * there are not). */ ASSERT3P(spa->spa_vdev_removal, ==, NULL); spa->spa_vdev_removal = svr; svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* * When we are opening a pool, we must read the mapping for each * indirect vdev in order from most recently removed to least * recently removed. We do this because the blocks for the mapping * of older indirect vdevs may be stored on more recently removed vdevs. * In order to read each indirect mapping object, we must have * initialized all more recently removed vdevs. */ int spa_remove_init(spa_t *spa) { int error; error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_REMOVING, sizeof (uint64_t), sizeof (spa->spa_removing_phys) / sizeof (uint64_t), &spa->spa_removing_phys); if (error == ENOENT) { spa->spa_removing_phys.sr_state = DSS_NONE; spa->spa_removing_phys.sr_removing_vdev = -1; spa->spa_removing_phys.sr_prev_indirect_vdev = -1; spa->spa_indirect_vdevs_loaded = B_TRUE; return (0); } else if (error != 0) { return (error); } if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { /* * We are currently removing a vdev. Create and * initialize a spa_vdev_removal_t from the bonus * buffer of the removing vdevs vdev_im_object, and * initialize its partial mapping. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_removing_phys.sr_removing_vdev); if (vd == NULL) { spa_config_exit(spa, SCL_STATE, FTAG); return (EINVAL); } vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT(vdev_is_concrete(vd)); spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); ASSERT(vd->vdev_removing); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); spa_config_exit(spa, SCL_STATE, FTAG); spa->spa_vdev_removal = svr; } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); uint64_t indirect_vdev_id = spa->spa_removing_phys.sr_prev_indirect_vdev; while (indirect_vdev_id != UINT64_MAX) { vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); vd->vdev_indirect_mapping = vdev_indirect_mapping_open( spa->spa_meta_objset, vic->vic_mapping_object); vd->vdev_indirect_births = vdev_indirect_births_open( spa->spa_meta_objset, vic->vic_births_object); indirect_vdev_id = vic->vic_prev_indirect_vdev; } spa_config_exit(spa, SCL_STATE, FTAG); /* * Now that we've loaded all the indirect mappings, we can allow * reads from other blocks (e.g. via predictive prefetch). */ spa->spa_indirect_vdevs_loaded = B_TRUE; return (0); } void spa_restart_removal(spa_t *spa) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; if (svr == NULL) return; /* * In general when this function is called there is no * removal thread running. The only scenario where this * is not true is during spa_import() where this function * is called twice [once from spa_import_impl() and * spa_async_resume()]. Thus, in the scenario where we * import a pool that has an ongoing removal we don't * want to spawn a second thread. */ if (svr->svr_thread != NULL) return; if (!spa_writeable(spa)) return; zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id); svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); } /* * Process freeing from a device which is in the middle of being removed. * We must handle this carefully so that we attempt to copy freed data, * and we correctly free already-copied data. */ void free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) { spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t txg = spa_syncing_txg(spa); uint64_t max_offset_yet = 0; ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, vdev_indirect_mapping_object(vim)); ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); mutex_enter(&svr->svr_lock); /* * Remove the segment from the removing vdev's spacemap. This * ensures that we will not attempt to copy this space (if the * removal thread has not yet visited it), and also ensures * that we know what is actually allocated on the new vdevs * (needed if we cancel the removal). * * Note: we must do the metaslab_free_concrete() with the svr_lock * held, so that the remove_thread can not load this metaslab and then * visit this offset between the time that we metaslab_free_concrete() * and when we check to see if it has been visited. * * Note: The checkpoint flag is set to false as having/taking * a checkpoint and removing a device can't happen at the same * time. */ ASSERT(!spa_has_checkpoint(spa)); metaslab_free_concrete(vd, offset, size, B_FALSE); uint64_t synced_size = 0; uint64_t synced_offset = 0; uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); if (offset < max_offset_synced) { /* * The mapping for this offset is already on disk. * Free from the new location. * * Note that we use svr_max_synced_offset because it is * updated atomically with respect to the in-core mapping. * By contrast, vim_max_offset is not. * * This block may be split between a synced entry and an * in-flight or unvisited entry. Only process the synced * portion of it here. */ synced_size = MIN(size, max_offset_synced - offset); synced_offset = offset; ASSERT3U(max_offset_yet, <=, max_offset_synced); max_offset_yet = max_offset_synced; DTRACE_PROBE3(remove__free__synced, spa_t *, spa, uint64_t, offset, uint64_t, synced_size); size -= synced_size; offset += synced_size; } /* * Look at all in-flight txgs starting from the currently syncing one * and see if a section of this free is being copied. By starting from * this txg and iterating forward, we might find that this region * was copied in two different txgs and handle it appropriately. */ for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { int txgoff = (txg + i) & TXG_MASK; if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { /* * The mapping for this offset is in flight, and * will be synced in txg+i. */ uint64_t inflight_size = MIN(size, svr->svr_max_offset_to_sync[txgoff] - offset); DTRACE_PROBE4(remove__free__inflight, spa_t *, spa, uint64_t, offset, uint64_t, inflight_size, uint64_t, txg + i); /* * We copy data in order of increasing offset. * Therefore the max_offset_to_sync[] must increase * (or be zero, indicating that nothing is being * copied in that txg). */ if (svr->svr_max_offset_to_sync[txgoff] != 0) { ASSERT3U(svr->svr_max_offset_to_sync[txgoff], >=, max_offset_yet); max_offset_yet = svr->svr_max_offset_to_sync[txgoff]; } /* * We've already committed to copying this segment: * we have allocated space elsewhere in the pool for * it and have an IO outstanding to copy the data. We * cannot free the space before the copy has * completed, or else the copy IO might overwrite any * new data. To free that space, we record the * segment in the appropriate svr_frees tree and free * the mapped space later, in the txg where we have * completed the copy and synced the mapping (see * vdev_mapping_sync). */ range_tree_add(svr->svr_frees[txgoff], offset, inflight_size); size -= inflight_size; offset += inflight_size; /* * This space is already accounted for as being * done, because it is being copied in txg+i. * However, if i!=0, then it is being copied in * a future txg. If we crash after this txg * syncs but before txg+i syncs, then the space * will be free. Therefore we must account * for the space being done in *this* txg * (when it is freed) rather than the future txg * (when it will be copied). */ ASSERT3U(svr->svr_bytes_done[txgoff], >=, inflight_size); svr->svr_bytes_done[txgoff] -= inflight_size; svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; } } ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); if (size > 0) { /* * The copy thread has not yet visited this offset. Ensure * that it doesn't. */ DTRACE_PROBE3(remove__free__unvisited, spa_t *, spa, uint64_t, offset, uint64_t, size); if (svr->svr_allocd_segs != NULL) range_tree_clear(svr->svr_allocd_segs, offset, size); /* * Since we now do not need to copy this data, for * accounting purposes we have done our job and can count * it as completed. */ svr->svr_bytes_done[txg & TXG_MASK] += size; } mutex_exit(&svr->svr_lock); /* * Now that we have dropped svr_lock, process the synced portion * of this free. */ if (synced_size > 0) { vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); /* * Note: this can only be called from syncing context, * and the vdev_indirect_mapping is only changed from the * sync thread, so we don't need svr_lock while doing * metaslab_free_impl_cb. */ boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, metaslab_free_impl_cb, &checkpoint); } } /* * Stop an active removal and update the spa_removing phys. */ static void spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); /* Ensure the removal thread has completed before we free the svr. */ spa_vdev_remove_suspend(spa); ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); if (state == DSS_FINISHED) { spa_removing_phys_t *srp = &spa->spa_removing_phys; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; if (srp->sr_prev_indirect_vdev != -1) { vdev_t *pvd; pvd = vdev_lookup_top(spa, srp->sr_prev_indirect_vdev); ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); } vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; srp->sr_prev_indirect_vdev = vd->vdev_id; } spa->spa_removing_phys.sr_state = state; spa->spa_removing_phys.sr_end_time = gethrestime_sec(); spa->spa_vdev_removal = NULL; spa_vdev_removal_destroy(svr); spa_sync_removing_state(spa, tx); spa_notify_waiters(spa); vdev_config_dirty(spa->spa_root_vdev); } static void free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) { vdev_t *vd = arg; vdev_indirect_mark_obsolete(vd, offset, size); boolean_t checkpoint = B_FALSE; vdev_indirect_ops.vdev_op_remap(vd, offset, size, metaslab_free_impl_cb, &checkpoint); } /* * On behalf of the removal thread, syncs an incremental bit more of * the indirect mapping to disk and updates the in-memory mapping. * Called as a sync task in every txg that the removal thread makes progress. */ static void vdev_mapping_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; uint64_t txg = dmu_tx_get_txg(tx); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT(vic->vic_mapping_object != 0); ASSERT3U(txg, ==, spa_syncing_txg(spa)); vdev_indirect_mapping_add_entries(vim, &svr->svr_new_segments[txg & TXG_MASK], tx); vdev_indirect_births_add_entry(vd->vdev_indirect_births, vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); /* * Free the copied data for anything that was freed while the * mapping entries were in flight. */ mutex_enter(&svr->svr_lock); range_tree_vacate(svr->svr_frees[txg & TXG_MASK], free_mapped_segment_cb, vd); ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, vdev_indirect_mapping_max_offset(vim)); svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; mutex_exit(&svr->svr_lock); spa_sync_removing_state(spa, tx); } typedef struct vdev_copy_segment_arg { spa_t *vcsa_spa; dva_t *vcsa_dest_dva; uint64_t vcsa_txg; range_tree_t *vcsa_obsolete_segs; } vdev_copy_segment_arg_t; static void unalloc_seg(void *arg, uint64_t start, uint64_t size) { vdev_copy_segment_arg_t *vcsa = arg; spa_t *spa = vcsa->vcsa_spa; blkptr_t bp = { { { {0} } } }; BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL); BP_SET_LSIZE(&bp, size); BP_SET_PSIZE(&bp, size); BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF); BP_SET_TYPE(&bp, DMU_OT_NONE); BP_SET_LEVEL(&bp, 0); BP_SET_DEDUP(&bp, 0); BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva)); DVA_SET_OFFSET(&bp.blk_dva[0], DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start); DVA_SET_ASIZE(&bp.blk_dva[0], size); zio_free(spa, vcsa->vcsa_txg, &bp); } /* * All reads and writes associated with a call to spa_vdev_copy_segment() * are done. */ static void spa_vdev_copy_segment_done(zio_t *zio) { vdev_copy_segment_arg_t *vcsa = zio->io_private; range_tree_vacate(vcsa->vcsa_obsolete_segs, unalloc_seg, vcsa); range_tree_destroy(vcsa->vcsa_obsolete_segs); kmem_free(vcsa, sizeof (*vcsa)); spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); } /* * The write of the new location is done. */ static void spa_vdev_copy_segment_write_done(zio_t *zio) { vdev_copy_arg_t *vca = zio->io_private; abd_free(zio->io_abd); mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes -= zio->io_size; if (zio->io_error != 0) vca->vca_write_error_bytes += zio->io_size; cv_signal(&vca->vca_cv); mutex_exit(&vca->vca_lock); } /* * The read of the old location is done. The parent zio is the write to * the new location. Allow it to start. */ static void spa_vdev_copy_segment_read_done(zio_t *zio) { vdev_copy_arg_t *vca = zio->io_private; if (zio->io_error != 0) { mutex_enter(&vca->vca_lock); vca->vca_read_error_bytes += zio->io_size; mutex_exit(&vca->vca_lock); } zio_nowait(zio_unique_parent(zio)); } /* * If the old and new vdevs are mirrors, we will read both sides of the old * mirror, and write each copy to the corresponding side of the new mirror. * If the old and new vdevs have a different number of children, we will do * this as best as possible. Since we aren't verifying checksums, this * ensures that as long as there's a good copy of the data, we'll have a * good copy after the removal, even if there's silent damage to one side * of the mirror. If we're removing a mirror that has some silent damage, * we'll have exactly the same damage in the new location (assuming that * the new location is also a mirror). * * We accomplish this by creating a tree of zio_t's, with as many writes as * there are "children" of the new vdev (a non-redundant vdev counts as one * child, a 2-way mirror has 2 children, etc). Each write has an associated * read from a child of the old vdev. Typically there will be the same * number of children of the old and new vdevs. However, if there are more * children of the new vdev, some child(ren) of the old vdev will be issued * multiple reads. If there are more children of the old vdev, some copies * will be dropped. * * For example, the tree of zio_t's for a 2-way mirror is: * * null * / \ * write(new vdev, child 0) write(new vdev, child 1) * | | * read(old vdev, child 0) read(old vdev, child 1) * * Child zio's complete before their parents complete. However, zio's * created with zio_vdev_child_io() may be issued before their children * complete. In this case we need to make sure that the children (reads) * complete before the parents (writes) are *issued*. We do this by not * calling zio_nowait() on each write until its corresponding read has * completed. * * The spa_config_lock must be held while zio's created by * zio_vdev_child_io() are in progress, to ensure that the vdev tree does * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" * zio is needed to release the spa_config_lock after all the reads and * writes complete. (Note that we can't grab the config lock for each read, * because it is not reentrant - we could deadlock with a thread waiting * for a write lock.) */ static void spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, vdev_t *source_vd, uint64_t source_offset, vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) { ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); /* * If the destination child in unwritable then there is no point * in issuing the source reads which cannot be written. */ if (!vdev_writeable(dest_child_vd)) return; mutex_enter(&vca->vca_lock); vca->vca_outstanding_bytes += size; mutex_exit(&vca->vca_lock); abd_t *abd = abd_alloc_for_io(size, B_FALSE); vdev_t *source_child_vd = NULL; if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { /* * Source and dest are both mirrors. Copy from the same * child id as we are copying to (wrapping around if there * are more dest children than source children). If the * preferred source child is unreadable select another. */ for (int i = 0; i < source_vd->vdev_children; i++) { source_child_vd = source_vd->vdev_child[ (dest_id + i) % source_vd->vdev_children]; if (vdev_readable(source_child_vd)) break; } } else { source_child_vd = source_vd; } /* * There should always be at least one readable source child or * the pool would be in a suspended state. Somehow selecting an * unreadable child would result in IO errors, the removal process * being cancelled, and the pool reverting to its pre-removal state. */ ASSERT3P(source_child_vd, !=, NULL); zio_t *write_zio = zio_vdev_child_io(nzio, NULL, dest_child_vd, dest_offset, abd, size, ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, spa_vdev_copy_segment_write_done, vca); zio_nowait(zio_vdev_child_io(write_zio, NULL, source_child_vd, source_offset, abd, size, ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, ZIO_FLAG_CANFAIL, spa_vdev_copy_segment_read_done, vca)); } /* * Allocate a new location for this segment, and create the zio_t's to * read from the old location and write to the new location. */ static int spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, uint64_t maxalloc, uint64_t txg, vdev_copy_arg_t *vca, zio_alloc_list_t *zal) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_entry_t *entry; dva_t dst = {{ 0 }}; uint64_t start = range_tree_min(segs); ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); uint64_t size = range_tree_span(segs); if (range_tree_span(segs) > maxalloc) { /* * We can't allocate all the segments. Prefer to end * the allocation at the end of a segment, thus avoiding * additional split blocks. */ range_seg_max_t search; zfs_btree_index_t where; rs_set_start(&search, segs, start + maxalloc); rs_set_end(&search, segs, start + maxalloc); (void) zfs_btree_find(&segs->rt_root, &search, &where); range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, &where); if (rs != NULL) { size = rs_get_end(rs, segs) - start; } else { /* * There are no segments that end before maxalloc. * I.e. the first segment is larger than maxalloc, * so we must split it. */ size = maxalloc; } } ASSERT3U(size, <=, maxalloc); ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); /* * An allocation class might not have any remaining vdevs or space */ metaslab_class_t *mc = mg->mg_class; if (mc != spa_normal_class(spa) && mc->mc_groups <= 1) mc = spa_normal_class(spa); int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0, zal, 0); if (error == ENOSPC && mc != spa_normal_class(spa)) { error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, &dst, 0, NULL, txg, 0, zal, 0); } if (error != 0) return (error); /* * Determine the ranges that are not actually needed. Offsets are * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); zfs_btree_index_t where; range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); ASSERT3U(rs_get_start(rs, segs), ==, start); uint64_t prev_seg_end = rs_get_end(rs, segs); while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { if (rs_get_start(rs, segs) >= start + size) { break; } else { range_tree_add(obsolete_segs, prev_seg_end - start, rs_get_start(rs, segs) - prev_seg_end); } prev_seg_end = rs_get_end(rs, segs); } /* We don't end in the middle of an obsolete range */ ASSERT3U(start + size, <=, prev_seg_end); range_tree_clear(segs, start, size); /* * We can't have any padding of the allocated size, otherwise we will * misunderstand what's allocated, and the size of the mapping. We * prevent padding by ensuring that all devices in the pool have the * same ashift, and the allocation size is a multiple of the ashift. */ VERIFY3U(DVA_GET_ASIZE(&dst), ==, size); entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); entry->vime_mapping.vimep_dst = dst; if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { entry->vime_obsolete_count = range_tree_space(obsolete_segs); } vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; vcsa->vcsa_obsolete_segs = obsolete_segs; vcsa->vcsa_spa = spa; vcsa->vcsa_txg = txg; /* * See comment before spa_vdev_copy_one_child(). */ spa_config_enter(spa, SCL_STATE, spa, RW_READER); zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, spa_vdev_copy_segment_done, vcsa, 0); vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); if (dest_vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < dest_vd->vdev_children; i++) { vdev_t *child = dest_vd->vdev_child[i]; spa_vdev_copy_one_child(vca, nzio, vd, start, child, DVA_GET_OFFSET(&dst), i, size); } } else { spa_vdev_copy_one_child(vca, nzio, vd, start, dest_vd, DVA_GET_OFFSET(&dst), -1, size); } zio_nowait(nzio); list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); vdev_dirty(vd, 0, NULL, txg); return (0); } /* * Complete the removal of a toplevel vdev. This is called as a * synctask in the same txg that we will sync out the new config (to the * MOS object) which indicates that this vdev is indirect. */ static void vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) { spa_vdev_removal_t *svr = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(svr->svr_bytes_done[i]); } ASSERT3U(spa->spa_removing_phys.sr_copied, ==, spa->spa_removing_phys.sr_to_copy); vdev_destroy_spacemaps(vd, tx); /* destroy leaf zaps, if any */ ASSERT3P(svr->svr_zaplist, !=, NULL); for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); pair != NULL; pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); } fnvlist_free(svr->svr_zaplist); spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); /* vd->vdev_path is not available here */ spa_history_log_internal(spa, "vdev remove completed", tx, "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id); } static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); if (vd->vdev_leaf_zap != 0) { char zkey[32]; (void) snprintf(zkey, sizeof (zkey), "%s-%llu", VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap); fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); } for (uint64_t id = 0; id < vd->vdev_children; id++) { vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); } } static void vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) { vdev_t *ivd; dmu_tx_t *tx; spa_t *spa = vd->vdev_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; /* * First, build a list of leaf zaps to be destroyed. * This is passed to the sync context thread, * which does the actual unlinking. */ svr->svr_zaplist = fnvlist_alloc(); vdev_remove_enlist_zaps(vd, svr->svr_zaplist); ivd = vdev_add_parent(vd, &vdev_indirect_ops); ivd->vdev_removing = 0; vd->vdev_leaf_zap = 0; vdev_remove_child(ivd, vd); vdev_compact_children(ivd); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); mutex_enter(&svr->svr_lock); svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); /* After this, we can not use svr. */ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, - 0, ZFS_SPACE_CHECK_NONE, tx); + dsl_sync_task_nowait(spa->spa_dsl_pool, + vdev_remove_complete_sync, svr, tx); dmu_tx_commit(tx); } /* * Complete the removal of a toplevel vdev. This is called in open * context by the removal thread after we have copied all vdev's data. */ static void vdev_remove_complete(spa_t *spa) { uint64_t txg; /* * Wait for any deferred frees to be synced before we call * vdev_metaslab_fini() */ txg_wait_synced(spa->spa_dsl_pool, 0); txg = spa_vdev_enter(spa); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); ASSERT3P(vd->vdev_initialize_thread, ==, NULL); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", vd->vdev_id, txg); /* * Discard allocation state. */ if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; spa_log_sm_set_blocklimit(spa); } ASSERT0(vd->vdev_stat.vs_space); ASSERT0(vd->vdev_stat.vs_dspace); vdev_remove_replace_with_indirect(vd, txg); /* * We now release the locks, allowing spa_sync to run and finish the * removal via vdev_remove_complete_sync in syncing context. * * Note that we hold on to the vdev_t that has been replaced. Since * it isn't part of the vdev tree any longer, it can't be concurrently * manipulated, even while we don't have the config lock. */ (void) spa_vdev_exit(spa, NULL, txg, 0); /* * Top ZAP should have been transferred to the indirect vdev in * vdev_remove_replace_with_indirect. */ ASSERT0(vd->vdev_top_zap); /* * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. */ ASSERT0(vd->vdev_leaf_zap); txg = spa_vdev_enter(spa); (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Request to update the config and the config cachefile. */ vdev_config_dirty(spa->spa_root_vdev); (void) spa_vdev_exit(spa, vd, txg, 0); if (ev != NULL) spa_event_post(ev); } /* * Evacuates a segment of size at most max_alloc from the vdev * via repeated calls to spa_vdev_copy_segment. If an allocation * fails, the pool is probably too fragmented to handle such a * large size, so decrease max_alloc so that the caller will not try * this size again this txg. */ static void spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, uint64_t *max_alloc, dmu_tx_t *tx) { uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = dmu_tx_pool(tx)->dp_spa; mutex_enter(&svr->svr_lock); /* * Determine how big of a chunk to copy. We can allocate up * to max_alloc bytes, and we can span up to vdev_removal_max_span * bytes of unallocated space at a time. "segs" will track the * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); for (;;) { range_tree_t *rt = svr->svr_allocd_segs; range_seg_t *rs = range_tree_first(rt); if (rs == NULL) break; uint64_t seg_length; if (range_tree_is_empty(segs)) { /* need to truncate the first seg based on max_alloc */ seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs, rt), *max_alloc); } else { if (rs_get_start(rs, rt) - range_tree_max(segs) > vdev_removal_max_span) { /* * Including this segment would cause us to * copy a larger unneeded chunk than is allowed. */ break; } else if (rs_get_end(rs, rt) - range_tree_min(segs) > *max_alloc) { /* * This additional segment would extend past * max_alloc. Rather than splitting this * segment, leave it for the next mapping. */ break; } else { seg_length = rs_get_end(rs, rt) - rs_get_start(rs, rt); } } range_tree_add(segs, rs_get_start(rs, rt), seg_length); range_tree_remove(svr->svr_allocd_segs, rs_get_start(rs, rt), seg_length); } if (range_tree_is_empty(segs)) { mutex_exit(&svr->svr_lock); range_tree_destroy(segs); return; } if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, - svr, 0, ZFS_SPACE_CHECK_NONE, tx); + svr, tx); } svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); /* * Note: this is the amount of *allocated* space * that we are taking care of each txg. */ svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs); mutex_exit(&svr->svr_lock); zio_alloc_list_t zal; metaslab_trace_init(&zal); uint64_t thismax = SPA_MAXBLOCKSIZE; while (!range_tree_is_empty(segs)) { int error = spa_vdev_copy_segment(vd, segs, thismax, txg, vca, &zal); if (error == ENOSPC) { /* * Cut our segment in half, and don't try this * segment size again this txg. Note that the * allocation size must be aligned to the highest * ashift in the pool, so that the allocation will * not be padded out to a multiple of the ashift, * which could cause us to think that this mapping * is larger than we intended. */ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); uint64_t attempted = MIN(range_tree_span(segs), thismax); thismax = P2ROUNDUP(attempted / 2, 1 << spa->spa_max_ashift); /* * The minimum-size allocation can not fail. */ ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); *max_alloc = attempted - (1 << spa->spa_max_ashift); } else { ASSERT0(error); /* * We've performed an allocation, so reset the * alloc trace list. */ metaslab_trace_fini(&zal); metaslab_trace_init(&zal); } } metaslab_trace_fini(&zal); range_tree_destroy(segs); } /* * The size of each removal mapping is limited by the tunable * zfs_remove_max_segment, but we must adjust this to be a multiple of the * pool's ashift, so that we don't try to split individual sectors regardless * of the tunable value. (Note that device removal requires that all devices * have the same ashift, so there's no difference between spa_min_ashift and * spa_max_ashift.) The raw tunable should not be used elsewhere. */ uint64_t spa_remove_max_segment(spa_t *spa) { return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift)); } /* * The removal thread operates in open context. It iterates over all * allocated space in the vdev, by loading each metaslab's spacemap. * For each contiguous segment of allocated space (capping the segment * size at SPA_MAXBLOCKSIZE), we: * - Allocate space for it on another vdev. * - Create a new mapping from the old location to the new location * (as a record in svr_new_segments). * - Initiate a physical read zio to get the data off the removing disk. * - In the read zio's done callback, initiate a physical write zio to * write it to the new vdev. * Note that all of this will take effect when a particular TXG syncs. * The sync thread ensures that all the phys reads and writes for the syncing * TXG have completed (see spa_txg_zio) and writes the new mappings to disk * (see vdev_mapping_sync()). */ static void spa_vdev_remove_thread(void *arg) { spa_t *spa = arg; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_copy_arg_t vca; uint64_t max_alloc = spa_remove_max_segment(spa); uint64_t last_txg = 0; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_removing); ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); ASSERT(vim != NULL); mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); vca.vca_outstanding_bytes = 0; vca.vca_read_error_bytes = 0; vca.vca_write_error_bytes = 0; mutex_enter(&svr->svr_lock); /* * Start from vim_max_offset so we pick up where we left off * if we are restarting the removal after opening the pool. */ uint64_t msi; for (msi = start_offset >> vd->vdev_ms_shift; msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; ASSERT3U(msi, <=, vd->vdev_ms_count); ASSERT0(range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); /* * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(range_tree_space(msp->ms_allocating[i])); } /* * If the metaslab has ever been allocated from (ms_sm!=NULL), * read the allocated segments from the space map object * into svr_allocd_segs. Since we do this while holding * svr_lock and ms_sync_lock, concurrent frees (which * would have modified the space map) will wait for us * to finish loading the spacemap, and then take the * appropriate action (see free_from_removing_vdev()). */ if (msp->ms_sm != NULL) { VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); range_tree_walk(msp->ms_unflushed_allocs, range_tree_add, svr->svr_allocd_segs); range_tree_walk(msp->ms_unflushed_frees, range_tree_remove, svr->svr_allocd_segs); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); /* * When we are resuming from a paused removal (i.e. * when importing a pool with a removal in progress), * discard any state that we have already processed. */ range_tree_clear(svr->svr_allocd_segs, 0, start_offset); } mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_sync_lock); vca.vca_msp = msp; zfs_dbgmsg("copying %llu segments for metaslab %llu", zfs_btree_numnodes(&svr->svr_allocd_segs->rt_root), msp->ms_id); while (!svr->svr_thread_exit && !range_tree_is_empty(svr->svr_allocd_segs)) { mutex_exit(&svr->svr_lock); /* * We need to periodically drop the config lock so that * writers can get in. Additionally, we can't wait * for a txg to sync while holding a config lock * (since a waiting writer could cause a 3-way deadlock * with the sync thread, which also gets a config * lock for reader). So we can't hold the config lock * while calling dmu_tx_assign(). */ spa_config_exit(spa, SCL_CONFIG, FTAG); /* * This delay will pause the removal around the point * specified by zfs_removal_suspend_progress. We do this * solely from the test suite or during debugging. */ uint64_t bytes_copied = spa->spa_removing_phys.sr_copied; for (int i = 0; i < TXG_SIZE; i++) bytes_copied += svr->svr_bytes_done[i]; while (zfs_removal_suspend_progress && !svr->svr_thread_exit) delay(hz); mutex_enter(&vca.vca_lock); while (vca.vca_outstanding_bytes > zfs_remove_max_copy_bytes) { cv_wait(&vca.vca_cv, &vca.vca_lock); } mutex_exit(&vca.vca_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); /* * Reacquire the vdev_config lock. The vdev_t * that we're removing may have changed, e.g. due * to a vdev_attach or vdev_detach. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd = vdev_lookup_top(spa, svr->svr_vdev_id); if (txg != last_txg) max_alloc = spa_remove_max_segment(spa); last_txg = txg; spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); dmu_tx_commit(tx); mutex_enter(&svr->svr_lock); } mutex_enter(&vca.vca_lock); if (zfs_removal_ignore_errors == 0 && (vca.vca_read_error_bytes > 0 || vca.vca_write_error_bytes > 0)) { svr->svr_thread_exit = B_TRUE; } mutex_exit(&vca.vca_lock); } mutex_exit(&svr->svr_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); /* * Wait for all copies to finish before cleaning up the vca. */ txg_wait_synced(spa->spa_dsl_pool, 0); ASSERT0(vca.vca_outstanding_bytes); mutex_destroy(&vca.vca_lock); cv_destroy(&vca.vca_cv); if (svr->svr_thread_exit) { mutex_enter(&svr->svr_lock); range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); svr->svr_thread = NULL; cv_broadcast(&svr->svr_cv); mutex_exit(&svr->svr_lock); /* * During the removal process an unrecoverable read or write * error was encountered. The removal process must be * cancelled or this damage may become permanent. */ if (zfs_removal_ignore_errors == 0 && (vca.vca_read_error_bytes > 0 || vca.vca_write_error_bytes > 0)) { zfs_dbgmsg("canceling removal due to IO errors: " "[read_error_bytes=%llu] [write_error_bytes=%llu]", vca.vca_read_error_bytes, vca.vca_write_error_bytes); spa_vdev_remove_cancel_impl(spa); } } else { ASSERT0(range_tree_space(svr->svr_allocd_segs)); vdev_remove_complete(spa); } thread_exit(); } void spa_vdev_remove_suspend(spa_t *spa) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; if (svr == NULL) return; mutex_enter(&svr->svr_lock); svr->svr_thread_exit = B_TRUE; while (svr->svr_thread != NULL) cv_wait(&svr->svr_cv, &svr->svr_lock); svr->svr_thread_exit = B_FALSE; mutex_exit(&svr->svr_lock); } /* ARGSUSED */ static int spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; if (spa->spa_vdev_removal == NULL) return (ENOTACTIVE); return (0); } /* * Cancel a removal by freeing all entries from the partial mapping * and marking the vdev as no longer being removing. */ /* ARGSUSED */ static void spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; objset_t *mos = spa->spa_meta_objset; ASSERT3P(svr->svr_thread, ==, NULL); spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); boolean_t are_precise; VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); if (are_precise) { spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); } uint64_t obsolete_sm_object; VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); if (obsolete_sm_object != 0) { ASSERT(vd->vdev_obsolete_sm != NULL); ASSERT3U(obsolete_sm_object, ==, space_map_object(vd->vdev_obsolete_sm)); space_map_free(vd->vdev_obsolete_sm, tx); VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); space_map_close(vd->vdev_obsolete_sm); vd->vdev_obsolete_sm = NULL; spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } for (int i = 0; i < TXG_SIZE; i++) { ASSERT(list_is_empty(&svr->svr_new_segments[i])); ASSERT3U(svr->svr_max_offset_to_sync[i], <=, vdev_indirect_mapping_max_offset(vim)); } for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) break; ASSERT0(range_tree_space(svr->svr_allocd_segs)); mutex_enter(&msp->ms_lock); /* * Assert nothing in flight -- ms_*tree is empty. */ for (int i = 0; i < TXG_SIZE; i++) ASSERT0(range_tree_space(msp->ms_allocating[i])); for (int i = 0; i < TXG_DEFER_SIZE; i++) ASSERT0(range_tree_space(msp->ms_defer[i])); ASSERT0(range_tree_space(msp->ms_freed)); if (msp->ms_sm != NULL) { mutex_enter(&svr->svr_lock); VERIFY0(space_map_load(msp->ms_sm, svr->svr_allocd_segs, SM_ALLOC)); range_tree_walk(msp->ms_unflushed_allocs, range_tree_add, svr->svr_allocd_segs); range_tree_walk(msp->ms_unflushed_frees, range_tree_remove, svr->svr_allocd_segs); range_tree_walk(msp->ms_freeing, range_tree_remove, svr->svr_allocd_segs); /* * Clear everything past what has been synced, * because we have not allocated mappings for it yet. */ uint64_t syncd = vdev_indirect_mapping_max_offset(vim); uint64_t sm_end = msp->ms_sm->sm_start + msp->ms_sm->sm_size; if (sm_end > syncd) range_tree_clear(svr->svr_allocd_segs, syncd, sm_end - syncd); mutex_exit(&svr->svr_lock); } mutex_exit(&msp->ms_lock); mutex_enter(&svr->svr_lock); range_tree_vacate(svr->svr_allocd_segs, free_mapped_segment_cb, vd); mutex_exit(&svr->svr_lock); } /* * Note: this must happen after we invoke free_mapped_segment_cb, * because it adds to the obsolete_segments. */ range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); ASSERT3U(vic->vic_mapping_object, ==, vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); vdev_indirect_mapping_close(vd->vdev_indirect_mapping); vd->vdev_indirect_mapping = NULL; vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); vic->vic_mapping_object = 0; ASSERT3U(vic->vic_births_object, ==, vdev_indirect_births_object(vd->vdev_indirect_births)); vdev_indirect_births_close(vd->vdev_indirect_births); vd->vdev_indirect_births = NULL; vdev_indirect_births_free(mos, vic->vic_births_object, tx); vic->vic_births_object = 0; /* * We may have processed some frees from the removing vdev in this * txg, thus increasing svr_bytes_done; discard that here to * satisfy the assertions in spa_vdev_removal_destroy(). * Note that future txg's can not have any bytes_done, because * future TXG's are only modified from open context, and we have * already shut down the copying thread. */ svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; spa_finish_removal(spa, DSS_CANCELED, tx); vd->vdev_removing = B_FALSE; vdev_config_dirty(vd); zfs_dbgmsg("canceled device removal for vdev %llu in %llu", vd->vdev_id, dmu_tx_get_txg(tx)); spa_history_log_internal(spa, "vdev remove canceled", tx, "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); } static int spa_vdev_remove_cancel_impl(spa_t *spa) { uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED); if (error == 0) { spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); vdev_t *vd = vdev_lookup_top(spa, vdid); metaslab_group_activate(vd->vdev_mg); spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); } return (error); } int spa_vdev_remove_cancel(spa_t *spa) { spa_vdev_remove_suspend(spa); if (spa->spa_vdev_removal == NULL) return (ENOTACTIVE); return (spa_vdev_remove_cancel_impl(spa)); } void svr_sync(spa_t *spa, dmu_tx_t *tx) { spa_vdev_removal_t *svr = spa->spa_vdev_removal; int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; if (svr == NULL) return; /* * This check is necessary so that we do not dirty the * DIRECTORY_OBJECT via spa_sync_removing_state() when there * is nothing to do. Dirtying it every time would prevent us * from syncing-to-convergence. */ if (svr->svr_bytes_done[txgoff] == 0) return; /* * Update progress accounting. */ spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; svr->svr_bytes_done[txgoff] = 0; spa_sync_removing_state(spa, tx); } static void vdev_remove_make_hole_and_free(vdev_t *vd) { uint64_t id = vd->vdev_id; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); vdev_free(vd); vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); vdev_config_dirty(rvd); /* * Reassess the health of our root vdev. */ vdev_reopen(rvd); } /* * Remove a log device. The config lock is held for the specified TXG. */ static int spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) { metaslab_group_t *mg = vd->vdev_mg; spa_t *spa = vd->vdev_spa; int error = 0; ASSERT(vd->vdev_islog); ASSERT(vd == vd->vdev_top); ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Stop allocating from this vdev. */ metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * Cancel any initialize or TRIM which was in progress. */ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED); vdev_autotrim_stop_wait(vd); /* * Evacuate the device. We don't hold the config lock as * writer since we need to do I/O but we do keep the * spa_namespace_lock held. Once this completes the device * should no longer have any blocks allocated on it. */ ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (vd->vdev_stat.vs_alloc != 0) error = spa_reset_logs(spa); *txg = spa_vdev_config_enter(spa); if (error != 0) { metaslab_group_activate(mg); return (error); } ASSERT0(vd->vdev_stat.vs_alloc); /* * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); /* * When the log space map feature is enabled we look at * the vdev's top_zap to find the on-disk flush data of * the metaslab we just flushed. Thus, while removing a * log vdev we make sure to call vdev_metaslab_fini() * first, which removes all metaslabs of this vdev from * spa_metaslabs_by_flushed before vdev_remove_empty() * destroys the top_zap of this log vdev. * * This avoids the scenario where we flush a metaslab * from the log vdev being removed that doesn't have a * top_zap and end up failing to lookup its on-disk flush * data. * * We don't call metaslab_group_destroy() right away * though (it will be called in vdev_free() later) as * during metaslab_sync() of metaslabs from other vdevs * we may touch the metaslab group of this vdev through * metaslab_class_histogram_verify() */ vdev_metaslab_fini(vd); spa_log_sm_set_blocklimit(spa); spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); *txg = spa_vdev_config_enter(spa); sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* The top ZAP should have been destroyed by vdev_remove_empty. */ ASSERT0(vd->vdev_top_zap); /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ ASSERT0(vd->vdev_leaf_zap); (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) vdev_state_clean(vd); if (list_link_active(&vd->vdev_config_dirty_node)) vdev_config_clean(vd); ASSERT0(vd->vdev_stat.vs_alloc); /* * Clean up the vdev namespace. */ vdev_remove_make_hole_and_free(vd); if (ev != NULL) spa_event_post(ev); return (0); } static int spa_vdev_remove_top_check(vdev_t *vd) { spa_t *spa = vd->vdev_spa; if (vd != vd->vdev_top) return (SET_ERROR(ENOTSUP)); if (!vdev_is_concrete(vd)) return (SET_ERROR(ENOTSUP)); if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) return (SET_ERROR(ENOTSUP)); /* available space in the pool's normal class */ uint64_t available = dsl_dir_space_available( spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); metaslab_class_t *mc = vd->vdev_mg->mg_class; /* * When removing a vdev from an allocation class that has * remaining vdevs, include available space from the class. */ if (mc != spa_normal_class(spa) && mc->mc_groups > 1) { uint64_t class_avail = metaslab_class_get_space(mc) - metaslab_class_get_alloc(mc); /* add class space, adjusted for overhead */ available += (class_avail * 94) / 100; } /* * There has to be enough free space to remove the * device and leave double the "slop" space (i.e. we * must leave at least 3% of the pool free, in addition to * the normal slop space). */ if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { return (SET_ERROR(ENOSPC)); } /* * There can not be a removal in progress. */ if (spa->spa_removing_phys.sr_state == DSS_SCANNING) return (SET_ERROR(EBUSY)); /* * The device must have all its data. */ if (!vdev_dtl_empty(vd, DTL_MISSING) || !vdev_dtl_empty(vd, DTL_OUTAGE)) return (SET_ERROR(EBUSY)); /* * The device must be healthy. */ if (!vdev_readable(vd)) return (SET_ERROR(EIO)); /* * All vdevs in normal class must have the same ashift. */ if (spa->spa_max_ashift != spa->spa_min_ashift) { return (SET_ERROR(EINVAL)); } /* * All vdevs in normal class must have the same ashift * and not be raidz. */ vdev_t *rvd = spa->spa_root_vdev; int num_indirect = 0; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); if (cvd->vdev_ops == &vdev_indirect_ops) num_indirect++; if (!vdev_is_concrete(cvd)) continue; if (cvd->vdev_ops == &vdev_raidz_ops) return (SET_ERROR(EINVAL)); /* * Need the mirror to be mirror of leaf vdevs only */ if (cvd->vdev_ops == &vdev_mirror_ops) { for (uint64_t cid = 0; cid < cvd->vdev_children; cid++) { if (!cvd->vdev_child[cid]->vdev_ops-> vdev_op_leaf) return (SET_ERROR(EINVAL)); } } } return (0); } /* * Initiate removal of a top-level vdev, reducing the total space in the pool. * The config lock is held for the specified TXG. Once initiated, * evacuation of all allocated space (copying it to other vdevs) happens * in the background (see spa_vdev_remove_thread()), and can be canceled * (see spa_vdev_remove_cancel()). If successful, the vdev will * be transformed to an indirect vdev (see spa_vdev_remove_complete()). */ static int spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) { spa_t *spa = vd->vdev_spa; int error; /* * Check for errors up-front, so that we don't waste time * passivating the metaslab group and clearing the ZIL if there * are errors. */ error = spa_vdev_remove_top_check(vd); if (error != 0) return (error); /* * Stop allocating from this vdev. Note that we must check * that this is not the only device in the pool before * passivating, otherwise we will not be able to make * progress because we can't allocate from any vdevs. * The above check for sufficient free space serves this * purpose. */ metaslab_group_t *mg = vd->vdev_mg; metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * We must ensure that no "stubby" log blocks are allocated * on the device to be removed. These blocks could be * written at any time, including while we are in the middle * of copying them. */ error = spa_reset_logs(spa); /* * We stop any initializing and TRIM that is currently in progress * but leave the state as "active". This will allow the process to * resume if the removal is canceled sometime later. */ vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE); vdev_autotrim_stop_wait(vd); *txg = spa_vdev_config_enter(spa); /* * Things might have changed while the config lock was dropped * (e.g. space usage). Check for errors again. */ if (error == 0) error = spa_vdev_remove_top_check(vd); if (error != 0) { metaslab_group_activate(mg); spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); return (error); } vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, *txg); vdev_config_dirty(vd); dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); dsl_sync_task_nowait(spa->spa_dsl_pool, - vdev_remove_initiate_sync, - (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx); dmu_tx_commit(tx); return (0); } /* * Remove a device from the pool. * * Removing a device from the vdev namespace requires several steps * and can take a significant amount of time. As a result we use * the spa_vdev_config_[enter/exit] functions which allow us to * grab and release the spa_config_lock while still holding the namespace * lock. During each step the configuration is synced out. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; uint_t nspares, nl2cache; int error = 0, error_log; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); sysevent_t *ev = NULL; char *vd_type = NULL, *vd_path = NULL; ASSERT(spa_writeable(spa)); if (!locked) txg = spa_vdev_enter(spa); ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { error = (spa_has_checkpoint(spa)) ? ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; if (!locked) return (spa_vdev_exit(spa, NULL, txg, error)); return (error); } vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_spares.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { /* * Only remove the hot spare if it's not currently in use * in this pool. */ if (vd == NULL || unspare) { if (vd == NULL) vd = spa_lookup_by_guid(spa, guid, B_TRUE); ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); vd_type = VDEV_TYPE_SPARE; vd_path = spa_strdup(fnvlist_lookup_string( nv, ZPOOL_CONFIG_PATH)); spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } else { error = SET_ERROR(EBUSY); } } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { vd_type = VDEV_TYPE_L2CACHE; vd_path = spa_strdup(fnvlist_lookup_string( nv, ZPOOL_CONFIG_PATH)); /* * Cache devices can always be removed. */ vd = spa_lookup_by_guid(spa, guid, B_TRUE); /* * Stop trimming the cache device. We need to release the * config lock to allow the syncing of TRIM transactions * without releasing the spa_namespace_lock. The same * strategy is employed in spa_vdev_remove_top(). */ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); mutex_exit(&vd->vdev_trim_lock); txg = spa_vdev_config_enter(spa); ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); vd_type = VDEV_TYPE_LOG; vd_path = spa_strdup((vd->vdev_path != NULL) ? vd->vdev_path : "-"); error = spa_vdev_remove_log(vd, &txg); } else if (vd != NULL) { ASSERT(!locked); error = spa_vdev_remove_top(vd, &txg); } else { /* * There is no vdev of any kind with the specified guid. */ error = SET_ERROR(ENOENT); } error_log = error; if (!locked) error = spa_vdev_exit(spa, NULL, txg, error); /* * Logging must be done outside the spa config lock. Otherwise, * this code path could end up holding the spa config lock while * waiting for a txg_sync so it can write to the internal log. * Doing that would prevent the txg sync from actually happening, * causing a deadlock. */ if (error_log == 0 && vd_type != NULL && vd_path != NULL) { spa_history_log_internal(spa, "vdev remove", NULL, "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path); } if (vd_path != NULL) spa_strfree(vd_path); if (ev != NULL) spa_event_post(ev); return (error); } int spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) { prs->prs_state = spa->spa_removing_phys.sr_state; if (prs->prs_state == DSS_NONE) return (SET_ERROR(ENOENT)); prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; prs->prs_start_time = spa->spa_removing_phys.sr_start_time; prs->prs_end_time = spa->spa_removing_phys.sr_end_time; prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; prs->prs_copied = spa->spa_removing_phys.sr_copied; prs->prs_mapping_memory = 0; uint64_t indirect_vdev_id = spa->spa_removing_phys.sr_prev_indirect_vdev; while (indirect_vdev_id != -1) { vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; vdev_indirect_config_t *vic = &vd->vdev_indirect_config; vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); indirect_vdev_id = vic->vic_prev_indirect_vdev; } return (0); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW, "Ignore hard IO errors when removing device"); ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, INT, ZMOD_RW, "Largest contiguous segment to allocate when removing device"); ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, INT, ZMOD_RW, "Largest span of free chunks a remap segment can span"); ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, INT, ZMOD_RW, "Pause device removal after this many bytes are copied " "(debug use only - causes removal to hang)"); /* END CSTYLED */ EXPORT_SYMBOL(free_from_removing_vdev); EXPORT_SYMBOL(spa_removal_get_stats); EXPORT_SYMBOL(spa_remove_init); EXPORT_SYMBOL(spa_restart_removal); EXPORT_SYMBOL(spa_vdev_removal_destroy); EXPORT_SYMBOL(spa_vdev_remove); EXPORT_SYMBOL(spa_vdev_remove_cancel); EXPORT_SYMBOL(spa_vdev_remove_suspend); EXPORT_SYMBOL(svr_sync); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 7383e1493371..02b42ddd5a6c 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -1,1701 +1,1700 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. */ #include #include #include #include #include #include #include #include #include #include /* * TRIM is a feature which is used to notify a SSD that some previously * written space is no longer allocated by the pool. This is useful because * writes to a SSD must be performed to blocks which have first been erased. * Ensuring the SSD always has a supply of erased blocks for new writes * helps prevent the performance from deteriorating. * * There are two supported TRIM methods; manual and automatic. * * Manual TRIM: * * A manual TRIM is initiated by running the 'zpool trim' command. A single * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for * managing that vdev TRIM process. This involves iterating over all the * metaslabs, calculating the unallocated space ranges, and then issuing the * required TRIM I/Os. * * While a metaslab is being actively trimmed it is not eligible to perform * new allocations. After traversing all of the metaslabs the thread is * terminated. Finally, both the requested options and current progress of * the TRIM are regularly written to the pool. This allows the TRIM to be * suspended and resumed as needed. * * Automatic TRIM: * * An automatic TRIM is enabled by setting the 'autotrim' pool property * to 'on'. When enabled, a `vdev_autotrim' thread is created for each * top-level (not leaf) vdev in the pool. These threads perform the same * core TRIM process as a manual TRIM, but with a few key differences. * * 1) Automatic TRIM happens continuously in the background and operates * solely on recently freed blocks (ms_trim not ms_allocatable). * * 2) Each thread is associated with a top-level (not leaf) vdev. This has * the benefit of simplifying the threading model, it makes it easier * to coordinate administrative commands, and it ensures only a single * metaslab is disabled at a time. Unlike manual TRIM, this means each * 'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its * children. * * 3) There is no automatic TRIM progress information stored on disk, nor * is it reported by 'zpool status'. * * While the automatic TRIM process is highly effective it is more likely * than a manual TRIM to encounter tiny ranges. Ranges less than or equal to * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently * TRIM and are skipped. This means small amounts of freed space may not * be automatically trimmed. * * Furthermore, devices with attached hot spares and devices being actively * replaced are skipped. This is done to avoid adding additional stress to * a potentially unhealthy device and to minimize the required rebuild time. * * For this reason it may be beneficial to occasionally manually TRIM a pool * even when automatic TRIM is enabled. */ /* * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths. */ unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024; /* * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped. */ unsigned int zfs_trim_extent_bytes_min = 32 * 1024; /* * Skip uninitialized metaslabs during the TRIM process. This option is * useful for pools constructed from large thinly-provisioned devices where * TRIM operations are slow. As a pool ages an increasing fraction of * the pools metaslabs will be initialized progressively degrading the * usefulness of this option. This setting is stored when starting a * manual TRIM and will persist for the duration of the requested TRIM. */ unsigned int zfs_trim_metaslab_skip = 0; /* * Maximum number of queued TRIM I/Os per leaf vdev. The number of * concurrent TRIM I/Os issued to the device is controlled by the * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options. */ unsigned int zfs_trim_queue_limit = 10; /* * The minimum number of transaction groups between automatic trims of a * metaslab. This setting represents a trade-off between issuing more * efficient TRIM operations, by allowing them to be aggregated longer, * and issuing them promptly so the trimmed space is available. Note * that this value is a minimum; metaslabs can be trimmed less frequently * when there are a large number of ranges which need to be trimmed. * * Increasing this value will allow frees to be aggregated for a longer * time. This can result is larger TRIM operations, and increased memory * usage in order to track the ranges to be trimmed. Decreasing this value * has the opposite effect. The default value of 32 was determined though * testing to be a reasonable compromise. */ unsigned int zfs_trim_txg_batch = 32; /* * The trim_args are a control structure which describe how a leaf vdev * should be trimmed. The core elements are the vdev, the metaslab being * trimmed and a range tree containing the extents to TRIM. All provided * ranges must be within the metaslab. */ typedef struct trim_args { /* * These fields are set by the caller of vdev_trim_ranges(). */ vdev_t *trim_vdev; /* Leaf vdev to TRIM */ metaslab_t *trim_msp; /* Disabled metaslab */ range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */ trim_type_t trim_type; /* Manual or auto TRIM */ uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */ uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */ enum trim_flag trim_flags; /* TRIM flags (secure) */ /* * These fields are updated by vdev_trim_ranges(). */ hrtime_t trim_start_time; /* Start time */ uint64_t trim_bytes_done; /* Bytes trimmed */ } trim_args_t; /* * Determines whether a vdev_trim_thread() should be stopped. */ static boolean_t vdev_trim_should_stop(vdev_t *vd) { return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || vd->vdev_detached || vd->vdev_top->vdev_removing); } /* * Determines whether a vdev_autotrim_thread() should be stopped. */ static boolean_t vdev_autotrim_should_stop(vdev_t *tvd) { return (tvd->vdev_autotrim_exit_wanted || !vdev_writeable(tvd) || tvd->vdev_removing || spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } /* * The sync task for updating the on-disk state of a manual TRIM. This * is scheduled by vdev_trim_change_state(). */ static void vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) { /* * We pass in the guid instead of the vdev_t since the vdev may * have been freed prior to the sync task being processed. This * happens when a vdev is detached as we call spa_config_vdev_exit(), * stop the trimming thread, schedule the sync task, and free * the vdev. Later when the scheduled sync task is invoked, it would * find that the vdev has been freed. */ uint64_t guid = *(uint64_t *)arg; uint64_t txg = dmu_tx_get_txg(tx); kmem_free(arg, sizeof (uint64_t)); vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) return; uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; vd->vdev_trim_offset[txg & TXG_MASK] = 0; VERIFY3U(vd->vdev_leaf_zap, !=, 0); objset_t *mos = vd->vdev_spa->spa_meta_objset; if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) { if (vd->vdev_trim_last_offset == UINT64_MAX) last_offset = 0; vd->vdev_trim_last_offset = last_offset; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, sizeof (last_offset), 1, &last_offset, tx)); } if (vd->vdev_trim_action_time > 0) { uint64_t val = (uint64_t)vd->vdev_trim_action_time; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val), 1, &val, tx)); } if (vd->vdev_trim_rate > 0) { uint64_t rate = (uint64_t)vd->vdev_trim_rate; if (rate == UINT64_MAX) rate = 0; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx)); } uint64_t partial = vd->vdev_trim_partial; if (partial == UINT64_MAX) partial = 0; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, sizeof (partial), 1, &partial, tx)); uint64_t secure = vd->vdev_trim_secure; if (secure == UINT64_MAX) secure = 0; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, sizeof (secure), 1, &secure, tx)); uint64_t trim_state = vd->vdev_trim_state; VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, sizeof (trim_state), 1, &trim_state, tx)); } /* * Update the on-disk state of a manual TRIM. This is called to request * that a TRIM be started/suspended/canceled, or to change one of the * TRIM options (partial, secure, rate). */ static void vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, uint64_t rate, boolean_t partial, boolean_t secure) { ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); spa_t *spa = vd->vdev_spa; if (new_state == vd->vdev_trim_state) return; /* * Copy the vd's guid, this will be freed by the sync task. */ uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* * If we're suspending, then preserve the original start time. */ if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) { vd->vdev_trim_action_time = gethrestime_sec(); } /* * If we're activating, then preserve the requested rate and trim * method. Setting the last offset and rate to UINT64_MAX is used * as a sentinel to indicate they should be reset to default values. */ if (new_state == VDEV_TRIM_ACTIVE) { if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE || vd->vdev_trim_state == VDEV_TRIM_CANCELED) { vd->vdev_trim_last_offset = UINT64_MAX; vd->vdev_trim_rate = UINT64_MAX; vd->vdev_trim_partial = UINT64_MAX; vd->vdev_trim_secure = UINT64_MAX; } if (rate != 0) vd->vdev_trim_rate = rate; if (partial != 0) vd->vdev_trim_partial = partial; if (secure != 0) vd->vdev_trim_secure = secure; } boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED); vd->vdev_trim_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, - guid, 2, ZFS_SPACE_CHECK_NONE, tx); + guid, tx); switch (new_state) { case VDEV_TRIM_ACTIVE: spa_event_notify(spa, vd, NULL, resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START); spa_history_log_internal(spa, "trim", tx, "vdev=%s activated", vd->vdev_path); break; case VDEV_TRIM_SUSPENDED: spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND); spa_history_log_internal(spa, "trim", tx, "vdev=%s suspended", vd->vdev_path); break; case VDEV_TRIM_CANCELED: spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); spa_history_log_internal(spa, "trim", tx, "vdev=%s canceled", vd->vdev_path); break; case VDEV_TRIM_COMPLETE: spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); spa_history_log_internal(spa, "trim", tx, "vdev=%s complete", vd->vdev_path); break; default: panic("invalid state %llu", (unsigned long long)new_state); } dmu_tx_commit(tx); if (new_state != VDEV_TRIM_ACTIVE) spa_notify_waiters(spa); } /* * The zio_done_func_t done callback for each manual TRIM issued. It is * responsible for updating the TRIM stats, reissuing failed TRIM I/Os, * and limiting the number of in flight TRIM I/Os. */ static void vdev_trim_cb(zio_t *zio) { vdev_t *vd = zio->io_vd; mutex_enter(&vd->vdev_trim_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the vdev was unavailable; roll the * last offset back. (This works because spa_sync waits on * spa_txg_zio before it runs sync tasks.) */ uint64_t *offset = &vd->vdev_trim_offset[zio->io_txg & TXG_MASK]; *offset = MIN(*offset, zio->io_offset); } else { if (zio->io_error != 0) { vd->vdev_stat.vs_trim_errors++; spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, 0, 0, 0, 0, 1, zio->io_orig_size); } else { spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, 1, zio->io_orig_size, 0, 0, 0, 0); } vd->vdev_trim_bytes_done += zio->io_orig_size; } ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0); vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--; cv_broadcast(&vd->vdev_trim_io_cv); mutex_exit(&vd->vdev_trim_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* * The zio_done_func_t done callback for each automatic TRIM issued. It * is responsible for updating the TRIM stats and limiting the number of * in flight TRIM I/Os. Automatic TRIM I/Os are best effort and are * never reissued on failure. */ static void vdev_autotrim_cb(zio_t *zio) { vdev_t *vd = zio->io_vd; mutex_enter(&vd->vdev_trim_io_lock); if (zio->io_error != 0) { vd->vdev_stat.vs_trim_errors++; spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, 0, 0, 0, 0, 1, zio->io_orig_size); } else { spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, 1, zio->io_orig_size, 0, 0, 0, 0); } ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0); vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--; cv_broadcast(&vd->vdev_trim_io_cv); mutex_exit(&vd->vdev_trim_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* * The zio_done_func_t done callback for each TRIM issued via * vdev_trim_simple(). It is responsible for updating the TRIM stats and * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best * effort and are never reissued on failure. */ static void vdev_trim_simple_cb(zio_t *zio) { vdev_t *vd = zio->io_vd; mutex_enter(&vd->vdev_trim_io_lock); if (zio->io_error != 0) { vd->vdev_stat.vs_trim_errors++; spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, 0, 0, 0, 0, 1, zio->io_orig_size); } else { spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, 1, zio->io_orig_size, 0, 0, 0, 0); } ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0); vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--; cv_broadcast(&vd->vdev_trim_io_cv); mutex_exit(&vd->vdev_trim_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* * Returns the average trim rate in bytes/sec for the ta->trim_vdev. */ static uint64_t vdev_trim_calculate_rate(trim_args_t *ta) { return (ta->trim_bytes_done * 1000 / (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1)); } /* * Issues a physical TRIM and takes care of rate limiting (bytes/sec) * and number of concurrent TRIM I/Os. */ static int vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) { vdev_t *vd = ta->trim_vdev; spa_t *spa = vd->vdev_spa; void *cb; mutex_enter(&vd->vdev_trim_io_lock); /* * Limit manual TRIM I/Os to the requested rate. This does not * apply to automatic TRIM since no per vdev rate can be specified. */ if (ta->trim_type == TRIM_TYPE_MANUAL) { while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) && vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) { cv_timedwait_idle(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock, ddi_get_lbolt() + MSEC_TO_TICK(10)); } } ta->trim_bytes_done += size; /* Limit in flight trimming I/Os */ while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] + vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } vd->vdev_trim_inflight[ta->trim_type]++; mutex_exit(&vd->vdev_trim_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); uint64_t txg = dmu_tx_get_txg(tx); spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); mutex_enter(&vd->vdev_trim_lock); if (ta->trim_type == TRIM_TYPE_MANUAL && vd->vdev_trim_offset[txg & TXG_MASK] == 0) { uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); *guid = vd->vdev_guid; /* This is the first write of this txg. */ dsl_sync_task_nowait(spa_get_dsl(spa), - vdev_trim_zap_update_sync, guid, 2, - ZFS_SPACE_CHECK_RESERVED, tx); + vdev_trim_zap_update_sync, guid, tx); } /* * We know the vdev_t will still be around since all consumers of * vdev_free must stop the trimming first. */ if ((ta->trim_type == TRIM_TYPE_MANUAL && vdev_trim_should_stop(vd)) || (ta->trim_type == TRIM_TYPE_AUTO && vdev_autotrim_should_stop(vd->vdev_top))) { mutex_enter(&vd->vdev_trim_io_lock); vd->vdev_trim_inflight[ta->trim_type]--; mutex_exit(&vd->vdev_trim_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_trim_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_trim_lock); if (ta->trim_type == TRIM_TYPE_MANUAL) vd->vdev_trim_offset[txg & TXG_MASK] = start + size; if (ta->trim_type == TRIM_TYPE_MANUAL) { cb = vdev_trim_cb; } else if (ta->trim_type == TRIM_TYPE_AUTO) { cb = vdev_autotrim_cb; } else { cb = vdev_trim_simple_cb; } zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd, start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags)); /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */ dmu_tx_commit(tx); return (0); } /* * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree. * Additional parameters describing how the TRIM should be performed must * be set in the trim_args structure. See the trim_args definition for * additional information. */ static int vdev_trim_ranges(trim_args_t *ta) { vdev_t *vd = ta->trim_vdev; zfs_btree_t *t = &ta->trim_tree->rt_root; zfs_btree_index_t idx; uint64_t extent_bytes_max = ta->trim_extent_bytes_max; uint64_t extent_bytes_min = ta->trim_extent_bytes_min; spa_t *spa = vd->vdev_spa; ta->trim_start_time = gethrtime(); ta->trim_bytes_done = 0; for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; rs = zfs_btree_next(t, &idx, &idx)) { uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, ta->trim_tree); if (extent_bytes_min && size < extent_bytes_min) { spa_iostats_trim_add(spa, ta->trim_type, 0, 0, 1, size, 0, 0); continue; } /* Split range into legally-sized physical chunks */ uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; for (uint64_t w = 0; w < writes_required; w++) { int error; error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + rs_get_start(rs, ta->trim_tree) + (w *extent_bytes_max), MIN(size - (w * extent_bytes_max), extent_bytes_max)); if (error != 0) { return (error); } } } return (0); } /* * Calculates the completion percentage of a manual TRIM. */ static void vdev_trim_calculate_progress(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); vd->vdev_trim_bytes_est = 0; vd->vdev_trim_bytes_done = 0; for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); uint64_t ms_free = msp->ms_size - metaslab_allocated_space(msp); if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) ms_free /= vd->vdev_top->vdev_children; /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; vdev_xlate(vd, &logical_rs, &physical_rs); if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) { vd->vdev_trim_bytes_done += ms_free; vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; } /* * If we get here, we're in the middle of trimming this * metaslab. Load it and walk the free tree for more * accurate progress estimation. */ VERIFY0(metaslab_load(msp)); range_tree_t *rt = msp->ms_allocatable; zfs_btree_t *bt = &rt->rt_root; zfs_btree_index_t idx; for (range_seg_t *rs = zfs_btree_first(bt, &idx); rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); vdev_xlate(vd, &logical_rs, &physical_rs); uint64_t size = physical_rs.rs_end - physical_rs.rs_start; vd->vdev_trim_bytes_est += size; if (vd->vdev_trim_last_offset >= physical_rs.rs_end) { vd->vdev_trim_bytes_done += size; } else if (vd->vdev_trim_last_offset > physical_rs.rs_start && vd->vdev_trim_last_offset <= physical_rs.rs_end) { vd->vdev_trim_bytes_done += vd->vdev_trim_last_offset - physical_rs.rs_start; } } mutex_exit(&msp->ms_lock); } } /* * Load from disk the vdev's manual TRIM information. This includes the * state, progress, and options provided when initiating the manual TRIM. */ static int vdev_trim_load(vdev_t *vd) { int err = 0; ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); ASSERT(vd->vdev_leaf_zap != 0); if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE || vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, sizeof (vd->vdev_trim_last_offset), 1, &vd->vdev_trim_last_offset); if (err == ENOENT) { vd->vdev_trim_last_offset = 0; err = 0; } if (err == 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE, sizeof (vd->vdev_trim_rate), 1, &vd->vdev_trim_rate); if (err == ENOENT) { vd->vdev_trim_rate = 0; err = 0; } } if (err == 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, sizeof (vd->vdev_trim_partial), 1, &vd->vdev_trim_partial); if (err == ENOENT) { vd->vdev_trim_partial = 0; err = 0; } } if (err == 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, sizeof (vd->vdev_trim_secure), 1, &vd->vdev_trim_secure); if (err == ENOENT) { vd->vdev_trim_secure = 0; err = 0; } } } vdev_trim_calculate_progress(vd); return (err); } /* * Convert the logical range into a physical range and add it to the * range tree passed in the trim_args_t. */ static void vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; range_seg64_t logical_rs, physical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; /* * Every range to be trimmed must be part of ms_allocatable. * When ZFS_DEBUG_TRIM is set load the metaslab to verify this * is always the case. */ if (zfs_flags & ZFS_DEBUG_TRIM) { metaslab_t *msp = ta->trim_msp; VERIFY0(metaslab_load(msp)); VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } ASSERT(vd->vdev_ops->vdev_op_leaf); vdev_xlate(vd, &logical_rs, &physical_rs); IMPLY(vd->vdev_top == vd, logical_rs.rs_start == physical_rs.rs_start); IMPLY(vd->vdev_top == vd, logical_rs.rs_end == physical_rs.rs_end); /* * Only a manual trim will be traversing the vdev sequentially. * For an auto trim all valid ranges should be added. */ if (ta->trim_type == TRIM_TYPE_MANUAL) { /* Only add segments that we have not visited yet */ if (physical_rs.rs_end <= vd->vdev_trim_last_offset) return; /* Pick up where we left off mid-range. */ if (vd->vdev_trim_last_offset > physical_rs.rs_start) { ASSERT3U(physical_rs.rs_end, >, vd->vdev_trim_last_offset); physical_rs.rs_start = vd->vdev_trim_last_offset; } } ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); /* * With raidz, it's possible that the logical range does not live on * this leaf vdev. We only add the physical range to this vdev's if it * has a length greater than 0. */ if (physical_rs.rs_end > physical_rs.rs_start) { range_tree_add(ta->trim_tree, physical_rs.rs_start, physical_rs.rs_end - physical_rs.rs_start); } else { ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); } } /* * Each manual TRIM thread is responsible for trimming the unallocated * space for each leaf vdev. This is accomplished by sequentially iterating * over its top-level metaslabs and issuing TRIM I/O for the space described * by its ms_allocatable. While a metaslab is undergoing trimming it is * not eligible for new allocations. */ static void vdev_trim_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; trim_args_t ta; int error = 0; /* * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by * vdev_trim(). Wait for the updated values to be reflected * in the zap in order to start with the requested settings. */ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); ASSERT(vdev_is_concrete(vd)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd->vdev_trim_last_offset = 0; vd->vdev_trim_rate = 0; vd->vdev_trim_partial = 0; vd->vdev_trim_secure = 0; VERIFY0(vdev_trim_load(vd)); ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; /* * When a secure TRIM has been requested infer that the intent * is that everything must be trimmed. Override the default * minimum TRIM size to prevent ranges from being skipped. */ if (vd->vdev_trim_secure) { ta.trim_flags |= ZIO_TRIM_SECURE; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; } uint64_t ms_count = 0; for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_top->vdev_ms[i]; /* * If we've expanded the top-level vdev or it's our * first pass, calculate our progress. */ if (vd->vdev_top->vdev_ms_count != ms_count) { vdev_trim_calculate_progress(vd); ms_count = vd->vdev_top->vdev_ms_count; } spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); /* * If a partial TRIM was requested skip metaslabs which have * never been initialized and thus have never been written. */ if (msp->ms_sm == NULL && vd->vdev_trim_partial) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vdev_trim_calculate_progress(vd); continue; } ta.trim_msp = msp; range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta); range_tree_vacate(msp->ms_trim, NULL, NULL); mutex_exit(&msp->ms_lock); error = vdev_trim_ranges(&ta); metaslab_enable(msp, B_TRUE, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); range_tree_vacate(ta.trim_tree, NULL, NULL); if (error != 0) break; } spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_enter(&vd->vdev_trim_io_lock); while (vd->vdev_trim_inflight[0] > 0) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } mutex_exit(&vd->vdev_trim_io_lock); range_tree_destroy(ta.trim_tree); mutex_enter(&vd->vdev_trim_lock); if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0); /* * Drop the vdev_trim_lock while we sync out the txg since it's * possible that a device might be trying to come online and must * check to see if it needs to restart a trim. That thread will be * holding the spa_config_lock which would prevent the txg_wait_synced * from completing. */ mutex_exit(&vd->vdev_trim_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&vd->vdev_trim_lock); vd->vdev_trim_thread = NULL; cv_broadcast(&vd->vdev_trim_cv); mutex_exit(&vd->vdev_trim_lock); thread_exit(); } /* * Initiates a manual TRIM for the vdev_t. Callers must hold vdev_trim_lock, * the vdev_t must be a leaf and cannot already be manually trimming. */ void vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) { ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); vd->vdev_trim_thread = thread_create(NULL, 0, vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri); } /* * Wait for the trimming thread to be terminated (canceled or stopped). */ static void vdev_trim_stop_wait_impl(vdev_t *vd) { ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); while (vd->vdev_trim_thread != NULL) cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock); ASSERT3P(vd->vdev_trim_thread, ==, NULL); vd->vdev_trim_exit_wanted = B_FALSE; } /* * Wait for vdev trim threads which were listed to cleanly exit. */ void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) { vdev_t *vd; ASSERT(MUTEX_HELD(&spa_namespace_lock)); while ((vd = list_remove_head(vd_list)) != NULL) { mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop_wait_impl(vd); mutex_exit(&vd->vdev_trim_lock); } } /* * Stop trimming a device, with the resultant trimming state being tgt_state. * For blocking behavior pass NULL for vd_list. Otherwise, when a list_t is * provided the stopping vdev is inserted in to the list. Callers are then * required to call vdev_trim_stop_wait() to block for all the trim threads * to exit. The caller must hold vdev_trim_lock and must not be writing to * the spa config, as the trimming thread may try to enter the config as a * reader before exiting. */ void vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) { ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); /* * Allow cancel requests to proceed even if the trim thread has * stopped. */ if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED) return; vdev_trim_change_state(vd, tgt_state, 0, 0, 0); vd->vdev_trim_exit_wanted = B_TRUE; if (vd_list == NULL) { vdev_trim_stop_wait_impl(vd); } else { ASSERT(MUTEX_HELD(&spa_namespace_lock)); list_insert_tail(vd_list, vd); } } /* * Requests that all listed vdevs stop trimming. */ static void vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) { if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { mutex_enter(&vd->vdev_trim_lock); vdev_trim_stop(vd, tgt_state, vd_list); mutex_exit(&vd->vdev_trim_lock); return; } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state, vd_list); } } /* * Convenience function to stop trimming of a vdev tree and set all trim * thread pointers to NULL. */ void vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) { spa_t *spa = vd->vdev_spa; list_t vd_list; vdev_t *vd_l2cache; ASSERT(MUTEX_HELD(&spa_namespace_lock)); list_create(&vd_list, sizeof (vdev_t), offsetof(vdev_t, vdev_trim_node)); vdev_trim_stop_all_impl(vd, tgt_state, &vd_list); /* * Iterate over cache devices and request stop trimming the * whole device in case we export the pool or remove the cache * device prematurely. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vd_l2cache = spa->spa_l2cache.sav_vdevs[i]; vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list); } vdev_trim_stop_wait(spa, &vd_list); if (vd->vdev_spa->spa_sync_on) { /* Make sure that our state has been synced to disk */ txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); } list_destroy(&vd_list); } /* * Conditionally restarts a manual TRIM given its on-disk state. */ void vdev_trim_restart(vdev_t *vd) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); if (vd->vdev_leaf_zap != 0) { mutex_enter(&vd->vdev_trim_lock); uint64_t trim_state = VDEV_TRIM_NONE; int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, sizeof (trim_state), 1, &trim_state); ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_state = trim_state; uint64_t timestamp = 0; err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (timestamp), 1, ×tamp); ASSERT(err == 0 || err == ENOENT); vd->vdev_trim_action_time = timestamp; if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || vd->vdev_offline) { /* load progress for reporting, but don't resume */ VERIFY0(vdev_trim_load(vd)); } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && vdev_writeable(vd) && !vd->vdev_top->vdev_removing && vd->vdev_trim_thread == NULL) { VERIFY0(vdev_trim_load(vd)); vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } mutex_exit(&vd->vdev_trim_lock); } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_trim_restart(vd->vdev_child[i]); } } /* * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that * every TRIM range is contained within ms_allocatable. */ static void vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; metaslab_t *msp = ta->trim_msp; VERIFY3B(msp->ms_loaded, ==, B_TRUE); VERIFY3U(msp->ms_disabled, >, 0); VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); } /* * Each automatic TRIM thread is responsible for managing the trimming of a * top-level vdev in the pool. No automatic TRIM state is maintained on-disk. * * N.B. This behavior is different from a manual TRIM where a thread * is created for each leaf vdev, instead of each top-level vdev. */ static void vdev_autotrim_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; int shift = 0; mutex_enter(&vd->vdev_autotrim_lock); ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(vd->vdev_autotrim_thread, !=, NULL); mutex_exit(&vd->vdev_autotrim_lock); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; while (!vdev_autotrim_should_stop(vd)) { int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); boolean_t issued_trim = B_FALSE; /* * All of the metaslabs are divided in to groups of size * num_metaslabs / zfs_trim_txg_batch. Each of these groups * is composed of metaslabs which are spread evenly over the * device. * * For example, when zfs_trim_txg_batch = 32 (default) then * group 0 will contain metaslabs 0, 32, 64, ...; * group 1 will contain metaslabs 1, 33, 65, ...; * group 2 will contain metaslabs 2, 34, 66, ...; and so on. * * On each pass through the while() loop one of these groups * is selected. This is accomplished by using a shift value * to select the starting metaslab, then striding over the * metaslabs using the zfs_trim_txg_batch size. This is * done to accomplish two things. * * 1) By dividing the metaslabs in to groups, and making sure * that each group takes a minimum of one txg to process. * Then zfs_trim_txg_batch controls the minimum number of * txgs which must occur before a metaslab is revisited. * * 2) Selecting non-consecutive metaslabs distributes the * TRIM commands for a group evenly over the entire device. * This can be advantageous for certain types of devices. */ for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count; i += txgs_per_trim) { metaslab_t *msp = vd->vdev_ms[i]; range_tree_t *trim_tree; spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); mutex_enter(&msp->ms_lock); /* * Skip the metaslab when it has never been allocated * or when there are no recent frees to trim. */ if (msp->ms_sm == NULL || range_tree_is_empty(msp->ms_trim)) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); continue; } /* * Skip the metaslab when it has already been disabled. * This may happen when a manual TRIM or initialize * operation is running concurrently. In the case * of a manual TRIM, the ms_trim tree will have been * vacated. Only ranges added after the manual TRIM * disabled the metaslab will be included in the tree. * These will be processed when the automatic TRIM * next revisits this metaslab. */ if (msp->ms_disabled > 1) { mutex_exit(&msp->ms_lock); metaslab_enable(msp, B_FALSE, B_FALSE); continue; } /* * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); range_tree_swap(&msp->ms_trim, &trim_tree); ASSERT(range_tree_is_empty(msp->ms_trim)); /* * There are two cases when constructing the per-vdev * trim trees for a metaslab. If the top-level vdev * has no children then it is also a leaf and should * be trimmed. Otherwise our children are the leaves * and a trim tree should be constructed for each. */ trim_args_t *tap; uint64_t children = vd->vdev_children; if (children == 0) { children = 1; tap = kmem_zalloc(sizeof (trim_args_t) * children, KM_SLEEP); tap[0].trim_vdev = vd; } else { tap = kmem_zalloc(sizeof (trim_args_t) * children, KM_SLEEP); for (uint64_t c = 0; c < children; c++) { tap[c].trim_vdev = vd->vdev_child[c]; } } for (uint64_t c = 0; c < children; c++) { trim_args_t *ta = &tap[c]; vdev_t *cvd = ta->trim_vdev; ta->trim_msp = msp; ta->trim_extent_bytes_max = extent_bytes_max; ta->trim_extent_bytes_min = extent_bytes_min; ta->trim_type = TRIM_TYPE_AUTO; ta->trim_flags = 0; if (cvd->vdev_detached || !vdev_writeable(cvd) || !cvd->vdev_has_trim || cvd->vdev_trim_thread != NULL) { continue; } /* * When a device has an attached hot spare, or * is being replaced it will not be trimmed. * This is done to avoid adding additional * stress to a potentially unhealthy device, * and to minimize the required rebuild time. */ if (!cvd->vdev_ops->vdev_op_leaf) continue; ta->trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); range_tree_walk(trim_tree, vdev_trim_range_add, ta); } mutex_exit(&msp->ms_lock); spa_config_exit(spa, SCL_CONFIG, FTAG); /* * Issue the TRIM I/Os for all ranges covered by the * TRIM trees. These ranges are safe to TRIM because * no new allocations will be performed until the call * to metaslab_enabled() below. */ for (uint64_t c = 0; c < children; c++) { trim_args_t *ta = &tap[c]; /* * Always yield to a manual TRIM if one has * been started for the child vdev. */ if (ta->trim_tree == NULL || ta->trim_vdev->vdev_trim_thread != NULL) { continue; } /* * After this point metaslab_enable() must be * called with the sync flag set. This is done * here because vdev_trim_ranges() is allowed * to be interrupted (EINTR) before issuing all * of the required TRIM I/Os. */ issued_trim = B_TRUE; int error = vdev_trim_ranges(ta); if (error) break; } /* * Verify every range which was trimmed is still * contained within the ms_allocatable tree. */ if (zfs_flags & ZFS_DEBUG_TRIM) { mutex_enter(&msp->ms_lock); VERIFY0(metaslab_load(msp)); VERIFY3P(tap[0].trim_msp, ==, msp); range_tree_walk(trim_tree, vdev_trim_range_verify, &tap[0]); mutex_exit(&msp->ms_lock); } range_tree_vacate(trim_tree, NULL, NULL); range_tree_destroy(trim_tree); metaslab_enable(msp, issued_trim, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t c = 0; c < children; c++) { trim_args_t *ta = &tap[c]; if (ta->trim_tree == NULL) continue; range_tree_vacate(ta->trim_tree, NULL, NULL); range_tree_destroy(ta->trim_tree); } kmem_free(tap, sizeof (trim_args_t) * children); } spa_config_exit(spa, SCL_CONFIG, FTAG); /* * After completing the group of metaslabs wait for the next * open txg. This is done to make sure that a minimum of * zfs_trim_txg_batch txgs will occur before these metaslabs * are trimmed again. */ txg_wait_open(spa_get_dsl(spa), 0, issued_trim); shift++; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); } for (uint64_t c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_trim_io_lock); while (cvd->vdev_trim_inflight[1] > 0) { cv_wait(&cvd->vdev_trim_io_cv, &cvd->vdev_trim_io_lock); } mutex_exit(&cvd->vdev_trim_io_lock); } spa_config_exit(spa, SCL_CONFIG, FTAG); /* * When exiting because the autotrim property was set to off, then * abandon any unprocessed ms_trim ranges to reclaim the memory. */ if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) { for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { metaslab_t *msp = vd->vdev_ms[i]; mutex_enter(&msp->ms_lock); range_tree_vacate(msp->ms_trim, NULL, NULL); mutex_exit(&msp->ms_lock); } } mutex_enter(&vd->vdev_autotrim_lock); ASSERT(vd->vdev_autotrim_thread != NULL); vd->vdev_autotrim_thread = NULL; cv_broadcast(&vd->vdev_autotrim_cv); mutex_exit(&vd->vdev_autotrim_lock); thread_exit(); } /* * Starts an autotrim thread, if needed, for each top-level vdev which can be * trimmed. A top-level vdev which has been evacuated will never be trimmed. */ void vdev_autotrim(spa_t *spa) { vdev_t *root_vd = spa->spa_root_vdev; for (uint64_t i = 0; i < root_vd->vdev_children; i++) { vdev_t *tvd = root_vd->vdev_child[i]; mutex_enter(&tvd->vdev_autotrim_lock); if (vdev_writeable(tvd) && !tvd->vdev_removing && tvd->vdev_autotrim_thread == NULL) { ASSERT3P(tvd->vdev_top, ==, tvd); tvd->vdev_autotrim_thread = thread_create(NULL, 0, vdev_autotrim_thread, tvd, 0, &p0, TS_RUN, maxclsyspri); ASSERT(tvd->vdev_autotrim_thread != NULL); } mutex_exit(&tvd->vdev_autotrim_lock); } } /* * Wait for the vdev_autotrim_thread associated with the passed top-level * vdev to be terminated (canceled or stopped). */ void vdev_autotrim_stop_wait(vdev_t *tvd) { mutex_enter(&tvd->vdev_autotrim_lock); if (tvd->vdev_autotrim_thread != NULL) { tvd->vdev_autotrim_exit_wanted = B_TRUE; while (tvd->vdev_autotrim_thread != NULL) { cv_wait(&tvd->vdev_autotrim_cv, &tvd->vdev_autotrim_lock); } ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); tvd->vdev_autotrim_exit_wanted = B_FALSE; } mutex_exit(&tvd->vdev_autotrim_lock); } /* * Wait for all of the vdev_autotrim_thread associated with the pool to * be terminated (canceled or stopped). */ void vdev_autotrim_stop_all(spa_t *spa) { vdev_t *root_vd = spa->spa_root_vdev; for (uint64_t i = 0; i < root_vd->vdev_children; i++) vdev_autotrim_stop_wait(root_vd->vdev_child[i]); } /* * Conditionally restart all of the vdev_autotrim_thread's for the pool. */ void vdev_autotrim_restart(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (spa->spa_autotrim) vdev_autotrim(spa); } static void vdev_trim_l2arc_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; l2arc_dev_t *dev = l2arc_vdev_get(vd); trim_args_t ta; range_seg64_t physical_rs; ASSERT(vdev_is_concrete(vd)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); vd->vdev_trim_last_offset = 0; vd->vdev_trim_rate = 0; vd->vdev_trim_partial = 0; vd->vdev_trim_secure = 0; bzero(&ta, sizeof (ta)); ta.trim_vdev = vd; ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; ta.trim_flags = 0; physical_rs.rs_start = vd->vdev_trim_bytes_done = 0; physical_rs.rs_end = vd->vdev_trim_bytes_est = vdev_get_min_asize(vd); range_tree_add(ta.trim_tree, physical_rs.rs_start, physical_rs.rs_end - physical_rs.rs_start); mutex_enter(&vd->vdev_trim_lock); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); mutex_exit(&vd->vdev_trim_lock); (void) vdev_trim_ranges(&ta); spa_config_exit(spa, SCL_CONFIG, FTAG); mutex_enter(&vd->vdev_trim_io_lock); while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } mutex_exit(&vd->vdev_trim_io_lock); range_tree_vacate(ta.trim_tree, NULL, NULL); range_tree_destroy(ta.trim_tree); mutex_enter(&vd->vdev_trim_lock); if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, vd->vdev_trim_rate, vd->vdev_trim_partial, vd->vdev_trim_secure); } ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0); /* * Drop the vdev_trim_lock while we sync out the txg since it's * possible that a device might be trying to come online and * must check to see if it needs to restart a trim. That thread * will be holding the spa_config_lock which would prevent the * txg_wait_synced from completing. Same strategy as in * vdev_trim_thread(). */ mutex_exit(&vd->vdev_trim_lock); txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); mutex_enter(&vd->vdev_trim_lock); /* * Update the header of the cache device here, before * broadcasting vdev_trim_cv which may lead to the removal * of the device. The same applies for setting l2ad_trim_all to * false. */ spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd, RW_READER); bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize); l2arc_dev_hdr_update(dev); spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd); vd->vdev_trim_thread = NULL; if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE) dev->l2ad_trim_all = B_FALSE; cv_broadcast(&vd->vdev_trim_cv); mutex_exit(&vd->vdev_trim_lock); thread_exit(); } /* * Punches out TRIM threads for the L2ARC devices in a spa and assigns them * to vd->vdev_trim_thread variable. This facilitates the management of * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition * to a pool or pool creation or when the header of the device is invalid. */ void vdev_trim_l2arc(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Locate the spa's l2arc devices and kick off TRIM threads. */ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_t *vd = spa->spa_l2cache.sav_vdevs[i]; l2arc_dev_t *dev = l2arc_vdev_get(vd); if (dev == NULL || !dev->l2ad_trim_all) { /* * Don't attempt TRIM if the vdev is UNAVAIL or if the * cache device was not marked for whole device TRIM * (ie l2arc_trim_ahead = 0, or the L2ARC device header * is valid with trim_state = VDEV_TRIM_COMPLETE and * l2ad_log_entries > 0). */ continue; } mutex_enter(&vd->vdev_trim_lock); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(vdev_is_concrete(vd)); ASSERT3P(vd->vdev_trim_thread, ==, NULL); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_trim_exit_wanted); ASSERT(!vd->vdev_top->vdev_removing); vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); vd->vdev_trim_thread = thread_create(NULL, 0, vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&vd->vdev_trim_lock); } } /* * A wrapper which calls vdev_trim_ranges(). It is intended to be called * on leaf vdevs. */ int vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) { trim_args_t ta; range_seg64_t physical_rs; int error; physical_rs.rs_start = start; physical_rs.rs_end = start + size; ASSERT(vdev_is_concrete(vd)); ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(!vd->vdev_detached); ASSERT(!vd->vdev_top->vdev_removing); bzero(&ta, sizeof (ta)); ta.trim_vdev = vd; ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); ta.trim_type = TRIM_TYPE_SIMPLE; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; ta.trim_flags = 0; ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); if (physical_rs.rs_end > physical_rs.rs_start) { range_tree_add(ta.trim_tree, physical_rs.rs_start, physical_rs.rs_end - physical_rs.rs_start); } else { ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); } error = vdev_trim_ranges(&ta); mutex_enter(&vd->vdev_trim_io_lock); while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } mutex_exit(&vd->vdev_trim_io_lock); range_tree_vacate(ta.trim_tree, NULL, NULL); range_tree_destroy(ta.trim_tree); return (error); } EXPORT_SYMBOL(vdev_trim); EXPORT_SYMBOL(vdev_trim_stop); EXPORT_SYMBOL(vdev_trim_stop_all); EXPORT_SYMBOL(vdev_trim_stop_wait); EXPORT_SYMBOL(vdev_trim_restart); EXPORT_SYMBOL(vdev_autotrim); EXPORT_SYMBOL(vdev_autotrim_stop_all); EXPORT_SYMBOL(vdev_autotrim_stop_wait); EXPORT_SYMBOL(vdev_autotrim_restart); EXPORT_SYMBOL(vdev_trim_l2arc); EXPORT_SYMBOL(vdev_trim_simple); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, "Max size of TRIM commands, larger will be split"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW, "Min size of TRIM commands, smaller will be skipped"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW, "Skip metaslabs which have never been initialized"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW, "Min number of txgs to aggregate frees before issuing TRIM"); ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW, "Max queued TRIMs outstanding per leaf vdev"); /* END CSTYLED */