diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h index bb8248a667b1..64358bb5fc0b 100644 --- a/include/sys/dsl_deadlist.h +++ b/include/sys/dsl_deadlist.h @@ -1,116 +1,128 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2018, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_DSL_DEADLIST_H #define _SYS_DSL_DEADLIST_H #include #include #include #ifdef __cplusplus extern "C" { #endif struct dmu_buf; struct dsl_pool; struct dsl_dataset; typedef struct dsl_deadlist_phys { uint64_t dl_used; uint64_t dl_comp; uint64_t dl_uncomp; uint64_t dl_pad[37]; /* pad out to 320b for future expansion */ } dsl_deadlist_phys_t; typedef struct dsl_deadlist { objset_t *dl_os; uint64_t dl_object; - avl_tree_t dl_tree; + avl_tree_t dl_tree; /* contains dsl_deadlist_entry_t */ + avl_tree_t dl_cache; /* contains dsl_deadlist_cache_entry_t */ boolean_t dl_havetree; + boolean_t dl_havecache; struct dmu_buf *dl_dbuf; dsl_deadlist_phys_t *dl_phys; kmutex_t dl_lock; /* if it's the old on-disk format: */ bpobj_t dl_bpobj; boolean_t dl_oldfmt; } dsl_deadlist_t; +typedef struct dsl_deadlist_cache_entry { + avl_node_t dlce_node; + uint64_t dlce_mintxg; + uint64_t dlce_bpobj; + uint64_t dlce_bytes; + uint64_t dlce_comp; + uint64_t dlce_uncomp; +} dsl_deadlist_cache_entry_t; + typedef struct dsl_deadlist_entry { avl_node_t dle_node; uint64_t dle_mintxg; bpobj_t dle_bpobj; } dsl_deadlist_entry_t; typedef struct livelist_condense_entry { struct dsl_dataset *ds; dsl_deadlist_entry_t *first; dsl_deadlist_entry_t *next; boolean_t syncing; boolean_t cancelled; } livelist_condense_entry_t; extern unsigned long zfs_livelist_max_entries; extern int zfs_livelist_min_percent_shared; typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle); void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); void dsl_deadlist_close(dsl_deadlist_t *dl); void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg); uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx); int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl); dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl); uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, uint64_t mrs_obj, dmu_tx_t *tx); void dsl_deadlist_space(dsl_deadlist_t *dl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); void dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx); boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free, zthr_t *t, uint64_t *size); void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, dmu_tx_t *tx); +void dsl_deadlist_discard_tree(dsl_deadlist_t *dl); #ifdef __cplusplus } #endif #endif /* _SYS_DSL_DEADLIST_H */ diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 25878f0ea42c..15a59315c27d 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -1,881 +1,1008 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include /* * Deadlist concurrency: * * Deadlists can only be modified from the syncing thread. * * Except for dsl_deadlist_insert(), it can only be modified with the * dp_config_rwlock held with RW_WRITER. * * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can * be called concurrently, from open context, with the dl_config_rwlock held * with RW_READER. * * Therefore, we only need to provide locking between dsl_deadlist_insert() and * the accessors, protecting: * dl_phys->dl_used,comp,uncomp * and protecting the dl_tree from being loaded. * The locking is provided by dl_lock. Note that locking on the bpobj_t * provides its own locking, and dl_oldfmt is immutable. */ /* * Livelist Overview * ================ * * Livelists use the same 'deadlist_t' struct as deadlists and are also used * to track blkptrs over the lifetime of a dataset. Livelists however, belong * to clones and track the blkptrs that are clone-specific (were born after * the clone's creation). The exception is embedded block pointers which are * not included in livelists because they do not need to be freed. * * When it comes time to delete the clone, the livelist provides a quick * reference as to what needs to be freed. For this reason, livelists also track * when clone-specific blkptrs are freed before deletion to prevent double * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the * deletion algorithm iterates backwards over the livelist, matching * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists * are also updated in the case when blkptrs are remapped: the old version * of the blkptr is cancelled out with a FREE and the new version is tracked * with an ALLOC. * * To bound the amount of memory required for deletion, livelists over a * certain size are spread over multiple entries. Entries are grouped by * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will * be in the same entry. This allows us to delete livelists incrementally * over multiple syncs, one entry at a time. * * During the lifetime of the clone, livelists can get extremely large. * Their size is managed by periodic condensing (preemptively cancelling out * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when * the shared space between the clone and its origin is so small that it * doesn't make sense to use livelists anymore. */ /* * The threshold sublist size at which we create a new sub-livelist for the * next txg. However, since blkptrs of the same transaction group must be in * the same sub-list, the actual sublist size may exceed this. When picking the * size we had to balance the fact that larger sublists mean fewer sublists * (decreasing the cost of insertion) against the consideration that sublists * will be loaded into memory and shouldn't take up an inordinate amount of * space. We settled on ~500000 entries, corresponding to roughly 128M. */ unsigned long zfs_livelist_max_entries = 500000; /* * We can approximate how much of a performance gain a livelist will give us * based on the percentage of blocks shared between the clone and its origin. * 0 percent shared means that the clone has completely diverged and that the * old method is maximally effective: every read from the block tree will * result in lots of frees. Livelists give us gains when they track blocks * scattered across the tree, when one read in the old method might only * result in a few frees. Once the clone has been overwritten enough, * writes are no longer sparse and we'll no longer get much of a benefit from * tracking them with a livelist. We chose a lower limit of 75 percent shared * (25 percent overwritten). This means that 1/4 of all block pointers will be * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists * to make deletion 4x faster. Once the amount of shared space drops below this * threshold, the clone will revert to the old deletion method. */ int zfs_livelist_min_percent_shared = 75; - static int dsl_deadlist_compare(const void *arg1, const void *arg2) { - const dsl_deadlist_entry_t *dle1 = (const dsl_deadlist_entry_t *)arg1; - const dsl_deadlist_entry_t *dle2 = (const dsl_deadlist_entry_t *)arg2; + const dsl_deadlist_entry_t *dle1 = arg1; + const dsl_deadlist_entry_t *dle2 = arg2; return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); } +static int +dsl_deadlist_cache_compare(const void *arg1, const void *arg2) +{ + const dsl_deadlist_cache_entry_t *dlce1 = arg1; + const dsl_deadlist_cache_entry_t *dlce2 = arg2; + + return (AVL_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); +} + static void dsl_deadlist_load_tree(dsl_deadlist_t *dl) { zap_cursor_t zc; zap_attribute_t za; ASSERT(MUTEX_HELD(&dl->dl_lock)); ASSERT(!dl->dl_oldfmt); + if (dl->dl_havecache) { + /* + * After loading the tree, the caller may modify the tree, + * e.g. to add or remove nodes, or to make a node no longer + * refer to the empty_bpobj. These changes would make the + * dl_cache incorrect. Therefore we discard the cache here, + * so that it can't become incorrect. + */ + dsl_deadlist_cache_entry_t *dlce; + void *cookie = NULL; + while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) + != NULL) { + kmem_free(dlce, sizeof (*dlce)); + } + avl_destroy(&dl->dl_cache); + dl->dl_havecache = B_FALSE; + } if (dl->dl_havetree) return; avl_create(&dl->dl_tree, dsl_deadlist_compare, sizeof (dsl_deadlist_entry_t), offsetof(dsl_deadlist_entry_t, dle_node)); for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); - VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, - za.za_first_integer)); + + /* + * Prefetch all the bpobj's so that we do that i/o + * in parallel. Then open them all in a second pass. + */ + dle->dle_bpobj.bpo_object = za.za_first_integer; + dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + avl_add(&dl->dl_tree, dle); } zap_cursor_fini(&zc); + + for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree); + dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, + dle->dle_bpobj.bpo_object)); + } dl->dl_havetree = B_TRUE; } +/* + * Load only the non-empty bpobj's into the dl_cache. The cache is an analog + * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It + * is used only for gathering space statistics. The dl_cache has two + * advantages over the dl_tree: + * + * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's + * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj + * many times and to inquire about its (zero) space stats many times. + * + * 2. The dl_cache uses less memory than the dl_tree. We only need to load + * the dl_tree of snapshots when deleting a snapshot, after which we free the + * dl_tree with dsl_deadlist_discard_tree + */ +static void +dsl_deadlist_load_cache(dsl_deadlist_t *dl) +{ + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + ASSERT(!dl->dl_oldfmt); + if (dl->dl_havecache) + return; + + uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj; + + avl_create(&dl->dl_cache, dsl_deadlist_cache_compare, + sizeof (dsl_deadlist_cache_entry_t), + offsetof(dsl_deadlist_cache_entry_t, dlce_node)); + for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (za.za_first_integer == empty_bpobj) + continue; + dsl_deadlist_cache_entry_t *dlce = + kmem_zalloc(sizeof (*dlce), KM_SLEEP); + dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL); + + /* + * Prefetch all the bpobj's so that we do that i/o + * in parallel. Then open them all in a second pass. + */ + dlce->dlce_bpobj = za.za_first_integer; + dmu_prefetch(dl->dl_os, dlce->dlce_bpobj, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + avl_add(&dl->dl_cache, dlce); + } + zap_cursor_fini(&zc); + + for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache); + dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) { + bpobj_t bpo; + VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj)); + + VERIFY0(bpobj_space(&bpo, + &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp)); + bpobj_close(&bpo); + } + dl->dl_havecache = B_TRUE; +} + +/* + * Discard the tree to save memory. + */ +void +dsl_deadlist_discard_tree(dsl_deadlist_t *dl) +{ + mutex_enter(&dl->dl_lock); + + if (!dl->dl_havetree) { + mutex_exit(&dl->dl_lock); + return; + } + dsl_deadlist_entry_t *dle; + void *cookie = NULL; + while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + } + avl_destroy(&dl->dl_tree); + + dl->dl_havetree = B_FALSE; + mutex_exit(&dl->dl_lock); +} + void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) { dsl_deadlist_entry_t *dle; ASSERT(dsl_deadlist_is_open(dl)); mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); mutex_exit(&dl->dl_lock); for (dle = avl_first(&dl->dl_tree); dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { if (func(args, dle) != 0) break; } } void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) { dmu_object_info_t doi; ASSERT(!dsl_deadlist_is_open(dl)); mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); dl->dl_os = os; dl->dl_object = object; VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); dmu_object_info_from_db(dl->dl_dbuf, &doi); if (doi.doi_type == DMU_OT_BPOBJ) { dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_oldfmt = B_TRUE; VERIFY0(bpobj_open(&dl->dl_bpobj, os, object)); return; } dl->dl_oldfmt = B_FALSE; dl->dl_phys = dl->dl_dbuf->db_data; dl->dl_havetree = B_FALSE; + dl->dl_havecache = B_FALSE; } boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl) { return (dl->dl_os != NULL); } void dsl_deadlist_close(dsl_deadlist_t *dl) { - void *cookie = NULL; - dsl_deadlist_entry_t *dle; - ASSERT(dsl_deadlist_is_open(dl)); mutex_destroy(&dl->dl_lock); if (dl->dl_oldfmt) { dl->dl_oldfmt = B_FALSE; bpobj_close(&dl->dl_bpobj); dl->dl_os = NULL; dl->dl_object = 0; return; } if (dl->dl_havetree) { + dsl_deadlist_entry_t *dle; + void *cookie = NULL; while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); } avl_destroy(&dl->dl_tree); } + if (dl->dl_havecache) { + dsl_deadlist_cache_entry_t *dlce; + void *cookie = NULL; + while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) + != NULL) { + kmem_free(dlce, sizeof (*dlce)); + } + avl_destroy(&dl->dl_cache); + } dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_phys = NULL; dl->dl_os = NULL; dl->dl_object = 0; } uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) { if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, sizeof (dsl_deadlist_phys_t), tx)); } void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) { dmu_object_info_t doi; zap_cursor_t zc; zap_attribute_t za; VERIFY0(dmu_object_info(os, dlobj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_free(os, dlobj, tx); return; } for (zap_cursor_init(&zc, os, dlobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t obj = za.za_first_integer; if (obj == dmu_objset_pool(os)->dp_empty_bpobj) bpobj_decr_empty(os, tx); else bpobj_free(os, obj, tx); } zap_cursor_fini(&zc); VERIFY0(dmu_object_free(os, dlobj, tx)); } static void dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(MUTEX_HELD(&dl->dl_lock)); if (dle->dle_bpobj.bpo_object == dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); } static void dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, uint64_t obj, dmu_tx_t *tx) { ASSERT(MUTEX_HELD(&dl->dl_lock)); if (dle->dle_bpobj.bpo_object != dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); } else { bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } } void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; if (dl->dl_oldfmt) { bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); return; } mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); int sign = bp_freed ? -1 : +1; dl->dl_phys->dl_used += sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); dle_tofind.dle_mintxg = bp->blk_birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); else dle = AVL_PREV(&dl->dl_tree, dle); if (dle == NULL) { zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", bp, (longlong_t)bp->blk_birth); dle = avl_first(&dl->dl_tree); } ASSERT3P(dle, !=, NULL); dle_enqueue(dl, dle, bp, bp_freed, tx); mutex_exit(&dl->dl_lock); } int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; dsl_deadlist_insert(dl, bp, B_FALSE, tx); return (0); } int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; dsl_deadlist_insert(dl, bp, B_TRUE, tx); return (0); } /* * Insert new key in deadlist, which must be > all current entries. * mintxg is not inclusive. */ void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) { uint64_t obj; dsl_deadlist_entry_t *dle; if (dl->dl_oldfmt) return; dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = mintxg; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); avl_add(&dl->dl_tree, dle); VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, mintxg, obj, tx)); mutex_exit(&dl->dl_lock); } /* * Remove this key, merging its entries into the previous key. */ void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle, *dle_prev; if (dl->dl_oldfmt) return; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); ASSERT3P(dle, !=, NULL); dle_prev = AVL_PREV(&dl->dl_tree, dle); dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); avl_remove(&dl->dl_tree, dle); bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); mutex_exit(&dl->dl_lock); } /* * Remove a deadlist entry and all of its contents by removing the entry from * the deadlist's avl tree, freeing the entry's bpobj and adjusting the * deadlist's space accounting accordingly. */ void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) { uint64_t used, comp, uncomp; dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; objset_t *os = dl->dl_os; if (dl->dl_oldfmt) return; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); VERIFY3P(dle, !=, NULL); avl_remove(&dl->dl_tree, dle); VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dmu_buf_will_dirty(dl->dl_dbuf, tx); dl->dl_phys->dl_used -= used; dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { bpobj_decr_empty(os, tx); } else { bpobj_free(os, dle->dle_bpobj.bpo_object, tx); } bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); mutex_exit(&dl->dl_lock); } /* * Clear out the contents of a deadlist_entry by freeing its bpobj, * replacing it with an empty bpobj and adjusting the deadlist's * space accounting */ void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, dmu_tx_t *tx) { uint64_t new_obj, used, comp, uncomp; objset_t *os = dl->dl_os; mutex_enter(&dl->dl_lock); VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dmu_buf_will_dirty(dl->dl_dbuf, tx); dl->dl_phys->dl_used -= used; dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) bpobj_decr_empty(os, tx); else bpobj_free(os, dle->dle_bpobj.bpo_object, tx); bpobj_close(&dle->dle_bpobj); new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, new_obj, tx)); ASSERT(bpobj_is_empty(&dle->dle_bpobj)); mutex_exit(&dl->dl_lock); } /* * Return the first entry in deadlist's avl tree */ dsl_deadlist_entry_t * dsl_deadlist_first(dsl_deadlist_t *dl) { dsl_deadlist_entry_t *dle; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle = avl_first(&dl->dl_tree); mutex_exit(&dl->dl_lock); return (dle); } /* * Return the last entry in deadlist's avl tree */ dsl_deadlist_entry_t * dsl_deadlist_last(dsl_deadlist_t *dl) { dsl_deadlist_entry_t *dle; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle = avl_last(&dl->dl_tree); mutex_exit(&dl->dl_lock); return (dle); } /* * Walk ds's snapshots to regenerate generate ZAP & AVL. */ static void dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, uint64_t mrs_obj, dmu_tx_t *tx) { dsl_deadlist_t dl = { 0 }; dsl_pool_t *dp = dmu_objset_pool(os); dsl_deadlist_open(&dl, os, dlobj); if (dl.dl_oldfmt) { dsl_deadlist_close(&dl); return; } while (mrs_obj != 0) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); dsl_deadlist_add_key(&dl, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsl_dataset_rele(ds, FTAG); } dsl_deadlist_close(&dl); } uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, uint64_t mrs_obj, dmu_tx_t *tx) { dsl_deadlist_entry_t *dle; uint64_t newobj; newobj = dsl_deadlist_alloc(dl->dl_os, tx); if (dl->dl_oldfmt) { dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); return (newobj); } mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); for (dle = avl_first(&dl->dl_tree); dle; dle = AVL_NEXT(&dl->dl_tree, dle)) { uint64_t obj; if (dle->dle_mintxg >= maxtxg) break; obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(zap_add_int_key(dl->dl_os, newobj, dle->dle_mintxg, obj, tx)); } mutex_exit(&dl->dl_lock); return (newobj); } void dsl_deadlist_space(dsl_deadlist_t *dl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { ASSERT(dsl_deadlist_is_open(dl)); if (dl->dl_oldfmt) { VERIFY0(bpobj_space(&dl->dl_bpobj, usedp, compp, uncompp)); return; } mutex_enter(&dl->dl_lock); *usedp = dl->dl_phys->dl_used; *compp = dl->dl_phys->dl_comp; *uncompp = dl->dl_phys->dl_uncomp; mutex_exit(&dl->dl_lock); } /* * return space used in the range (mintxg, maxtxg]. * Includes maxtxg, does not include mintxg. * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is * UINT64_MAX). */ void dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { - dsl_deadlist_entry_t *dle; - dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_cache_entry_t *dlce; + dsl_deadlist_cache_entry_t dlce_tofind; avl_index_t where; if (dl->dl_oldfmt) { VERIFY0(bpobj_space_range(&dl->dl_bpobj, mintxg, maxtxg, usedp, compp, uncompp)); return; } *usedp = *compp = *uncompp = 0; mutex_enter(&dl->dl_lock); - dsl_deadlist_load_tree(dl); - dle_tofind.dle_mintxg = mintxg; - dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + dsl_deadlist_load_cache(dl); + dlce_tofind.dlce_mintxg = mintxg; + dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where); + /* - * If we don't find this mintxg, there shouldn't be anything - * after it either. + * If this mintxg doesn't exist, it may be an empty_bpobj which + * is omitted from the sparse tree. Start at the next non-empty + * entry. */ - ASSERT(dle != NULL || - avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); - - for (; dle && dle->dle_mintxg < maxtxg; - dle = AVL_NEXT(&dl->dl_tree, dle)) { - uint64_t used, comp, uncomp; - - VERIFY0(bpobj_space(&dle->dle_bpobj, - &used, &comp, &uncomp)); - - *usedp += used; - *compp += comp; - *uncompp += uncomp; + if (dlce == NULL) + dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER); + + for (; dlce && dlce->dlce_mintxg < maxtxg; + dlce = AVL_NEXT(&dl->dl_tree, dlce)) { + *usedp += dlce->dlce_bytes; + *compp += dlce->dlce_comp; + *uncompp += dlce->dlce_uncomp; } - /* - * This assertion ensures that the maxtxg is a key in the deadlist - * (unless it's UINT64_MAX). - */ - ASSERT(maxtxg == UINT64_MAX || - (dle != NULL && dle->dle_mintxg == maxtxg)); mutex_exit(&dl->dl_lock); } static void dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; uint64_t used, comp, uncomp; bpobj_t bpo; ASSERT(MUTEX_HELD(&dl->dl_lock)); VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); bpobj_close(&bpo); dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); dl->dl_phys->dl_used += used; dl->dl_phys->dl_comp += comp; dl->dl_phys->dl_uncomp += uncomp; dle_tofind.dle_mintxg = birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); dle_enqueue_subobj(dl, dle, obj, tx); } static int dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; dsl_deadlist_insert(dl, bp, bp_freed, tx); return (0); } /* * Merge the deadlist pointed to by 'obj' into dl. obj will be left as * an empty deadlist. */ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) { zap_cursor_t zc; zap_attribute_t za; dmu_buf_t *bonus; dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_t bpo; VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); bpobj_close(&bpo); return; } mutex_enter(&dl->dl_lock); for (zap_cursor_init(&zc, dl->dl_os, obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t mintxg = zfs_strtonum(za.za_name, NULL); dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); } zap_cursor_fini(&zc); VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); dlp = bonus->db_data; dmu_buf_will_dirty(bonus, tx); bzero(dlp, sizeof (*dlp)); dmu_buf_rele(bonus, FTAG); mutex_exit(&dl->dl_lock); } /* * Remove entries on dl that are born > mintxg, and put them on the bpobj. */ void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; ASSERT(!dl->dl_oldfmt); mutex_enter(&dl->dl_lock); dmu_buf_will_dirty(dl->dl_dbuf, tx); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); while (dle) { uint64_t used, comp, uncomp; dsl_deadlist_entry_t *dle_next; bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); ASSERT3U(dl->dl_phys->dl_used, >=, used); ASSERT3U(dl->dl_phys->dl_comp, >=, comp); ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); dl->dl_phys->dl_used -= used; dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, dle->dle_mintxg, tx)); dle_next = AVL_NEXT(&dl->dl_tree, dle); avl_remove(&dl->dl_tree, dle); bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); dle = dle_next; } mutex_exit(&dl->dl_lock); } typedef struct livelist_entry { const blkptr_t *le_bp; avl_node_t le_node; } livelist_entry_t; static int livelist_compare(const void *larg, const void *rarg) { const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp; const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp; /* Sort them according to dva[0] */ uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); if (l_dva0_vdev != r_dva0_vdev) return (AVL_CMP(l_dva0_vdev, r_dva0_vdev)); /* if vdevs are equal, sort by offsets. */ uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); if (l_dva0_offset == r_dva0_offset) ASSERT3U(l->blk_birth, ==, r->blk_birth); return (AVL_CMP(l_dva0_offset, r_dva0_offset)); } struct livelist_iter_arg { avl_tree_t *avl; bplist_t *to_free; zthr_t *t; }; /* * Expects an AVL tree which is incrementally filled will FREE blkptrs * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a * corresponding FREE are stored in the supplied bplist. */ static int dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { struct livelist_iter_arg *lia = arg; avl_tree_t *avl = lia->avl; bplist_t *to_free = lia->to_free; zthr_t *t = lia->t; ASSERT(tx == NULL); if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) return (SET_ERROR(EINTR)); if (bp_freed) { livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP); blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); *temp_bp = *bp; node->le_bp = temp_bp; avl_add(avl, node); } else { livelist_entry_t node; node.le_bp = bp; livelist_entry_t *found = avl_find(avl, &node, NULL); if (found != NULL) { avl_remove(avl, found); kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t)); kmem_free(found, sizeof (livelist_entry_t)); } else { bplist_append(to_free, bp); } } return (0); } /* * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs * which have an ALLOC entry but no matching FREE */ int dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, uint64_t *size) { avl_tree_t avl; avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), offsetof(livelist_entry_t, le_node)); /* process the sublist */ struct livelist_iter_arg arg = { .avl = &avl, .to_free = to_free, .t = t }; int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); avl_destroy(&avl); return (err); } #if defined(_KERNEL) /* CSTYLED */ module_param(zfs_livelist_max_entries, ulong, 0644); MODULE_PARM_DESC(zfs_livelist_max_entries, "Size to start the next sub-livelist in a livelist"); module_param(zfs_livelist_min_percent_shared, int, 0644); MODULE_PARM_DESC(zfs_livelist_min_percent_shared, "Threshold at which livelist is disabled"); #endif diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 5c483c5dd961..788753bdccdb 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -1,1260 +1,1267 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 by Joyent, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) { if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (dsl_dataset_long_held(ds)) return (SET_ERROR(EBUSY)); /* * Only allow deferred destroy on pools that support it. * NOTE: deferred destroy is only supported on snapshots. */ if (defer) { if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) return (SET_ERROR(ENOTSUP)); return (0); } /* * If this snapshot has an elevated user reference count, * we can't destroy it yet. */ if (ds->ds_userrefs > 0) return (SET_ERROR(EBUSY)); /* * Can't delete a branch point. */ if (dsl_dataset_phys(ds)->ds_num_children > 1) return (SET_ERROR(EEXIST)); return (0); } int dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) { dsl_destroy_snapshot_arg_t *ddsa = arg; const char *dsname = ddsa->ddsa_name; boolean_t defer = ddsa->ddsa_defer; dsl_pool_t *dp = dmu_tx_pool(tx); int error = 0; dsl_dataset_t *ds; error = dsl_dataset_hold(dp, dsname, FTAG, &ds); /* * If the snapshot does not exist, silently ignore it, and * dsl_destroy_snapshot_sync() will be a no-op * (it's "already destroyed"). */ if (error == ENOENT) return (0); if (error == 0) { error = dsl_destroy_snapshot_check_impl(ds, defer); dsl_dataset_rele(ds, FTAG); } return (error); } struct process_old_arg { dsl_dataset_t *ds; dsl_dataset_t *ds_prev; boolean_t after_branch_point; zio_t *pio; uint64_t used, comp, uncomp; }; static int process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { struct process_old_arg *poa = arg; dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; ASSERT(!BP_IS_HOLE(bp)); if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); if (poa->ds_prev && !poa->after_branch_point && bp->blk_birth > dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += bp_get_dsize_sync(dp->dp_spa, bp); } } else { poa->used += bp_get_dsize_sync(dp->dp_spa, bp); poa->comp += BP_GET_PSIZE(bp); poa->uncomp += BP_GET_UCSIZE(bp); dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); } return (0); } static void process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) { struct process_old_arg poa = { 0 }; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t deadlist_obj; ASSERT(ds->ds_deadlist.dl_oldfmt); ASSERT(ds_next->ds_deadlist.dl_oldfmt); poa.ds = ds; poa.ds_prev = ds_prev; poa.after_branch_point = after_branch_point; poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, process_old_cb, &poa, tx)); VERIFY0(zio_wait(poa.pio)); ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes); /* change snapused */ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -poa.used, -poa.comp, -poa.uncomp, tx); /* swap next's deadlist to our deadlist */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_close(&ds_next->ds_deadlist); deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; dsl_dataset_phys(ds)->ds_deadlist_obj = dsl_dataset_phys(ds_next)->ds_deadlist_obj; dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj; dsl_deadlist_open(&ds->ds_deadlist, mos, dsl_dataset_phys(ds)->ds_deadlist_obj); dsl_deadlist_open(&ds_next->ds_deadlist, mos, dsl_dataset_phys(ds_next)->ds_deadlist_obj); } typedef struct remaining_clones_key { dsl_dataset_t *rck_clone; list_node_t rck_node; } remaining_clones_key_t; static remaining_clones_key_t * rck_alloc(dsl_dataset_t *clone) { remaining_clones_key_t *rck = kmem_alloc(sizeof (*rck), KM_SLEEP); rck->rck_clone = clone; return (rck); } static void dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx, list_t *stack, void *tag) { objset_t *mos = dd->dd_pool->dp_meta_objset; /* * If it is the old version, dd_clones doesn't exist so we can't * find the clones, but dsl_deadlist_remove_key() is a no-op so it * doesn't matter. */ if (dsl_dir_phys(dd)->dd_clones == 0) return; zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dataset_t *clone; VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, za->za_first_integer, tag, &clone)); if (clone->ds_dir->dd_origin_txg > mintxg) { dsl_deadlist_remove_key(&clone->ds_deadlist, mintxg, tx); if (dsl_dataset_remap_deadlist_exists(clone)) { dsl_deadlist_remove_key( &clone->ds_remap_deadlist, mintxg, tx); } list_insert_head(stack, rck_alloc(clone)); } else { dsl_dataset_rele(clone, tag); } } zap_cursor_fini(zc); kmem_free(za, sizeof (zap_attribute_t)); kmem_free(zc, sizeof (zap_cursor_t)); } void dsl_dir_remove_clones_key(dsl_dir_t *top_dd, uint64_t mintxg, dmu_tx_t *tx) { list_t stack; list_create(&stack, sizeof (remaining_clones_key_t), offsetof(remaining_clones_key_t, rck_node)); dsl_dir_remove_clones_key_impl(top_dd, mintxg, tx, &stack, FTAG); for (remaining_clones_key_t *rck = list_remove_head(&stack); rck != NULL; rck = list_remove_head(&stack)) { dsl_dataset_t *clone = rck->rck_clone; dsl_dir_t *clone_dir = clone->ds_dir; kmem_free(rck, sizeof (*rck)); dsl_dir_remove_clones_key_impl(clone_dir, mintxg, tx, &stack, FTAG); dsl_dataset_rele(clone, FTAG); } list_destroy(&stack); } static void dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; /* Move blocks to be obsoleted to pool's obsolete list. */ if (dsl_dataset_remap_deadlist_exists(ds_next)) { if (!bpobj_is_open(&dp->dp_obsolete_bpobj)) dsl_pool_create_obsolete_bpobj(dp, tx); dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist, &dp->dp_obsolete_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); } /* Merge our deadlist into next's and free it. */ if (dsl_dataset_remap_deadlist_exists(ds)) { uint64_t remap_deadlist_object = dsl_dataset_get_remap_deadlist_object(ds); ASSERT(remap_deadlist_object != 0); mutex_enter(&ds_next->ds_remap_deadlist_lock); if (!dsl_dataset_remap_deadlist_exists(ds_next)) dsl_dataset_create_remap_deadlist(ds_next, tx); mutex_exit(&ds_next->ds_remap_deadlist_lock); dsl_deadlist_merge(&ds_next->ds_remap_deadlist, remap_deadlist_object, tx); dsl_dataset_destroy_remap_deadlist(ds, tx); } } void dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) { int after_branch_point = FALSE; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; dsl_dataset_t *ds_prev = NULL; uint64_t obj; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(zfs_refcount_is_zero(&ds->ds_longholds)); if (defer && (ds->ds_userrefs > 0 || dsl_dataset_phys(ds)->ds_num_children > 1)) { ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); return; } ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); /* We need to log before removing it from the namespace. */ spa_history_log_internal_ds(ds, "destroy", tx, ""); dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; boolean_t book_exists = dsl_bookmark_ds_destroyed(ds, tx); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (dsl_dataset_feature_is_active(ds, f)) dsl_dataset_deactivate_feature(ds, f, tx); } if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { ASSERT3P(ds->ds_prev, ==, NULL); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev)); after_branch_point = (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj); dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { VERIFY0(zap_add_int(mos, dsl_dataset_phys(ds_prev)-> ds_next_clones_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, tx)); } } if (!after_branch_point) { dsl_dataset_phys(ds_prev)->ds_next_snap_obj = dsl_dataset_phys(ds)->ds_next_snap_obj; } } dsl_dataset_t *ds_next; uint64_t old_unique; uint64_t used = 0, comp = 0, uncomp = 0; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next)); ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj); old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes; dmu_buf_will_dirty(ds_next->ds_dbuf, tx); dsl_dataset_phys(ds_next)->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsl_dataset_phys(ds_next)->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0); if (ds_next->ds_deadlist.dl_oldfmt) { process_old_deadlist(ds, ds_prev, ds_next, after_branch_point, tx); } else { /* Adjust prev's unique space. */ if (ds_prev && !after_branch_point) { dsl_deadlist_space_range(&ds_next->ds_deadlist, dsl_dataset_phys(ds_prev)->ds_prev_snap_txg, dsl_dataset_phys(ds)->ds_prev_snap_txg, &used, &comp, &uncomp); dsl_dataset_phys(ds_prev)->ds_unique_bytes += used; } /* Adjust snapused. */ dsl_deadlist_space_range(&ds_next->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -used, -comp, -uncomp, tx); /* Move blocks to be freed to pool's free list. */ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + + /* + * We are done with the deadlist tree (generated/used + * by dsl_deadlist_move_bpobj() and dsl_deadlist_merge()). + * Discard it to save memory. + */ + dsl_deadlist_discard_tree(&ds_next->ds_deadlist); } dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = 0; dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx); if (!book_exists) { /* Collapse range in clone heads */ dsl_dir_remove_clones_key(ds->ds_dir, dsl_dataset_phys(ds)->ds_creation_txg, tx); } if (ds_next->ds_is_snapshot) { dsl_dataset_t *ds_nextnext; /* * Update next's unique to include blocks which * were previously shared by only this snapshot * and it. Those blocks will be born after the * prev snap and before this snap, and will have * died after the next snap and before the one * after that (ie. be on the snap after next's * deadlist). */ VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds_next)->ds_next_snap_obj, FTAG, &ds_nextnext)); dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, dsl_dataset_phys(ds)->ds_creation_txg, &used, &comp, &uncomp); dsl_dataset_phys(ds_next)->ds_unique_bytes += used; dsl_dataset_rele(ds_nextnext, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); /* Collapse range in this head. */ dsl_dataset_t *hds; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); if (!book_exists) { /* Collapse range in this head. */ dsl_deadlist_remove_key(&hds->ds_deadlist, dsl_dataset_phys(ds)->ds_creation_txg, tx); } if (dsl_dataset_remap_deadlist_exists(hds)) { dsl_deadlist_remove_key(&hds->ds_remap_deadlist, dsl_dataset_phys(ds)->ds_creation_txg, tx); } dsl_dataset_rele(hds, FTAG); } else { ASSERT3P(ds_next->ds_prev, ==, ds); dsl_dataset_rele(ds_next->ds_prev, ds_next); ds_next->ds_prev = NULL; if (ds_prev) { VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds_next, &ds_next->ds_prev)); } dsl_dataset_recalc_head_uniq(ds_next); /* * Reduce the amount of our unconsumed refreservation * being charged to our parent by the amount of * new unique data we have gained. */ if (old_unique < ds_next->ds_reserved) { int64_t mrsdelta; uint64_t new_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes; ASSERT(old_unique <= new_unique); mrsdelta = MIN(new_unique - old_unique, ds_next->ds_reserved - old_unique); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); } } dsl_dataset_rele(ds_next, FTAG); /* * This must be done after the dsl_traverse(), because it will * re-open the objset. */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } /* remove from snapshot namespace */ dsl_dataset_t *ds_head; ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head)); VERIFY0(dsl_dataset_get_snapname(ds)); #ifdef ZFS_DEBUG { uint64_t val; int err; err = dsl_dataset_snap_lookup(ds_head, ds->ds_snapname, &val); ASSERT0(err); ASSERT3U(val, ==, obj); } #endif VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE)); dsl_dataset_rele(ds_head, FTAG); if (ds_prev != NULL) dsl_dataset_rele(ds_prev, FTAG); spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { ASSERTV(uint64_t count); ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count) && count == 0); VERIFY0(dmu_object_free(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, tx)); } if (dsl_dataset_phys(ds)->ds_props_obj != 0) VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj, tx)); if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, tx)); dsl_dir_rele(ds->ds_dir, ds); ds->ds_dir = NULL; dmu_object_free_zapified(mos, obj, tx); } void dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) { dsl_destroy_snapshot_arg_t *ddsa = arg; const char *dsname = ddsa->ddsa_name; boolean_t defer = ddsa->ddsa_defer; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (error == ENOENT) return; ASSERT0(error); dsl_destroy_snapshot_sync_impl(ds, defer, tx); zvol_remove_minors(dp->dp_spa, dsname, B_TRUE); dsl_dataset_rele(ds, FTAG); } /* * The semantics of this function are described in the comment above * lzc_destroy_snaps(). To summarize: * * The snapshots must all be in the same pool. * * Snapshots that don't exist will be silently ignored (considered to be * "already deleted"). * * On success, all snaps will be destroyed and this will return 0. * On failure, no snaps will be destroyed, the errlist will be filled in, * and this will return an errno. */ int dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, nvlist_t *errlist) { if (nvlist_next_nvpair(snaps, NULL) == NULL) return (0); /* * lzc_destroy_snaps() is documented to take an nvlist whose * values "don't matter". We need to convert that nvlist to * one that we know can be converted to LUA. We also don't * care about any duplicate entries because the nvlist will * be converted to a LUA table which should take care of this. */ nvlist_t *snaps_normalized; VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP)); for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { fnvlist_add_boolean_value(snaps_normalized, nvpair_name(pair), B_TRUE); } nvlist_t *arg; VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP)); fnvlist_add_nvlist(arg, "snaps", snaps_normalized); fnvlist_free(snaps_normalized); fnvlist_add_boolean_value(arg, "defer", defer); nvlist_t *wrapper; VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP)); fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg); fnvlist_free(arg); const char *program = "arg = ...\n" "snaps = arg['snaps']\n" "defer = arg['defer']\n" "errors = { }\n" "has_errors = false\n" "for snap, v in pairs(snaps) do\n" " errno = zfs.check.destroy{snap, defer=defer}\n" " zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n" " if errno == ENOENT then\n" " snaps[snap] = nil\n" " elseif errno ~= 0 then\n" " errors[snap] = errno\n" " has_errors = true\n" " end\n" "end\n" "if has_errors then\n" " return errors\n" "end\n" "for snap, v in pairs(snaps) do\n" " errno = zfs.sync.destroy{snap, defer=defer}\n" " assert(errno == 0)\n" "end\n" "return { }\n"; nvlist_t *result = fnvlist_alloc(); int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)), program, B_TRUE, 0, zfs_lua_max_memlimit, nvlist_next_nvpair(wrapper, NULL), result); if (error != 0) { char *errorstr = NULL; (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr); if (errorstr != NULL) { zfs_dbgmsg(errorstr); } return (error); } fnvlist_free(wrapper); /* * lzc_destroy_snaps() is documented to fill the errlist with * int32 values, so we need to covert the int64 values that are * returned from LUA. */ int rv = 0; nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN); for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL); pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) { int32_t val = (int32_t)fnvpair_value_int64(pair); if (rv == 0) rv = val; fnvlist_add_int32(errlist, nvpair_name(pair), val); } fnvlist_free(result); return (rv); } int dsl_destroy_snapshot(const char *name, boolean_t defer) { int error; nvlist_t *nvl = fnvlist_alloc(); nvlist_t *errlist = fnvlist_alloc(); fnvlist_add_boolean(nvl, name); error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); fnvlist_free(errlist); fnvlist_free(nvl); return (error); } struct killarg { dsl_dataset_t *ds; dmu_tx_t *tx; }; /* ARGSUSED */ static int kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { ASSERT(zilog != NULL); /* * It's a block in the intent log. It has no * accounting, so just free it. */ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { ASSERT(zilog == NULL); ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } return (0); } static void old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) { struct killarg ka; /* * Free everything that we point to (that's born after * the previous snapshot, if we are a clone) * * NB: this should be very quick, because we already * freed all the objects in open context. */ ka.ds = ds; ka.tx = tx; VERIFY0(traverse_dataset(ds, dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST | TRAVERSE_NO_DECRYPT, kill_blkptr, &ka)); ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || dsl_dataset_phys(ds)->ds_unique_bytes == 0); } int dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) { int error; uint64_t count; objset_t *mos; ASSERT(!ds->ds_is_snapshot); if (ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (zfs_refcount_count(&ds->ds_longholds) != expected_holds) return (SET_ERROR(EBUSY)); mos = ds->ds_dir->dd_pool->dp_meta_objset; /* * Can't delete a head dataset if there are snapshots of it. * (Except if the only snapshots are from the branch we cloned * from.) */ if (ds->ds_prev != NULL && dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) return (SET_ERROR(EBUSY)); /* * Can't delete if there are children of this fs. */ error = zap_count(mos, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count); if (error != 0) return (error); if (count != 0) return (SET_ERROR(EEXIST)); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && ds->ds_prev->ds_userrefs == 0) { /* We need to remove the origin snapshot as well. */ if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds)) return (SET_ERROR(EBUSY)); } return (0); } int dsl_destroy_head_check(void *arg, dmu_tx_t *tx) { dsl_destroy_head_arg_t *ddha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); if (error != 0) return (error); error = dsl_destroy_head_check_impl(ds, 0); dsl_dataset_rele(ds, FTAG); return (error); } static void dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) { dsl_dir_t *dd; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; dd_used_t t; ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj); /* Decrement the filesystem count for all parent filesystems. */ if (dd->dd_parent != NULL) dsl_fs_ss_count_adjust(dd->dd_parent, -1, DD_FIELD_FILESYSTEM_COUNT, tx); /* * Remove our reservation. The impl() routine avoids setting the * actual property, which would require the (already destroyed) ds. */ dsl_dir_set_reservation_sync_impl(dd, 0, tx); ASSERT0(dsl_dir_phys(dd)->dd_used_bytes); ASSERT0(dsl_dir_phys(dd)->dd_reserved); for (t = 0; t < DD_USED_NUM; t++) ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]); if (dd->dd_crypto_obj != 0) { dsl_crypto_key_destroy_sync(dd->dd_crypto_obj, tx); (void) spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object); } VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx)); VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx)); if (dsl_dir_phys(dd)->dd_clones != 0) VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx)); VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx)); VERIFY0(zap_remove(mos, dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, dd->dd_myname, tx)); dsl_dir_rele(dd, FTAG); dmu_object_free_zapified(mos, ddobj, tx); } static void dsl_clone_destroy_assert(dsl_dir_t *dd) { uint64_t used, comp, uncomp; ASSERT(dsl_dir_is_clone(dd)); dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used); ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp); /* * Greater than because we do not track embedded block pointers in * the livelist */ ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp); ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list)); ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list)); } /* * Start the delete process for a clone. Free its zil, verify the space usage * and queue the blkptrs for deletion by adding the livelist to the pool-wide * delete queue. */ static void dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t zap_obj, to_delete, used, comp, uncomp; objset_t *os; dsl_dir_t *dd = ds->ds_dir; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; spa_t *spa = dmu_tx_pool(tx)->dp_spa; VERIFY0(dmu_objset_from_ds(ds, &os)); /* Check that the clone is in a correct state to be deleted */ dsl_clone_destroy_assert(dd); /* Destroy the zil */ zil_destroy_sync(dmu_objset_zil(os), tx); VERIFY0(zap_lookup(mos, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete)); /* Initialize deleted_clones entry to track livelists to cleanup */ int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); if (error == ENOENT) { zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &(zap_obj), tx)); spa->spa_livelists_to_delete = zap_obj; } else if (error != 0) { zfs_panic_recover("zfs: error %d was returned while looking " "up DMU_POOL_DELETED_CLONES in the zap"); return; } VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx)); /* Clone is no longer using space, now tracked by dp_free_dir */ dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); dsl_dir_diduse_space(dd, DD_USED_HEAD, -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); dsl_dir_remove_livelist(dd, tx, B_FALSE); zthr_wakeup(spa->spa_livelist_delete_zthr); } /* * Move the bptree into the pool's list of trees to clean up, update space * accounting information and destroy the zil. */ void dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t used, comp, uncomp; objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; zil_destroy_sync(dmu_objset_zil(os), tx); if (!spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { dsl_scan_t *scn = dp->dp_scan; spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, tx); dp->dp_bptree_obj = bptree_alloc(mos, tx); VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj, tx)); ASSERT(!scn->scn_async_destroying); scn->scn_async_destroying = B_TRUE; } used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || dsl_dataset_phys(ds)->ds_unique_bytes == used); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); bptree_add(mos, dp->dp_bptree_obj, &dsl_dataset_phys(ds)->ds_bp, dsl_dataset_phys(ds)->ds_prev_snap_txg, used, comp, uncomp, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, -used, -comp, -uncomp, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); } void dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; uint64_t obj, ddobj, prevobj = 0; boolean_t rmorigin; ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); ASSERT(ds->ds_prev == NULL || dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); rrw_exit(&ds->ds_bp_rwlock, FTAG); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); /* We need to log before removing it from the namespace. */ spa_history_log_internal_ds(ds, "destroy", tx, ""); rmorigin = (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && ds->ds_prev->ds_userrefs == 0); /* Remove our reservation. */ if (ds->ds_reserved != 0) { dsl_dataset_set_refreservation_sync_impl(ds, (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 0, tx); ASSERT0(ds->ds_reserved); } obj = ds->ds_object; for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (dsl_dataset_feature_is_active(ds, f)) dsl_dataset_deactivate_feature(ds, f, tx); } dsl_scan_ds_destroyed(ds, tx); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { /* This is a clone */ ASSERT(ds->ds_prev != NULL); ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=, obj); ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj); dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds->ds_prev, obj, tx); } ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1); dsl_dataset_phys(ds->ds_prev)->ds_num_children--; } /* * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty since the dataset has no snapshots. * (If it's a clone, it's safe to ignore the deadlist contents * since they are still referenced by the origin snapshot.) */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = 0; if (dsl_dataset_remap_deadlist_exists(ds)) dsl_dataset_destroy_remap_deadlist(ds, tx); /* * Each destroy is responsible for both destroying (enqueuing * to be destroyed) the blkptrs comprising the dataset as well as * those belonging to the zil. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { dsl_async_clone_destroy(ds, tx); } else if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { dsl_async_dataset_destroy(ds, tx); } else { old_synchronous_dataset_destroy(ds, tx); } if (ds->ds_prev != NULL) { if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY0(zap_remove_int(mos, dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones, ds->ds_object, tx)); } prevobj = ds->ds_prev->ds_object; dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = NULL; } /* * This must be done after the dsl_traverse(), because it will * re-open the objset. */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0; ddobj = ds->ds_dir->dd_object; ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0); VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx)); if (ds->ds_bookmarks_obj != 0) { void *cookie = NULL; dsl_bookmark_node_t *dbn; while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) { if (dbn->dbn_phys.zbm_redaction_obj != 0) { VERIFY0(dmu_object_free(mos, dbn->dbn_phys.zbm_redaction_obj, tx)); spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_REDACTION_BOOKMARKS, tx); } if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_BOOKMARK_WRITTEN, tx); } spa_strfree(dbn->dbn_name); mutex_destroy(&dbn->dbn_lock); kmem_free(dbn, sizeof (*dbn)); } avl_destroy(&ds->ds_bookmarks); VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx)); spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); } spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj); ASSERT0(dsl_dataset_phys(ds)->ds_props_obj); ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj); dsl_dir_rele(ds->ds_dir, ds); ds->ds_dir = NULL; dmu_object_free_zapified(mos, obj, tx); dsl_dir_destroy_sync(ddobj, tx); if (rmorigin) { dsl_dataset_t *prev; VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); dsl_dataset_rele(prev, FTAG); } } void dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) { dsl_destroy_head_arg_t *ddha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); dsl_destroy_head_sync_impl(ds, tx); zvol_remove_minors(dp->dp_spa, ddha->ddha_name, B_TRUE); dsl_dataset_rele(ds, FTAG); } static void dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) { dsl_destroy_head_arg_t *ddha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); /* Mark it as inconsistent on-disk, in case we crash */ dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; spa_history_log_internal_ds(ds, "destroy begin", tx, ""); dsl_dataset_rele(ds, FTAG); } int dsl_destroy_head(const char *name) { dsl_destroy_head_arg_t ddha; int error; spa_t *spa; boolean_t isenabled; #ifdef _KERNEL zfs_destroy_unmount_origin(name); #endif error = spa_open(name, &spa, FTAG); if (error != 0) return (error); isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY); spa_close(spa, FTAG); ddha.ddha_name = name; if (!isenabled) { objset_t *os; error = dsl_sync_task(name, dsl_destroy_head_check, dsl_destroy_head_begin_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY); if (error != 0) return (error); /* * Head deletion is processed in one txg on old pools; * remove the objects from open context so that the txg sync * is not too long. This optimization can only work for * encrypted datasets if the wrapping key is loaded. */ error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_TRUE, FTAG, &os); if (error == 0) { uint64_t prev_snap_txg = dsl_dataset_phys(dmu_objset_ds(os))-> ds_prev_snap_txg; for (uint64_t obj = 0; error == 0; error = dmu_object_next(os, &obj, FALSE, prev_snap_txg)) (void) dmu_free_long_object(os, obj); /* sync out all frees */ txg_wait_synced(dmu_objset_pool(os), 0); dmu_objset_disown(os, B_TRUE, FTAG); } } return (dsl_sync_task(name, dsl_destroy_head_check, dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY)); } /* * Note, this function is used as the callback for dmu_objset_find(). We * always return 0 so that we will continue to find and process * inconsistent datasets, even if we encounter an error trying to * process one of them. */ /* ARGSUSED */ int dsl_destroy_inconsistent(const char *dsname, void *arg) { objset_t *os; if (dmu_objset_hold(dsname, FTAG, &os) == 0) { boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os)); /* * If the dataset is inconsistent because a resumable receive * has failed, then do not destroy it. */ if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os))) need_destroy = B_FALSE; dmu_objset_rele(os, FTAG); if (need_destroy) (void) dsl_destroy_head(dsname); } return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(dsl_destroy_head); EXPORT_SYMBOL(dsl_destroy_head_sync_impl); EXPORT_SYMBOL(dsl_dataset_user_hold_check_one); EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl); EXPORT_SYMBOL(dsl_destroy_inconsistent); EXPORT_SYMBOL(dsl_dataset_user_release_tmp); EXPORT_SYMBOL(dsl_destroy_head_check_impl); #endif