diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h index 16e403526cff..2bca0a82e5eb 100644 --- a/include/sys/bpobj.h +++ b/include/sys/bpobj.h @@ -1,106 +1,107 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, 2019 by Delphix. All rights reserved. */ #ifndef _SYS_BPOBJ_H #define _SYS_BPOBJ_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct bpobj_phys { /* * This is the bonus buffer for the dead lists. The object's * contents is an array of bpo_entries blkptr_t's, representing * a total of bpo_bytes physical space. */ uint64_t bpo_num_blkptrs; uint64_t bpo_bytes; uint64_t bpo_comp; uint64_t bpo_uncomp; uint64_t bpo_subobjs; uint64_t bpo_num_subobjs; uint64_t bpo_num_freed; } bpobj_phys_t; #define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) #define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) #define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t)) typedef struct bpobj { kmutex_t bpo_lock; objset_t *bpo_os; uint64_t bpo_object; int bpo_epb; uint8_t bpo_havecomp; uint8_t bpo_havesubobj; uint8_t bpo_havefreed; bpobj_phys_t *bpo_phys; dmu_buf_t *bpo_dbuf; dmu_buf_t *bpo_cached_dbuf; } bpobj_t; typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx); uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx); void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx); int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); void bpobj_close(bpobj_t *bpo); boolean_t bpobj_is_open(const bpobj_t *bpo); int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *); int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, int64_t start); void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); +void bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj); void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx); int bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); boolean_t bpobj_is_empty(bpobj_t *bpo); int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx); #ifdef __cplusplus } #endif #endif /* _SYS_BPOBJ_H */ diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 68f534c6b197..a8e9309d284b 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -1,943 +1,1004 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. */ #include #include #include #include #include #include /* * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). */ uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); dsl_pool_t *dp = dmu_objset_pool(os); if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { ASSERT0(dp->dp_empty_bpobj); dp->dp_empty_bpobj = bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, &dp->dp_empty_bpobj, tx) == 0); } spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); ASSERT(dp->dp_empty_bpobj != 0); return (dp->dp_empty_bpobj); } else { return (bpobj_alloc(os, blocksize, tx)); } } void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_objset_pool(os); spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); if (!spa_feature_is_active(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ)) { VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_EMPTY_BPOBJ, tx)); VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); dp->dp_empty_bpobj = 0; } } uint64_t bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) { int size; if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) size = BPOBJ_SIZE_V0; else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) size = BPOBJ_SIZE_V1; else if (!spa_feature_is_active(dmu_objset_spa(os), SPA_FEATURE_LIVELIST)) size = BPOBJ_SIZE_V2; else size = sizeof (bpobj_phys_t); return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, DMU_OT_BPOBJ_HDR, size, tx)); } void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) { int64_t i; bpobj_t bpo; dmu_object_info_t doi; int epb; dmu_buf_t *dbuf = NULL; ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); mutex_enter(&bpo.bpo_lock); if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) goto out; VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); epb = doi.doi_data_block_size / sizeof (uint64_t); for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { uint64_t *objarray; uint64_t offset, blkoff; offset = i * sizeof (uint64_t); blkoff = P2PHASE(i, epb); if (dbuf == NULL || dbuf->db_offset > offset) { if (dbuf) dmu_buf_rele(dbuf, FTAG); VERIFY3U(0, ==, dmu_buf_hold(os, bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); } ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); objarray = dbuf->db_data; bpobj_free(os, objarray[blkoff], tx); } if (dbuf) { dmu_buf_rele(dbuf, FTAG); dbuf = NULL; } VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); out: mutex_exit(&bpo.bpo_lock); bpobj_close(&bpo); VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); } int bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) { dmu_object_info_t doi; int err; err = dmu_object_info(os, object, &doi); if (err) return (err); bzero(bpo, sizeof (*bpo)); mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); ASSERT(bpo->bpo_dbuf == NULL); ASSERT(bpo->bpo_phys == NULL); ASSERT(object != 0); ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); if (err) return (err); bpo->bpo_os = os; bpo->bpo_object = object; bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); bpo->bpo_phys = bpo->bpo_dbuf->db_data; return (0); } boolean_t bpobj_is_open(const bpobj_t *bpo) { return (bpo->bpo_object != 0); } void bpobj_close(bpobj_t *bpo) { /* Lame workaround for closing a bpobj that was never opened. */ if (bpo->bpo_object == 0) return; dmu_buf_rele(bpo->bpo_dbuf, bpo); if (bpo->bpo_cached_dbuf != NULL) dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); bpo->bpo_dbuf = NULL; bpo->bpo_phys = NULL; bpo->bpo_cached_dbuf = NULL; bpo->bpo_object = 0; mutex_destroy(&bpo->bpo_lock); } static boolean_t bpobj_is_empty_impl(bpobj_t *bpo) { ASSERT(MUTEX_HELD(&bpo->bpo_lock)); return (bpo->bpo_phys->bpo_num_blkptrs == 0 && (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); } boolean_t bpobj_is_empty(bpobj_t *bpo) { mutex_enter(&bpo->bpo_lock); boolean_t is_empty = bpobj_is_empty_impl(bpo); mutex_exit(&bpo->bpo_lock); return (is_empty); } /* * A recursive iteration of the bpobjs would be nice here but we run the risk * of overflowing function stack space. Instead, find each subobj and add it * to the head of our list so it can be scanned for subjobjs. Like a * recursive implementation, the "deepest" subobjs will be freed first. * When a subobj is found to have no additional subojs, free it. */ typedef struct bpobj_info { bpobj_t *bpi_bpo; /* * This object is a subobj of bpi_parent, * at bpi_index in its subobj array. */ struct bpobj_info *bpi_parent; uint64_t bpi_index; /* How many of our subobj's are left to process. */ uint64_t bpi_unprocessed_subobjs; /* True after having visited this bpo's directly referenced BPs. */ boolean_t bpi_visited; list_node_t bpi_node; } bpobj_info_t; static bpobj_info_t * bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) { bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP); bpi->bpi_bpo = bpo; bpi->bpi_parent = parent; bpi->bpi_index = index; if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs; } return (bpi); } /* * Update bpobj and all of its parents with new space accounting. */ static void propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) { for (; bpi != NULL; bpi = bpi->bpi_parent) { bpobj_t *p = bpi->bpi_bpo; ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx)); p->bpo_phys->bpo_bytes -= freed; ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0); if (p->bpo_havecomp) { p->bpo_phys->bpo_comp -= comp_freed; p->bpo_phys->bpo_uncomp -= uncomp_freed; } } } static int bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, int64_t start, dmu_tx_t *tx, boolean_t free) { int err = 0; int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; dmu_buf_t *dbuf = NULL; bpobj_t *bpo = bpi->bpi_bpo; for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { uint64_t offset = i * sizeof (blkptr_t); uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); if (dbuf == NULL || dbuf->db_offset > offset) { if (dbuf) dmu_buf_rele(dbuf, FTAG); err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, FTAG, &dbuf, 0); if (err) break; } ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); blkptr_t *bparray = dbuf->db_data; blkptr_t *bp = &bparray[blkoff]; boolean_t bp_freed = BP_GET_FREE(bp); err = func(arg, bp, bp_freed, tx); if (err) break; if (free) { int sign = bp_freed ? -1 : +1; spa_t *spa = dmu_objset_spa(bpo->bpo_os); freed += sign * bp_get_dsize_sync(spa, bp); comp_freed += sign * BP_GET_PSIZE(bp); uncomp_freed += sign * BP_GET_UCSIZE(bp); ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); bpo->bpo_phys->bpo_num_blkptrs--; ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); if (bp_freed) { ASSERT(bpo->bpo_havefreed); bpo->bpo_phys->bpo_num_freed--; ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); } } } if (free) { propagate_space_reduction(bpi, freed, comp_freed, uncomp_freed, tx); VERIFY0(dmu_free_range(bpo->bpo_os, bpo->bpo_object, bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), DMU_OBJECT_END, tx)); } if (dbuf) { dmu_buf_rele(dbuf, FTAG); dbuf = NULL; } return (err); } /* * Given an initial bpo, start by freeing the BPs that are directly referenced * by that bpo. If the bpo has subobjs, read in its last subobj and push the * subobj to our stack. By popping items off our stack, eventually we will * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if * requested also free the now-empty bpo from disk and decrement * its parent's subobj count. We continue popping each subobj from our stack, * visiting its last subobj until they too have no more subobjs, and so on. */ static int bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) { list_t stack; bpobj_info_t *bpi; int err = 0; /* * Create a "stack" for us to work with without worrying about * stack overflows. Initialize it with the initial_bpo. */ list_create(&stack, sizeof (bpobj_info_t), offsetof(bpobj_info_t, bpi_node)); mutex_enter(&initial_bpo->bpo_lock); if (bpobj_size != NULL) *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); while ((bpi = list_head(&stack)) != NULL) { bpobj_t *bpo = bpi->bpi_bpo; ASSERT3P(bpo, !=, NULL); ASSERT(MUTEX_HELD(&bpo->bpo_lock)); ASSERT(bpobj_is_open(bpo)); if (free) dmu_buf_will_dirty(bpo->bpo_dbuf, tx); if (bpi->bpi_visited == B_FALSE) { err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, free); bpi->bpi_visited = B_TRUE; if (err != 0) break; } /* * We've finished with this bpo's directly-referenced BP's and * it has no more unprocessed subobjs. We can free its * bpobj_info_t (unless it is the topmost, initial_bpo). * If we are freeing from disk, we can also do that. */ if (bpi->bpi_unprocessed_subobjs == 0) { /* * If there are no entries, there should * be no bytes. */ if (bpobj_is_empty_impl(bpo)) { ASSERT0(bpo->bpo_phys->bpo_bytes); ASSERT0(bpo->bpo_phys->bpo_comp); ASSERT0(bpo->bpo_phys->bpo_uncomp); } /* The initial_bpo has no parent and is not closed. */ if (bpi->bpi_parent != NULL) { if (free) { bpobj_t *p = bpi->bpi_parent->bpi_bpo; ASSERT0(bpo->bpo_phys->bpo_num_blkptrs); ASSERT3U(p->bpo_phys->bpo_num_subobjs, >, 0); ASSERT3U(bpi->bpi_index, ==, p->bpo_phys->bpo_num_subobjs - 1); ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); p->bpo_phys->bpo_num_subobjs--; VERIFY0(dmu_free_range(p->bpo_os, p->bpo_phys->bpo_subobjs, bpi->bpi_index * sizeof (uint64_t), sizeof (uint64_t), tx)); /* eliminate the empty subobj list */ if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { ASSERT0(bpo->bpo_phys-> bpo_num_subobjs); err = dmu_object_free( bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, tx); if (err) break; bpo->bpo_phys->bpo_subobjs = 0; } err = dmu_object_free(p->bpo_os, bpo->bpo_object, tx); if (err) break; } mutex_exit(&bpo->bpo_lock); bpobj_close(bpo); kmem_free(bpo, sizeof (bpobj_t)); } else { mutex_exit(&bpo->bpo_lock); } /* * Finished processing this bpo. Unlock, and free * our "stack" info. */ list_remove_head(&stack); kmem_free(bpi, sizeof (bpobj_info_t)); } else { /* * We have unprocessed subobjs. Process the next one. */ ASSERT(bpo->bpo_havecomp); ASSERT3P(bpobj_size, ==, NULL); /* Add the last subobj to stack. */ int64_t i = bpi->bpi_unprocessed_subobjs - 1; uint64_t offset = i * sizeof (uint64_t); uint64_t obj_from_sublist; err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, offset, sizeof (uint64_t), &obj_from_sublist, DMU_READ_PREFETCH); if (err) break; bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), KM_SLEEP); err = bpobj_open(sublist, bpo->bpo_os, obj_from_sublist); if (err) break; list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); mutex_enter(&sublist->bpo_lock); bpi->bpi_unprocessed_subobjs--; } } /* * Cleanup anything left on the "stack" after we left the loop. * Every bpo on the stack is locked so we must remember to undo * that now (in LIFO order). */ while ((bpi = list_remove_head(&stack)) != NULL) { bpobj_t *bpo = bpi->bpi_bpo; ASSERT(err != 0); ASSERT3P(bpo, !=, NULL); mutex_exit(&bpo->bpo_lock); /* do not free the initial_bpo */ if (bpi->bpi_parent != NULL) { bpobj_close(bpi->bpi_bpo); kmem_free(bpi->bpi_bpo, sizeof (bpobj_t)); } kmem_free(bpi, sizeof (bpobj_info_t)); } list_destroy(&stack); return (err); } /* * Iterate and remove the entries. If func returns nonzero, iteration * will stop and that entry will not be removed. */ int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) { return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); } /* * Iterate the entries. If func returns nonzero, iteration will stop. * * If there are no subobjs: * * *bpobj_size can be used to return the number of block pointers in the * bpobj. Note that this may be different from the number of block pointers * that are iterated over, if iteration is terminated early (e.g. by the func * returning nonzero). * * If there are concurrent (or subsequent) modifications to the bpobj then the * returned *bpobj_size can be passed as "start" to * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. */ int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, uint64_t *bpobj_size) { return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); } /* * Iterate over the blkptrs in the bpobj beginning at index start. If func * returns nonzero, iteration will stop. This is a livelist specific function * since it assumes that there are no subobjs present. */ int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, int64_t start) { if (bpo->bpo_havesubobj) VERIFY0(bpo->bpo_phys->bpo_subobjs); bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); kmem_free(bpi, sizeof (bpobj_info_t)); return (err); } /* * Logically add subobj's contents to the parent bpobj. * * In the most general case, this is accomplished in constant time by adding * a reference to subobj. This case is used when enqueuing a large subobj: * +--------------+ +--------------+ * | bpobj |----------------------->| subobj list | * +----+----+----+----+----+ +-----+-----+--+--+ * | bp | bp | bp | bp | bp | | obj | obj | obj | * +----+----+----+----+----+ +-----+-----+-----+ * * +--------------+ +--------------+ * | sub-bpobj |----------------------> | subsubobj | * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ * * Result: sub-bpobj added to parent's subobj list. * +--------------+ +--------------+ * | bpobj |----------------------->| subobj list | * +----+----+----+----+----+ +-----+-----+--+--+-----+ * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | * +----+----+----+----+----+ +-----+-----+-----+--|--+ * | * /-----------------------------------------------------/ * v * +--------------+ +--------------+ * | sub-bpobj |----------------------> | subsubobj | * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ * * * In a common case, the subobj is small: its bp's and its list of subobj's * are each stored in a single block. In this case we copy the subobj's * contents to the parent: * +--------------+ +--------------+ * | bpobj |----------------------->| subobj list | * +----+----+----+----+----+ +-----+-----+--+--+ * | bp | bp | bp | bp | bp | | obj | obj | obj | * +----+----+----+----+----+ +-----+-----+-----+ * ^ ^ * +--------------+ | +--------------+ | * | sub-bpobj |---------^------------> | subsubobj | ^ * +----+----+----+ | +-----+-----+--+ | * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/ * +----+----+ +-----+-----+ * * Result: subobj destroyed, contents copied to parent: * +--------------+ +--------------+ * | bpobj |----------------------->| subobj list | * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+ * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ | * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+ * * * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's * but retain the sub-bpobj: * +--------------+ +--------------+ * | bpobj |----------------------->| subobj list | * +----+----+----+----+----+ +-----+-----+--+--+ * | bp | bp | bp | bp | bp | | obj | obj | obj | * +----+----+----+----+----+ +-----+-----+-----+ * ^ * +--------------+ +--------------+ | * | sub-bpobj |----------------------> | subsubobj | ^ * +----+----+----+----+---------+----+ +-----+-----+--+ | * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/ * +----+----+----+----+---------+----+ +-----+-----+ * * Result: sub-sub-bpobjs and subobj added to parent's subobj list. * +--------------+ +--------------+ * | bpobj |-------------------->| subobj list | * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+ * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* | * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+ * | * /--------------------------------------------------------------/ * v * +--------------+ * | sub-bpobj | * +----+----+----+----+---------+----+ * | bp | bp | bp | bp | ... | bp | * +----+----+----+----+---------+----+ */ void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) { bpobj_t subbpo; uint64_t used, comp, uncomp, subsubobjs; boolean_t copy_subsub = B_TRUE; boolean_t copy_bps = B_TRUE; ASSERT(bpobj_is_open(bpo)); ASSERT(subobj != 0); ASSERT(bpo->bpo_havesubobj); ASSERT(bpo->bpo_havecomp); ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { bpobj_decr_empty(bpo->bpo_os, tx); return; } VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); - VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); - if (bpobj_is_empty(&subbpo)) { /* No point in having an empty subobj. */ bpobj_close(&subbpo); bpobj_free(bpo->bpo_os, subobj, tx); return; } + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); mutex_enter(&bpo->bpo_lock); dmu_buf_will_dirty(bpo->bpo_dbuf, tx); dmu_object_info_t doi; if (bpo->bpo_phys->bpo_subobjs != 0) { ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); } /* * If subobj has only one block of subobjs, then move subobj's * subobjs to bpo's subobj list directly. This reduces recursion in * bpobj_iterate due to nested subobjs. */ subsubobjs = subbpo.bpo_phys->bpo_subobjs; if (subsubobjs != 0) { VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); if (doi.doi_max_offset > doi.doi_data_block_size) { copy_subsub = B_FALSE; } } /* * If, in addition to having only one block of subobj's, subobj has * only one block of bp's, then move subobj's bp's to bpo's bp list * directly. This reduces recursion in bpobj_iterate due to nested * subobjs. */ VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi)); if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) { copy_bps = B_FALSE; } if (copy_subsub && subsubobjs != 0) { dmu_buf_t *subdb; uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs, 0, FTAG, &subdb, 0)); /* * Make sure that we are not asking dmu_write() * to write more data than we have in our buffer. */ VERIFY3U(subdb->db_size, >=, numsubsub * sizeof (subobj)); if (bpo->bpo_phys->bpo_subobjs == 0) { bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); } dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), numsubsub * sizeof (subobj), subdb->db_data, tx); dmu_buf_rele(subdb, FTAG); bpo->bpo_phys->bpo_num_subobjs += numsubsub; dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); subbpo.bpo_phys->bpo_subobjs = 0; VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx)); } if (copy_bps) { dmu_buf_t *bps; uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs; ASSERT(copy_subsub); VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj, 0, FTAG, &bps, 0)); /* * Make sure that we are not asking dmu_write() * to write more data than we have in our buffer. */ VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t)); dmu_write(bpo->bpo_os, bpo->bpo_object, bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), numbps * sizeof (blkptr_t), bps->db_data, tx); dmu_buf_rele(bps, FTAG); bpo->bpo_phys->bpo_num_blkptrs += numbps; bpobj_close(&subbpo); VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx)); } else { bpobj_close(&subbpo); if (bpo->bpo_phys->bpo_subobjs == 0) { bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); } dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), sizeof (subobj), &subobj, tx); bpo->bpo_phys->bpo_num_subobjs++; } bpo->bpo_phys->bpo_bytes += used; bpo->bpo_phys->bpo_comp += comp; bpo->bpo_phys->bpo_uncomp += uncomp; mutex_exit(&bpo->bpo_lock); } +/* + * Prefetch metadata required for bpobj_enqueue_subobj(). + */ +void +bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj) +{ + dmu_object_info_t doi; + bpobj_t subbpo; + uint64_t subsubobjs; + boolean_t copy_subsub = B_TRUE; + boolean_t copy_bps = B_TRUE; + + ASSERT(bpobj_is_open(bpo)); + ASSERT(subobj != 0); + + if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) + return; + + if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0) + return; + if (bpobj_is_empty(&subbpo)) { + bpobj_close(&subbpo); + return; + } + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + bpobj_close(&subbpo); + + if (subsubobjs != 0) { + if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0) + return; + if (doi.doi_max_offset > doi.doi_data_block_size) + copy_subsub = B_FALSE; + } + + if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0) + return; + if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) + copy_bps = B_FALSE; + + if (copy_subsub && subsubobjs != 0) { + if (bpo->bpo_phys->bpo_subobjs) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, + ZIO_PRIORITY_ASYNC_READ); + } + dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1, + ZIO_PRIORITY_ASYNC_READ); + } + + if (copy_bps) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1, + ZIO_PRIORITY_ASYNC_READ); + dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1, + ZIO_PRIORITY_ASYNC_READ); + } else if (bpo->bpo_phys->bpo_subobjs) { + dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, + ZIO_PRIORITY_ASYNC_READ); + } +} + void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { blkptr_t stored_bp = *bp; uint64_t offset; int blkoff; blkptr_t *bparray; ASSERT(bpobj_is_open(bpo)); ASSERT(!BP_IS_HOLE(bp)); ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); if (BP_IS_EMBEDDED(bp)) { /* * The bpobj will compress better without the payload. * * Note that we store EMBEDDED bp's because they have an * uncompressed size, which must be accounted for. An * alternative would be to add their size to bpo_uncomp * without storing the bp, but that would create additional * complications: bpo_uncomp would be inconsistent with the * set of BP's stored, and bpobj_iterate() wouldn't visit * all the space accounted for in the bpobj. */ bzero(&stored_bp, sizeof (stored_bp)); stored_bp.blk_prop = bp->blk_prop; stored_bp.blk_birth = bp->blk_birth; } else if (!BP_GET_DEDUP(bp)) { /* The bpobj will compress better without the checksum */ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); } stored_bp.blk_fill = 0; BP_SET_FREE(&stored_bp, bp_freed); mutex_enter(&bpo->bpo_lock); offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); if (bpo->bpo_cached_dbuf == NULL || offset < bpo->bpo_cached_dbuf->db_offset || offset >= bpo->bpo_cached_dbuf->db_offset + bpo->bpo_cached_dbuf->db_size) { if (bpo->bpo_cached_dbuf) dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, bpo, &bpo->bpo_cached_dbuf, 0)); } dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); bparray = bpo->bpo_cached_dbuf->db_data; bparray[blkoff] = stored_bp; dmu_buf_will_dirty(bpo->bpo_dbuf, tx); bpo->bpo_phys->bpo_num_blkptrs++; int sign = bp_freed ? -1 : +1; bpo->bpo_phys->bpo_bytes += sign * bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); if (bpo->bpo_havecomp) { bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); } if (bp_freed) { ASSERT(bpo->bpo_havefreed); bpo->bpo_phys->bpo_num_freed++; } mutex_exit(&bpo->bpo_lock); } struct space_range_arg { spa_t *spa; uint64_t mintxg; uint64_t maxtxg; uint64_t used; uint64_t comp; uint64_t uncomp; }; static int space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { (void) bp_freed, (void) tx; struct space_range_arg *sra = arg; if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) sra->used += bp_get_dsize_sync(sra->spa, bp); else sra->used += bp_get_dsize(sra->spa, bp); sra->comp += BP_GET_PSIZE(bp); sra->uncomp += BP_GET_UCSIZE(bp); } return (0); } int bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { ASSERT(bpobj_is_open(bpo)); mutex_enter(&bpo->bpo_lock); *usedp = bpo->bpo_phys->bpo_bytes; if (bpo->bpo_havecomp) { *compp = bpo->bpo_phys->bpo_comp; *uncompp = bpo->bpo_phys->bpo_uncomp; mutex_exit(&bpo->bpo_lock); return (0); } else { mutex_exit(&bpo->bpo_lock); return (bpobj_space_range(bpo, 0, UINT64_MAX, usedp, compp, uncompp)); } } /* * Return the amount of space in the bpobj which is: * mintxg < blk_birth <= maxtxg */ int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { struct space_range_arg sra = { 0 }; int err; ASSERT(bpobj_is_open(bpo)); /* * As an optimization, if they want the whole txg range, just * get bpo_bytes rather than iterating over the bps. */ if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) return (bpobj_space(bpo, usedp, compp, uncompp)); sra.spa = dmu_objset_spa(bpo->bpo_os); sra.mintxg = mintxg; sra.maxtxg = maxtxg; err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); *usedp = sra.used; *compp = sra.comp; *uncompp = sra.uncomp; return (err); } /* * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a * bpobj are designated as free or allocated that information is not preserved * in bplists. */ int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { (void) bp_freed, (void) tx; bplist_t *bpl = arg; bplist_append(bpl, bp); return (0); } diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 7681b735ec70..1b2d8b92f288 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -1,1048 +1,1111 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include /* * Deadlist concurrency: * * Deadlists can only be modified from the syncing thread. * * Except for dsl_deadlist_insert(), it can only be modified with the * dp_config_rwlock held with RW_WRITER. * * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can * be called concurrently, from open context, with the dl_config_rwlock held * with RW_READER. * * Therefore, we only need to provide locking between dsl_deadlist_insert() and * the accessors, protecting: * dl_phys->dl_used,comp,uncomp * and protecting the dl_tree from being loaded. * The locking is provided by dl_lock. Note that locking on the bpobj_t * provides its own locking, and dl_oldfmt is immutable. */ /* * Livelist Overview * ================ * * Livelists use the same 'deadlist_t' struct as deadlists and are also used * to track blkptrs over the lifetime of a dataset. Livelists however, belong * to clones and track the blkptrs that are clone-specific (were born after * the clone's creation). The exception is embedded block pointers which are * not included in livelists because they do not need to be freed. * * When it comes time to delete the clone, the livelist provides a quick * reference as to what needs to be freed. For this reason, livelists also track * when clone-specific blkptrs are freed before deletion to prevent double * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the * deletion algorithm iterates backwards over the livelist, matching * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists * are also updated in the case when blkptrs are remapped: the old version * of the blkptr is cancelled out with a FREE and the new version is tracked * with an ALLOC. * * To bound the amount of memory required for deletion, livelists over a * certain size are spread over multiple entries. Entries are grouped by * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will * be in the same entry. This allows us to delete livelists incrementally * over multiple syncs, one entry at a time. * * During the lifetime of the clone, livelists can get extremely large. * Their size is managed by periodic condensing (preemptively cancelling out * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when * the shared space between the clone and its origin is so small that it * doesn't make sense to use livelists anymore. */ /* * The threshold sublist size at which we create a new sub-livelist for the * next txg. However, since blkptrs of the same transaction group must be in * the same sub-list, the actual sublist size may exceed this. When picking the * size we had to balance the fact that larger sublists mean fewer sublists * (decreasing the cost of insertion) against the consideration that sublists * will be loaded into memory and shouldn't take up an inordinate amount of * space. We settled on ~500000 entries, corresponding to roughly 128M. */ unsigned long zfs_livelist_max_entries = 500000; /* * We can approximate how much of a performance gain a livelist will give us * based on the percentage of blocks shared between the clone and its origin. * 0 percent shared means that the clone has completely diverged and that the * old method is maximally effective: every read from the block tree will * result in lots of frees. Livelists give us gains when they track blocks * scattered across the tree, when one read in the old method might only * result in a few frees. Once the clone has been overwritten enough, * writes are no longer sparse and we'll no longer get much of a benefit from * tracking them with a livelist. We chose a lower limit of 75 percent shared * (25 percent overwritten). This means that 1/4 of all block pointers will be * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists * to make deletion 4x faster. Once the amount of shared space drops below this * threshold, the clone will revert to the old deletion method. */ int zfs_livelist_min_percent_shared = 75; static int dsl_deadlist_compare(const void *arg1, const void *arg2) { const dsl_deadlist_entry_t *dle1 = arg1; const dsl_deadlist_entry_t *dle2 = arg2; return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); } static int dsl_deadlist_cache_compare(const void *arg1, const void *arg2) { const dsl_deadlist_cache_entry_t *dlce1 = arg1; const dsl_deadlist_cache_entry_t *dlce2 = arg2; return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); } static void dsl_deadlist_load_tree(dsl_deadlist_t *dl) { zap_cursor_t zc; zap_attribute_t za; int error; ASSERT(MUTEX_HELD(&dl->dl_lock)); ASSERT(!dl->dl_oldfmt); if (dl->dl_havecache) { /* * After loading the tree, the caller may modify the tree, * e.g. to add or remove nodes, or to make a node no longer * refer to the empty_bpobj. These changes would make the * dl_cache incorrect. Therefore we discard the cache here, * so that it can't become incorrect. */ dsl_deadlist_cache_entry_t *dlce; void *cookie = NULL; while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) != NULL) { kmem_free(dlce, sizeof (*dlce)); } avl_destroy(&dl->dl_cache); dl->dl_havecache = B_FALSE; } if (dl->dl_havetree) return; avl_create(&dl->dl_tree, dsl_deadlist_compare, sizeof (dsl_deadlist_entry_t), offsetof(dsl_deadlist_entry_t, dle_node)); for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); /* * Prefetch all the bpobj's so that we do that i/o * in parallel. Then open them all in a second pass. */ dle->dle_bpobj.bpo_object = za.za_first_integer; dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_tree, dle); } VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree); dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, dle->dle_bpobj.bpo_object)); } dl->dl_havetree = B_TRUE; } /* * Load only the non-empty bpobj's into the dl_cache. The cache is an analog * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It * is used only for gathering space statistics. The dl_cache has two * advantages over the dl_tree: * * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj * many times and to inquire about its (zero) space stats many times. * * 2. The dl_cache uses less memory than the dl_tree. We only need to load * the dl_tree of snapshots when deleting a snapshot, after which we free the * dl_tree with dsl_deadlist_discard_tree */ static void dsl_deadlist_load_cache(dsl_deadlist_t *dl) { zap_cursor_t zc; zap_attribute_t za; int error; ASSERT(MUTEX_HELD(&dl->dl_lock)); ASSERT(!dl->dl_oldfmt); if (dl->dl_havecache) return; uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj; avl_create(&dl->dl_cache, dsl_deadlist_cache_compare, sizeof (dsl_deadlist_cache_entry_t), offsetof(dsl_deadlist_cache_entry_t, dlce_node)); for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { if (za.za_first_integer == empty_bpobj) continue; dsl_deadlist_cache_entry_t *dlce = kmem_zalloc(sizeof (*dlce), KM_SLEEP); dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL); /* * Prefetch all the bpobj's so that we do that i/o * in parallel. Then open them all in a second pass. */ dlce->dlce_bpobj = za.za_first_integer; dmu_prefetch(dl->dl_os, dlce->dlce_bpobj, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_cache, dlce); } VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache); dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) { bpobj_t bpo; VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj)); VERIFY0(bpobj_space(&bpo, &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp)); bpobj_close(&bpo); } dl->dl_havecache = B_TRUE; } /* * Discard the tree to save memory. */ void dsl_deadlist_discard_tree(dsl_deadlist_t *dl) { mutex_enter(&dl->dl_lock); if (!dl->dl_havetree) { mutex_exit(&dl->dl_lock); return; } dsl_deadlist_entry_t *dle; void *cookie = NULL; while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); } avl_destroy(&dl->dl_tree); dl->dl_havetree = B_FALSE; mutex_exit(&dl->dl_lock); } void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) { dsl_deadlist_entry_t *dle; ASSERT(dsl_deadlist_is_open(dl)); mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); mutex_exit(&dl->dl_lock); for (dle = avl_first(&dl->dl_tree); dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { if (func(args, dle) != 0) break; } } void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) { dmu_object_info_t doi; ASSERT(!dsl_deadlist_is_open(dl)); mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); dl->dl_os = os; dl->dl_object = object; VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); dmu_object_info_from_db(dl->dl_dbuf, &doi); if (doi.doi_type == DMU_OT_BPOBJ) { dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_oldfmt = B_TRUE; VERIFY0(bpobj_open(&dl->dl_bpobj, os, object)); return; } dl->dl_oldfmt = B_FALSE; dl->dl_phys = dl->dl_dbuf->db_data; dl->dl_havetree = B_FALSE; dl->dl_havecache = B_FALSE; } boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl) { return (dl->dl_os != NULL); } void dsl_deadlist_close(dsl_deadlist_t *dl) { ASSERT(dsl_deadlist_is_open(dl)); mutex_destroy(&dl->dl_lock); if (dl->dl_oldfmt) { dl->dl_oldfmt = B_FALSE; bpobj_close(&dl->dl_bpobj); dl->dl_os = NULL; dl->dl_object = 0; return; } if (dl->dl_havetree) { dsl_deadlist_entry_t *dle; void *cookie = NULL; while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); } avl_destroy(&dl->dl_tree); } if (dl->dl_havecache) { dsl_deadlist_cache_entry_t *dlce; void *cookie = NULL; while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) != NULL) { kmem_free(dlce, sizeof (*dlce)); } avl_destroy(&dl->dl_cache); } dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_phys = NULL; dl->dl_os = NULL; dl->dl_object = 0; } uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) { if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, sizeof (dsl_deadlist_phys_t), tx)); } void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) { dmu_object_info_t doi; zap_cursor_t zc; zap_attribute_t za; int error; VERIFY0(dmu_object_info(os, dlobj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_free(os, dlobj, tx); return; } for (zap_cursor_init(&zc, os, dlobj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t obj = za.za_first_integer; if (obj == dmu_objset_pool(os)->dp_empty_bpobj) bpobj_decr_empty(os, tx); else bpobj_free(os, obj, tx); } VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); VERIFY0(dmu_object_free(os, dlobj, tx)); } static void dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { ASSERT(MUTEX_HELD(&dl->dl_lock)); if (dle->dle_bpobj.bpo_object == dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); } static void dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, uint64_t obj, dmu_tx_t *tx) { ASSERT(MUTEX_HELD(&dl->dl_lock)); if (dle->dle_bpobj.bpo_object != dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); } else { bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } } +/* + * Prefetch metadata required for dle_enqueue_subobj(). + */ +static void +dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + uint64_t obj) +{ + if (dle->dle_bpobj.bpo_object != + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) + bpobj_prefetch_subobj(&dle->dle_bpobj, obj); +} + void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; if (dl->dl_oldfmt) { bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); return; } mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); int sign = bp_freed ? -1 : +1; dl->dl_phys->dl_used += sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); dle_tofind.dle_mintxg = bp->blk_birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); else dle = AVL_PREV(&dl->dl_tree, dle); if (dle == NULL) { zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", bp, (longlong_t)bp->blk_birth); dle = avl_first(&dl->dl_tree); } ASSERT3P(dle, !=, NULL); dle_enqueue(dl, dle, bp, bp_freed, tx); mutex_exit(&dl->dl_lock); } int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; dsl_deadlist_insert(dl, bp, B_FALSE, tx); return (0); } int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; dsl_deadlist_insert(dl, bp, B_TRUE, tx); return (0); } /* * Insert new key in deadlist, which must be > all current entries. * mintxg is not inclusive. */ void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) { uint64_t obj; dsl_deadlist_entry_t *dle; if (dl->dl_oldfmt) return; dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = mintxg; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); avl_add(&dl->dl_tree, dle); VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, mintxg, obj, tx)); mutex_exit(&dl->dl_lock); } /* * Remove this key, merging its entries into the previous key. */ void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle, *dle_prev; if (dl->dl_oldfmt) return; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); ASSERT3P(dle, !=, NULL); dle_prev = AVL_PREV(&dl->dl_tree, dle); dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); avl_remove(&dl->dl_tree, dle); bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); mutex_exit(&dl->dl_lock); } /* * Remove a deadlist entry and all of its contents by removing the entry from * the deadlist's avl tree, freeing the entry's bpobj and adjusting the * deadlist's space accounting accordingly. */ void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) { uint64_t used, comp, uncomp; dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; objset_t *os = dl->dl_os; if (dl->dl_oldfmt) return; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); VERIFY3P(dle, !=, NULL); avl_remove(&dl->dl_tree, dle); VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); dmu_buf_will_dirty(dl->dl_dbuf, tx); dl->dl_phys->dl_used -= used; dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { bpobj_decr_empty(os, tx); } else { bpobj_free(os, dle->dle_bpobj.bpo_object, tx); } bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); mutex_exit(&dl->dl_lock); } /* * Clear out the contents of a deadlist_entry by freeing its bpobj, * replacing it with an empty bpobj and adjusting the deadlist's * space accounting */ void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, dmu_tx_t *tx) { uint64_t new_obj, used, comp, uncomp; objset_t *os = dl->dl_os; mutex_enter(&dl->dl_lock); VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); dmu_buf_will_dirty(dl->dl_dbuf, tx); dl->dl_phys->dl_used -= used; dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) bpobj_decr_empty(os, tx); else bpobj_free(os, dle->dle_bpobj.bpo_object, tx); bpobj_close(&dle->dle_bpobj); new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, new_obj, tx)); ASSERT(bpobj_is_empty(&dle->dle_bpobj)); mutex_exit(&dl->dl_lock); } /* * Return the first entry in deadlist's avl tree */ dsl_deadlist_entry_t * dsl_deadlist_first(dsl_deadlist_t *dl) { dsl_deadlist_entry_t *dle; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle = avl_first(&dl->dl_tree); mutex_exit(&dl->dl_lock); return (dle); } /* * Return the last entry in deadlist's avl tree */ dsl_deadlist_entry_t * dsl_deadlist_last(dsl_deadlist_t *dl) { dsl_deadlist_entry_t *dle; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle = avl_last(&dl->dl_tree); mutex_exit(&dl->dl_lock); return (dle); } /* * Walk ds's snapshots to regenerate generate ZAP & AVL. */ static void dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, uint64_t mrs_obj, dmu_tx_t *tx) { dsl_deadlist_t dl = { 0 }; dsl_pool_t *dp = dmu_objset_pool(os); dsl_deadlist_open(&dl, os, dlobj); if (dl.dl_oldfmt) { dsl_deadlist_close(&dl); return; } while (mrs_obj != 0) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); dsl_deadlist_add_key(&dl, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsl_dataset_rele(ds, FTAG); } dsl_deadlist_close(&dl); } uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, uint64_t mrs_obj, dmu_tx_t *tx) { dsl_deadlist_entry_t *dle; uint64_t newobj; newobj = dsl_deadlist_alloc(dl->dl_os, tx); if (dl->dl_oldfmt) { dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); return (newobj); } mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); for (dle = avl_first(&dl->dl_tree); dle; dle = AVL_NEXT(&dl->dl_tree, dle)) { uint64_t obj; if (dle->dle_mintxg >= maxtxg) break; obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(zap_add_int_key(dl->dl_os, newobj, dle->dle_mintxg, obj, tx)); } mutex_exit(&dl->dl_lock); return (newobj); } void dsl_deadlist_space(dsl_deadlist_t *dl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { ASSERT(dsl_deadlist_is_open(dl)); if (dl->dl_oldfmt) { VERIFY0(bpobj_space(&dl->dl_bpobj, usedp, compp, uncompp)); return; } mutex_enter(&dl->dl_lock); *usedp = dl->dl_phys->dl_used; *compp = dl->dl_phys->dl_comp; *uncompp = dl->dl_phys->dl_uncomp; mutex_exit(&dl->dl_lock); } /* * return space used in the range (mintxg, maxtxg]. * Includes maxtxg, does not include mintxg. * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is * UINT64_MAX). */ void dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { dsl_deadlist_cache_entry_t *dlce; dsl_deadlist_cache_entry_t dlce_tofind; avl_index_t where; if (dl->dl_oldfmt) { VERIFY0(bpobj_space_range(&dl->dl_bpobj, mintxg, maxtxg, usedp, compp, uncompp)); return; } *usedp = *compp = *uncompp = 0; mutex_enter(&dl->dl_lock); dsl_deadlist_load_cache(dl); dlce_tofind.dlce_mintxg = mintxg; dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where); /* * If this mintxg doesn't exist, it may be an empty_bpobj which * is omitted from the sparse tree. Start at the next non-empty * entry. */ if (dlce == NULL) dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER); for (; dlce && dlce->dlce_mintxg < maxtxg; dlce = AVL_NEXT(&dl->dl_tree, dlce)) { *usedp += dlce->dlce_bytes; *compp += dlce->dlce_comp; *uncompp += dlce->dlce_uncomp; } mutex_exit(&dl->dl_lock); } static void dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; uint64_t used, comp, uncomp; bpobj_t bpo; ASSERT(MUTEX_HELD(&dl->dl_lock)); VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); bpobj_close(&bpo); dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); dl->dl_phys->dl_used += used; dl->dl_phys->dl_comp += comp; dl->dl_phys->dl_uncomp += uncomp; dle_tofind.dle_mintxg = birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); dle_enqueue_subobj(dl, dle, obj, tx); } +/* + * Prefetch metadata required for dsl_deadlist_insert_bpobj(). + */ +static void +dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + dle_prefetch_subobj(dl, dle, obj); +} + static int dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; dsl_deadlist_insert(dl, bp, bp_freed, tx); return (0); } /* * Merge the deadlist pointed to by 'obj' into dl. obj will be left as * an empty deadlist. */ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) { - zap_cursor_t zc; - zap_attribute_t za; + zap_cursor_t zc, pzc; + zap_attribute_t za, pza; dmu_buf_t *bonus; dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; - int error; + int error, perror, i; VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_t bpo; VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); bpobj_close(&bpo); return; } mutex_enter(&dl->dl_lock); + /* + * Prefetch up to 128 deadlists first and then more as we progress. + * The limit is a balance between ARC use and diminishing returns. + */ + for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0; + (perror = zap_cursor_retrieve(&pzc, &pza)) == 0 && i < 128; + zap_cursor_advance(&pzc), i++) { + dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, + zfs_strtonum(pza.za_name, NULL)); + } for (zap_cursor_init(&zc, dl->dl_os, obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t mintxg = zfs_strtonum(za.za_name, NULL); dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); + if (perror == 0) { + dsl_deadlist_prefetch_bpobj(dl, pza.za_first_integer, + zfs_strtonum(pza.za_name, NULL)); + zap_cursor_advance(&pzc); + perror = zap_cursor_retrieve(&pzc, &pza); + } } VERIFY3U(error, ==, ENOENT); zap_cursor_fini(&zc); + zap_cursor_fini(&pzc); VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); dlp = bonus->db_data; dmu_buf_will_dirty(bonus, tx); bzero(dlp, sizeof (*dlp)); dmu_buf_rele(bonus, FTAG); mutex_exit(&dl->dl_lock); } /* * Remove entries on dl that are born > mintxg, and put them on the bpobj. */ void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; - dsl_deadlist_entry_t *dle; + dsl_deadlist_entry_t *dle, *pdle; avl_index_t where; + int i; ASSERT(!dl->dl_oldfmt); mutex_enter(&dl->dl_lock); dmu_buf_will_dirty(dl->dl_dbuf, tx); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); + /* + * Prefetch up to 128 deadlists first and then more as we progress. + * The limit is a balance between ARC use and diminishing returns. + */ + for (pdle = dle, i = 0; pdle && i < 128; ) { + bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); + pdle = AVL_NEXT(&dl->dl_tree, pdle); + } while (dle) { uint64_t used, comp, uncomp; dsl_deadlist_entry_t *dle_next; bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); + if (pdle) { + bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); + pdle = AVL_NEXT(&dl->dl_tree, pdle); + } VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); ASSERT3U(dl->dl_phys->dl_used, >=, used); ASSERT3U(dl->dl_phys->dl_comp, >=, comp); ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); dl->dl_phys->dl_used -= used; dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, dle->dle_mintxg, tx)); dle_next = AVL_NEXT(&dl->dl_tree, dle); avl_remove(&dl->dl_tree, dle); bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); dle = dle_next; } mutex_exit(&dl->dl_lock); } typedef struct livelist_entry { blkptr_t le_bp; uint32_t le_refcnt; avl_node_t le_node; } livelist_entry_t; static int livelist_compare(const void *larg, const void *rarg) { const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp; const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp; /* Sort them according to dva[0] */ uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); if (l_dva0_vdev != r_dva0_vdev) return (TREE_CMP(l_dva0_vdev, r_dva0_vdev)); /* if vdevs are equal, sort by offsets. */ uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); if (l_dva0_offset == r_dva0_offset) ASSERT3U(l->blk_birth, ==, r->blk_birth); return (TREE_CMP(l_dva0_offset, r_dva0_offset)); } struct livelist_iter_arg { avl_tree_t *avl; bplist_t *to_free; zthr_t *t; }; /* * Expects an AVL tree which is incrementally filled will FREE blkptrs * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a * corresponding FREE are stored in the supplied bplist. * * Note that multiple FREE and ALLOC entries for the same blkptr may * be encountered when dedup is involved. For this reason we keep a * refcount for all the FREE entries of each blkptr and ensure that * each of those FREE entries has a corresponding ALLOC preceding it. */ static int dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) { struct livelist_iter_arg *lia = arg; avl_tree_t *avl = lia->avl; bplist_t *to_free = lia->to_free; zthr_t *t = lia->t; ASSERT(tx == NULL); if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) return (SET_ERROR(EINTR)); livelist_entry_t node; node.le_bp = *bp; livelist_entry_t *found = avl_find(avl, &node, NULL); if (bp_freed) { if (found == NULL) { /* first free entry for this blkptr */ livelist_entry_t *e = kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP); e->le_bp = *bp; e->le_refcnt = 1; avl_add(avl, e); } else { /* dedup block free */ ASSERT(BP_GET_DEDUP(bp)); ASSERT3U(BP_GET_CHECKSUM(bp), ==, BP_GET_CHECKSUM(&found->le_bp)); ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); found->le_refcnt++; } } else { if (found == NULL) { /* block is currently marked as allocated */ bplist_append(to_free, bp); } else { /* alloc matches a free entry */ ASSERT3U(found->le_refcnt, !=, 0); found->le_refcnt--; if (found->le_refcnt == 0) { /* all tracked free pairs have been matched */ avl_remove(avl, found); kmem_free(found, sizeof (livelist_entry_t)); } else { /* * This is definitely a deduped blkptr so * let's validate it. */ ASSERT(BP_GET_DEDUP(bp)); ASSERT3U(BP_GET_CHECKSUM(bp), ==, BP_GET_CHECKSUM(&found->le_bp)); } } } return (0); } /* * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs * which have an ALLOC entry but no matching FREE */ int dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, uint64_t *size) { avl_tree_t avl; avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), offsetof(livelist_entry_t, le_node)); /* process the sublist */ struct livelist_iter_arg arg = { .avl = &avl, .to_free = to_free, .t = t }; int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); VERIFY(err != 0 || avl_numnodes(&avl) == 0); void *cookie = NULL; livelist_entry_t *le = NULL; while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) { kmem_free(le, sizeof (livelist_entry_t)); } avl_destroy(&avl); return (err); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW, "Size to start the next sub-livelist in a livelist"); ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW, "Threshold at which livelist is disabled"); /* END CSTYLED */