diff --git a/include/sys/btree.h b/include/sys/btree.h index a901d654ef1c..883abb5181c9 100644 --- a/include/sys/btree.h +++ b/include/sys/btree.h @@ -1,251 +1,252 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2019 by Delphix. All rights reserved. */ #ifndef _BTREE_H #define _BTREE_H #ifdef __cplusplus extern "C" { #endif #include /* * This file defines the interface for a B-Tree implementation for ZFS. The * tree can be used to store arbitrary sortable data types with low overhead * and good operation performance. In addition the tree intelligently * optimizes bulk in-order insertions to improve memory use and performance. * * Note that for all B-Tree functions, the values returned are pointers to the * internal copies of the data in the tree. The internal data can only be * safely mutated if the changes cannot change the ordering of the element * with respect to any other elements in the tree. * * The major drawback of the B-Tree is that any returned elements or indexes * are only valid until a side-effectful operation occurs, since these can * result in reallocation or relocation of data. Side effectful operations are * defined as insertion, removal, and zfs_btree_destroy_nodes. * * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core * nodes have an array of children pointing to other nodes, and an array of * elements that act as separators between the elements of the subtrees rooted * at its children. Leaf nodes only contain data elements, and form the bottom * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the * elements in the core nodes are not copies of or references to leaf node * elements. Each element occurs only once in the tree, no matter what kind * of node it is in. * * The tree's height is the same throughout, unlike many other forms of search * tree. Each node (except for the root) must be between half minus one and * completely full of elements (and children) at all times. Any operation that * would put the node outside of that range results in a rebalancing operation * (taking, merging, or splitting). * * This tree was implemented using descriptions from Wikipedia's articles on * B-Trees and B+ Trees. */ /* * Decreasing these values results in smaller memmove operations, but more of * them, and increased memory overhead. Increasing these values results in * higher variance in operation time, and reduces memory overhead. */ -#define BTREE_CORE_ELEMS 128 +#define BTREE_CORE_ELEMS 126 #define BTREE_LEAF_SIZE 4096 extern kmem_cache_t *zfs_btree_leaf_cache; typedef struct zfs_btree_hdr { struct zfs_btree_core *bth_parent; /* * Set to -1 to indicate core nodes. Other values represent first * valid element offset for leaf nodes. */ uint32_t bth_first; /* * For both leaf and core nodes, represents the number of elements in * the node. For core nodes, they will have bth_count + 1 children. */ uint32_t bth_count; } zfs_btree_hdr_t; typedef struct zfs_btree_core { zfs_btree_hdr_t btc_hdr; zfs_btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1]; uint8_t btc_elems[]; } zfs_btree_core_t; typedef struct zfs_btree_leaf { zfs_btree_hdr_t btl_hdr; uint8_t btl_elems[]; } zfs_btree_leaf_t; -#define BTREE_LEAF_ESIZE (BTREE_LEAF_SIZE - \ - offsetof(zfs_btree_leaf_t, btl_elems)) - typedef struct zfs_btree_index { zfs_btree_hdr_t *bti_node; uint32_t bti_offset; /* * True if the location is before the list offset, false if it's at * the listed offset. */ boolean_t bti_before; } zfs_btree_index_t; typedef struct btree { - zfs_btree_hdr_t *bt_root; - int64_t bt_height; + int (*bt_compar) (const void *, const void *); size_t bt_elem_size; + size_t bt_leaf_size; uint32_t bt_leaf_cap; + int32_t bt_height; uint64_t bt_num_elems; uint64_t bt_num_nodes; + zfs_btree_hdr_t *bt_root; zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading - int (*bt_compar) (const void *, const void *); } zfs_btree_t; /* * Allocate and deallocate caches for btree nodes. */ void zfs_btree_init(void); void zfs_btree_fini(void); /* * Initialize an B-Tree. Arguments are: * * tree - the tree to be initialized * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 * -1 for <, 0 for ==, and +1 for > * size - the value of sizeof(struct my_type) + * lsize - custom leaf size */ void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), size_t); +void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *), + size_t, size_t); /* * Find a node with a matching value in the tree. Returns the matching node * found. If not found, it returns NULL and then if "where" is not NULL it sets * "where" for use with zfs_btree_add_idx() or zfs_btree_nearest(). * * node - node that has the value being looked for * where - position for use with zfs_btree_nearest() or zfs_btree_add_idx(), * may be NULL */ void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *); /* * Insert a node into the tree. * * node - the node to insert * where - position as returned from zfs_btree_find() */ void zfs_btree_add_idx(zfs_btree_t *, const void *, const zfs_btree_index_t *); /* * Return the first or last valued node in the tree. Will return NULL if the * tree is empty. The index can be NULL if the location of the first or last * element isn't required. */ void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *); void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *); /* * Return the next or previous valued node in the tree. The second index can * safely be NULL, if the location of the next or previous value isn't * required. */ void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *, zfs_btree_index_t *); void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *, zfs_btree_index_t *); /* * Get a value from a tree and an index. */ void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *); /* * Add a single value to the tree. The value must not compare equal to any * other node already in the tree. Note that the value will be copied out, not * inserted directly. It is safe to free or destroy the value once this * function returns. */ void zfs_btree_add(zfs_btree_t *, const void *); /* * Remove a single value from the tree. The value must be in the tree. The * pointer passed in may be a pointer into a tree-controlled buffer, but it * need not be. */ void zfs_btree_remove(zfs_btree_t *, const void *); /* * Remove the value at the given location from the tree. */ void zfs_btree_remove_idx(zfs_btree_t *, zfs_btree_index_t *); /* * Return the number of nodes in the tree */ ulong_t zfs_btree_numnodes(zfs_btree_t *); /* * Used to destroy any remaining nodes in a tree. The cookie argument should * be initialized to NULL before the first call. Returns a node that has been * removed from the tree and may be free()'d. Returns NULL when the tree is * empty. * * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it * and finally zfs_btree_destroy(). No other B-Tree routines will be valid. * * cookie - an index used to save state between calls to * zfs_btree_destroy_nodes() * * EXAMPLE: * zfs_btree_t *tree; * struct my_data *node; * zfs_btree_index_t *cookie; * * cookie = NULL; * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) * data_destroy(node); * zfs_btree_destroy(tree); */ void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **); /* * Destroys all nodes in the tree quickly. This doesn't give the caller an * opportunity to iterate over each node and do its own cleanup; for that, use * zfs_btree_destroy_nodes(). */ void zfs_btree_clear(zfs_btree_t *); /* * Final destroy of an B-Tree. Arguments are: * * tree - the empty tree to destroy */ void zfs_btree_destroy(zfs_btree_t *tree); /* Runs a variety of self-checks on the btree to verify integrity. */ void zfs_btree_verify(zfs_btree_t *tree); #ifdef __cplusplus } #endif #endif /* _BTREE_H */ diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 250dde3ce235..3c83448caa2b 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -1,240 +1,239 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ #ifndef _SYS_ZAP_IMPL_H #define _SYS_ZAP_IMPL_H #include #include #include #ifdef __cplusplus extern "C" { #endif extern int fzap_default_block_shift; #define ZAP_MAGIC 0x2F52AB2ABULL #define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) #define MZAP_ENT_LEN 64 #define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) #define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE #define ZAP_NEED_CD (-1U) typedef struct mzap_ent_phys { uint64_t mze_value; uint32_t mze_cd; uint16_t mze_pad; /* in case we want to chain them someday */ char mze_name[MZAP_NAME_LEN]; } mzap_ent_phys_t; typedef struct mzap_phys { uint64_t mz_block_type; /* ZBT_MICRO */ uint64_t mz_salt; uint64_t mz_normflags; uint64_t mz_pad[5]; mzap_ent_phys_t mz_chunk[1]; /* actually variable size depending on block size */ } mzap_phys_t; typedef struct mzap_ent { - avl_node_t mze_node; - int mze_chunkid; - uint64_t mze_hash; - uint32_t mze_cd; /* copy from mze_phys->mze_cd */ + uint32_t mze_hash; + uint16_t mze_cd; /* copy from mze_phys->mze_cd */ + uint16_t mze_chunkid; } mzap_ent_t; #define MZE_PHYS(zap, mze) \ (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid]) /* * The (fat) zap is stored in one object. It is an array of * 1<= 6] [zap_leaf_t] [ptrtbl] ... * */ struct dmu_buf; struct zap_leaf; #define ZBT_LEAF ((1ULL << 63) + 0) #define ZBT_HEADER ((1ULL << 63) + 1) #define ZBT_MICRO ((1ULL << 63) + 3) /* any other values are ptrtbl blocks */ /* * the embedded pointer table takes up half a block: * block size / entry size (2^3) / 2 */ #define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) /* * The embedded pointer table starts half-way through the block. Since * the pointer table itself is half the block, it starts at (64-bit) * word number (1<zap_dbuf->db_data); } static inline mzap_phys_t * zap_m_phys(zap_t *zap) { return (zap->zap_dbuf->db_data); } typedef struct zap_name { zap_t *zn_zap; int zn_key_intlen; const void *zn_key_orig; int zn_key_orig_numints; const void *zn_key_norm; int zn_key_norm_numints; uint64_t zn_hash; matchtype_t zn_matchtype; int zn_normflags; char zn_normbuf[ZAP_MAXNAMELEN]; } zap_name_t; #define zap_f zap_u.zap_fat #define zap_m zap_u.zap_micro boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp); void zap_unlockdir(zap_t *zap, void *tag); void zap_evict_sync(void *dbu); -zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); +zap_name_t *zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); uint32_t zap_maxcd(zap_t *zap); uint64_t zap_getflags(zap_t *zap); #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) void fzap_byteswap(void *buf, size_t size); int fzap_count(zap_t *zap, uint64_t *count); int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); void fzap_prefetch(zap_name_t *zn); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, void *tag, dmu_tx_t *tx); int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, void *tag, dmu_tx_t *tx); int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers); int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); void fzap_get_stats(zap_t *zap, zap_stats_t *zs); void zap_put_leaf(struct zap_leaf *l); int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, void *tag, dmu_tx_t *tx); void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); #ifdef __cplusplus } #endif #endif /* _SYS_ZAP_IMPL_H */ diff --git a/module/zfs/btree.c b/module/zfs/btree.c index e16c4ebef6ba..28ab3fcdcc3c 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -1,2179 +1,2207 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2019 by Delphix. All rights reserved. */ #include #include #include kmem_cache_t *zfs_btree_leaf_cache; /* * Control the extent of the verification that occurs when zfs_btree_verify is * called. Primarily used for debugging when extending the btree logic and * functionality. As the intensity is increased, new verification steps are * added. These steps are cumulative; intensity = 3 includes the intensity = 1 * and intensity = 2 steps as well. * * Intensity 1: Verify that the tree's height is consistent throughout. * Intensity 2: Verify that a core node's children's parent pointers point * to the core node. * Intensity 3: Verify that the total number of elements in the tree matches the * sum of the number of elements in each node. Also verifies that each node's * count obeys the invariants (less than or equal to maximum value, greater than * or equal to half the maximum minus one). * Intensity 4: Verify that each element compares less than the element * immediately after it and greater than the one immediately before it using the * comparator function. For core nodes, also checks that each element is greater * than the last element in the first of the two nodes it separates, and less * than the first element in the second of the two nodes. * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside * of each node is poisoned appropriately. Note that poisoning always occurs if * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal * operation. * * Intensity 4 and 5 are particularly expensive to perform; the previous levels * are a few memory operations per node, while these levels require multiple * operations per element. In addition, when creating large btrees, these * operations are called at every step, resulting in extremely slow operation * (while the asymptotic complexity of the other steps is the same, the * importance of the constant factors cannot be denied). */ uint_t zfs_btree_verify_intensity = 0; /* * Convenience functions to silence warnings from memcpy/memmove's * return values and change argument order to src, dest. */ static void bcpy(const void *src, void *dest, size_t size) { (void) memcpy(dest, src, size); } static void bmov(const void *src, void *dest, size_t size) { (void) memmove(dest, src, size); } static boolean_t zfs_btree_is_core(struct zfs_btree_hdr *hdr) { return (hdr->bth_first == -1); } #ifdef _ILP32 #define BTREE_POISON 0xabadb10c #else #define BTREE_POISON 0xabadb10cdeadbeef #endif static void zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; if (zfs_btree_is_core(hdr)) { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { node->btc_children[i] = (zfs_btree_hdr_t *)BTREE_POISON; } (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, (BTREE_CORE_ELEMS - hdr->bth_count) * size); } else { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; (void) memset(leaf->btl_elems, 0x0f, hdr->bth_first * size); (void) memset(leaf->btl_elems + (hdr->bth_first + hdr->bth_count) * size, 0x0f, - BTREE_LEAF_ESIZE - + tree->bt_leaf_size - offsetof(zfs_btree_leaf_t, btl_elems) - (hdr->bth_first + hdr->bth_count) * size); } #endif } static inline void zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, uint32_t idx, uint32_t count) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; if (zfs_btree_is_core(hdr)) { ASSERT3U(idx, >=, hdr->bth_count); ASSERT3U(idx, <=, BTREE_CORE_ELEMS); ASSERT3U(idx + count, <=, BTREE_CORE_ELEMS); zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (uint32_t i = 1; i <= count; i++) { node->btc_children[idx + i] = (zfs_btree_hdr_t *)BTREE_POISON; } (void) memset(node->btc_elems + idx * size, 0x0f, count * size); } else { ASSERT3U(idx, <=, tree->bt_leaf_cap); ASSERT3U(idx + count, <=, tree->bt_leaf_cap); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; (void) memset(leaf->btl_elems + (hdr->bth_first + idx) * size, 0x0f, count * size); } #endif } static inline void zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, uint32_t idx) { #ifdef ZFS_DEBUG size_t size = tree->bt_elem_size; if (zfs_btree_is_core(hdr)) { ASSERT3U(idx, <, BTREE_CORE_ELEMS); zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; VERIFY3P(node->btc_children[idx + 1], ==, cval); for (size_t i = 0; i < size; i++) VERIFY3U(node->btc_elems[idx * size + i], ==, 0x0f); } else { ASSERT3U(idx, <, tree->bt_leaf_cap); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; if (idx >= tree->bt_leaf_cap - hdr->bth_first) return; for (size_t i = 0; i < size; i++) { VERIFY3U(leaf->btl_elems[(hdr->bth_first + idx) * size + i], ==, 0x0f); } } #endif } void zfs_btree_init(void) { zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_btree_fini(void) { kmem_cache_destroy(zfs_btree_leaf_cache); } +static void * +zfs_btree_leaf_alloc(zfs_btree_t *tree) +{ + if (tree->bt_leaf_size == BTREE_LEAF_SIZE) + return (kmem_cache_alloc(zfs_btree_leaf_cache, KM_SLEEP)); + else + return (kmem_alloc(tree->bt_leaf_size, KM_SLEEP)); +} + +static void +zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) +{ + if (tree->bt_leaf_size == BTREE_LEAF_SIZE) + return (kmem_cache_free(zfs_btree_leaf_cache, ptr)); + else + return (kmem_free(ptr, tree->bt_leaf_size)); +} + void zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), size_t size) { - ASSERT3U(size, <=, BTREE_LEAF_ESIZE / 2); + zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE); +} + +void +zfs_btree_create_custom(zfs_btree_t *tree, + int (*compar) (const void *, const void *), + size_t size, size_t lsize) +{ + size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); - bzero(tree, sizeof (*tree)); + ASSERT3U(size, <=, esize / 2); + memset(tree, 0, sizeof (*tree)); tree->bt_compar = compar; tree->bt_elem_size = size; - tree->bt_leaf_cap = P2ALIGN(BTREE_LEAF_ESIZE / size, 2); + tree->bt_leaf_size = lsize; + tree->bt_leaf_cap = P2ALIGN(esize / size, 2); tree->bt_height = -1; tree->bt_bulk = NULL; } /* * Find value in the array of elements provided. Uses a simple binary search. */ static void * zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, const void *value, zfs_btree_index_t *where) { uint32_t max = nelems; uint32_t min = 0; while (max > min) { uint32_t idx = (min + max) / 2; uint8_t *cur = buf + idx * tree->bt_elem_size; int comp = tree->bt_compar(cur, value); if (comp < 0) { min = idx + 1; } else if (comp > 0) { max = idx; } else { where->bti_offset = idx; where->bti_before = B_FALSE; return (cur); } } where->bti_offset = max; where->bti_before = B_TRUE; return (NULL); } /* * Find the given value in the tree. where may be passed as null to use as a * membership test or if the btree is being used as a map. */ void * zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) { if (tree->bt_height == -1) { if (where != NULL) { where->bti_node = NULL; where->bti_offset = 0; } ASSERT0(tree->bt_num_elems); return (NULL); } /* * If we're in bulk-insert mode, we check the last spot in the tree * and the last leaf in the tree before doing the normal search, * because for most workloads the vast majority of finds in * bulk-insert mode are to insert new elements. */ zfs_btree_index_t idx; size_t size = tree->bt_elem_size; if (tree->bt_bulk != NULL) { zfs_btree_leaf_t *last_leaf = tree->bt_bulk; int comp = tree->bt_compar(last_leaf->btl_elems + (last_leaf->btl_hdr.bth_first + last_leaf->btl_hdr.bth_count - 1) * size, value); if (comp < 0) { /* * If what they're looking for is after the last * element, it's not in the tree. */ if (where != NULL) { where->bti_node = (zfs_btree_hdr_t *)last_leaf; where->bti_offset = last_leaf->btl_hdr.bth_count; where->bti_before = B_TRUE; } return (NULL); } else if (comp == 0) { if (where != NULL) { where->bti_node = (zfs_btree_hdr_t *)last_leaf; where->bti_offset = last_leaf->btl_hdr.bth_count - 1; where->bti_before = B_FALSE; } return (last_leaf->btl_elems + (last_leaf->btl_hdr.bth_first + last_leaf->btl_hdr.bth_count - 1) * size); } if (tree->bt_compar(last_leaf->btl_elems + last_leaf->btl_hdr.bth_first * size, value) <= 0) { /* * If what they're looking for is after the first * element in the last leaf, it's in the last leaf or * it's not in the tree. */ void *d = zfs_btree_find_in_buf(tree, last_leaf->btl_elems + last_leaf->btl_hdr.bth_first * size, last_leaf->btl_hdr.bth_count, value, &idx); if (where != NULL) { idx.bti_node = (zfs_btree_hdr_t *)last_leaf; *where = idx; } return (d); } } zfs_btree_core_t *node = NULL; uint32_t child = 0; - uint64_t depth = 0; + uint32_t depth = 0; /* * Iterate down the tree, finding which child the value should be in * by comparing with the separators. */ for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; node = (zfs_btree_core_t *)node->btc_children[child], depth++) { ASSERT3P(node, !=, NULL); void *d = zfs_btree_find_in_buf(tree, node->btc_elems, node->btc_hdr.bth_count, value, &idx); EQUIV(d != NULL, !idx.bti_before); if (d != NULL) { if (where != NULL) { idx.bti_node = (zfs_btree_hdr_t *)node; *where = idx; } return (d); } ASSERT(idx.bti_before); child = idx.bti_offset; } /* * The value is in this leaf, or it would be if it were in the * tree. Find its proper location and return it. */ zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems + leaf->btl_hdr.bth_first * size, leaf->btl_hdr.bth_count, value, &idx); if (where != NULL) { idx.bti_node = (zfs_btree_hdr_t *)leaf; *where = idx; } return (d); } /* * To explain the following functions, it is useful to understand the four * kinds of shifts used in btree operation. First, a shift is a movement of * elements within a node. It is used to create gaps for inserting new * elements and children, or cover gaps created when things are removed. A * shift has two fundamental properties, each of which can be one of two * values, making four types of shifts. There is the direction of the shift * (left or right) and the shape of the shift (parallelogram or isoceles * trapezoid (shortened to trapezoid hereafter)). The shape distinction only * applies to shifts of core nodes. * * The names derive from the following imagining of the layout of a node: * * Elements: * * * * * * * ... * * * * Children: * * * * * * * * ... * * * * * This layout follows from the fact that the elements act as separators * between pairs of children, and that children root subtrees "below" the * current node. A left and right shift are fairly self-explanatory; a left * shift moves things to the left, while a right shift moves things to the * right. A parallelogram shift is a shift with the same number of elements * and children being moved, while a trapezoid shift is a shift that moves one * more children than elements. An example follows: * * A parallelogram shift could contain the following: * _______________ * \* * * * \ * * * ... * * * * * \ * * * *\ * * * ... * * * * --------------- * A trapezoid shift could contain the following: * ___________ * * / * * * \ * * * ... * * * * * / * * * *\ * * * ... * * * * --------------- * * Note that a parallelogram shift is always shaped like a "left-leaning" * parallelogram, where the starting index of the children being moved is * always one higher than the starting index of the elements being moved. No * "right-leaning" parallelogram shifts are needed (shifts where the starting * element index and starting child index being moved are the same) to achieve * any btree operations, so we ignore them. */ enum bt_shift_shape { BSS_TRAPEZOID, BSS_PARALLELOGRAM }; enum bt_shift_direction { BSD_LEFT, BSD_RIGHT }; /* * Shift elements and children in the provided core node by off spots. The * first element moved is idx, and count elements are moved. The shape of the * shift is determined by shape. The direction is determined by dir. */ static inline void bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, uint32_t count, uint32_t off, enum bt_shift_shape shape, enum bt_shift_direction dir) { size_t size = tree->bt_elem_size; ASSERT(zfs_btree_is_core(&node->btc_hdr)); uint8_t *e_start = node->btc_elems + idx * size; uint8_t *e_out = (dir == BSD_LEFT ? e_start - off * size : e_start + off * size); bmov(e_start, e_out, count * size); zfs_btree_hdr_t **c_start = node->btc_children + idx + (shape == BSS_TRAPEZOID ? 0 : 1); zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : c_start + off); uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); bmov(c_start, c_out, c_count * sizeof (*c_start)); } /* * Shift elements and children in the provided core node left by one spot. * The first element moved is idx, and count elements are moved. The * shape of the shift is determined by trap; true if the shift is a trapezoid, * false if it is a parallelogram. */ static inline void bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, uint32_t count, enum bt_shift_shape shape) { bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); } /* * Shift elements and children in the provided core node right by one spot. * Starts with elements[idx] and children[idx] and one more child than element. */ static inline void bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint32_t idx, uint32_t count, enum bt_shift_shape shape) { bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); } /* * Shift elements and children in the provided leaf node by off spots. * The first element moved is idx, and count elements are moved. The direction * is determined by left. */ static inline void bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint32_t idx, uint32_t count, uint32_t off, enum bt_shift_direction dir) { size_t size = tree->bt_elem_size; zfs_btree_hdr_t *hdr = &node->btl_hdr; ASSERT(!zfs_btree_is_core(hdr)); if (count == 0) return; uint8_t *start = node->btl_elems + (hdr->bth_first + idx) * size; uint8_t *out = (dir == BSD_LEFT ? start - off * size : start + off * size); bmov(start, out, count * size); } /* * Grow leaf for n new elements before idx. */ static void bt_grow_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, uint32_t n) { zfs_btree_hdr_t *hdr = &leaf->btl_hdr; ASSERT(!zfs_btree_is_core(hdr)); ASSERT3U(idx, <=, hdr->bth_count); uint32_t capacity = tree->bt_leaf_cap; ASSERT3U(hdr->bth_count + n, <=, capacity); boolean_t cl = (hdr->bth_first >= n); boolean_t cr = (hdr->bth_first + hdr->bth_count + n <= capacity); if (cl && (!cr || idx <= hdr->bth_count / 2)) { /* Grow left. */ hdr->bth_first -= n; bt_shift_leaf(tree, leaf, n, idx, n, BSD_LEFT); } else if (cr) { /* Grow right. */ bt_shift_leaf(tree, leaf, idx, hdr->bth_count - idx, n, BSD_RIGHT); } else { /* Grow both ways. */ uint32_t fn = hdr->bth_first - (capacity - (hdr->bth_count + n)) / 2; hdr->bth_first -= fn; bt_shift_leaf(tree, leaf, fn, idx, fn, BSD_LEFT); bt_shift_leaf(tree, leaf, fn + idx, hdr->bth_count - idx, n - fn, BSD_RIGHT); } hdr->bth_count += n; } /* * Shrink leaf for count elements starting from idx. */ static void bt_shrink_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, uint32_t n) { zfs_btree_hdr_t *hdr = &leaf->btl_hdr; ASSERT(!zfs_btree_is_core(hdr)); ASSERT3U(idx, <=, hdr->bth_count); ASSERT3U(idx + n, <=, hdr->bth_count); if (idx <= (hdr->bth_count - n) / 2) { bt_shift_leaf(tree, leaf, 0, idx, n, BSD_RIGHT); zfs_btree_poison_node_at(tree, hdr, 0, n); hdr->bth_first += n; } else { bt_shift_leaf(tree, leaf, idx + n, hdr->bth_count - idx - n, n, BSD_LEFT); zfs_btree_poison_node_at(tree, hdr, hdr->bth_count - n, n); } hdr->bth_count -= n; } /* * Move children and elements from one core node to another. The shape * parameter behaves the same as it does in the shift logic. */ static inline void bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint32_t sidx, uint32_t count, zfs_btree_core_t *dest, uint32_t didx, enum bt_shift_shape shape) { size_t size = tree->bt_elem_size; ASSERT(zfs_btree_is_core(&source->btc_hdr)); ASSERT(zfs_btree_is_core(&dest->btc_hdr)); bcpy(source->btc_elems + sidx * size, dest->btc_elems + didx * size, count * size); uint32_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); bcpy(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), c_count * sizeof (*source->btc_children)); } static inline void bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint32_t sidx, uint32_t count, zfs_btree_leaf_t *dest, uint32_t didx) { size_t size = tree->bt_elem_size; ASSERT(!zfs_btree_is_core(&source->btl_hdr)); ASSERT(!zfs_btree_is_core(&dest->btl_hdr)); bcpy(source->btl_elems + (source->btl_hdr.bth_first + sidx) * size, dest->btl_elems + (dest->btl_hdr.bth_first + didx) * size, count * size); } /* * Find the first element in the subtree rooted at hdr, return its value and * put its location in where if non-null. */ static void * zfs_btree_first_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) { zfs_btree_hdr_t *node; for (node = hdr; zfs_btree_is_core(node); node = ((zfs_btree_core_t *)node)->btc_children[0]) ; ASSERT(!zfs_btree_is_core(node)); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; if (where != NULL) { where->bti_node = node; where->bti_offset = 0; where->bti_before = B_FALSE; } return (&leaf->btl_elems[node->bth_first * tree->bt_elem_size]); } /* Insert an element and a child into a core node at the given offset. */ static void zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, uint32_t offset, zfs_btree_hdr_t *new_node, void *buf) { size_t size = tree->bt_elem_size; zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; ASSERT3P(par_hdr, ==, new_node->bth_parent); ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); if (zfs_btree_verify_intensity >= 5) { zfs_btree_verify_poison_at(tree, par_hdr, par_hdr->bth_count); } /* Shift existing elements and children */ uint32_t count = par_hdr->bth_count - offset; bt_shift_core_right(tree, parent, offset, count, BSS_PARALLELOGRAM); /* Insert new values */ parent->btc_children[offset + 1] = new_node; bcpy(buf, parent->btc_elems + offset * size, size); par_hdr->bth_count++; } /* * Insert new_node into the parent of old_node directly after old_node, with * buf as the dividing element between the two. */ static void zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, zfs_btree_hdr_t *new_node, void *buf) { ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); size_t size = tree->bt_elem_size; zfs_btree_core_t *parent = old_node->bth_parent; zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; /* * If this is the root node we were splitting, we create a new root * and increase the height of the tree. */ if (parent == NULL) { ASSERT3P(old_node, ==, tree->bt_root); tree->bt_num_nodes++; zfs_btree_core_t *new_root = kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * size, KM_SLEEP); zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; new_root_hdr->bth_parent = NULL; new_root_hdr->bth_first = -1; new_root_hdr->bth_count = 1; old_node->bth_parent = new_node->bth_parent = new_root; new_root->btc_children[0] = old_node; new_root->btc_children[1] = new_node; bcpy(buf, new_root->btc_elems, size); tree->bt_height++; tree->bt_root = new_root_hdr; zfs_btree_poison_node(tree, new_root_hdr); return; } /* * Since we have the new separator, binary search for where to put * new_node. */ zfs_btree_index_t idx; ASSERT(zfs_btree_is_core(par_hdr)); VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); uint32_t offset = idx.bti_offset; ASSERT3U(offset, <=, par_hdr->bth_count); ASSERT3P(parent->btc_children[offset], ==, old_node); /* * If the parent isn't full, shift things to accommodate our insertions * and return. */ if (par_hdr->bth_count != BTREE_CORE_ELEMS) { zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); return; } /* * We need to split this core node into two. Currently there are * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for * BTREE_CORE_ELEMS + 2. Some of the children will be part of the * current node, and the others will be moved to the new core node. * There are BTREE_CORE_ELEMS + 1 elements including the new one. One * will be used as the new separator in our parent, and the others * will be split among the two core nodes. * * Usually we will split the node in half evenly, with * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we * instead move only about a quarter of the elements (and children) to * the new node. Since the average state after a long time is a 3/4 * full node, shortcutting directly to that state improves efficiency. * * We do this in two stages: first we split into two nodes, and then we * reuse our existing logic to insert the new element and child. */ uint32_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? 2 : 4)) - 1, 2); uint32_t keep_count = BTREE_CORE_ELEMS - move_count - 1; ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); tree->bt_num_nodes++; zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * size, KM_SLEEP); zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; new_par_hdr->bth_parent = par_hdr->bth_parent; new_par_hdr->bth_first = -1; new_par_hdr->bth_count = move_count; zfs_btree_poison_node(tree, new_par_hdr); par_hdr->bth_count = keep_count; bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, 0, BSS_TRAPEZOID); /* Store the new separator in a buffer. */ uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); bcpy(parent->btc_elems + keep_count * size, tmp_buf, size); zfs_btree_poison_node(tree, par_hdr); if (offset < keep_count) { /* Insert the new node into the left half */ zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); /* * Move the new separator to the existing buffer. */ bcpy(tmp_buf, buf, size); } else if (offset > keep_count) { /* Insert the new node into the right half */ new_node->bth_parent = new_parent; zfs_btree_insert_core_impl(tree, new_parent, offset - keep_count - 1, new_node, buf); /* * Move the new separator to the existing buffer. */ bcpy(tmp_buf, buf, size); } else { /* * Move the new separator into the right half, and replace it * with buf. We also need to shift back the elements in the * right half to accommodate new_node. */ bt_shift_core_right(tree, new_parent, 0, move_count, BSS_TRAPEZOID); new_parent->btc_children[0] = new_node; bcpy(tmp_buf, new_parent->btc_elems, size); new_par_hdr->bth_count++; } kmem_free(tmp_buf, size); zfs_btree_poison_node(tree, par_hdr); for (uint32_t i = 0; i <= new_parent->btc_hdr.bth_count; i++) new_parent->btc_children[i]->bth_parent = new_parent; for (uint32_t i = 0; i <= parent->btc_hdr.bth_count; i++) ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); /* * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting. */ zfs_btree_insert_into_parent(tree, &parent->btc_hdr, &new_parent->btc_hdr, buf); } /* Insert an element into a leaf node at the given offset. */ static void zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint32_t idx, const void *value) { size_t size = tree->bt_elem_size; zfs_btree_hdr_t *hdr = &leaf->btl_hdr; ASSERT3U(leaf->btl_hdr.bth_count, <, tree->bt_leaf_cap); if (zfs_btree_verify_intensity >= 5) { zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, leaf->btl_hdr.bth_count); } bt_grow_leaf(tree, leaf, idx, 1); uint8_t *start = leaf->btl_elems + (hdr->bth_first + idx) * size; bcpy(value, start, size); } static void zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr); /* Helper function for inserting a new value into leaf at the given index. */ static void zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, const void *value, uint32_t idx) { size_t size = tree->bt_elem_size; uint32_t capacity = tree->bt_leaf_cap; /* * If the leaf isn't full, shift the elements after idx and insert * value. */ if (leaf->btl_hdr.bth_count != capacity) { zfs_btree_insert_leaf_impl(tree, leaf, idx, value); return; } /* * Otherwise, we split the leaf node into two nodes. If we're not bulk * inserting, each is of size (capacity / 2). If we are bulk * inserting, we move a quarter of the elements to the new node so * inserts into the old node don't cause immediate splitting but the * tree stays relatively dense. Since the average state after a long * time is a 3/4 full node, shortcutting directly to that state * improves efficiency. At the end of the bulk insertion process * we'll need to go through and fix up any nodes (the last leaf and * its ancestors, potentially) that are below the minimum. * * In either case, we're left with one extra element. The leftover * element will become the new dividing element between the two nodes. */ uint32_t move_count = MAX(capacity / (tree->bt_bulk ? 4 : 2), 1) - 1; uint32_t keep_count = capacity - move_count - 1; ASSERT3U(keep_count, >=, 1); /* If we insert on left. move one more to keep leaves balanced. */ if (idx < keep_count) { keep_count--; move_count++; } tree->bt_num_nodes++; - zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, - KM_SLEEP); + zfs_btree_leaf_t *new_leaf = zfs_btree_leaf_alloc(tree); zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; new_hdr->bth_parent = leaf->btl_hdr.bth_parent; new_hdr->bth_first = (tree->bt_bulk ? 0 : capacity / 4) + (idx >= keep_count && idx <= keep_count + move_count / 2); new_hdr->bth_count = move_count; zfs_btree_poison_node(tree, new_hdr); if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) tree->bt_bulk = new_leaf; /* Copy the back part to the new leaf. */ bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, 0); /* We store the new separator in a buffer we control for simplicity. */ uint8_t *buf = kmem_alloc(size, KM_SLEEP); bcpy(leaf->btl_elems + (leaf->btl_hdr.bth_first + keep_count) * size, buf, size); bt_shrink_leaf(tree, leaf, keep_count, 1 + move_count); if (idx < keep_count) { /* Insert into the existing leaf. */ zfs_btree_insert_leaf_impl(tree, leaf, idx, value); } else if (idx > keep_count) { /* Insert into the new leaf. */ zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - 1, value); } else { /* * Insert planned separator into the new leaf, and use * the new value as the new separator. */ zfs_btree_insert_leaf_impl(tree, new_leaf, 0, buf); bcpy(value, buf, size); } /* * Now that the node is split, we need to insert the new node into its * parent. This may cause further splitting, bur only of core nodes. */ zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, buf); kmem_free(buf, size); } static uint32_t zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { void *buf; if (zfs_btree_is_core(hdr)) { buf = ((zfs_btree_core_t *)hdr)->btc_elems; } else { buf = ((zfs_btree_leaf_t *)hdr)->btl_elems + hdr->bth_first * tree->bt_elem_size; } zfs_btree_index_t idx; zfs_btree_core_t *parent = hdr->bth_parent; VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, parent->btc_hdr.bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); return (idx.bti_offset); } /* * Take the b-tree out of bulk insert mode. During bulk-insert mode, some * nodes may violate the invariant that non-root nodes must be at least half * full. All nodes violating this invariant should be the last node in their * particular level. To correct the invariant, we take values from their left * neighbor until they are half full. They must have a left neighbor at their * level because the last node at a level is not the first node unless it's * the root. */ static void zfs_btree_bulk_finish(zfs_btree_t *tree) { ASSERT3P(tree->bt_bulk, !=, NULL); ASSERT3P(tree->bt_root, !=, NULL); zfs_btree_leaf_t *leaf = tree->bt_bulk; zfs_btree_hdr_t *hdr = &leaf->btl_hdr; zfs_btree_core_t *parent = hdr->bth_parent; size_t size = tree->bt_elem_size; uint32_t capacity = tree->bt_leaf_cap; /* * The invariant doesn't apply to the root node, if that's the only * node in the tree we're done. */ if (parent == NULL) { tree->bt_bulk = NULL; return; } /* First, take elements to rebalance the leaf node. */ if (hdr->bth_count < capacity / 2) { /* * First, find the left neighbor. The simplest way to do this * is to call zfs_btree_prev twice; the first time finds some * ancestor of this node, and the second time finds the left * neighbor. The ancestor found is the lowest common ancestor * of leaf and the neighbor. */ zfs_btree_index_t idx = { .bti_node = hdr, .bti_offset = 0 }; VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); ASSERT(zfs_btree_is_core(idx.bti_node)); zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; uint32_t common_idx = idx.bti_offset; VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); ASSERT(!zfs_btree_is_core(idx.bti_node)); zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; zfs_btree_hdr_t *l_hdr = idx.bti_node; uint32_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, capacity / 2); if (zfs_btree_verify_intensity >= 5) { for (uint32_t i = 0; i < move_count; i++) { zfs_btree_verify_poison_at(tree, hdr, leaf->btl_hdr.bth_count + i); } } /* First, shift elements in leaf back. */ bt_grow_leaf(tree, leaf, 0, move_count); /* Next, move the separator from the common ancestor to leaf. */ uint8_t *separator = common->btc_elems + common_idx * size; uint8_t *out = leaf->btl_elems + (hdr->bth_first + move_count - 1) * size; bcpy(separator, out, size); /* * Now we move elements from the tail of the left neighbor to * fill the remaining spots in leaf. */ bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - (move_count - 1), move_count - 1, leaf, 0); /* * Finally, move the new last element in the left neighbor to * the separator. */ bcpy(l_neighbor->btl_elems + (l_hdr->bth_first + l_hdr->bth_count - move_count) * size, separator, size); /* Adjust the node's counts, and we're done. */ bt_shrink_leaf(tree, l_neighbor, l_hdr->bth_count - move_count, move_count); ASSERT3U(l_hdr->bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); } /* * Now we have to rebalance any ancestors of leaf that may also * violate the invariant. */ capacity = BTREE_CORE_ELEMS; while (parent->btc_hdr.bth_parent != NULL) { zfs_btree_core_t *cur = parent; zfs_btree_hdr_t *hdr = &cur->btc_hdr; parent = hdr->bth_parent; /* * If the invariant isn't violated, move on to the next * ancestor. */ if (hdr->bth_count >= capacity / 2) continue; /* * Because the smallest number of nodes we can move when * splitting is 2, we never need to worry about not having a * left sibling (a sibling is a neighbor with the same parent). */ uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); ASSERT3U(parent_idx, >, 0); zfs_btree_core_t *l_neighbor = (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; uint32_t move_count = (capacity / 2) - hdr->bth_count; ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, capacity / 2); if (zfs_btree_verify_intensity >= 5) { for (uint32_t i = 0; i < move_count; i++) { zfs_btree_verify_poison_at(tree, hdr, hdr->bth_count + i); } } /* First, shift things in the right node back. */ bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, BSS_TRAPEZOID, BSD_RIGHT); /* Next, move the separator to the right node. */ uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * size); uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); bcpy(separator, e_out, size); /* * Now, move elements and children from the left node to the * right. We move one more child than elements. */ move_count--; uint32_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, BSS_TRAPEZOID); /* * Finally, move the last element in the left node to the * separator's position. */ move_idx--; bcpy(l_neighbor->btc_elems + move_idx * size, separator, size); l_neighbor->btc_hdr.bth_count -= move_count + 1; hdr->bth_count += move_count + 1; ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); ASSERT3U(hdr->bth_count, >=, capacity / 2); zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); for (uint32_t i = 0; i <= hdr->bth_count; i++) cur->btc_children[i]->bth_parent = cur; } tree->bt_bulk = NULL; zfs_btree_verify(tree); } /* * Insert value into tree at the location specified by where. */ void zfs_btree_add_idx(zfs_btree_t *tree, const void *value, const zfs_btree_index_t *where) { zfs_btree_index_t idx = {0}; /* If we're not inserting in the last leaf, end bulk insert mode. */ if (tree->bt_bulk != NULL) { if (where->bti_node != &tree->bt_bulk->btl_hdr) { zfs_btree_bulk_finish(tree); VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); where = &idx; } } tree->bt_num_elems++; /* * If this is the first element in the tree, create a leaf root node * and add the value to it. */ if (where->bti_node == NULL) { ASSERT3U(tree->bt_num_elems, ==, 1); ASSERT3S(tree->bt_height, ==, -1); ASSERT3P(tree->bt_root, ==, NULL); ASSERT0(where->bti_offset); tree->bt_num_nodes++; - zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, - KM_SLEEP); + zfs_btree_leaf_t *leaf = zfs_btree_leaf_alloc(tree); tree->bt_root = &leaf->btl_hdr; tree->bt_height++; zfs_btree_hdr_t *hdr = &leaf->btl_hdr; hdr->bth_parent = NULL; hdr->bth_first = 0; hdr->bth_count = 0; zfs_btree_poison_node(tree, hdr); zfs_btree_insert_into_leaf(tree, leaf, value, 0); tree->bt_bulk = leaf; } else if (!zfs_btree_is_core(where->bti_node)) { /* * If we're inserting into a leaf, go directly to the helper * function. */ zfs_btree_insert_into_leaf(tree, (zfs_btree_leaf_t *)where->bti_node, value, where->bti_offset); } else { /* * If we're inserting into a core node, we can't just shift * the existing element in that slot in the same node without * breaking our ordering invariants. Instead we place the new * value in the node at that spot and then insert the old * separator into the first slot in the subtree to the right. */ zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; /* * We can ignore bti_before, because either way the value * should end up in bti_offset. */ uint32_t off = where->bti_offset; zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; size_t size = tree->bt_elem_size; uint8_t *buf = kmem_alloc(size, KM_SLEEP); bcpy(node->btc_elems + off * size, buf, size); bcpy(value, node->btc_elems + off * size, size); /* * Find the first slot in the subtree to the right, insert * there. */ zfs_btree_index_t new_idx; VERIFY3P(zfs_btree_first_helper(tree, subtree, &new_idx), !=, NULL); ASSERT0(new_idx.bti_offset); ASSERT(!zfs_btree_is_core(new_idx.bti_node)); zfs_btree_insert_into_leaf(tree, (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); kmem_free(buf, size); } zfs_btree_verify(tree); } /* * Return the first element in the tree, and put its location in where if * non-null. */ void * zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) { if (tree->bt_height == -1) { ASSERT0(tree->bt_num_elems); return (NULL); } return (zfs_btree_first_helper(tree, tree->bt_root, where)); } /* * Find the last element in the subtree rooted at hdr, return its value and * put its location in where if non-null. */ static void * zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) { zfs_btree_hdr_t *node; for (node = hdr; zfs_btree_is_core(node); node = ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) ; zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; if (where != NULL) { where->bti_node = node; where->bti_offset = node->bth_count - 1; where->bti_before = B_FALSE; } return (leaf->btl_elems + (node->bth_first + node->bth_count - 1) * btree->bt_elem_size); } /* * Return the last element in the tree, and put its location in where if * non-null. */ void * zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) { if (tree->bt_height == -1) { ASSERT0(tree->bt_num_elems); return (NULL); } return (zfs_btree_last_helper(tree, tree->bt_root, where)); } /* * This function contains the logic to find the next node in the tree. A * helper function is used because there are multiple internal consumemrs of * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each * node after we've finished with it. */ static void * zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx, void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) { if (idx->bti_node == NULL) { ASSERT3S(tree->bt_height, ==, -1); return (NULL); } uint32_t offset = idx->bti_offset; if (!zfs_btree_is_core(idx->bti_node)) { /* * When finding the next element of an element in a leaf, * there are two cases. If the element isn't the last one in * the leaf, in which case we just return the next element in * the leaf. Otherwise, we need to traverse up our parents * until we find one where our ancestor isn't the last child * of its parent. Once we do, the next element is the * separator after our ancestor in its parent. */ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; uint32_t new_off = offset + (idx->bti_before ? 0 : 1); if (leaf->btl_hdr.bth_count > new_off) { out_idx->bti_node = &leaf->btl_hdr; out_idx->bti_offset = new_off; out_idx->bti_before = B_FALSE; return (leaf->btl_elems + (leaf->btl_hdr.bth_first + new_off) * tree->bt_elem_size); } zfs_btree_hdr_t *prev = &leaf->btl_hdr; for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; node != NULL; node = node->btc_hdr.bth_parent) { zfs_btree_hdr_t *hdr = &node->btc_hdr; ASSERT(zfs_btree_is_core(hdr)); uint32_t i = zfs_btree_find_parent_idx(tree, prev); if (done_func != NULL) done_func(tree, prev); if (i == hdr->bth_count) { prev = hdr; continue; } out_idx->bti_node = hdr; out_idx->bti_offset = i; out_idx->bti_before = B_FALSE; return (node->btc_elems + i * tree->bt_elem_size); } if (done_func != NULL) done_func(tree, prev); /* * We've traversed all the way up and been at the end of the * node every time, so this was the last element in the tree. */ return (NULL); } /* If we were before an element in a core node, return that element. */ ASSERT(zfs_btree_is_core(idx->bti_node)); zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; if (idx->bti_before) { out_idx->bti_before = B_FALSE; return (node->btc_elems + offset * tree->bt_elem_size); } /* * The next element from one in a core node is the first element in * the subtree just to the right of the separator. */ zfs_btree_hdr_t *child = node->btc_children[offset + 1]; return (zfs_btree_first_helper(tree, child, out_idx)); } /* * Return the next valued node in the tree. The same address can be safely * passed for idx and out_idx. */ void * zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx) { return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); } /* * Return the previous valued node in the tree. The same value can be safely * passed for idx and out_idx. */ void * zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, zfs_btree_index_t *out_idx) { if (idx->bti_node == NULL) { ASSERT3S(tree->bt_height, ==, -1); return (NULL); } uint32_t offset = idx->bti_offset; if (!zfs_btree_is_core(idx->bti_node)) { /* * When finding the previous element of an element in a leaf, * there are two cases. If the element isn't the first one in * the leaf, in which case we just return the previous element * in the leaf. Otherwise, we need to traverse up our parents * until we find one where our previous ancestor isn't the * first child. Once we do, the previous element is the * separator after our previous ancestor. */ zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; if (offset != 0) { out_idx->bti_node = &leaf->btl_hdr; out_idx->bti_offset = offset - 1; out_idx->bti_before = B_FALSE; return (leaf->btl_elems + (leaf->btl_hdr.bth_first + offset - 1) * tree->bt_elem_size); } zfs_btree_hdr_t *prev = &leaf->btl_hdr; for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; node != NULL; node = node->btc_hdr.bth_parent) { zfs_btree_hdr_t *hdr = &node->btc_hdr; ASSERT(zfs_btree_is_core(hdr)); uint32_t i = zfs_btree_find_parent_idx(tree, prev); if (i == 0) { prev = hdr; continue; } out_idx->bti_node = hdr; out_idx->bti_offset = i - 1; out_idx->bti_before = B_FALSE; return (node->btc_elems + (i - 1) * tree->bt_elem_size); } /* * We've traversed all the way up and been at the start of the * node every time, so this was the first node in the tree. */ return (NULL); } /* * The previous element from one in a core node is the last element in * the subtree just to the left of the separator. */ ASSERT(zfs_btree_is_core(idx->bti_node)); zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; zfs_btree_hdr_t *child = node->btc_children[offset]; return (zfs_btree_last_helper(tree, child, out_idx)); } /* * Get the value at the provided index in the tree. * * Note that the value returned from this function can be mutated, but only * if it will not change the ordering of the element with respect to any other * elements that could be in the tree. */ void * zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) { ASSERT(!idx->bti_before); size_t size = tree->bt_elem_size; if (!zfs_btree_is_core(idx->bti_node)) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; return (leaf->btl_elems + (leaf->btl_hdr.bth_first + idx->bti_offset) * size); } zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; return (node->btc_elems + idx->bti_offset * size); } /* Add the given value to the tree. Must not already be in the tree. */ void zfs_btree_add(zfs_btree_t *tree, const void *node) { zfs_btree_index_t where = {0}; VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); zfs_btree_add_idx(tree, node, &where); } /* Helper function to free a tree node. */ static void zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) { tree->bt_num_nodes--; if (!zfs_btree_is_core(node)) { - kmem_cache_free(zfs_btree_leaf_cache, node); + zfs_btree_leaf_free(tree, node); } else { kmem_free(node, sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * tree->bt_elem_size); } } /* * Remove the rm_hdr and the separator to its left from the parent node. The * buffer that rm_hdr was stored in may already be freed, so its contents * cannot be accessed. */ static void zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, zfs_btree_hdr_t *rm_hdr) { size_t size = tree->bt_elem_size; uint32_t min_count = (BTREE_CORE_ELEMS / 2) - 1; zfs_btree_hdr_t *hdr = &node->btc_hdr; /* * If the node is the root node and rm_hdr is one of two children, * promote the other child to the root. */ if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { ASSERT3U(hdr->bth_count, ==, 1); ASSERT3P(tree->bt_root, ==, node); ASSERT3P(node->btc_children[1], ==, rm_hdr); tree->bt_root = node->btc_children[0]; node->btc_children[0]->bth_parent = NULL; zfs_btree_node_destroy(tree, hdr); tree->bt_height--; return; } uint32_t idx; for (idx = 0; idx <= hdr->bth_count; idx++) { if (node->btc_children[idx] == rm_hdr) break; } ASSERT3U(idx, <=, hdr->bth_count); /* * If the node is the root or it has more than the minimum number of * children, just remove the child and separator, and return. */ if (hdr->bth_parent == NULL || hdr->bth_count > min_count) { /* * Shift the element and children to the right of rm_hdr to * the left by one spot. */ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, BSS_PARALLELOGRAM); hdr->bth_count--; zfs_btree_poison_node_at(tree, hdr, hdr->bth_count, 1); return; } ASSERT3U(hdr->bth_count, ==, min_count); /* * Now we try to take a node from a neighbor. We check left, then * right. If the neighbor exists and has more than the minimum number * of elements, we move the separator between us and them to our * node, move their closest element (last for left, first for right) * to the separator, and move their closest child to our node. Along * the way we need to collapse the gap made by idx, and (for our right * neighbor) the gap made by removing their first element and child. * * Note: this logic currently doesn't support taking from a neighbor * that isn't a sibling (i.e. a neighbor with a different * parent). This isn't critical functionality, but may be worth * implementing in the future for completeness' sake. */ zfs_btree_core_t *parent = hdr->bth_parent; uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { /* We can take a node from the left neighbor. */ ASSERT(zfs_btree_is_core(l_hdr)); zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; /* * Start by shifting the elements and children in the current * node to the right by one spot. */ bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); /* * Move the separator between node and neighbor to the first * element slot in the current node. */ uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; bcpy(separator, node->btc_elems, size); /* Move the last child of neighbor to our first child slot. */ node->btc_children[0] = neighbor->btc_children[l_hdr->bth_count]; node->btc_children[0]->bth_parent = node; /* Move the last element of neighbor to the separator spot. */ uint8_t *take_elem = neighbor->btc_elems + (l_hdr->bth_count - 1) * size; bcpy(take_elem, separator, size); l_hdr->bth_count--; zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count, 1); return; } zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { /* We can take a node from the right neighbor. */ ASSERT(zfs_btree_is_core(r_hdr)); zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; /* * Shift elements in node left by one spot to overwrite rm_hdr * and the separator before it. */ bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, BSS_PARALLELOGRAM); /* * Move the separator between node and neighbor to the last * element spot in node. */ uint8_t *separator = parent->btc_elems + parent_idx * size; bcpy(separator, node->btc_elems + (hdr->bth_count - 1) * size, size); /* * Move the first child of neighbor to the last child spot in * node. */ node->btc_children[hdr->bth_count] = neighbor->btc_children[0]; node->btc_children[hdr->bth_count]->bth_parent = node; /* Move the first element of neighbor to the separator spot. */ uint8_t *take_elem = neighbor->btc_elems; bcpy(take_elem, separator, size); r_hdr->bth_count--; /* * Shift the elements and children of neighbor to cover the * stolen elements. */ bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, BSS_TRAPEZOID); zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count, 1); return; } /* * In this case, neither of our neighbors can spare an element, so we * need to merge with one of them. We prefer the left one, * arbitrarily. Move the separator into the leftmost merging node * (which may be us or the left neighbor), and then move the right * merging node's elements. Once that's done, we go back and delete * the element we're removing. Finally, go into the parent and delete * the right merging node and the separator. This may cause further * merging. */ zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; uint32_t new_idx = idx; if (l_hdr != NULL) { keep_hdr = l_hdr; new_rm_hdr = hdr; new_idx += keep_hdr->bth_count + 1; } else { ASSERT3P(r_hdr, !=, NULL); keep_hdr = hdr; new_rm_hdr = r_hdr; parent_idx++; } ASSERT(zfs_btree_is_core(keep_hdr)); ASSERT(zfs_btree_is_core(new_rm_hdr)); zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; if (zfs_btree_verify_intensity >= 5) { for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) { zfs_btree_verify_poison_at(tree, keep_hdr, keep_hdr->bth_count + i); } } /* Move the separator into the left node. */ uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; bcpy(separator, e_out, size); keep_hdr->bth_count++; /* Move all our elements and children into the left node. */ bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, keep_hdr->bth_count, BSS_TRAPEZOID); uint32_t old_count = keep_hdr->bth_count; /* Update bookkeeping */ keep_hdr->bth_count += new_rm_hdr->bth_count; ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); /* * Shift the element and children to the right of rm_hdr to * the left by one spot. */ ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, BSS_PARALLELOGRAM); keep_hdr->bth_count--; /* Reparent all our children to point to the left node. */ zfs_btree_hdr_t **new_start = keep->btc_children + old_count - 1; for (uint32_t i = 0; i < new_rm_hdr->bth_count + 1; i++) new_start[i]->bth_parent = keep; for (uint32_t i = 0; i <= keep_hdr->bth_count; i++) { ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); ASSERT3P(keep->btc_children[i], !=, rm_hdr); } zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); new_rm_hdr->bth_count = 0; zfs_btree_remove_from_node(tree, parent, new_rm_hdr); zfs_btree_node_destroy(tree, new_rm_hdr); } /* Remove the element at the specific location. */ void zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) { size_t size = tree->bt_elem_size; zfs_btree_hdr_t *hdr = where->bti_node; uint32_t idx = where->bti_offset; ASSERT(!where->bti_before); if (tree->bt_bulk != NULL) { /* * Leave bulk insert mode. Note that our index would be * invalid after we correct the tree, so we copy the value * we're planning to remove and find it again after * bulk_finish. */ uint8_t *value = zfs_btree_get(tree, where); uint8_t *tmp = kmem_alloc(size, KM_SLEEP); bcpy(value, tmp, size); zfs_btree_bulk_finish(tree); VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); kmem_free(tmp, size); hdr = where->bti_node; idx = where->bti_offset; } tree->bt_num_elems--; /* * If the element happens to be in a core node, we move a leaf node's * element into its place and then remove the leaf node element. This * makes the rebalance logic not need to be recursive both upwards and * downwards. */ if (zfs_btree_is_core(hdr)) { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; void *new_value = zfs_btree_last_helper(tree, left_subtree, where); ASSERT3P(new_value, !=, NULL); bcpy(new_value, node->btc_elems + idx * size, size); hdr = where->bti_node; idx = where->bti_offset; ASSERT(!where->bti_before); } /* * First, we'll update the leaf's metadata. Then, we shift any * elements after the idx to the left. After that, we rebalance if * needed. */ ASSERT(!zfs_btree_is_core(hdr)); zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; ASSERT3U(hdr->bth_count, >, 0); uint32_t min_count = (tree->bt_leaf_cap / 2) - 1; /* * If we're over the minimum size or this is the root, just overwrite * the value and return. */ if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { bt_shrink_leaf(tree, leaf, idx, 1); if (hdr->bth_parent == NULL) { ASSERT0(tree->bt_height); if (hdr->bth_count == 0) { tree->bt_root = NULL; tree->bt_height--; zfs_btree_node_destroy(tree, &leaf->btl_hdr); } } zfs_btree_verify(tree); return; } ASSERT3U(hdr->bth_count, ==, min_count); /* * Now we try to take a node from a sibling. We check left, then * right. If they exist and have more than the minimum number of * elements, we move the separator between us and them to our node * and move their closest element (last for left, first for right) to * the separator. Along the way we need to collapse the gap made by * idx, and (for our right neighbor) the gap made by removing their * first element. * * Note: this logic currently doesn't support taking from a neighbor * that isn't a sibling. This isn't critical functionality, but may be * worth implementing in the future for completeness' sake. */ zfs_btree_core_t *parent = hdr->bth_parent; uint32_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : parent->btc_children[parent_idx - 1]); if (l_hdr != NULL && l_hdr->bth_count > min_count) { /* We can take a node from the left neighbor. */ ASSERT(!zfs_btree_is_core(l_hdr)); zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)l_hdr; /* * Move our elements back by one spot to make room for the * stolen element and overwrite the element being removed. */ bt_shift_leaf(tree, leaf, 0, idx, 1, BSD_RIGHT); /* Move the separator to our first spot. */ uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; bcpy(separator, leaf->btl_elems + hdr->bth_first * size, size); /* Move our neighbor's last element to the separator. */ uint8_t *take_elem = neighbor->btl_elems + (l_hdr->bth_first + l_hdr->bth_count - 1) * size; bcpy(take_elem, separator, size); /* Delete our neighbor's last element. */ bt_shrink_leaf(tree, neighbor, l_hdr->bth_count - 1, 1); zfs_btree_verify(tree); return; } zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? NULL : parent->btc_children[parent_idx + 1]); if (r_hdr != NULL && r_hdr->bth_count > min_count) { /* We can take a node from the right neighbor. */ ASSERT(!zfs_btree_is_core(r_hdr)); zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; /* * Move our elements after the element being removed forwards * by one spot to make room for the stolen element and * overwrite the element being removed. */ bt_shift_leaf(tree, leaf, idx + 1, hdr->bth_count - idx - 1, 1, BSD_LEFT); /* Move the separator between us to our last spot. */ uint8_t *separator = parent->btc_elems + parent_idx * size; bcpy(separator, leaf->btl_elems + (hdr->bth_first + hdr->bth_count - 1) * size, size); /* Move our neighbor's first element to the separator. */ uint8_t *take_elem = neighbor->btl_elems + r_hdr->bth_first * size; bcpy(take_elem, separator, size); /* Delete our neighbor's first element. */ bt_shrink_leaf(tree, neighbor, 0, 1); zfs_btree_verify(tree); return; } /* * In this case, neither of our neighbors can spare an element, so we * need to merge with one of them. We prefer the left one, arbitrarily. * After remove we move the separator into the leftmost merging node * (which may be us or the left neighbor), and then move the right * merging node's elements. Once that's done, we go back and delete * the element we're removing. Finally, go into the parent and delete * the right merging node and the separator. This may cause further * merging. */ zfs_btree_hdr_t *rm_hdr, *k_hdr; if (l_hdr != NULL) { k_hdr = l_hdr; rm_hdr = hdr; } else { ASSERT3P(r_hdr, !=, NULL); k_hdr = hdr; rm_hdr = r_hdr; parent_idx++; } ASSERT(!zfs_btree_is_core(k_hdr)); ASSERT(!zfs_btree_is_core(rm_hdr)); ASSERT3U(k_hdr->bth_count, ==, min_count); ASSERT3U(rm_hdr->bth_count, ==, min_count); zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)k_hdr; zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; if (zfs_btree_verify_intensity >= 5) { for (uint32_t i = 0; i < rm_hdr->bth_count + 1; i++) { zfs_btree_verify_poison_at(tree, k_hdr, k_hdr->bth_count + i); } } /* * Remove the value from the node. It will go below the minimum, * but we'll fix it in no time. */ bt_shrink_leaf(tree, leaf, idx, 1); /* Prepare space for elements to be moved from the right. */ uint32_t k_count = k_hdr->bth_count; bt_grow_leaf(tree, keep, k_count, 1 + rm_hdr->bth_count); ASSERT3U(k_hdr->bth_count, ==, min_count * 2); /* Move the separator into the first open spot. */ uint8_t *out = keep->btl_elems + (k_hdr->bth_first + k_count) * size; uint8_t *separator = parent->btc_elems + (parent_idx - 1) * size; bcpy(separator, out, size); /* Move our elements to the left neighbor. */ bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); /* Remove the emptied node from the parent. */ zfs_btree_remove_from_node(tree, parent, rm_hdr); zfs_btree_node_destroy(tree, rm_hdr); zfs_btree_verify(tree); } /* Remove the given value from the tree. */ void zfs_btree_remove(zfs_btree_t *tree, const void *value) { zfs_btree_index_t where = {0}; VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); zfs_btree_remove_idx(tree, &where); } /* Return the number of elements in the tree. */ ulong_t zfs_btree_numnodes(zfs_btree_t *tree) { return (tree->bt_num_elems); } /* * This function is used to visit all the elements in the tree before * destroying the tree. This allows the calling code to perform any cleanup it * needs to do. This is more efficient than just removing the first element * over and over, because it removes all rebalancing. Once the destroy_nodes() * function has been called, no other btree operations are valid until it * returns NULL, which point the only valid operation is zfs_btree_destroy(). * * example: * * zfs_btree_index_t *cookie = NULL; * my_data_t *node; * * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) * free(node->ptr); * zfs_btree_destroy(tree); * */ void * zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) { if (*cookie == NULL) { if (tree->bt_height == -1) return (NULL); *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); return (zfs_btree_first(tree, *cookie)); } void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, zfs_btree_node_destroy); if (rval == NULL) { tree->bt_root = NULL; tree->bt_height = -1; tree->bt_num_elems = 0; kmem_free(*cookie, sizeof (**cookie)); tree->bt_bulk = NULL; } return (rval); } static void zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { if (zfs_btree_is_core(hdr)) { zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; for (uint32_t i = 0; i <= hdr->bth_count; i++) zfs_btree_clear_helper(tree, btc->btc_children[i]); } zfs_btree_node_destroy(tree, hdr); } void zfs_btree_clear(zfs_btree_t *tree) { if (tree->bt_root == NULL) { ASSERT0(tree->bt_num_elems); return; } zfs_btree_clear_helper(tree, tree->bt_root); tree->bt_num_elems = 0; tree->bt_root = NULL; tree->bt_num_nodes = 0; tree->bt_height = -1; tree->bt_bulk = NULL; } void zfs_btree_destroy(zfs_btree_t *tree) { ASSERT0(tree->bt_num_elems); ASSERT3P(tree->bt_root, ==, NULL); } /* Verify that every child of this node has the correct parent pointer. */ static void zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { if (!zfs_btree_is_core(hdr)) return; zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (uint32_t i = 0; i <= hdr->bth_count; i++) { VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); } } /* Verify that every node has the correct parent pointer. */ static void zfs_btree_verify_pointers(zfs_btree_t *tree) { if (tree->bt_height == -1) { VERIFY3P(tree->bt_root, ==, NULL); return; } VERIFY3P(tree->bt_root->bth_parent, ==, NULL); zfs_btree_verify_pointers_helper(tree, tree->bt_root); } /* * Verify that all the current node and its children satisfy the count * invariants, and return the total count in the subtree rooted in this node. */ static uint64_t zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { if (!zfs_btree_is_core(hdr)) { if (tree->bt_root != hdr && tree->bt_bulk && hdr != &tree->bt_bulk->btl_hdr) { VERIFY3U(hdr->bth_count, >=, tree->bt_leaf_cap / 2 - 1); } return (hdr->bth_count); } else { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint64_t ret = hdr->bth_count; if (tree->bt_root != hdr && tree->bt_bulk == NULL) VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); for (uint32_t i = 0; i <= hdr->bth_count; i++) { ret += zfs_btree_verify_counts_helper(tree, node->btc_children[i]); } return (ret); } } /* * Verify that all nodes satisfy the invariants and that the total number of * elements is correct. */ static void zfs_btree_verify_counts(zfs_btree_t *tree) { EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); if (tree->bt_height == -1) { return; } VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, tree->bt_num_elems); } /* * Check that the subtree rooted at this node has a uniform height. Returns * the number of nodes under this node, to help verify bt_num_nodes. */ static uint64_t zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, - int64_t height) + int32_t height) { if (!zfs_btree_is_core(hdr)) { VERIFY0(height); return (1); } zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; uint64_t ret = 1; for (uint32_t i = 0; i <= hdr->bth_count; i++) { ret += zfs_btree_verify_height_helper(tree, node->btc_children[i], height - 1); } return (ret); } /* * Check that the tree rooted at this node has a uniform height, and that the * bt_height in the tree is correct. */ static void zfs_btree_verify_height(zfs_btree_t *tree) { EQUIV(tree->bt_height == -1, tree->bt_root == NULL); if (tree->bt_height == -1) { return; } VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, tree->bt_height), ==, tree->bt_num_nodes); } /* * Check that the elements in this node are sorted, and that if this is a core * node, the separators are properly between the subtrees they separaate and * that the children also satisfy this requirement. */ static void zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { size_t size = tree->bt_elem_size; if (!zfs_btree_is_core(hdr)) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; for (uint32_t i = 1; i < hdr->bth_count; i++) { VERIFY3S(tree->bt_compar(leaf->btl_elems + (hdr->bth_first + i - 1) * size, leaf->btl_elems + (hdr->bth_first + i) * size), ==, -1); } return; } zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (uint32_t i = 1; i < hdr->bth_count; i++) { VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, node->btc_elems + i * size), ==, -1); } for (uint32_t i = 0; i < hdr->bth_count; i++) { uint8_t *left_child_last = NULL; zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; if (zfs_btree_is_core(left_child_hdr)) { zfs_btree_core_t *left_child = (zfs_btree_core_t *)left_child_hdr; left_child_last = left_child->btc_elems + (left_child_hdr->bth_count - 1) * size; } else { zfs_btree_leaf_t *left_child = (zfs_btree_leaf_t *)left_child_hdr; left_child_last = left_child->btl_elems + (left_child_hdr->bth_first + left_child_hdr->bth_count - 1) * size; } int comp = tree->bt_compar(node->btc_elems + i * size, left_child_last); if (comp <= 0) { panic("btree: compar returned %d (expected 1) at " "%px %d: compar(%px, %px)", comp, node, i, node->btc_elems + i * size, left_child_last); } uint8_t *right_child_first = NULL; zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; if (zfs_btree_is_core(right_child_hdr)) { zfs_btree_core_t *right_child = (zfs_btree_core_t *)right_child_hdr; right_child_first = right_child->btc_elems; } else { zfs_btree_leaf_t *right_child = (zfs_btree_leaf_t *)right_child_hdr; right_child_first = right_child->btl_elems + right_child_hdr->bth_first * size; } comp = tree->bt_compar(node->btc_elems + i * size, right_child_first); if (comp >= 0) { panic("btree: compar returned %d (expected -1) at " "%px %d: compar(%px, %px)", comp, node, i, node->btc_elems + i * size, right_child_first); } } for (uint32_t i = 0; i <= hdr->bth_count; i++) zfs_btree_verify_order_helper(tree, node->btc_children[i]); } /* Check that all elements in the tree are in sorted order. */ static void zfs_btree_verify_order(zfs_btree_t *tree) { EQUIV(tree->bt_height == -1, tree->bt_root == NULL); if (tree->bt_height == -1) { return; } zfs_btree_verify_order_helper(tree, tree->bt_root); } #ifdef ZFS_DEBUG /* Check that all unused memory is poisoned correctly. */ static void zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) { size_t size = tree->bt_elem_size; if (!zfs_btree_is_core(hdr)) { zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; for (size_t i = 0; i < hdr->bth_first * size; i++) VERIFY3U(leaf->btl_elems[i], ==, 0x0f); + size_t esize = tree->bt_leaf_size - + offsetof(zfs_btree_leaf_t, btl_elems); for (size_t i = (hdr->bth_first + hdr->bth_count) * size; - i < BTREE_LEAF_ESIZE; i++) + i < esize; i++) VERIFY3U(leaf->btl_elems[i], ==, 0x0f); } else { zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; for (size_t i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; i++) VERIFY3U(node->btc_elems[i], ==, 0x0f); for (uint32_t i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { VERIFY3P(node->btc_children[i], ==, (zfs_btree_hdr_t *)BTREE_POISON); } for (uint32_t i = 0; i <= hdr->bth_count; i++) { zfs_btree_verify_poison_helper(tree, node->btc_children[i]); } } } #endif /* Check that unused memory in the tree is still poisoned. */ static void zfs_btree_verify_poison(zfs_btree_t *tree) { #ifdef ZFS_DEBUG if (tree->bt_height == -1) return; zfs_btree_verify_poison_helper(tree, tree->bt_root); #endif } void zfs_btree_verify(zfs_btree_t *tree) { if (zfs_btree_verify_intensity == 0) return; zfs_btree_verify_height(tree); if (zfs_btree_verify_intensity == 1) return; zfs_btree_verify_pointers(tree); if (zfs_btree_verify_intensity == 2) return; zfs_btree_verify_counts(tree); if (zfs_btree_verify_intensity == 3) return; zfs_btree_verify_order(tree); if (zfs_btree_verify_intensity == 4) return; zfs_btree_verify_poison(tree); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, "Enable btree verification. Levels above 4 require ZFS be built " "with debugging"); /* END CSTYLED */ diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c index aad923d512df..fc25344ea8a8 100644 --- a/module/zfs/zap_leaf.c +++ b/module/zfs/zap_leaf.c @@ -1,847 +1,847 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* * The 512-byte leaf is broken into 32 16-byte chunks. * chunk number n means l_chunk[n], even though the header precedes it. * the names are stored null-terminated. */ #include #include #include #include #include #include #include #include #include static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); #define CHAIN_END 0xffff /* end of the chunk chain */ #define LEAF_HASH(l, h) \ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ ((h) >> \ (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) #define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) static void zap_memset(void *a, int c, size_t n) { char *cp = a; char *cpend = cp + n; while (cp < cpend) *cp++ = c; } static void stv(int len, void *addr, uint64_t value) { switch (len) { case 1: *(uint8_t *)addr = value; return; case 2: *(uint16_t *)addr = value; return; case 4: *(uint32_t *)addr = value; return; case 8: *(uint64_t *)addr = value; return; default: cmn_err(CE_PANIC, "bad int len %d", len); } } static uint64_t ldv(int len, const void *addr) { switch (len) { case 1: return (*(uint8_t *)addr); case 2: return (*(uint16_t *)addr); case 4: return (*(uint32_t *)addr); case 8: return (*(uint64_t *)addr); default: cmn_err(CE_PANIC, "bad int len %d", len); } return (0xFEEDFACEDEADBEEFULL); } void zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) { zap_leaf_t l; dmu_buf_t l_dbuf; l_dbuf.db_data = buf; l.l_bs = highbit64(size) - 1; l.l_dbuf = &l_dbuf; buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); struct zap_leaf_entry *le; switch (lc->l_free.lf_type) { case ZAP_CHUNK_ENTRY: le = &lc->l_entry; le->le_type = BSWAP_8(le->le_type); le->le_value_intlen = BSWAP_8(le->le_value_intlen); le->le_next = BSWAP_16(le->le_next); le->le_name_chunk = BSWAP_16(le->le_name_chunk); le->le_name_numints = BSWAP_16(le->le_name_numints); le->le_value_chunk = BSWAP_16(le->le_value_chunk); le->le_value_numints = BSWAP_16(le->le_value_numints); le->le_cd = BSWAP_32(le->le_cd); le->le_hash = BSWAP_64(le->le_hash); break; case ZAP_CHUNK_FREE: lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); break; case ZAP_CHUNK_ARRAY: lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); /* la_array doesn't need swapping */ break; default: cmn_err(CE_PANIC, "bad leaf type %d", lc->l_free.lf_type); } } } void zap_leaf_init(zap_leaf_t *l, boolean_t sort) { l->l_bs = highbit64(l->l_dbuf->db_size) - 1; zap_memset(&zap_leaf_phys(l)->l_hdr, 0, sizeof (struct zap_leaf_header)); zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; } ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF; zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC; zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); if (sort) zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; } /* * Routines which manipulate leaf chunks (l_chunk[]). */ static uint16_t zap_leaf_chunk_alloc(zap_leaf_t *l) { ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); zap_leaf_phys(l)->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; zap_leaf_phys(l)->l_hdr.lh_nfree--; return (chunk); } static void zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) { struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); zlf->lf_type = ZAP_CHUNK_FREE; zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist; bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ zap_leaf_phys(l)->l_hdr.lh_freelist = chunk; zap_leaf_phys(l)->l_hdr.lh_nfree++; } /* * Routines which manipulate leaf arrays (zap_leaf_array type chunks). */ static uint16_t zap_leaf_array_create(zap_leaf_t *l, const char *buf, int integer_size, int num_integers) { uint16_t chunk_head; uint16_t *chunkp = &chunk_head; int byten = 0; uint64_t value = 0; int shift = (integer_size - 1) * 8; int len = num_integers; ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN); while (len > 0) { uint16_t chunk = zap_leaf_chunk_alloc(l); struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; la->la_type = ZAP_CHUNK_ARRAY; for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { if (byten == 0) value = ldv(integer_size, buf); la->la_array[i] = value >> shift; value <<= 8; if (++byten == integer_size) { byten = 0; buf += integer_size; if (--len == 0) break; } } *chunkp = chunk; chunkp = &la->la_next; } *chunkp = CHAIN_END; return (chunk_head); } static void zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) { uint16_t chunk = *chunkp; *chunkp = CHAIN_END; while (chunk != CHAIN_END) { int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, ZAP_CHUNK_ARRAY); zap_leaf_chunk_free(l, chunk); chunk = nextchunk; } } /* array_len and buf_len are in integers, not bytes */ static void zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, void *buf) { int len = MIN(array_len, buf_len); int byten = 0; uint64_t value = 0; char *p = buf; ASSERT3U(array_int_len, <=, buf_int_len); /* Fast path for one 8-byte integer */ if (array_int_len == 8 && buf_int_len == 8 && len == 1) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; uint8_t *ip = la->la_array; uint64_t *buf64 = buf; *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; return; } /* Fast path for an array of 1-byte integers (eg. the entry name) */ if (array_int_len == 1 && buf_int_len == 1 && buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { while (chunk != CHAIN_END) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES); p += ZAP_LEAF_ARRAY_BYTES; chunk = la->la_next; } return; } while (len > 0) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { value = (value << 8) | la->la_array[i]; byten++; if (byten == array_int_len) { stv(buf_int_len, p, value); byten = 0; len--; if (len == 0) return; p += buf_int_len; } } chunk = la->la_next; } } static boolean_t zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_numints) { int bseen = 0; if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { uint64_t *thiskey = kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP); ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, sizeof (*thiskey), array_numints, thiskey); boolean_t match = bcmp(thiskey, zn->zn_key_orig, array_numints * sizeof (*thiskey)) == 0; kmem_free(thiskey, array_numints * sizeof (*thiskey)); return (match); } ASSERT(zn->zn_key_intlen == 1); if (zn->zn_matchtype & MT_NORMALIZE) { char *thisname = kmem_alloc(array_numints, KM_SLEEP); zap_leaf_array_read(l, chunk, sizeof (char), array_numints, sizeof (char), array_numints, thisname); boolean_t match = zap_match(zn, thisname); kmem_free(thisname, array_numints); return (match); } /* * Fast path for exact matching. * First check that the lengths match, so that we don't read * past the end of the zn_key_orig array. */ if (array_numints != zn->zn_key_orig_numints) return (B_FALSE); while (bseen < array_numints) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) break; chunk = la->la_next; bseen += toread; } return (bseen == array_numints); } /* * Routines which manipulate leaf entries. */ int zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) { struct zap_leaf_entry *le; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); *chunkp != CHAIN_END; chunkp = &le->le_next) { uint16_t chunk = *chunkp; le = ZAP_LEAF_ENTRY(l, chunk); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (le->le_hash != zn->zn_hash) continue; /* * NB: the entry chain is always sorted by cd on * normalized zap objects, so this will find the * lowest-cd match for MT_NORMALIZE. */ ASSERT((zn->zn_matchtype == 0) || (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); if (zap_leaf_array_match(l, zn, le->le_name_chunk, le->le_name_numints)) { zeh->zeh_num_integers = le->le_value_numints; zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; zeh->zeh_leaf = l; return (0); } } return (SET_ERROR(ENOENT)); } /* Return (h1,cd1 >= h2,cd2) */ #define HCD_GTEQ(h1, cd1, h2, cd2) \ ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) int zap_leaf_lookup_closest(zap_leaf_t *l, uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) { uint64_t besth = -1ULL; uint32_t bestcd = -1U; uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; struct zap_leaf_entry *le; ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh]; chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { ASSERT3U(bestlh, >=, lh); bestlh = lh; besth = le->le_hash; bestcd = le->le_cd; zeh->zeh_num_integers = le->le_value_numints; zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_fakechunk = chunk; zeh->zeh_chunkp = &zeh->zeh_fakechunk; zeh->zeh_leaf = l; } } } return (bestcd == -1U ? SET_ERROR(ENOENT) : 0); } int zap_entry_read(const zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, void *buf) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (le->le_value_intlen > integer_size) return (SET_ERROR(EINVAL)); zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_value_intlen, le->le_value_numints, integer_size, num_integers, buf); if (zeh->zeh_num_integers > num_integers) return (SET_ERROR(EOVERFLOW)); return (0); } int zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, char *buf) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8, le->le_name_numints, 8, buflen / 8, buf); } else { zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, le->le_name_numints, 1, buflen, buf); } if (le->le_name_numints > buflen) return (SET_ERROR(EOVERFLOW)); return (0); } int zap_entry_update(zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, const void *buf) { zap_leaf_t *l = zeh->zeh_leaf; struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks) return (SET_ERROR(EAGAIN)); zap_leaf_array_free(l, &le->le_value_chunk); le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); le->le_value_numints = num_integers; le->le_value_intlen = integer_size; return (0); } void zap_entry_remove(zap_entry_handle_t *zeh) { zap_leaf_t *l = zeh->zeh_leaf; ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); uint16_t entry_chunk = *zeh->zeh_chunkp; struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); zap_leaf_array_free(l, &le->le_name_chunk); zap_leaf_array_free(l, &le->le_value_chunk); *zeh->zeh_chunkp = le->le_next; zap_leaf_chunk_free(l, entry_chunk); zap_leaf_phys(l)->l_hdr.lh_nentries--; } int zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, uint8_t integer_size, uint64_t num_integers, const void *buf, zap_entry_handle_t *zeh) { uint16_t chunk; struct zap_leaf_entry *le; uint64_t h = zn->zn_hash; uint64_t valuelen = integer_size * num_integers; int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) return (SET_ERROR(E2BIG)); if (cd == ZAP_NEED_CD) { /* find the lowest unused cd */ if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { cd = 0; for (chunk = *LEAF_HASH_ENTPTR(l, h); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); if (le->le_cd > cd) break; if (le->le_hash == h) { ASSERT3U(cd, ==, le->le_cd); cd++; } } } else { /* old unsorted format; do it the O(n^2) way */ for (cd = 0; ; cd++) { for (chunk = *LEAF_HASH_ENTPTR(l, h); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); if (le->le_hash == h && le->le_cd == cd) { break; } } /* If this cd is not in use, we are good. */ if (chunk == CHAIN_END) break; } } /* * We would run out of space in a block before we could * store enough entries to run out of CD values. */ ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); } if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks) return (SET_ERROR(EAGAIN)); /* make the entry */ chunk = zap_leaf_chunk_alloc(l); le = ZAP_LEAF_ENTRY(l, chunk); le->le_type = ZAP_CHUNK_ENTRY; le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig, zn->zn_key_intlen, zn->zn_key_orig_numints); le->le_name_numints = zn->zn_key_orig_numints; le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); le->le_value_numints = num_integers; le->le_value_intlen = integer_size; le->le_hash = h; le->le_cd = cd; /* link it into the hash chain */ /* XXX if we did the search above, we could just use that */ uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk); zap_leaf_phys(l)->l_hdr.lh_nentries++; zeh->zeh_leaf = l; zeh->zeh_num_integers = num_integers; zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; return (0); } /* * Determine if there is another entry with the same normalized form. * For performance purposes, either zn or name must be provided (the * other can be NULL). Note, there usually won't be any hash * conflicts, in which case we don't need the concatenated/normalized * form of the name. But all callers have one of these on hand anyway, * so might as well take advantage. A cleaner but slower interface * would accept neither argument, and compute the normalized name as - * needed (using zap_name_alloc(zap_entry_read_name(zeh))). + * needed (using zap_name_alloc_str(zap_entry_read_name(zeh))). */ boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, const char *name, zap_t *zap) { struct zap_leaf_entry *le; boolean_t allocdzn = B_FALSE; if (zap->zap_normflags == 0) return (B_FALSE); for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); if (le->le_hash != zeh->zeh_hash) continue; if (le->le_cd == zeh->zeh_cd) continue; if (zn == NULL) { - zn = zap_name_alloc(zap, name, MT_NORMALIZE); + zn = zap_name_alloc_str(zap, name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_leaf_array_match(zeh->zeh_leaf, zn, le->le_name_chunk, le->le_name_numints)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for transferring entries between leafs. */ static uint16_t * zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); struct zap_leaf_entry *le2; uint16_t *chunkp; /* * keep the entry chain sorted by cd * NB: this will not cause problems for unsorted leafs, though * it is unnecessary there. */ for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash); *chunkp != CHAIN_END; chunkp = &le2->le_next) { le2 = ZAP_LEAF_ENTRY(l, *chunkp); if (le2->le_cd > le->le_cd) break; } le->le_next = *chunkp; *chunkp = entry; return (chunkp); } static uint16_t zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) { uint16_t new_chunk; uint16_t *nchunkp = &new_chunk; while (chunk != CHAIN_END) { uint16_t nchunk = zap_leaf_chunk_alloc(nl); struct zap_leaf_array *nla = &ZAP_LEAF_CHUNK(nl, nchunk).l_array; struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; int nextchunk = la->la_next; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); *nla = *la; /* structure assignment */ zap_leaf_chunk_free(l, chunk); chunk = nextchunk; *nchunkp = nchunk; nchunkp = &nla->la_next; } *nchunkp = CHAIN_END; return (new_chunk); } static void zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); uint16_t chunk = zap_leaf_chunk_alloc(nl); struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk); *nle = *le; /* structure assignment */ (void) zap_leaf_rehash_entry(nl, chunk); nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); nle->le_value_chunk = zap_leaf_transfer_array(l, le->le_value_chunk, nl); zap_leaf_chunk_free(l, entry); zap_leaf_phys(l)->l_hdr.lh_nentries--; zap_leaf_phys(nl)->l_hdr.lh_nentries++; } /* * Transfer the entries whose hash prefix ends in 1 to the new leaf. */ void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* set new prefix and prefix_len */ zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; zap_leaf_phys(l)->l_hdr.lh_prefix_len++; zap_leaf_phys(nl)->l_hdr.lh_prefix = zap_leaf_phys(l)->l_hdr.lh_prefix | 1; zap_leaf_phys(nl)->l_hdr.lh_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* break existing hash chains */ zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); if (sort) zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; /* * Transfer entries whose hash bit 'bit' is set to nl; rehash * the remaining entries * * NB: We could find entries via the hashtable instead. That * would be O(hashents+numents) rather than O(numblks+numents), * but this accesses memory more sequentially, and when we're * called, the block is usually pretty full. */ for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); if (le->le_type != ZAP_CHUNK_ENTRY) continue; if (le->le_hash & (1ULL << bit)) zap_leaf_transfer_entry(l, i, nl); else (void) zap_leaf_rehash_entry(l, i); } } void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) { int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - zap_leaf_phys(l)->l_hdr.lh_prefix_len; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_leafs_with_2n_pointers[n]++; n = zap_leaf_phys(l)->l_hdr.lh_nentries/5; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_blocks_with_n5_entries[n]++; n = ((1<l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / (1<zs_blocks_n_tenths_full[n]++; for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { int nentries = 0; int chunk = zap_leaf_phys(l)->l_hash[i]; while (chunk != CHAIN_END) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, chunk); n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_entries_using_n_chunks[n]++; chunk = le->le_next; nentries++; } n = nentries; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_buckets_with_n_entries[n]++; } } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 516d46ac7f31..e3dadf130413 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1,1698 +1,1725 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ #include #include #include #include #include #include #include -#include +#include #include #include #ifdef _KERNEL #include #endif static int mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); return (zap_f_phys(zap)->zap_flags); } int zap_hashbits(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return (48); else return (28); } uint32_t zap_maxcd(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return ((1<<16)-1); else return (-1U); } static uint64_t zap_hash(zap_name_t *zn) { zap_t *zap = zn->zn_zap; uint64_t h = 0; if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); h = *(uint64_t *)zn->zn_key_orig; } else { h = zap->zap_salt; ASSERT(h != 0); ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { const uint64_t *wp = zn->zn_key_norm; ASSERT(zn->zn_key_intlen == 8); for (int i = 0; i < zn->zn_key_norm_numints; wp++, i++) { uint64_t word = *wp; - for (int j = 0; j < zn->zn_key_intlen; j++) { + for (int j = 0; j < 8; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; } } } else { const uint8_t *cp = zn->zn_key_norm; /* * We previously stored the terminating null on * disk, but didn't hash it, so we need to * continue to not hash it. (The * zn_key_*_numints includes the terminating * null for non-binary keys.) */ int len = zn->zn_key_norm_numints - 1; ASSERT(zn->zn_key_intlen == 1); for (int i = 0; i < len; cp++, i++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF]; } } } /* * Don't use all 64 bits, since we need some in the cookie for * the collision differentiator. We MUST use the high bits, * since those are the ones that we first pay attention to when * choosing the bucket. */ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); return (h); } static int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) { ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); size_t inlen = strlen(name) + 1; size_t outlen = ZAP_MAXNAMELEN; int err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } boolean_t zap_match(zap_name_t *zn, const char *matchname) { ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); if (zn->zn_matchtype & MT_NORMALIZE) { char norm[ZAP_MAXNAMELEN]; if (zap_normalize(zn->zn_zap, matchname, norm, zn->zn_normflags) != 0) return (B_FALSE); return (strcmp(zn->zn_key_norm, norm) == 0); } else { return (strcmp(zn->zn_key_orig, matchname) == 0); } } +static zap_name_t * +zap_name_alloc(zap_t *zap) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + zn->zn_zap = zap; + return (zn); +} + void zap_name_free(zap_name_t *zn) { kmem_free(zn, sizeof (zap_name_t)); } -zap_name_t * -zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) +static int +zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt) { - zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + zap_t *zap = zn->zn_zap; - zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_matchtype = mt; zn->zn_normflags = zap->zap_normflags; /* * If we're dealing with a case sensitive lookup on a mixed or * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup * will fold case to all caps overriding the lookup request. */ if (mt & MT_MATCH_CASE) zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; if (zap->zap_normflags) { /* * We *must* use zap_normflags because this normalization is * what the hash is computed from. */ if (zap_normalize(zap, key, zn->zn_normbuf, - zap->zap_normflags) != 0) { - zap_name_free(zn); - return (NULL); - } + zap->zap_normflags) != 0) + return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { - if (mt != 0) { - zap_name_free(zn); - return (NULL); - } + if (mt != 0) + return (SET_ERROR(ENOTSUP)); zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } zn->zn_hash = zap_hash(zn); if (zap->zap_normflags != zn->zn_normflags) { /* * We *must* use zn_normflags because this normalization is * what the matching is based on. (Not the hash!) */ if (zap_normalize(zap, key, zn->zn_normbuf, - zn->zn_normflags) != 0) { - zap_name_free(zn); - return (NULL); - } + zn->zn_normflags) != 0) + return (SET_ERROR(ENOTSUP)); zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } + return (0); +} + +zap_name_t * +zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt) +{ + zap_name_t *zn = zap_name_alloc(zap); + if (zap_name_init_str(zn, key, mt) != 0) { + zap_name_free(zn); + return (NULL); + } return (zn); } static zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); ASSERT(zap->zap_normflags == 0); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = zn->zn_key_norm = key; zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; zn->zn_matchtype = 0; zn->zn_hash = zap_hash(zn); return (zn); } static void mzap_byteswap(mzap_phys_t *buf, size_t size) { buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); buf->mz_normflags = BSWAP_64(buf->mz_normflags); int max = (size / MZAP_ENT_LEN) - 1; for (int i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = BSWAP_64(buf->mz_chunk[i].mze_value); buf->mz_chunk[i].mze_cd = BSWAP_32(buf->mz_chunk[i].mze_cd); } } void zap_byteswap(void *buf, size_t size) { uint64_t block_type = *(uint64_t *)buf; if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { /* ASSERT(magic == ZAP_LEAF_MAGIC); */ mzap_byteswap(buf, size); } else { fzap_byteswap(buf, size); } } static int mze_compare(const void *arg1, const void *arg2) { const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; - int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash); - if (likely(cmp)) - return (cmp); - - return (TREE_CMP(mze1->mze_cd, mze2->mze_cd)); + return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd, + (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } static void -mze_insert(zap_t *zap, int chunkid, uint64_t hash) +mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { + mzap_ent_t mze; + ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); - mze->mze_chunkid = chunkid; - mze->mze_hash = hash; - mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; - ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); - avl_add(&zap->zap_m.zap_avl, mze); + mze.mze_chunkid = chunkid; + ASSERT0(hash & 0xffffffff); + mze.mze_hash = hash >> 32; + ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff); + mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd; + ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0); + zfs_btree_add(&zap->zap_m.zap_tree, &mze); } static mzap_ent_t * -mze_find(zap_name_t *zn) +mze_find(zap_name_t *zn, zfs_btree_index_t *idx) { mzap_ent_t mze_tofind; mzap_ent_t *mze; - avl_index_t idx; - avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; + zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); - mze_tofind.mze_hash = zn->zn_hash; + ASSERT0(zn->zn_hash & 0xffffffff); + mze_tofind.mze_hash = zn->zn_hash >> 32; mze_tofind.mze_cd = 0; - mze = avl_find(avl, &mze_tofind, &idx); + mze = zfs_btree_find(tree, &mze_tofind, idx); if (mze == NULL) - mze = avl_nearest(avl, idx, AVL_AFTER); - for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { + mze = zfs_btree_next(tree, idx, idx); + for (; mze && mze->mze_hash == mze_tofind.mze_hash; + mze = zfs_btree_next(tree, idx, idx)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } return (NULL); } static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; - avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; + zfs_btree_index_t idx; + zfs_btree_t *tree = &zap->zap_m.zap_tree; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT0(hash & 0xffffffff); + hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; uint32_t cd = 0; - for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); - mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); + mze && mze->mze_hash == hash; + mze = zfs_btree_next(tree, &idx, &idx)) { if (mze->mze_cd != cd) break; cd++; } return (cd); } /* * Each mzap entry requires at max : 4 chunks * 3 chunks for names + 1 chunk for value. */ #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) /* * Check if the current entry keeps the colliding entries under the fatzap leaf * size. */ static boolean_t mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) { zap_t *zap = zn->zn_zap; mzap_ent_t mze_tofind; - mzap_ent_t *mze; - avl_index_t idx; - avl_tree_t *avl = &zap->zap_m.zap_avl; + zfs_btree_index_t idx; + zfs_btree_t *tree = &zap->zap_m.zap_tree; uint32_t mzap_ents = 0; + ASSERT0(hash & 0xffffffff); + hash >>= 32; mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; - for (mze = avl_find(avl, &mze_tofind, &idx); - mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx); + mze && mze->mze_hash == hash; + mze = zfs_btree_next(tree, &idx, &idx)) { mzap_ents++; } /* Include the new entry being added */ mzap_ents++; return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); } -static void -mze_remove(zap_t *zap, mzap_ent_t *mze) -{ - ASSERT(zap->zap_ismicro); - ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - - avl_remove(&zap->zap_m.zap_avl, mze); - kmem_free(mze, sizeof (mzap_ent_t)); -} - static void mze_destroy(zap_t *zap) { - mzap_ent_t *mze; - void *avlcookie = NULL; - - while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))) - kmem_free(mze, sizeof (mzap_ent_t)); - avl_destroy(&zap->zap_m.zap_avl); + zfs_btree_clear(&zap->zap_m.zap_tree); + zfs_btree_destroy(&zap->zap_m.zap_tree); } static zap_t * mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) { zap_t *winner; uint64_t *zap_hdr = (uint64_t *)db->db_data; uint64_t zap_block_type = zap_hdr[0]; uint64_t zap_magic = zap_hdr[1]; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = os; zap->zap_object = obj; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { winner = NULL; /* No actual winner here... */ goto handle_winner; } } else { zap->zap_ismicro = TRUE; } /* * Make sure that zap_ismicro is set before we let others see * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) goto handle_winner; if (zap->zap_ismicro) { zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; - avl_create(&zap->zap_m.zap_avl, mze_compare, - sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); - for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { + /* + * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove() + * overhead on massive inserts below. It still allows to store + * 62 entries before we have to add 2KB B-tree core node. + */ + zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, + sizeof (mzap_ent_t), 512); + + zap_name_t *zn = zap_name_alloc(zap); + for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { - zap_name_t *zn; - zap->zap_m.zap_num_entries++; - zn = zap_name_alloc(zap, mze->mze_name, 0); + zap_name_init_str(zn, mze->mze_name, 0); mze_insert(zap, i, zn->zn_hash); - zap_name_free(zn); } } + zap_name_free(zn); } else { zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_normflags = zap_f_phys(zap)->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); /* * The embedded pointer table should not overlap the * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, &zap_f_phys(zap)->zap_salt); /* * The embedded pointer table should end at the end of * the block */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); return (zap); handle_winner: rw_exit(&zap->zap_rwlock); rw_destroy(&zap->zap_rwlock); if (!zap->zap_ismicro) mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); return (winner); } /* * This routine "consumes" the caller's hold on the dbuf, which must * have the specified tag. */ static int zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { ASSERT0(db->db_offset); objset_t *os = dmu_buf_get_objset(db); uint64_t obj = db->db_object; dmu_object_info_t doi; *zapp = NULL; dmu_object_info_from_db(db, &doi); if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) return (SET_ERROR(EINVAL)); zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { zap = mzap_open(os, obj, db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. * Check for corruption! */ return (SET_ERROR(EIO)); } } /* * We're checking zap_ismicro without the lock held, in order to * tell what type of lock we want. Once we have some sort of * lock, see if it really is the right type. In practice this * can only be different if it was upgraded from micro to fat, * and micro wanted WRITER but fat only needs READER. */ krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; rw_enter(&zap->zap_rwlock, lt); if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { /* it was upgraded, now we only need reader */ ASSERT(lt == RW_WRITER); ASSERT(RW_READER == ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); rw_downgrade(&zap->zap_rwlock); lt = RW_READER; } zap->zap_objset = os; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3P(zap->zap_dbuf, ==, db); ASSERT(!zap->zap_ismicro || zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; if (newsz > MZAP_MAX_BLKSZ) { dprintf("upgrading obj %llu: num_entries=%u\n", (u_longlong_t)obj, zap->zap_m.zap_num_entries); *zapp = zap; int err = mzap_upgrade(zapp, tag, tx, 0); if (err != 0) rw_exit(&zap->zap_rwlock); return (err); } VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; } *zapp = zap; return (0); } static int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) { return (err); } #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) { dmu_buf_rele(db, tag); } return (err); } int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) dmu_buf_rele(db, tag); return (err); } void zap_unlockdir(zap_t *zap, void *tag) { rw_exit(&zap->zap_rwlock); dmu_buf_rele(zap->zap_dbuf, tag); } static int mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) { int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); int sz = zap->zap_dbuf->db_size; mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); bcopy(zap->zap_dbuf->db_data, mzp, sz); int nchunks = zap->zap_m.zap_num_chunks; if (!flags) { err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err != 0) { vmem_free(mzp, sz); return (err); } } dprintf("upgrading obj=%llu with %u chunks\n", (u_longlong_t)zap->zap_object, nchunks); - /* XXX destroy the avl later, so we can use the stored hash value */ + /* XXX destroy the tree later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); + zap_name_t *zn = zap_name_alloc(zap); for (int i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, (u_longlong_t)mze->mze_value); - zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); + zap_name_init_str(zn, mze->mze_name, 0); /* If we fail here, we would end up losing entries */ VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx)); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ - zap_name_free(zn); } + zap_name_free(zn); vmem_free(mzp, sz); *zapp = zap; return (0); } /* * The "normflags" determine the behavior of the matchtype_t which is * passed to zap_lookup_norm(). Names which have the same normalized * version will be stored with the same hash value, and therefore we can * perform normalization-insensitive lookups. We can be Unicode form- * insensitive and/or case-insensitive. The following flags are valid for * "normflags": * * U8_TEXTPREP_NFC * U8_TEXTPREP_NFD * U8_TEXTPREP_NFKC * U8_TEXTPREP_NFKD * U8_TEXTPREP_TOUPPER * * The *_NF* (Normalization Form) flags are mutually exclusive; at most one * of them may be supplied. */ void mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db, tx); mzap_phys_t *zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL; zp->mz_normflags = normflags; if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); } else { dmu_buf_rele(db, FTAG); } } static uint64_t zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) { uint64_t obj; ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); if (allocated_dnode == NULL) { dnode_t *dn; obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, &dn, FTAG, tx); mzap_create_impl(dn, normflags, flags, tx); dnode_rele(dn, FTAG); } else { obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx); mzap_create_impl(*allocated_dnode, normflags, flags, tx); } return (obj); } int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, 0, ot, bonustype, bonuslen, dnodesize, tx)); } int zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { dnode_t *dn; int error; ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, dnodesize, tx); if (error != 0) return (error); error = dnode_hold(os, obj, FTAG, &dn); if (error != 0) return (error); mzap_create_impl(dn, normflags, 0, tx); dnode_rele(dn, FTAG); return (0); } uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } uint64_t zap_create_dnsize(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, dnodesize, tx)); } uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, 0, ot, 0, 0, bonustype, bonuslen, dnodesize, NULL, NULL, tx)); } uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_flags_dnsize(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, tx)); } /* * Create a zap object and return a pointer to the newly allocated dnode via * the allocated_dnode argument. The returned dnode will be held and the * caller is responsible for releasing the hold by calling dnode_rele(). */ uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) { return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); } int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) { /* * dmu_object_free will free the object number and free the * data. Freeing the data will cause our pageout function to be * called, which will destroy our data (zap_leaf_t's and zap_t). */ return (dmu_object_free(os, zapobj, tx)); } void zap_evict_sync(void *dbu) { zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); if (zap->zap_ismicro) mze_destroy(zap); else mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); } int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); if (!zap->zap_ismicro) { err = fzap_count(zap, count); } else { *count = zap->zap_m.zap_num_entries; } zap_unlockdir(zap, FTAG); return (err); } /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t -mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) +mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze, + zfs_btree_index_t *idx) { - int direction = AVL_BEFORE; boolean_t allocdzn = B_FALSE; + mzap_ent_t *other; + zfs_btree_index_t oidx; if (zap->zap_normflags == 0) return (B_FALSE); -again: - for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction); + for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx); other && other->mze_hash == mze->mze_hash; - other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { + other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) { if (zn == NULL) { - zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, - MT_NORMALIZE); + zn = zap_name_alloc_str(zap, + MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } - if (direction == AVL_BEFORE) { - direction = AVL_AFTER; - goto again; + for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx); + other && other->mze_hash == mze->mze_hash; + other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) { + + if (zn == NULL) { + zn = zap_name_alloc_str(zap, + MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); + allocdzn = B_TRUE; + } + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for manipulating attributes. */ int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm(os, zapobj, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } static int zap_lookup_impl(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { int err = 0; - zap_name_t *zn = zap_name_alloc(zap, name, mt); + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (num_integers < 1) { err = SET_ERROR(EOVERFLOW); } else if (integer_size != 8) { err = SET_ERROR(EINVAL); } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; if (realname != NULL) (void) strlcpy(realname, MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, - zn, mze); + zn, mze, &idx); } } } } zap_name_free(zn); return (err); } int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); - zn = zap_name_alloc(zap, name, 0); + zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm_by_dnode(dn, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_contains(objset_t *os, uint64_t zapobj, const char *name) { int err = zap_lookup_norm(os, zapobj, name, 0, 0, NULL, 0, NULL, 0, NULL); if (err == EOVERFLOW || err == EINVAL) err = 0; /* found, but skipped reading the value */ return (err); } int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc(zap, name, 0); + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (integer_size) *integer_size = 8; if (num_integers) *num_integers = 1; } } zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } static void mzap_addent(zap_name_t *zn, uint64_t value) { zap_t *zap = zn->zn_zap; - int start = zap->zap_m.zap_alloc_next; + uint16_t start = zap->zap_m.zap_alloc_next; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd < zap_maxcd(zap)); again: - for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { + for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; (void) strlcpy(mze->mze_name, zn->zn_key_orig, sizeof (mze->mze_name)); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; mze_insert(zap, i, zn->zn_hash); return; } } if (start != 0) { start = 0; goto again; } cmn_err(CE_PANIC, "out of entries!"); } static int zap_add_impl(zap_t *zap, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, void *tag) { const uint64_t *intval = val; int err = 0; - zap_name_t *zn = zap_name_alloc(zap, key, 0); + zap_name_t *zn = zap_name_alloc_str(zap, key, 0); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN || !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); if (err == 0) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { - if (mze_find(zn) != NULL) { + zfs_btree_index_t idx; + if (mze_find(zn, &idx) != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, tag); return (err); } int zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } int zap_add_by_dnode(dnode_t *dn, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; const uint64_t *intval = val; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc(zap, name, 0); + zap_name_t *zn = zap_name_alloc_str(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", (u_longlong_t)zapobj, integer_size, (u_longlong_t)num_integers, name); err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); if (err == 0) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze != NULL) { MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, 0, tx)); } static int zap_remove_impl(zap_t *zap, const char *name, matchtype_t mt, dmu_tx_t *tx) { int err = 0; - zap_name_t *zn = zap_name_alloc(zap, name, mt); + zap_name_t *zn = zap_name_alloc_str(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { - mzap_ent_t *mze = mze_find(zn); + zfs_btree_index_t idx; + mzap_ent_t *mze = mze_find(zn, &idx); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; - bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], - sizeof (mzap_ent_phys_t)); - mze_remove(zap, mze); + memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t)); + zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx); } } zap_name_free(zn); return (err); } int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, mt, tx); zap_unlockdir(zap, FTAG); return (err); } int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, 0, tx); zap_unlockdir(zap, FTAG); return (err); } int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_remove(zn, tx); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } /* * Routines for iterating over the attributes. */ static void zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized, boolean_t prefetch) { zc->zc_objset = os; zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; zc->zc_prefetch = prefetch; } void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) { zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); } /* * Initialize a cursor at the beginning of the ZAP object. The entire * ZAP object will be prefetched. */ void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); } /* * Initialize a cursor at the beginning, but request that we not prefetch * the entire ZAP object. */ void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); } void zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); zap_unlockdir(zc->zc_zap, NULL); zc->zc_zap = NULL; } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } zc->zc_objset = NULL; } uint64_t zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); if (zc->zc_zap == NULL) return (zc->zc_serialized); ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); /* * We want to keep the high 32 bits of the cursor zero if we can, so * that 32-bit programs can access this. So usually use a small * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits * of the cursor. * * [ collision differentiator | zap_hashbits()-bit hash value ] */ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; if (zc->zc_hash == -1ULL) return (SET_ERROR(ENOENT)); if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); if (err != 0) return (err); /* * To support zap_cursor_init_serialized, advance, retrieve, * we must add to the existing zc_cd, which may already * be 1 due to the zap_cursor_advance. */ ASSERT(zc->zc_hash == 0); hb = zap_hashbits(zc->zc_zap); zc->zc_hash = zc->zc_serialized << (64 - hb); zc->zc_cd += zc->zc_serialized >> hb; if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { - avl_index_t idx; + zfs_btree_index_t idx; mzap_ent_t mze_tofind; - mze_tofind.mze_hash = zc->zc_hash; + mze_tofind.mze_hash = zc->zc_hash >> 32; mze_tofind.mze_cd = zc->zc_cd; - mzap_ent_t *mze = - avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); + mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree, + &mze_tofind, &idx); if (mze == NULL) { - mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, - idx, AVL_AFTER); + mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree, + &idx, &idx); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = - mzap_normalization_conflict(zc->zc_zap, NULL, mze); + mzap_normalization_conflict(zc->zc_zap, NULL, + mze, &idx); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strlcpy(za->za_name, mzep->mze_name, sizeof (za->za_name)); - zc->zc_hash = mze->mze_hash; + zc->zc_hash = (uint64_t)mze->mze_hash << 32; zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; err = SET_ERROR(ENOENT); } } rw_exit(&zc->zc_zap->zap_rwlock); return (err); } void zap_cursor_advance(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return; zc->zc_cd++; } int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); bzero(zs, sizeof (zap_stats_t)); if (zap->zap_ismicro) { zs->zs_blocksize = zap->zap_dbuf->db_size; zs->zs_num_entries = zap->zap_m.zap_num_entries; zs->zs_num_blocks = 1; } else { fzap_get_stats(zap, zs); } zap_unlockdir(zap, FTAG); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zap_create); EXPORT_SYMBOL(zap_create_dnsize); EXPORT_SYMBOL(zap_create_norm); EXPORT_SYMBOL(zap_create_norm_dnsize); EXPORT_SYMBOL(zap_create_flags); EXPORT_SYMBOL(zap_create_flags_dnsize); EXPORT_SYMBOL(zap_create_claim); EXPORT_SYMBOL(zap_create_claim_norm); EXPORT_SYMBOL(zap_create_claim_norm_dnsize); EXPORT_SYMBOL(zap_create_hold); EXPORT_SYMBOL(zap_destroy); EXPORT_SYMBOL(zap_lookup); EXPORT_SYMBOL(zap_lookup_by_dnode); EXPORT_SYMBOL(zap_lookup_norm); EXPORT_SYMBOL(zap_lookup_uint64); EXPORT_SYMBOL(zap_contains); EXPORT_SYMBOL(zap_prefetch); EXPORT_SYMBOL(zap_prefetch_uint64); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); EXPORT_SYMBOL(zap_update); EXPORT_SYMBOL(zap_update_uint64); EXPORT_SYMBOL(zap_length); EXPORT_SYMBOL(zap_length_uint64); EXPORT_SYMBOL(zap_remove); EXPORT_SYMBOL(zap_remove_by_dnode); EXPORT_SYMBOL(zap_remove_norm); EXPORT_SYMBOL(zap_remove_uint64); EXPORT_SYMBOL(zap_count); EXPORT_SYMBOL(zap_value_search); EXPORT_SYMBOL(zap_join); EXPORT_SYMBOL(zap_join_increment); EXPORT_SYMBOL(zap_add_int); EXPORT_SYMBOL(zap_remove_int); EXPORT_SYMBOL(zap_lookup_int); EXPORT_SYMBOL(zap_increment_int); EXPORT_SYMBOL(zap_add_int_key); EXPORT_SYMBOL(zap_lookup_int_key); EXPORT_SYMBOL(zap_increment); EXPORT_SYMBOL(zap_cursor_init); EXPORT_SYMBOL(zap_cursor_fini); EXPORT_SYMBOL(zap_cursor_retrieve); EXPORT_SYMBOL(zap_cursor_advance); EXPORT_SYMBOL(zap_cursor_serialize); EXPORT_SYMBOL(zap_cursor_init_serialized); EXPORT_SYMBOL(zap_get_stats); #endif