diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c index 4cd7f4b085ba..6d45d9762a78 100644 --- a/sys/kern/subr_pctrie.c +++ b/sys/kern/subr_pctrie.c @@ -1,735 +1,728 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Path-compressed radix trie implementation. * * The implementation takes into account the following rationale: * - Size of the nodes should be as small as possible but still big enough * to avoid a large maximum depth for the trie. This is a balance * between the necessity to not wire too much physical memory for the nodes * and the necessity to avoid too much cache pollution during the trie * operations. * - There is not a huge bias toward the number of lookup operations over * the number of insert and remove operations. This basically implies * that optimizations supposedly helping one operation but hurting the * other might be carefully evaluated. * - On average not many nodes are expected to be fully populated, hence * level compression may just complicate things. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include /* smr.h depends on struct thread. */ #include #include #ifdef DDB #include #endif #define PCTRIE_MASK (PCTRIE_COUNT - 1) #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1) #if PCTRIE_WIDTH == 3 typedef uint8_t pn_popmap_t; #elif PCTRIE_WIDTH == 4 typedef uint16_t pn_popmap_t; #elif PCTRIE_WIDTH == 5 typedef uint32_t pn_popmap_t; #else #error Unsupported width #endif _Static_assert(sizeof(pn_popmap_t) <= sizeof(int), "pn_popmap_t too wide"); -/* Flag bits stored in node pointers. */ -#define PCTRIE_ISLEAF 0x1 -#define PCTRIE_FLAGS 0x1 +/* Set of all flag bits stored in node pointers. */ +#define PCTRIE_FLAGS (PCTRIE_ISLEAF) #define PCTRIE_PAD PCTRIE_FLAGS /* Returns one unit associated with specified level. */ #define PCTRIE_UNITLEVEL(lev) \ ((uint64_t)1 << ((lev) * PCTRIE_WIDTH)) struct pctrie_node; typedef SMR_POINTER(struct pctrie_node *) smr_pctnode_t; struct pctrie_node { uint64_t pn_owner; /* Owner of record. */ pn_popmap_t pn_popmap; /* Valid children. */ uint8_t pn_clev; /* Current level. */ smr_pctnode_t pn_child[PCTRIE_COUNT]; /* Child nodes. */ }; enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED }; static __inline void pctrie_node_store(smr_pctnode_t *p, void *val, enum pctrie_access access); /* * Return the position in the array for a given level. */ static __inline int pctrie_slot(uint64_t index, uint16_t level) { return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK); } /* Computes the key (index) with the low-order 'level' radix-digits zeroed. */ static __inline uint64_t pctrie_trimkey(uint64_t index, uint16_t level) { return (index & -PCTRIE_UNITLEVEL(level)); } /* * Allocate a node. Pre-allocation should ensure that the request * will always be satisfied. */ static struct pctrie_node * pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t index, uint16_t clevel) { struct pctrie_node *node; node = allocfn(ptree); if (node == NULL) return (NULL); /* * We want to clear the last child pointer after the final section * has exited so lookup can not return false negatives. It is done * here because it will be cache-cold in the dtor callback. */ if (node->pn_popmap != 0) { pctrie_node_store(&node->pn_child[ffs(node->pn_popmap) - 1], - NULL, PCTRIE_UNSERIALIZED); + PCTRIE_NULL, PCTRIE_UNSERIALIZED); node->pn_popmap = 0; } node->pn_owner = pctrie_trimkey(index, clevel + 1); node->pn_clev = clevel; return (node); } /* * Free radix node. */ static __inline void pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node, pctrie_free_t freefn) { #ifdef INVARIANTS int slot; KASSERT(powerof2(node->pn_popmap), ("pctrie_node_put: node %p has too many children %04x", node, node->pn_popmap)); for (slot = 0; slot < PCTRIE_COUNT; slot++) { if ((node->pn_popmap & (1 << slot)) != 0) continue; KASSERT(smr_unserialized_load(&node->pn_child[slot], true) == - NULL, ("pctrie_node_put: node %p has a child", node)); + PCTRIE_NULL, + ("pctrie_node_put: node %p has a child", node)); } #endif freefn(ptree, node); } /* * Fetch a node pointer from a slot. */ static __inline struct pctrie_node * pctrie_node_load(smr_pctnode_t *p, smr_t smr, enum pctrie_access access) { switch (access) { case PCTRIE_UNSERIALIZED: return (smr_unserialized_load(p, true)); case PCTRIE_LOCKED: return (smr_serialized_load(p, true)); case PCTRIE_SMR: return (smr_entered_load(p, smr)); } __assert_unreachable(); } static __inline void pctrie_node_store(smr_pctnode_t *p, void *v, enum pctrie_access access) { switch (access) { case PCTRIE_UNSERIALIZED: smr_unserialized_store(p, v, true); break; case PCTRIE_LOCKED: smr_serialized_store(p, v, true); break; case PCTRIE_SMR: panic("%s: Not supported in SMR section.", __func__); break; default: __assert_unreachable(); break; } } /* * Get the root node for a tree. */ static __inline struct pctrie_node * pctrie_root_load(struct pctrie *ptree, smr_t smr, enum pctrie_access access) { return (pctrie_node_load((smr_pctnode_t *)&ptree->pt_root, smr, access)); } /* * Set the root node for a tree. */ static __inline void pctrie_root_store(struct pctrie *ptree, struct pctrie_node *node, enum pctrie_access access) { pctrie_node_store((smr_pctnode_t *)&ptree->pt_root, node, access); } /* * Returns TRUE if the specified node is a leaf and FALSE otherwise. */ static __inline bool pctrie_isleaf(struct pctrie_node *node) { return (((uintptr_t)node & PCTRIE_ISLEAF) != 0); } /* * Returns val with leaf bit set. */ static __inline void * pctrie_toleaf(uint64_t *val) { return ((void *)((uintptr_t)val | PCTRIE_ISLEAF)); } /* * Returns the associated val extracted from node. */ static __inline uint64_t * pctrie_toval(struct pctrie_node *node) { return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS)); } /* * Make 'child' a child of 'node'. */ static __inline void pctrie_addnode(struct pctrie_node *node, uint64_t index, uint16_t clev, struct pctrie_node *child, enum pctrie_access access) { int slot; slot = pctrie_slot(index, clev); pctrie_node_store(&node->pn_child[slot], child, access); node->pn_popmap ^= 1 << slot; KASSERT((node->pn_popmap & (1 << slot)) != 0, ("%s: bad popmap slot %d in node %p", __func__, slot, node)); } /* * Returns the level where two keys differ. * It cannot accept 2 equal keys. */ static __inline uint16_t pctrie_keydiff(uint64_t index1, uint64_t index2) { KASSERT(index1 != index2, ("%s: passing the same key value %jx", __func__, (uintmax_t)index1)); CTASSERT(sizeof(long long) >= sizeof(uint64_t)); /* * From the highest-order bit where the indexes differ, * compute the highest level in the trie where they differ. */ return ((flsll(index1 ^ index2) - 1) / PCTRIE_WIDTH); } /* * Returns TRUE if it can be determined that key does not belong to the * specified node. Otherwise, returns FALSE. */ static __inline bool pctrie_keybarr(struct pctrie_node *node, uint64_t idx) { if (node->pn_clev < PCTRIE_LIMIT) { idx = pctrie_trimkey(idx, node->pn_clev + 1); return (idx != node->pn_owner); } return (false); } /* * Internal helper for pctrie_reclaim_allnodes(). * This function is recursive. */ static void pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node, pctrie_free_t freefn) { struct pctrie_node *child; int slot; while (node->pn_popmap != 0) { slot = ffs(node->pn_popmap) - 1; child = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_UNSERIALIZED); - KASSERT(child != NULL, ("%s: bad popmap slot %d in node %p", + KASSERT(child != PCTRIE_NULL, + ("%s: bad popmap slot %d in node %p", __func__, slot, node)); if (!pctrie_isleaf(child)) pctrie_reclaim_allnodes_int(ptree, child, freefn); node->pn_popmap ^= 1 << slot; - pctrie_node_store(&node->pn_child[slot], NULL, + pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL, PCTRIE_UNSERIALIZED); } pctrie_node_put(ptree, node, freefn); } /* * pctrie node zone initializer. */ int pctrie_zone_init(void *mem, int size __unused, int flags __unused) { struct pctrie_node *node; node = mem; node->pn_popmap = 0; - memset(node->pn_child, 0, sizeof(node->pn_child)); + for (int i = 0; i < nitems(node->pn_child); i++) + pctrie_node_store(&node->pn_child[i], PCTRIE_NULL, + PCTRIE_UNSERIALIZED); return (0); } size_t pctrie_node_size(void) { return (sizeof(struct pctrie_node)); } /* * Inserts the key-value pair into the trie. * Panics if the key already exists. */ int pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn) { uint64_t index, newind; - struct pctrie_node *leaf, *node, *tmp; + struct pctrie_node *leaf, *node, *parent; smr_pctnode_t *parentp; int slot; uint16_t clev; index = *val; leaf = pctrie_toleaf(val); /* * The owner of record for root is not really important because it * will never be used. */ node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); - if (node == NULL) { - ptree->pt_root = (uintptr_t)leaf; - return (0); - } - for (parentp = (smr_pctnode_t *)&ptree->pt_root;; node = tmp) { + parent = NULL; + for (;;) { if (pctrie_isleaf(node)) { + if (node == PCTRIE_NULL) { + if (parent == NULL) + ptree->pt_root = leaf; + else + pctrie_addnode(parent, index, + parent->pn_clev, leaf, + PCTRIE_LOCKED); + return (0); + } newind = *pctrie_toval(node); if (newind == index) panic("%s: key %jx is already present", __func__, (uintmax_t)index); break; - } else if (pctrie_keybarr(node, index)) { + } + if (pctrie_keybarr(node, index)) { newind = node->pn_owner; break; } slot = pctrie_slot(index, node->pn_clev); - parentp = &node->pn_child[slot]; - tmp = pctrie_node_load(parentp, NULL, PCTRIE_LOCKED); - if (tmp == NULL) { - pctrie_addnode(node, index, node->pn_clev, leaf, - PCTRIE_LOCKED); - return (0); - } + parent = node; + node = pctrie_node_load(&node->pn_child[slot], NULL, + PCTRIE_LOCKED); } /* * A new node is needed because the right insertion level is reached. * Setup the new intermediate node and add the 2 children: the * new object and the older edge or object. */ + parentp = (parent != NULL) ? &parent->pn_child[slot]: + (smr_pctnode_t *)&ptree->pt_root; clev = pctrie_keydiff(newind, index); - tmp = pctrie_node_get(ptree, allocfn, index, clev); - if (tmp == NULL) + parent = pctrie_node_get(ptree, allocfn, index, clev); + if (parent == NULL) return (ENOMEM); /* These writes are not yet visible due to ordering. */ - pctrie_addnode(tmp, index, clev, leaf, PCTRIE_UNSERIALIZED); - pctrie_addnode(tmp, newind, clev, node, PCTRIE_UNSERIALIZED); + pctrie_addnode(parent, index, clev, leaf, PCTRIE_UNSERIALIZED); + pctrie_addnode(parent, newind, clev, node, PCTRIE_UNSERIALIZED); /* Synchronize to make the above visible. */ - pctrie_node_store(parentp, tmp, PCTRIE_LOCKED); + pctrie_node_store(parentp, parent, PCTRIE_LOCKED); return (0); } /* * Returns the value stored at the index. If the index is not present, * NULL is returned. */ static __always_inline uint64_t * _pctrie_lookup(struct pctrie *ptree, uint64_t index, smr_t smr, enum pctrie_access access) { struct pctrie_node *node; uint64_t *m; int slot; node = pctrie_root_load(ptree, smr, access); - while (node != NULL) { + for (;;) { if (pctrie_isleaf(node)) { - m = pctrie_toval(node); - if (*m == index) + if ((m = pctrie_toval(node)) != NULL && *m == index) return (m); break; } if (pctrie_keybarr(node, index)) break; slot = pctrie_slot(index, node->pn_clev); node = pctrie_node_load(&node->pn_child[slot], smr, access); } return (NULL); } /* * Returns the value stored at the index, assuming access is externally * synchronized by a lock. * * If the index is not present, NULL is returned. */ uint64_t * pctrie_lookup(struct pctrie *ptree, uint64_t index) { return (_pctrie_lookup(ptree, index, NULL, PCTRIE_LOCKED)); } /* * Returns the value stored at the index without requiring an external lock. * * If the index is not present, NULL is returned. */ uint64_t * pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t index, smr_t smr) { uint64_t *res; smr_enter(smr); res = _pctrie_lookup(ptree, index, smr, PCTRIE_SMR); smr_exit(smr); return (res); } /* * Returns the value with the least index that is greater than or equal to the * specified index, or NULL if there are no such values. * * Requires that access be externally synchronized by a lock. */ uint64_t * pctrie_lookup_ge(struct pctrie *ptree, uint64_t index) { struct pctrie_node *node, *succ; uint64_t *m; int slot; /* * Descend the trie as if performing an ordinary lookup for the * specified value. However, unlike an ordinary lookup, as we descend * the trie, we use "succ" to remember the last branching-off point, * that is, the interior node under which the least value that is both * outside our current path down the trie and greater than the specified * index resides. (The node's popmap makes it fast and easy to * recognize a branching-off point.) If our ordinary lookup fails to * yield a value that is greater than or equal to the specified index, * then we will exit this loop and perform a lookup starting from * "succ". If "succ" is not NULL, then that lookup is guaranteed to * succeed. */ node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); succ = NULL; - while (node != NULL) { + for (;;) { if (pctrie_isleaf(node)) { - m = pctrie_toval(node); - if (*m >= index) + if ((m = pctrie_toval(node)) != NULL && *m >= index) return (m); break; } if (pctrie_keybarr(node, index)) { /* * If all values in this subtree are > index, then the * least value in this subtree is the answer. */ if (node->pn_owner > index) succ = node; break; } slot = pctrie_slot(index, node->pn_clev); /* * Just in case the next search step leads to a subtree of all * values < index, check popmap to see if a next bigger step, to * a subtree of all pages with values > index, is available. If * so, remember to restart the search here. */ if ((node->pn_popmap >> slot) > 1) succ = node; node = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED); } /* * Restart the search from the last place visited in the subtree that * included some values > index, if there was such a place. */ if (succ == NULL) return (NULL); if (succ != node) { /* * Take a step to the next bigger sibling of the node chosen * last time. In that subtree, all values > index. */ slot = pctrie_slot(index, succ->pn_clev) + 1; KASSERT((succ->pn_popmap >> slot) != 0, ("%s: no popmap siblings past slot %d in node %p", __func__, slot, succ)); slot += ffs(succ->pn_popmap >> slot) - 1; succ = pctrie_node_load(&succ->pn_child[slot], NULL, PCTRIE_LOCKED); } /* * Find the value in the subtree rooted at "succ" with the least index. */ while (!pctrie_isleaf(succ)) { KASSERT(succ->pn_popmap != 0, ("%s: no popmap children in node %p", __func__, succ)); slot = ffs(succ->pn_popmap) - 1; succ = pctrie_node_load(&succ->pn_child[slot], NULL, PCTRIE_LOCKED); } return (pctrie_toval(succ)); } /* * Returns the value with the greatest index that is less than or equal to the * specified index, or NULL if there are no such values. * * Requires that access be externally synchronized by a lock. */ uint64_t * pctrie_lookup_le(struct pctrie *ptree, uint64_t index) { struct pctrie_node *node, *pred; uint64_t *m; int slot; /* * Mirror the implementation of pctrie_lookup_ge, described above. */ node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); pred = NULL; - while (node != NULL) { + for (;;) { if (pctrie_isleaf(node)) { - m = pctrie_toval(node); - if (*m <= index) + if ((m = pctrie_toval(node)) != NULL && *m <= index) return (m); break; } if (pctrie_keybarr(node, index)) { if (node->pn_owner < index) pred = node; break; } slot = pctrie_slot(index, node->pn_clev); if ((node->pn_popmap & ((1 << slot) - 1)) != 0) pred = node; node = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED); } if (pred == NULL) return (NULL); if (pred != node) { slot = pctrie_slot(index, pred->pn_clev); KASSERT((pred->pn_popmap & ((1 << slot) - 1)) != 0, ("%s: no popmap siblings before slot %d in node %p", __func__, slot, pred)); slot = fls(pred->pn_popmap & ((1 << slot) - 1)) - 1; pred = pctrie_node_load(&pred->pn_child[slot], NULL, PCTRIE_LOCKED); } while (!pctrie_isleaf(pred)) { KASSERT(pred->pn_popmap != 0, ("%s: no popmap children in node %p", __func__, pred)); slot = fls(pred->pn_popmap) - 1; pred = pctrie_node_load(&pred->pn_child[slot], NULL, PCTRIE_LOCKED); } return (pctrie_toval(pred)); } /* * Remove the specified index from the tree. * Panics if the key is not present. */ void pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn) { - struct pctrie_node *node, *parent, *tmp; + struct pctrie_node *child, *node, *parent; uint64_t *m; int slot; - node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); - if (pctrie_isleaf(node)) { - m = pctrie_toval(node); - if (*m != index) - panic("%s: invalid key found", __func__); - pctrie_root_store(ptree, NULL, PCTRIE_LOCKED); - return; - } - parent = NULL; + node = NULL; + child = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); for (;;) { - if (node == NULL) - panic("pctrie_remove: impossible to locate the key"); - slot = pctrie_slot(index, node->pn_clev); - tmp = pctrie_node_load(&node->pn_child[slot], NULL, - PCTRIE_LOCKED); - if (pctrie_isleaf(tmp)) { - m = pctrie_toval(tmp); - if (*m != index) - panic("%s: invalid key found", __func__); - KASSERT((node->pn_popmap & (1 << slot)) != 0, - ("%s: bad popmap slot %d in node %p", - __func__, slot, node)); - node->pn_popmap ^= 1 << slot; - pctrie_node_store(&node->pn_child[slot], NULL, - PCTRIE_LOCKED); - if (!powerof2(node->pn_popmap)) - break; - KASSERT(node->pn_popmap != 0, - ("%s: bad popmap all zeroes", __func__)); - slot = ffs(node->pn_popmap) - 1; - tmp = pctrie_node_load(&node->pn_child[slot], - NULL, PCTRIE_LOCKED); - KASSERT(tmp != NULL, - ("%s: bad popmap slot %d in node %p", - __func__, slot, node)); - if (parent == NULL) - pctrie_root_store(ptree, tmp, PCTRIE_LOCKED); - else { - slot = pctrie_slot(index, parent->pn_clev); - KASSERT(pctrie_node_load( - &parent->pn_child[slot], NULL, - PCTRIE_LOCKED) == node, - ("%s: invalid child value", __func__)); - pctrie_node_store(&parent->pn_child[slot], tmp, - PCTRIE_LOCKED); - } - /* - * The child is still valid and we can not zero the - * pointer until all SMR references are gone. - */ - pctrie_node_put(ptree, node, freefn); + if (pctrie_isleaf(child)) break; - } parent = node; - node = tmp; + node = child; + slot = pctrie_slot(index, node->pn_clev); + child = pctrie_node_load(&node->pn_child[slot], NULL, + PCTRIE_LOCKED); + } + if ((m = pctrie_toval(child)) == NULL || *m != index) + panic("%s: key not found", __func__); + if (node == NULL) { + pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_LOCKED); + return; + } + KASSERT((node->pn_popmap & (1 << slot)) != 0, + ("%s: bad popmap slot %d in node %p", + __func__, slot, node)); + node->pn_popmap ^= 1 << slot; + pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL, PCTRIE_LOCKED); + if (!powerof2(node->pn_popmap)) + return; + KASSERT(node->pn_popmap != 0, ("%s: bad popmap all zeroes", __func__)); + slot = ffs(node->pn_popmap) - 1; + child = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED); + KASSERT(child != PCTRIE_NULL, + ("%s: bad popmap slot %d in node %p", __func__, slot, node)); + if (parent == NULL) + pctrie_root_store(ptree, child, PCTRIE_LOCKED); + else { + slot = pctrie_slot(index, parent->pn_clev); + KASSERT(node == + pctrie_node_load(&parent->pn_child[slot], NULL, + PCTRIE_LOCKED), ("%s: invalid child value", __func__)); + pctrie_node_store(&parent->pn_child[slot], child, + PCTRIE_LOCKED); } + /* + * The child is still valid and we can not zero the + * pointer until all SMR references are gone. + */ + pctrie_node_put(ptree, node, freefn); } /* * Remove and free all the nodes from the tree. * This function is recursive but there is a tight control on it as the * maximum depth of the tree is fixed. */ void pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn) { struct pctrie_node *root; root = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED); - if (root == NULL) + if (root == PCTRIE_NULL) return; - pctrie_root_store(ptree, NULL, PCTRIE_UNSERIALIZED); + pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_UNSERIALIZED); if (!pctrie_isleaf(root)) pctrie_reclaim_allnodes_int(ptree, root, freefn); } #ifdef DDB /* * Show details about the given node. */ DB_SHOW_COMMAND(pctrienode, db_show_pctrienode) { struct pctrie_node *node, *tmp; int slot; pn_popmap_t popmap; if (!have_addr) return; node = (struct pctrie_node *)addr; db_printf("node %p, owner %jx, children popmap %04x, level %u:\n", (void *)node, (uintmax_t)node->pn_owner, node->pn_popmap, node->pn_clev); for (popmap = node->pn_popmap; popmap != 0; popmap ^= 1 << slot) { slot = ffs(popmap) - 1; tmp = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_UNSERIALIZED); db_printf("slot: %d, val: %p, value: %p, clev: %d\n", slot, (void *)tmp, pctrie_isleaf(tmp) ? pctrie_toval(tmp) : NULL, node->pn_clev); } } #endif /* DDB */ diff --git a/sys/sys/_pctrie.h b/sys/sys/_pctrie.h index b7947889e176..bc2f6a781066 100644 --- a/sys/sys/_pctrie.h +++ b/sys/sys/_pctrie.h @@ -1,43 +1,48 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __SYS_PCTRIE_H_ #define __SYS_PCTRIE_H_ +/* + * Radix tree node. + */ +struct pctrie_node; + /* * Radix tree root. */ struct pctrie { - uintptr_t pt_root; + struct pctrie_node *pt_root; }; #endif /* !__SYS_PCTRIE_H_ */ diff --git a/sys/sys/pctrie.h b/sys/sys/pctrie.h index 1968cb521fd6..687236cdd872 100644 --- a/sys/sys/pctrie.h +++ b/sys/sys/pctrie.h @@ -1,166 +1,172 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_PCTRIE_H_ #define _SYS_PCTRIE_H_ #include #include #ifdef _KERNEL #define PCTRIE_DEFINE_SMR(name, type, field, allocfn, freefn, smr) \ PCTRIE_DEFINE(name, type, field, allocfn, freefn) \ \ static __inline struct type * \ name##_PCTRIE_LOOKUP_UNLOCKED(struct pctrie *ptree, uint64_t key) \ { \ \ return name##_PCTRIE_VAL2PTR(pctrie_lookup_unlocked(ptree, \ key, (smr))); \ } \ #define PCTRIE_DEFINE(name, type, field, allocfn, freefn) \ \ CTASSERT(sizeof(((struct type *)0)->field) == sizeof(uint64_t)); \ /* \ * XXX This assert protects flag bits, it does not enforce natural \ * alignment. 32bit architectures do not naturally align 64bit fields. \ */ \ CTASSERT((__offsetof(struct type, field) & (sizeof(uint32_t) - 1)) == 0); \ \ static __inline struct type * \ name##_PCTRIE_VAL2PTR(uint64_t *val) \ { \ \ if (val == NULL) \ return (NULL); \ return (struct type *) \ ((uintptr_t)val - __offsetof(struct type, field)); \ } \ \ static __inline uint64_t * \ name##_PCTRIE_PTR2VAL(struct type *ptr) \ { \ \ return &ptr->field; \ } \ \ static __inline int \ name##_PCTRIE_INSERT(struct pctrie *ptree, struct type *ptr) \ { \ \ return pctrie_insert(ptree, name##_PCTRIE_PTR2VAL(ptr), \ allocfn); \ } \ \ static __inline struct type * \ name##_PCTRIE_LOOKUP(struct pctrie *ptree, uint64_t key) \ { \ \ return name##_PCTRIE_VAL2PTR(pctrie_lookup(ptree, key)); \ } \ \ static __inline __unused struct type * \ name##_PCTRIE_LOOKUP_LE(struct pctrie *ptree, uint64_t key) \ { \ \ return name##_PCTRIE_VAL2PTR(pctrie_lookup_le(ptree, key)); \ } \ \ static __inline __unused struct type * \ name##_PCTRIE_LOOKUP_GE(struct pctrie *ptree, uint64_t key) \ { \ \ return name##_PCTRIE_VAL2PTR(pctrie_lookup_ge(ptree, key)); \ } \ \ static __inline __unused void \ name##_PCTRIE_RECLAIM(struct pctrie *ptree) \ { \ \ pctrie_reclaim_allnodes(ptree, freefn); \ } \ \ static __inline void \ name##_PCTRIE_REMOVE(struct pctrie *ptree, uint64_t key) \ { \ \ pctrie_remove(ptree, key, freefn); \ } typedef void *(*pctrie_alloc_t)(struct pctrie *ptree); typedef void (*pctrie_free_t)(struct pctrie *ptree, void *node); int pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn); uint64_t *pctrie_lookup(struct pctrie *ptree, uint64_t key); uint64_t *pctrie_lookup_ge(struct pctrie *ptree, uint64_t key); uint64_t *pctrie_lookup_le(struct pctrie *ptree, uint64_t key); uint64_t *pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t key, smr_t smr); void pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn); void pctrie_remove(struct pctrie *ptree, uint64_t key, pctrie_free_t freefn); size_t pctrie_node_size(void); int pctrie_zone_init(void *mem, int size, int flags); +/* + * Each search path in the trie terminates at a leaf, which is a pointer to a + * value marked with a set 1-bit. A leaf may be associated with a null pointer + * to indicate no value there. + */ +#define PCTRIE_ISLEAF 0x1 +#define PCTRIE_NULL (struct pctrie_node *)PCTRIE_ISLEAF + static __inline void pctrie_init(struct pctrie *ptree) { - - ptree->pt_root = 0; + ptree->pt_root = PCTRIE_NULL; } static __inline bool pctrie_is_empty(struct pctrie *ptree) { - - return (ptree->pt_root == 0); + return (ptree->pt_root == PCTRIE_NULL); } /* * These widths should allow the pointers to a node's children to fit within * a single cache line. The extra levels from a narrow width should not be * a problem thanks to path compression. */ #ifdef __LP64__ #define PCTRIE_WIDTH 4 #else #define PCTRIE_WIDTH 3 #endif #define PCTRIE_COUNT (1 << PCTRIE_WIDTH) #endif /* _KERNEL */ #endif /* !_SYS_PCTRIE_H_ */ diff --git a/sys/vm/_vm_radix.h b/sys/vm/_vm_radix.h index 77e364d1d238..26f112ee3ee8 100644 --- a/sys/vm/_vm_radix.h +++ b/sys/vm/_vm_radix.h @@ -1,43 +1,48 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __VM_RADIX_H_ #define __VM_RADIX_H_ +/* + * Radix tree node. + */ +struct vm_radix_node; + /* * Radix tree root. */ struct vm_radix { - uintptr_t rt_root; + struct vm_radix_node *rt_root; }; #endif /* !__VM_RADIX_H_ */ diff --git a/sys/vm/vm_radix.c b/sys/vm/vm_radix.c index 6504d5031460..3c4ea9568108 100644 --- a/sys/vm/vm_radix.c +++ b/sys/vm/vm_radix.c @@ -1,814 +1,821 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Path-compressed radix trie implementation. * The following code is not generalized into a general purpose library * because there are way too many parameters embedded that should really * be decided by the library consumers. At the same time, consumers * of this code must achieve highest possible performance. * * The implementation takes into account the following rationale: * - Size of the nodes should be as small as possible but still big enough * to avoid a large maximum depth for the trie. This is a balance * between the necessity to not wire too much physical memory for the nodes * and the necessity to avoid too much cache pollution during the trie * operations. * - There is not a huge bias toward the number of lookup operations over * the number of insert and remove operations. This basically implies * that optimizations supposedly helping one operation but hurting the * other might be carefully evaluated. * - On average not many nodes are expected to be fully populated, hence * level compression may just complicate things. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* * These widths should allow the pointers to a node's children to fit within * a single cache line. The extra levels from a narrow width should not be * a problem thanks to path compression. */ #ifdef __LP64__ #define VM_RADIX_WIDTH 4 #else #define VM_RADIX_WIDTH 3 #endif #define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH) #define VM_RADIX_MASK (VM_RADIX_COUNT - 1) #define VM_RADIX_LIMIT \ (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1) #if VM_RADIX_WIDTH == 3 typedef uint8_t rn_popmap_t; #elif VM_RADIX_WIDTH == 4 typedef uint16_t rn_popmap_t; #elif VM_RADIX_WIDTH == 5 typedef uint32_t rn_popmap_t; #else #error Unsupported width #endif _Static_assert(sizeof(rn_popmap_t) <= sizeof(int), "rn_popmap_t too wide"); -/* Flag bits stored in node pointers. */ -#define VM_RADIX_ISLEAF 0x1 -#define VM_RADIX_FLAGS 0x1 +/* Set of all flag bits stored in node pointers. */ +#define VM_RADIX_FLAGS (VM_RADIX_ISLEAF) #define VM_RADIX_PAD VM_RADIX_FLAGS /* Returns one unit associated with specified level. */ #define VM_RADIX_UNITLEVEL(lev) \ ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH)) enum vm_radix_access { SMR, LOCKED, UNSERIALIZED }; struct vm_radix_node; typedef SMR_POINTER(struct vm_radix_node *) smrnode_t; struct vm_radix_node { vm_pindex_t rn_owner; /* Owner of record. */ rn_popmap_t rn_popmap; /* Valid children. */ uint8_t rn_clev; /* Current level. */ smrnode_t rn_child[VM_RADIX_COUNT]; /* Child nodes. */ }; static uma_zone_t vm_radix_node_zone; static smr_t vm_radix_smr; static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, enum vm_radix_access access); /* * Return the position in the array for a given level. */ static __inline int vm_radix_slot(vm_pindex_t index, uint16_t level) { return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK); } /* Computes the key (index) with the low-order 'level' radix-digits zeroed. */ static __inline vm_pindex_t vm_radix_trimkey(vm_pindex_t index, uint16_t level) { return (index & -VM_RADIX_UNITLEVEL(level)); } /* * Allocate a radix node. */ static struct vm_radix_node * vm_radix_node_get(vm_pindex_t index, uint16_t clevel) { struct vm_radix_node *rnode; rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT); if (rnode == NULL) return (NULL); /* * We want to clear the last child pointer after the final section * has exited so lookup can not return false negatives. It is done * here because it will be cache-cold in the dtor callback. */ if (rnode->rn_popmap != 0) { vm_radix_node_store(&rnode->rn_child[ffs(rnode->rn_popmap) - 1], - NULL, UNSERIALIZED); + VM_RADIX_NULL, UNSERIALIZED); rnode->rn_popmap = 0; } rnode->rn_owner = vm_radix_trimkey(index, clevel + 1); rnode->rn_clev = clevel; return (rnode); } /* * Free radix node. */ static __inline void vm_radix_node_put(struct vm_radix_node *rnode) { #ifdef INVARIANTS int slot; KASSERT(powerof2(rnode->rn_popmap), ("vm_radix_node_put: rnode %p has too many children %04x", rnode, rnode->rn_popmap)); for (slot = 0; slot < VM_RADIX_COUNT; slot++) { if ((rnode->rn_popmap & (1 << slot)) != 0) continue; KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) == - NULL, ("vm_radix_node_put: rnode %p has a child", rnode)); + VM_RADIX_NULL, + ("vm_radix_node_put: rnode %p has a child", rnode)); } #endif uma_zfree_smr(vm_radix_node_zone, rnode); } /* * Fetch a node pointer from a slot in another node. */ static __inline struct vm_radix_node * vm_radix_node_load(smrnode_t *p, enum vm_radix_access access) { switch (access) { case UNSERIALIZED: return (smr_unserialized_load(p, true)); case LOCKED: return (smr_serialized_load(p, true)); case SMR: return (smr_entered_load(p, vm_radix_smr)); } __assert_unreachable(); } static __inline void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, enum vm_radix_access access) { switch (access) { case UNSERIALIZED: smr_unserialized_store(p, v, true); break; case LOCKED: smr_serialized_store(p, v, true); break; case SMR: panic("vm_radix_node_store: Not supported in smr section."); } } /* * Get the root node for a radix tree. */ static __inline struct vm_radix_node * vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access) { return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access)); } /* * Set the root node for a radix tree. */ static __inline void vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode, enum vm_radix_access access) { vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access); } /* * Returns TRUE if the specified radix node is a leaf and FALSE otherwise. */ static __inline bool vm_radix_isleaf(struct vm_radix_node *rnode) { return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0); } /* * Returns page cast to radix node with leaf bit set. */ static __inline struct vm_radix_node * vm_radix_toleaf(vm_page_t page) { return ((struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF)); } /* * Returns the associated page extracted from rnode. */ static __inline vm_page_t vm_radix_topage(struct vm_radix_node *rnode) { return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS)); } /* * Make 'child' a child of 'rnode'. */ static __inline void vm_radix_addnode(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev, struct vm_radix_node *child, enum vm_radix_access access) { int slot; slot = vm_radix_slot(index, clev); vm_radix_node_store(&rnode->rn_child[slot], child, access); rnode->rn_popmap ^= 1 << slot; KASSERT((rnode->rn_popmap & (1 << slot)) != 0, ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode)); } /* * Returns the level where two keys differ. * It cannot accept 2 equal keys. */ static __inline uint16_t vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2) { KASSERT(index1 != index2, ("%s: passing the same key value %jx", __func__, (uintmax_t)index1)); CTASSERT(sizeof(long long) >= sizeof(vm_pindex_t)); /* * From the highest-order bit where the indexes differ, * compute the highest level in the trie where they differ. */ return ((flsll(index1 ^ index2) - 1) / VM_RADIX_WIDTH); } /* * Returns TRUE if it can be determined that key does not belong to the * specified rnode. Otherwise, returns FALSE. */ static __inline bool vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx) { if (rnode->rn_clev < VM_RADIX_LIMIT) { idx = vm_radix_trimkey(idx, rnode->rn_clev + 1); return (idx != rnode->rn_owner); } return (false); } /* * Internal helper for vm_radix_reclaim_allnodes(). * This function is recursive. */ static void vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode) { struct vm_radix_node *child; int slot; while (rnode->rn_popmap != 0) { slot = ffs(rnode->rn_popmap) - 1; child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED); - KASSERT(child != NULL, ("%s: bad popmap slot %d in rnode %p", + KASSERT(child != VM_RADIX_NULL, + ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode)); if (!vm_radix_isleaf(child)) vm_radix_reclaim_allnodes_int(child); rnode->rn_popmap ^= 1 << slot; - vm_radix_node_store(&rnode->rn_child[slot], NULL, + vm_radix_node_store(&rnode->rn_child[slot], VM_RADIX_NULL, UNSERIALIZED); } vm_radix_node_put(rnode); } +/* + * radix node zone initializer. + */ +static int +vm_radix_zone_init(void *mem, int size, int flags) +{ + struct vm_radix_node *rnode; + + rnode = mem; + rnode->rn_popmap = 0; + for (int i = 0; i < nitems(rnode->rn_child); i++) + vm_radix_node_store(&rnode->rn_child[i], VM_RADIX_NULL, + UNSERIALIZED); + return (0); +} + #ifndef UMA_MD_SMALL_ALLOC void vm_radix_reserve_kva(void); /* * Reserve the KVA necessary to satisfy the node allocation. * This is mandatory in architectures not supporting direct * mapping as they will need otherwise to carve into the kernel maps for * every node allocation, resulting into deadlocks for consumers already * working with kernel maps. */ void vm_radix_reserve_kva(void) { /* * Calculate the number of reserved nodes, discounting the pages that * are needed to store them. */ if (!uma_zone_reserve_kva(vm_radix_node_zone, ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE + sizeof(struct vm_radix_node)))) panic("%s: unable to reserve KVA", __func__); } #endif /* * Initialize the UMA slab zone. */ void vm_radix_zinit(void) { vm_radix_node_zone = uma_zcreate("RADIX NODE", - sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL, - VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT); + sizeof(struct vm_radix_node), NULL, NULL, vm_radix_zone_init, NULL, + VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR); vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone); } /* * Inserts the key-value pair into the trie. * Panics if the key already exists. */ int vm_radix_insert(struct vm_radix *rtree, vm_page_t page) { vm_pindex_t index, newind; - struct vm_radix_node *leaf, *rnode, *tmp; + struct vm_radix_node *leaf, *parent, *rnode; smrnode_t *parentp; int slot; uint16_t clev; index = page->pindex; leaf = vm_radix_toleaf(page); /* * The owner of record for root is not really important because it * will never be used. */ rnode = vm_radix_root_load(rtree, LOCKED); - if (rnode == NULL) { - rtree->rt_root = (uintptr_t)leaf; - return (0); - } - for (parentp = (smrnode_t *)&rtree->rt_root;; rnode = tmp) { + parent = NULL; + for (;;) { if (vm_radix_isleaf(rnode)) { + if (rnode == VM_RADIX_NULL) { + if (parent == NULL) + rtree->rt_root = leaf; + else + vm_radix_addnode(parent, index, + parent->rn_clev, leaf, LOCKED); + return (0); + } newind = vm_radix_topage(rnode)->pindex; if (newind == index) panic("%s: key %jx is already present", __func__, (uintmax_t)index); break; - } else if (vm_radix_keybarr(rnode, index)) { + } + if (vm_radix_keybarr(rnode, index)) { newind = rnode->rn_owner; break; } slot = vm_radix_slot(index, rnode->rn_clev); - parentp = &rnode->rn_child[slot]; - tmp = vm_radix_node_load(parentp, LOCKED); - if (tmp == NULL) { - vm_radix_addnode(rnode, index, rnode->rn_clev, leaf, - LOCKED); - return (0); - } + parent = rnode; + rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); } /* * A new node is needed because the right insertion level is reached. * Setup the new intermediate node and add the 2 children: the * new object and the older edge or object. */ + parentp = (parent != NULL) ? &parent->rn_child[slot]: + (smrnode_t *)&rtree->rt_root; clev = vm_radix_keydiff(newind, index); - tmp = vm_radix_node_get(index, clev); - if (tmp == NULL) + parent = vm_radix_node_get(index, clev); + if (parent == NULL) return (ENOMEM); /* These writes are not yet visible due to ordering. */ - vm_radix_addnode(tmp, index, clev, leaf, UNSERIALIZED); - vm_radix_addnode(tmp, newind, clev, rnode, UNSERIALIZED); + vm_radix_addnode(parent, index, clev, leaf, UNSERIALIZED); + vm_radix_addnode(parent, newind, clev, rnode, UNSERIALIZED); /* Serializing write to make the above visible. */ - vm_radix_node_store(parentp, tmp, LOCKED); + vm_radix_node_store(parentp, parent, LOCKED); return (0); } /* * Returns the value stored at the index. If the index is not present, * NULL is returned. */ static __always_inline vm_page_t _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index, enum vm_radix_access access) { struct vm_radix_node *rnode; vm_page_t m; int slot; rnode = vm_radix_root_load(rtree, access); - while (rnode != NULL) { + for (;;) { if (vm_radix_isleaf(rnode)) { - m = vm_radix_topage(rnode); - if (m->pindex == index) + if ((m = vm_radix_topage(rnode)) != NULL && + m->pindex == index) return (m); break; } if (vm_radix_keybarr(rnode, index)) break; slot = vm_radix_slot(index, rnode->rn_clev); rnode = vm_radix_node_load(&rnode->rn_child[slot], access); } return (NULL); } /* * Returns the value stored at the index assuming there is an external lock. * * If the index is not present, NULL is returned. */ vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index) { return _vm_radix_lookup(rtree, index, LOCKED); } /* * Returns the value stored at the index without requiring an external lock. * * If the index is not present, NULL is returned. */ vm_page_t vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index) { vm_page_t m; smr_enter(vm_radix_smr); m = _vm_radix_lookup(rtree, index, SMR); smr_exit(vm_radix_smr); return (m); } /* * Returns the page with the least pindex that is greater than or equal to the * specified pindex, or NULL if there are no such pages. * * Requires that access be externally synchronized by a lock. */ vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *rnode, *succ; vm_page_t m; int slot; /* * Descend the trie as if performing an ordinary lookup for the page * with the specified pindex. However, unlike an ordinary lookup, as we * descend the trie, we use "succ" to remember the last branching-off * point, that is, the interior node under which the page with the least * pindex that is both outside our current path down the trie and more * than the specified pindex resides. (The node's popmap makes it fast * and easy to recognize a branching-off point.) If our ordinary lookup * fails to yield a page with a pindex that is greater than or equal to * the specified pindex, then we will exit this loop and perform a * lookup starting from "succ". If "succ" is not NULL, then that lookup * is guaranteed to succeed. */ rnode = vm_radix_root_load(rtree, LOCKED); succ = NULL; - while (rnode != NULL) { + for (;;) { if (vm_radix_isleaf(rnode)) { - m = vm_radix_topage(rnode); - if (m->pindex >= index) + if ((m = vm_radix_topage(rnode)) != NULL && + m->pindex >= index) return (m); break; } if (vm_radix_keybarr(rnode, index)) { /* * If all pages in this subtree have pindex > index, * then the page in this subtree with the least pindex * is the answer. */ if (rnode->rn_owner > index) succ = rnode; break; } slot = vm_radix_slot(index, rnode->rn_clev); /* * Just in case the next search step leads to a subtree of all * pages with pindex < index, check popmap to see if a next * bigger step, to a subtree of all pages with pindex > index, * is available. If so, remember to restart the search here. */ if ((rnode->rn_popmap >> slot) > 1) succ = rnode; rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); } /* * Restart the search from the last place visited in the subtree that * included some pages with pindex > index, if there was such a place. */ if (succ == NULL) return (NULL); if (succ != rnode) { /* * Take a step to the next bigger sibling of the node chosen * last time. In that subtree, all pages have pindex > index. */ slot = vm_radix_slot(index, succ->rn_clev) + 1; KASSERT((succ->rn_popmap >> slot) != 0, ("%s: no popmap siblings past slot %d in node %p", __func__, slot, succ)); slot += ffs(succ->rn_popmap >> slot) - 1; succ = vm_radix_node_load(&succ->rn_child[slot], LOCKED); } /* * Find the page in the subtree rooted at "succ" with the least pindex. */ while (!vm_radix_isleaf(succ)) { KASSERT(succ->rn_popmap != 0, ("%s: no popmap children in node %p", __func__, succ)); slot = ffs(succ->rn_popmap) - 1; succ = vm_radix_node_load(&succ->rn_child[slot], LOCKED); } return (vm_radix_topage(succ)); } /* * Returns the page with the greatest pindex that is less than or equal to the * specified pindex, or NULL if there are no such pages. * * Requires that access be externally synchronized by a lock. */ vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *pred, *rnode; vm_page_t m; int slot; /* * Mirror the implementation of vm_radix_lookup_ge, described above. */ rnode = vm_radix_root_load(rtree, LOCKED); pred = NULL; - while (rnode != NULL) { + for (;;) { if (vm_radix_isleaf(rnode)) { - m = vm_radix_topage(rnode); - if (m->pindex <= index) + if ((m = vm_radix_topage(rnode)) != NULL && + m->pindex <= index) return (m); break; } if (vm_radix_keybarr(rnode, index)) { if (rnode->rn_owner < index) pred = rnode; break; } slot = vm_radix_slot(index, rnode->rn_clev); if ((rnode->rn_popmap & ((1 << slot) - 1)) != 0) pred = rnode; rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); } if (pred == NULL) return (NULL); if (pred != rnode) { slot = vm_radix_slot(index, pred->rn_clev); KASSERT((pred->rn_popmap & ((1 << slot) - 1)) != 0, ("%s: no popmap siblings before slot %d in node %p", __func__, slot, pred)); slot = fls(pred->rn_popmap & ((1 << slot) - 1)) - 1; pred = vm_radix_node_load(&pred->rn_child[slot], LOCKED); } while (!vm_radix_isleaf(pred)) { KASSERT(pred->rn_popmap != 0, ("%s: no popmap children in node %p", __func__, pred)); slot = fls(pred->rn_popmap) - 1; pred = vm_radix_node_load(&pred->rn_child[slot], LOCKED); } return (vm_radix_topage(pred)); } /* * Remove the specified index from the trie, and return the value stored at * that index. If the index is not present, return NULL. */ vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) { - struct vm_radix_node *rnode, *parent, *tmp; + struct vm_radix_node *child, *parent, *rnode; vm_page_t m; int slot; - rnode = vm_radix_root_load(rtree, LOCKED); - if (vm_radix_isleaf(rnode)) { - m = vm_radix_topage(rnode); - if (m->pindex != index) - return (NULL); - vm_radix_root_store(rtree, NULL, LOCKED); - return (m); - } - parent = NULL; + rnode = NULL; + child = vm_radix_root_load(rtree, LOCKED); for (;;) { - if (rnode == NULL) - return (NULL); - slot = vm_radix_slot(index, rnode->rn_clev); - tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); - if (vm_radix_isleaf(tmp)) { - m = vm_radix_topage(tmp); - if (m->pindex != index) - return (NULL); - KASSERT((rnode->rn_popmap & (1 << slot)) != 0, - ("%s: bad popmap slot %d in rnode %p", - __func__, slot, rnode)); - rnode->rn_popmap ^= 1 << slot; - vm_radix_node_store( - &rnode->rn_child[slot], NULL, LOCKED); - if (!powerof2(rnode->rn_popmap)) - return (m); - KASSERT(rnode->rn_popmap != 0, - ("%s: bad popmap all zeroes", __func__)); - slot = ffs(rnode->rn_popmap) - 1; - tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); - KASSERT(tmp != NULL, - ("%s: bad popmap slot %d in rnode %p", - __func__, slot, rnode)); - if (parent == NULL) - vm_radix_root_store(rtree, tmp, LOCKED); - else { - slot = vm_radix_slot(index, parent->rn_clev); - KASSERT(vm_radix_node_load( - &parent->rn_child[slot], LOCKED) == rnode, - ("%s: invalid child value", __func__)); - vm_radix_node_store(&parent->rn_child[slot], - tmp, LOCKED); - } - /* - * The child is still valid and we can not zero the - * pointer until all smr references are gone. - */ - vm_radix_node_put(rnode); - return (m); - } + if (vm_radix_isleaf(child)) + break; parent = rnode; - rnode = tmp; + rnode = child; + slot = vm_radix_slot(index, rnode->rn_clev); + child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); + } + if ((m = vm_radix_topage(child)) == NULL || m->pindex != index) + return (NULL); + if (rnode == NULL) { + vm_radix_root_store(rtree, VM_RADIX_NULL, LOCKED); + return (m); + } + KASSERT((rnode->rn_popmap & (1 << slot)) != 0, + ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode)); + rnode->rn_popmap ^= 1 << slot; + vm_radix_node_store(&rnode->rn_child[slot], VM_RADIX_NULL, LOCKED); + if (!powerof2(rnode->rn_popmap)) + return (m); + KASSERT(rnode->rn_popmap != 0, ("%s: bad popmap all zeroes", __func__)); + slot = ffs(rnode->rn_popmap) - 1; + child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); + KASSERT(child != VM_RADIX_NULL, + ("%s: bad popmap slot %d in rnode %p", __func__, slot, rnode)); + if (parent == NULL) + vm_radix_root_store(rtree, child, LOCKED); + else { + slot = vm_radix_slot(index, parent->rn_clev); + KASSERT(rnode == + vm_radix_node_load(&parent->rn_child[slot], LOCKED), + ("%s: invalid child value", __func__)); + vm_radix_node_store(&parent->rn_child[slot], child, LOCKED); } + /* + * The child is still valid and we can not zero the + * pointer until all smr references are gone. + */ + vm_radix_node_put(rnode); + return (m); } /* * Remove and free all the nodes from the radix tree. * This function is recursive but there is a tight control on it as the * maximum depth of the tree is fixed. */ void vm_radix_reclaim_allnodes(struct vm_radix *rtree) { struct vm_radix_node *root; root = vm_radix_root_load(rtree, LOCKED); - if (root == NULL) + if (root == VM_RADIX_NULL) return; - vm_radix_root_store(rtree, NULL, UNSERIALIZED); + vm_radix_root_store(rtree, VM_RADIX_NULL, UNSERIALIZED); if (!vm_radix_isleaf(root)) vm_radix_reclaim_allnodes_int(root); } /* * Replace an existing page in the trie with another one. * Panics if there is not an old page in the trie at the new page's index. */ vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage) { - struct vm_radix_node *rnode, *tmp; + struct vm_radix_node *leaf, *parent, *rnode; vm_page_t m; vm_pindex_t index; int slot; + leaf = vm_radix_toleaf(newpage); index = newpage->pindex; rnode = vm_radix_root_load(rtree, LOCKED); - if (rnode == NULL) - panic("%s: replacing page on an empty trie", __func__); - if (vm_radix_isleaf(rnode)) { - m = vm_radix_topage(rnode); - if (m->pindex != index) - panic("%s: original replacing root key not found", - __func__); - rtree->rt_root = (uintptr_t)vm_radix_toleaf(newpage); - return (m); - } + parent = NULL; for (;;) { - slot = vm_radix_slot(index, rnode->rn_clev); - tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); - if (vm_radix_isleaf(tmp)) { - m = vm_radix_topage(tmp); - if (m->pindex != index) - break; - vm_radix_node_store(&rnode->rn_child[slot], - vm_radix_toleaf(newpage), LOCKED); - return (m); - } else if (tmp == NULL || vm_radix_keybarr(tmp, index)) + if (vm_radix_isleaf(rnode)) { + if ((m = vm_radix_topage(rnode)) != NULL && + m->pindex == index) { + if (parent == NULL) + rtree->rt_root = leaf; + else + vm_radix_node_store( + &parent->rn_child[slot], leaf, + LOCKED); + return (m); + } + break; + } + if (vm_radix_keybarr(rnode, index)) break; - rnode = tmp; + slot = vm_radix_slot(index, rnode->rn_clev); + parent = rnode; + rnode = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); } panic("%s: original replacing page not found", __func__); } void vm_radix_wait(void) { uma_zwait(vm_radix_node_zone); } #ifdef DDB /* * Show details about the given radix node. */ DB_SHOW_COMMAND(radixnode, db_show_radixnode) { struct vm_radix_node *rnode, *tmp; int slot; rn_popmap_t popmap; if (!have_addr) return; rnode = (struct vm_radix_node *)addr; db_printf("radixnode %p, owner %jx, children popmap %04x, level %u:\n", (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_popmap, rnode->rn_clev); for (popmap = rnode->rn_popmap; popmap != 0; popmap ^= 1 << slot) { slot = ffs(popmap) - 1; tmp = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED); db_printf("slot: %d, val: %p, page: %p, clev: %d\n", slot, (void *)tmp, vm_radix_isleaf(tmp) ? vm_radix_topage(tmp) : NULL, rnode->rn_clev); } } #endif /* DDB */ diff --git a/sys/vm/vm_radix.h b/sys/vm/vm_radix.h index b478cf4b734b..becfbaedccf5 100644 --- a/sys/vm/vm_radix.h +++ b/sys/vm/vm_radix.h @@ -1,66 +1,72 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VM_RADIX_H_ #define _VM_RADIX_H_ #include #ifdef _KERNEL int vm_radix_insert(struct vm_radix *rtree, vm_page_t page); void vm_radix_wait(void); vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index); void vm_radix_reclaim_allnodes(struct vm_radix *rtree); vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index); vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage); void vm_radix_zinit(void); +/* + * Each search path in the trie terminates at a leaf, which is a pointer to a + * page marked with a set 1-bit. A leaf may be associated with a null pointer + * to indicate no page there. + */ +#define VM_RADIX_ISLEAF 0x1 +#define VM_RADIX_NULL (struct vm_radix_node *)VM_RADIX_ISLEAF + static __inline void vm_radix_init(struct vm_radix *rtree) { - - rtree->rt_root = 0; + rtree->rt_root = VM_RADIX_NULL; } static __inline bool vm_radix_is_empty(struct vm_radix *rtree) { - - return (rtree->rt_root == 0); + return (rtree->rt_root == VM_RADIX_NULL); } #endif /* _KERNEL */ #endif /* !_VM_RADIX_H_ */