Index: head/sys/sys/_smr.h =================================================================== --- head/sys/sys/_smr.h (revision 358727) +++ head/sys/sys/_smr.h (revision 358728) @@ -1,38 +1,50 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019, 2020 Jeffrey Roberson * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef _SYS__SMR_H_ #define _SYS__SMR_H_ typedef uint32_t smr_seq_t; typedef int32_t smr_delta_t; typedef struct smr *smr_t; +#define SMR_ENTERED(smr) \ + (curthread->td_critnest != 0 && zpcpu_get((smr))->c_seq != SMR_SEQ_INVALID) + +#define SMR_ASSERT_ENTERED(smr) \ + KASSERT(SMR_ENTERED(smr), ("Not in smr section")) + +#define SMR_ASSERT_NOT_ENTERED(smr) \ + KASSERT(!SMR_ENTERED(smr), ("In smr section.")); + +#define SMR_ASSERT(ex, fn) \ + KASSERT((ex), (fn ": Assertion " #ex " failed at %s:%d", __FILE__, __LINE__)) + #endif /* __SYS_SMR_H_ */ Index: head/sys/sys/smr.h =================================================================== --- head/sys/sys/smr.h (revision 358727) +++ head/sys/sys/smr.h (revision 358728) @@ -1,366 +1,261 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2019, 2020 Jeffrey Roberson * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef _SYS_SMR_H_ #define _SYS_SMR_H_ #include /* * Safe memory reclamation. See subr_smr.c for a description of the - * algorithm. + * algorithm, and smr_types.h for macros to define and access SMR-protected + * data structures. * * Readers synchronize with smr_enter()/exit() and writers may either * free directly to a SMR UMA zone or use smr_synchronize or wait. */ /* * Modular arithmetic for comparing sequence numbers that have * potentially wrapped. Copied from tcp_seq.h. */ #define SMR_SEQ_LT(a, b) ((smr_delta_t)((a)-(b)) < 0) #define SMR_SEQ_LEQ(a, b) ((smr_delta_t)((a)-(b)) <= 0) #define SMR_SEQ_GT(a, b) ((smr_delta_t)((a)-(b)) > 0) #define SMR_SEQ_GEQ(a, b) ((smr_delta_t)((a)-(b)) >= 0) #define SMR_SEQ_DELTA(a, b) ((smr_delta_t)((a)-(b))) #define SMR_SEQ_MIN(a, b) (SMR_SEQ_LT((a), (b)) ? (a) : (b)) #define SMR_SEQ_MAX(a, b) (SMR_SEQ_GT((a), (b)) ? (a) : (b)) #define SMR_SEQ_INVALID 0 /* Shared SMR state. */ union s_wr { struct { smr_seq_t seq; /* Current write sequence #. */ int ticks; /* tick of last update (LAZY) */ }; uint64_t _pair; }; struct smr_shared { const char *s_name; /* Name for debugging/reporting. */ union s_wr s_wr; /* Write sequence */ smr_seq_t s_rd_seq; /* Minimum observed read sequence. */ }; typedef struct smr_shared *smr_shared_t; /* Per-cpu SMR state. */ struct smr { smr_seq_t c_seq; /* Current observed sequence. */ smr_shared_t c_shared; /* Shared SMR state. */ int c_deferred; /* Deferred advance counter. */ int c_limit; /* Deferred advance limit. */ int c_flags; /* SMR Configuration */ }; #define SMR_LAZY 0x0001 /* Higher latency write, fast read. */ #define SMR_DEFERRED 0x0002 /* Aggregate updates to wr_seq. */ - -#define SMR_ENTERED(smr) \ - (curthread->td_critnest != 0 && zpcpu_get((smr))->c_seq != SMR_SEQ_INVALID) - -#define SMR_ASSERT_ENTERED(smr) \ - KASSERT(SMR_ENTERED(smr), ("Not in smr section")) - -#define SMR_ASSERT_NOT_ENTERED(smr) \ - KASSERT(!SMR_ENTERED(smr), ("In smr section.")); - -#define SMR_ASSERT(ex, fn) \ - KASSERT((ex), (fn ": Assertion " #ex " failed at %s:%d", __FILE__, __LINE__)) - -/* - * SMR Accessors are meant to provide safe access to SMR protected - * pointers and prevent misuse and accidental access. - * - * Accessors are grouped by type: - * entered - Use while in a read section (between smr_enter/smr_exit()) - * serialized - Use while holding a lock that serializes writers. Updates - * are synchronized with readers via included barriers. - * unserialized - Use after the memory is out of scope and not visible to - * readers. - * - * All acceses include a parameter for an assert to verify the required - * synchronization. For example, a writer might use: - * - * smr_serialized_store(pointer, value, mtx_owned(&writelock)); - * - * These are only enabled in INVARIANTS kernels. - */ - -/* Type restricting pointer access to force smr accessors. */ -#define SMR_TYPE_DECLARE(smrtype, type) \ -typedef struct { \ - type __ptr; /* Do not access directly */ \ -} smrtype - -/* - * Read from an SMR protected pointer while in a read section. - */ -#define smr_entered_load(p, smr) ({ \ - SMR_ASSERT(SMR_ENTERED((smr)), "smr_entered_load"); \ - (__typeof((p)->__ptr))atomic_load_acq_ptr((uintptr_t *)&(p)->__ptr); \ -}) - -/* - * Read from an SMR protected pointer while serialized by an - * external mechanism. 'ex' should contain an assert that the - * external mechanism is held. i.e. mtx_owned() - */ -#define smr_serialized_load(p, ex) ({ \ - SMR_ASSERT(ex, "smr_serialized_load"); \ - (__typeof((p)->__ptr))atomic_load_ptr(&(p)->__ptr); \ -}) - -/* - * Store 'v' to an SMR protected pointer while serialized by an - * external mechanism. 'ex' should contain an assert that the - * external mechanism is held. i.e. mtx_owned() - * - * Writers that are serialized with mutual exclusion or on a single - * thread should use smr_serialized_store() rather than swap. - */ -#define smr_serialized_store(p, v, ex) do { \ - SMR_ASSERT(ex, "smr_serialized_store"); \ - __typeof((p)->__ptr) _v = (v); \ - atomic_store_rel_ptr((uintptr_t *)&(p)->__ptr, (uintptr_t)_v); \ -} while (0) - -/* - * swap 'v' with an SMR protected pointer and return the old value - * while serialized by an external mechanism. 'ex' should contain - * an assert that the external mechanism is provided. i.e. mtx_owned() - * - * Swap permits multiple writers to update a pointer concurrently. - */ -#define smr_serialized_swap(p, v, ex) ({ \ - SMR_ASSERT(ex, "smr_serialized_swap"); \ - __typeof((p)->__ptr) _v = (v); \ - /* Release barrier guarantees contents are visible to reader */ \ - atomic_thread_fence_rel(); \ - (__typeof((p)->__ptr))atomic_swap_ptr( \ - (uintptr_t *)&(p)->__ptr, (uintptr_t)_v); \ -}) - -/* - * Read from an SMR protected pointer when no serialization is required - * such as in the destructor callback or when the caller guarantees other - * synchronization. - */ -#define smr_unserialized_load(p, ex) ({ \ - SMR_ASSERT(ex, "smr_unserialized_load"); \ - (__typeof((p)->__ptr))atomic_load_ptr(&(p)->__ptr); \ -}) - -/* - * Store to an SMR protected pointer when no serialiation is required - * such as in the destructor callback or when the caller guarantees other - * synchronization. - */ -#define smr_unserialized_store(p, v, ex) do { \ - SMR_ASSERT(ex, "smr_unserialized_store"); \ - __typeof((p)->__ptr) _v = (v); \ - atomic_store_ptr((uintptr_t *)&(p)->__ptr, (uintptr_t)_v); \ -} while (0) /* * Return the current write sequence number. This is not the same as the * current goal which may be in the future. */ static inline smr_seq_t smr_shared_current(smr_shared_t s) { return (atomic_load_int(&s->s_wr.seq)); } static inline smr_seq_t smr_current(smr_t smr) { return (smr_shared_current(zpcpu_get(smr)->c_shared)); } /* * Enter a read section. */ static inline void smr_enter(smr_t smr) { critical_enter(); smr = zpcpu_get(smr); KASSERT((smr->c_flags & SMR_LAZY) == 0, ("smr_enter(%s) lazy smr.", smr->c_shared->s_name)); KASSERT(smr->c_seq == 0, ("smr_enter(%s) does not support recursion.", smr->c_shared->s_name)); /* * Store the current observed write sequence number in our * per-cpu state so that it can be queried via smr_poll(). * Frees that are newer than this stored value will be * deferred until we call smr_exit(). * * An acquire barrier is used to synchronize with smr_exit() * and smr_poll(). * * It is possible that a long delay between loading the wr_seq * and storing the c_seq could create a situation where the * rd_seq advances beyond our stored c_seq. In this situation * only the observed wr_seq is stale, the fence still orders * the load. See smr_poll() for details on how this condition * is detected and handled there. */ /* This is an add because we do not have atomic_store_acq_int */ atomic_add_acq_int(&smr->c_seq, smr_shared_current(smr->c_shared)); } /* * Exit a read section. */ static inline void smr_exit(smr_t smr) { smr = zpcpu_get(smr); CRITICAL_ASSERT(curthread); KASSERT((smr->c_flags & SMR_LAZY) == 0, ("smr_exit(%s) lazy smr.", smr->c_shared->s_name)); KASSERT(smr->c_seq != SMR_SEQ_INVALID, ("smr_exit(%s) not in a smr section.", smr->c_shared->s_name)); /* * Clear the recorded sequence number. This allows poll() to * detect CPUs not in read sections. * * Use release semantics to retire any stores before the sequence * number is cleared. */ atomic_store_rel_int(&smr->c_seq, SMR_SEQ_INVALID); critical_exit(); } /* * Enter a lazy smr section. This is used for read-mostly state that * can tolerate a high free latency. */ static inline void smr_lazy_enter(smr_t smr) { critical_enter(); smr = zpcpu_get(smr); KASSERT((smr->c_flags & SMR_LAZY) != 0, ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name)); KASSERT(smr->c_seq == 0, ("smr_lazy_enter(%s) does not support recursion.", smr->c_shared->s_name)); /* * This needs no serialization. If an interrupt occurs before we * assign sr_seq to c_seq any speculative loads will be discarded. * If we assign a stale wr_seq value due to interrupt we use the * same algorithm that renders smr_enter() safe. */ atomic_store_int(&smr->c_seq, smr_shared_current(smr->c_shared)); } /* * Exit a lazy smr section. This is used for read-mostly state that * can tolerate a high free latency. */ static inline void smr_lazy_exit(smr_t smr) { smr = zpcpu_get(smr); CRITICAL_ASSERT(curthread); KASSERT((smr->c_flags & SMR_LAZY) != 0, ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name)); KASSERT(smr->c_seq != SMR_SEQ_INVALID, ("smr_lazy_exit(%s) not in a smr section.", smr->c_shared->s_name)); /* * All loads/stores must be retired before the sequence becomes * visible. The fence compiles away on amd64. Another * alternative would be to omit the fence but store the exit * time and wait 1 tick longer. */ atomic_thread_fence_rel(); atomic_store_int(&smr->c_seq, SMR_SEQ_INVALID); critical_exit(); } /* * Advances the write sequence number. Returns the sequence number * required to ensure that all modifications are visible to readers. */ smr_seq_t smr_advance(smr_t smr); /* * Returns true if a goal sequence has been reached. If * wait is true this will busy loop until success. */ bool smr_poll(smr_t smr, smr_seq_t goal, bool wait); /* Create a new SMR context. */ smr_t smr_create(const char *name, int limit, int flags); /* Destroy the context. */ void smr_destroy(smr_t smr); /* * Blocking wait for all readers to observe 'goal'. */ static inline bool smr_wait(smr_t smr, smr_seq_t goal) { return (smr_poll(smr, goal, true)); } /* * Synchronize advances the write sequence and returns when all * readers have observed it. * * If your application can cache a sequence number returned from * smr_advance() and poll or wait at a later time there will * be less chance of busy looping while waiting for readers. */ static inline void smr_synchronize(smr_t smr) { smr_wait(smr, smr_advance(smr)); } /* Only at startup. */ void smr_init(void); #endif /* _SYS_SMR_H_ */ Index: head/sys/sys/smr_types.h =================================================================== --- head/sys/sys/smr_types.h (nonexistent) +++ head/sys/sys/smr_types.h (revision 358728) @@ -0,0 +1,138 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019, 2020 Jeffrey Roberson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_SMR_TYPES_H_ +#define _SYS_SMR_TYPES_H_ + +#include + +/* + * SMR Accessors are meant to provide safe access to SMR protected + * pointers and prevent misuse and accidental access. + * + * Accessors are grouped by type: + * entered - Use while in a read section (between smr_enter/smr_exit()) + * serialized - Use while holding a lock that serializes writers. Updates + * are synchronized with readers via included barriers. + * unserialized - Use after the memory is out of scope and not visible to + * readers. + * + * All acceses include a parameter for an assert to verify the required + * synchronization. For example, a writer might use: + * + * smr_serialized_store(pointer, value, mtx_owned(&writelock)); + * + * These are only enabled in INVARIANTS kernels. + */ + +/* Type restricting pointer access to force smr accessors. */ +#define SMR_POINTER(type) \ +struct { \ + type __ptr; /* Do not access directly */ \ +} + +/* + * Read from an SMR protected pointer while in a read section. + */ +#define smr_entered_load(p, smr) ({ \ + SMR_ASSERT(SMR_ENTERED((smr)), "smr_entered_load"); \ + (__typeof((p)->__ptr))atomic_load_acq_ptr((uintptr_t *)&(p)->__ptr); \ +}) + +/* + * Read from an SMR protected pointer while serialized by an + * external mechanism. 'ex' should contain an assert that the + * external mechanism is held. i.e. mtx_owned() + */ +#define smr_serialized_load(p, ex) ({ \ + SMR_ASSERT(ex, "smr_serialized_load"); \ + (__typeof((p)->__ptr))atomic_load_ptr(&(p)->__ptr); \ +}) + +/* + * Store 'v' to an SMR protected pointer while serialized by an + * external mechanism. 'ex' should contain an assert that the + * external mechanism is held. i.e. mtx_owned() + * + * Writers that are serialized with mutual exclusion or on a single + * thread should use smr_serialized_store() rather than swap. + */ +#define smr_serialized_store(p, v, ex) do { \ + SMR_ASSERT(ex, "smr_serialized_store"); \ + __typeof((p)->__ptr) _v = (v); \ + atomic_store_rel_ptr((uintptr_t *)&(p)->__ptr, (uintptr_t)_v); \ +} while (0) + +/* + * swap 'v' with an SMR protected pointer and return the old value + * while serialized by an external mechanism. 'ex' should contain + * an assert that the external mechanism is provided. i.e. mtx_owned() + * + * Swap permits multiple writers to update a pointer concurrently. + */ +#define smr_serialized_swap(p, v, ex) ({ \ + SMR_ASSERT(ex, "smr_serialized_swap"); \ + __typeof((p)->__ptr) _v = (v); \ + /* Release barrier guarantees contents are visible to reader */ \ + atomic_thread_fence_rel(); \ + (__typeof((p)->__ptr))atomic_swap_ptr( \ + (uintptr_t *)&(p)->__ptr, (uintptr_t)_v); \ +}) + +/* + * Read from an SMR protected pointer when no serialization is required + * such as in the destructor callback or when the caller guarantees other + * synchronization. + */ +#define smr_unserialized_load(p, ex) ({ \ + SMR_ASSERT(ex, "smr_unserialized_load"); \ + (__typeof((p)->__ptr))atomic_load_ptr(&(p)->__ptr); \ +}) + +/* + * Store to an SMR protected pointer when no serialiation is required + * such as in the destructor callback or when the caller guarantees other + * synchronization. + */ +#define smr_unserialized_store(p, v, ex) do { \ + SMR_ASSERT(ex, "smr_unserialized_store"); \ + __typeof((p)->__ptr) _v = (v); \ + atomic_store_ptr((uintptr_t *)&(p)->__ptr, (uintptr_t)_v); \ +} while (0) + +#ifndef _KERNEL + +/* + * Load an SMR protected pointer when accessing kernel data structures through + * libkvm. + */ +#define smr_kvm_load(p) ((p)->__ptr) + +#endif /* !_KERNEL */ +#endif /* !_SYS_SMR_TYPES_H_ */ Property changes on: head/sys/sys/smr_types.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/vm/vm_radix.c =================================================================== --- head/sys/vm/vm_radix.c (revision 358727) +++ head/sys/vm/vm_radix.c (revision 358728) @@ -1,910 +1,911 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 EMC Corp. * Copyright (c) 2011 Jeffrey Roberson * Copyright (c) 2008 Mayur Shardul * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Path-compressed radix trie implementation. * The following code is not generalized into a general purpose library * because there are way too many parameters embedded that should really * be decided by the library consumers. At the same time, consumers * of this code must achieve highest possible performance. * * The implementation takes into account the following rationale: * - Size of the nodes should be as small as possible but still big enough * to avoid a large maximum depth for the trie. This is a balance * between the necessity to not wire too much physical memory for the nodes * and the necessity to avoid too much cache pollution during the trie * operations. * - There is not a huge bias toward the number of lookup operations over * the number of insert and remove operations. This basically implies * that optimizations supposedly helping one operation but hurting the * other might be carefully evaluated. * - On average not many nodes are expected to be fully populated, hence * level compression may just complicate things. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include +#include #include #include #include #include #include #include #ifdef DDB #include #endif /* * These widths should allow the pointers to a node's children to fit within * a single cache line. The extra levels from a narrow width should not be * a problem thanks to path compression. */ #ifdef __LP64__ #define VM_RADIX_WIDTH 4 #else #define VM_RADIX_WIDTH 3 #endif #define VM_RADIX_COUNT (1 << VM_RADIX_WIDTH) #define VM_RADIX_MASK (VM_RADIX_COUNT - 1) #define VM_RADIX_LIMIT \ (howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1) /* Flag bits stored in node pointers. */ #define VM_RADIX_ISLEAF 0x1 #define VM_RADIX_FLAGS 0x1 #define VM_RADIX_PAD VM_RADIX_FLAGS /* Returns one unit associated with specified level. */ #define VM_RADIX_UNITLEVEL(lev) \ ((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH)) enum vm_radix_access { SMR, LOCKED, UNSERIALIZED }; struct vm_radix_node; -SMR_TYPE_DECLARE(smrnode_t, struct vm_radix_node *); +typedef SMR_POINTER(struct vm_radix_node *) smrnode_t; struct vm_radix_node { vm_pindex_t rn_owner; /* Owner of record. */ uint16_t rn_count; /* Valid children. */ uint8_t rn_clev; /* Current level. */ int8_t rn_last; /* zero last ptr. */ smrnode_t rn_child[VM_RADIX_COUNT]; /* Child nodes. */ }; static uma_zone_t vm_radix_node_zone; static smr_t vm_radix_smr; static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, enum vm_radix_access access); /* * Allocate a radix node. */ static struct vm_radix_node * vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel) { struct vm_radix_node *rnode; rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT); if (rnode == NULL) return (NULL); /* * We want to clear the last child pointer after the final section * has exited so lookup can not return false negatives. It is done * here because it will be cache-cold in the dtor callback. */ if (rnode->rn_last != 0) { vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1], NULL, UNSERIALIZED); rnode->rn_last = 0; } rnode->rn_owner = owner; rnode->rn_count = count; rnode->rn_clev = clevel; return (rnode); } /* * Free radix node. */ static __inline void vm_radix_node_put(struct vm_radix_node *rnode, int8_t last) { #ifdef INVARIANTS int slot; KASSERT(rnode->rn_count == 0, ("vm_radix_node_put: rnode %p has %d children", rnode, rnode->rn_count)); for (slot = 0; slot < VM_RADIX_COUNT; slot++) { if (slot == last) continue; KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) == NULL, ("vm_radix_node_put: rnode %p has a child", rnode)); } #endif /* Off by one so a freshly zero'd node is not assigned to. */ rnode->rn_last = last + 1; uma_zfree_smr(vm_radix_node_zone, rnode); } /* * Return the position in the array for a given level. */ static __inline int vm_radix_slot(vm_pindex_t index, uint16_t level) { return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK); } /* Trims the key after the specified level. */ static __inline vm_pindex_t vm_radix_trimkey(vm_pindex_t index, uint16_t level) { vm_pindex_t ret; ret = index; if (level > 0) { ret >>= level * VM_RADIX_WIDTH; ret <<= level * VM_RADIX_WIDTH; } return (ret); } /* * Fetch a node pointer from a slot in another node. */ static __inline struct vm_radix_node * vm_radix_node_load(smrnode_t *p, enum vm_radix_access access) { switch (access) { case UNSERIALIZED: return (smr_unserialized_load(p, true)); case LOCKED: return (smr_serialized_load(p, true)); case SMR: return (smr_entered_load(p, vm_radix_smr)); } __unreachable(); } static __inline void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v, enum vm_radix_access access) { switch (access) { case UNSERIALIZED: smr_unserialized_store(p, v, true); break; case LOCKED: smr_serialized_store(p, v, true); break; case SMR: panic("vm_radix_node_store: Not supported in smr section."); } } /* * Get the root node for a radix tree. */ static __inline struct vm_radix_node * vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access) { return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access)); } /* * Set the root node for a radix tree. */ static __inline void vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode, enum vm_radix_access access) { vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access); } /* * Returns TRUE if the specified radix node is a leaf and FALSE otherwise. */ static __inline boolean_t vm_radix_isleaf(struct vm_radix_node *rnode) { return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0); } /* * Returns the associated page extracted from rnode. */ static __inline vm_page_t vm_radix_topage(struct vm_radix_node *rnode) { return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS)); } /* * Adds the page as a child of the provided node. */ static __inline void vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev, vm_page_t page, enum vm_radix_access access) { int slot; slot = vm_radix_slot(index, clev); vm_radix_node_store(&rnode->rn_child[slot], (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access); } /* * Returns the slot where two keys differ. * It cannot accept 2 equal keys. */ static __inline uint16_t vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2) { uint16_t clev; KASSERT(index1 != index2, ("%s: passing the same key value %jx", __func__, (uintmax_t)index1)); index1 ^= index2; for (clev = VM_RADIX_LIMIT;; clev--) if (vm_radix_slot(index1, clev) != 0) return (clev); } /* * Returns TRUE if it can be determined that key does not belong to the * specified rnode. Otherwise, returns FALSE. */ static __inline boolean_t vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx) { if (rnode->rn_clev < VM_RADIX_LIMIT) { idx = vm_radix_trimkey(idx, rnode->rn_clev + 1); return (idx != rnode->rn_owner); } return (FALSE); } /* * Internal helper for vm_radix_reclaim_allnodes(). * This function is recursive. */ static void vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode) { struct vm_radix_node *child; int slot; KASSERT(rnode->rn_count <= VM_RADIX_COUNT, ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode)); for (slot = 0; rnode->rn_count != 0; slot++) { child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED); if (child == NULL) continue; if (!vm_radix_isleaf(child)) vm_radix_reclaim_allnodes_int(child); vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED); rnode->rn_count--; } vm_radix_node_put(rnode, -1); } #ifndef UMA_MD_SMALL_ALLOC void vm_radix_reserve_kva(void); /* * Reserve the KVA necessary to satisfy the node allocation. * This is mandatory in architectures not supporting direct * mapping as they will need otherwise to carve into the kernel maps for * every node allocation, resulting into deadlocks for consumers already * working with kernel maps. */ void vm_radix_reserve_kva(void) { /* * Calculate the number of reserved nodes, discounting the pages that * are needed to store them. */ if (!uma_zone_reserve_kva(vm_radix_node_zone, ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE + sizeof(struct vm_radix_node)))) panic("%s: unable to reserve KVA", __func__); } #endif /* * Initialize the UMA slab zone. */ void vm_radix_zinit(void) { vm_radix_node_zone = uma_zcreate("RADIX NODE", sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT); vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone); } /* * Inserts the key-value pair into the trie. * Panics if the key already exists. */ int vm_radix_insert(struct vm_radix *rtree, vm_page_t page) { vm_pindex_t index, newind; struct vm_radix_node *rnode, *tmp; smrnode_t *parentp; vm_page_t m; int slot; uint16_t clev; index = page->pindex; /* * The owner of record for root is not really important because it * will never be used. */ rnode = vm_radix_root_load(rtree, LOCKED); if (rnode == NULL) { rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF; return (0); } parentp = (smrnode_t *)&rtree->rt_root; for (;;) { if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex == index) panic("%s: key %jx is already present", __func__, (uintmax_t)index); clev = vm_radix_keydiff(m->pindex, index); tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); if (tmp == NULL) return (ENOMEM); /* These writes are not yet visible due to ordering. */ vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED); vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED); /* Synchronize to make leaf visible. */ vm_radix_node_store(parentp, tmp, LOCKED); return (0); } else if (vm_radix_keybarr(rnode, index)) break; slot = vm_radix_slot(index, rnode->rn_clev); parentp = &rnode->rn_child[slot]; tmp = vm_radix_node_load(parentp, LOCKED); if (tmp == NULL) { rnode->rn_count++; vm_radix_addpage(rnode, index, rnode->rn_clev, page, LOCKED); return (0); } rnode = tmp; } /* * A new node is needed because the right insertion level is reached. * Setup the new intermediate node and add the 2 children: the * new object and the older edge. */ newind = rnode->rn_owner; clev = vm_radix_keydiff(newind, index); tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev); if (tmp == NULL) return (ENOMEM); slot = vm_radix_slot(newind, clev); /* These writes are not yet visible due to ordering. */ vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED); vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED); /* Serializing write to make the above visible. */ vm_radix_node_store(parentp, tmp, LOCKED); return (0); } /* * Returns TRUE if the specified radix tree contains a single leaf and FALSE * otherwise. */ boolean_t vm_radix_is_singleton(struct vm_radix *rtree) { struct vm_radix_node *rnode; rnode = vm_radix_root_load(rtree, LOCKED); if (rnode == NULL) return (FALSE); return (vm_radix_isleaf(rnode)); } /* * Returns the value stored at the index. If the index is not present, * NULL is returned. */ static __always_inline vm_page_t _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index, enum vm_radix_access access) { struct vm_radix_node *rnode; vm_page_t m; int slot; rnode = vm_radix_root_load(rtree, access); while (rnode != NULL) { if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex == index) return (m); break; } if (vm_radix_keybarr(rnode, index)) break; slot = vm_radix_slot(index, rnode->rn_clev); rnode = vm_radix_node_load(&rnode->rn_child[slot], access); } return (NULL); } /* * Returns the value stored at the index assuming there is an external lock. * * If the index is not present, NULL is returned. */ vm_page_t vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index) { return _vm_radix_lookup(rtree, index, LOCKED); } /* * Returns the value stored at the index without requiring an external lock. * * If the index is not present, NULL is returned. */ vm_page_t vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index) { vm_page_t m; smr_enter(vm_radix_smr); m = _vm_radix_lookup(rtree, index, SMR); smr_exit(vm_radix_smr); return (m); } /* * Look up the nearest entry at a position greater than or equal to index. */ vm_page_t vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *stack[VM_RADIX_LIMIT]; vm_pindex_t inc; vm_page_t m; struct vm_radix_node *child, *rnode; #ifdef INVARIANTS int loops = 0; #endif int slot, tos; rnode = vm_radix_root_load(rtree, LOCKED); if (rnode == NULL) return (NULL); else if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex >= index) return (m); else return (NULL); } tos = 0; for (;;) { /* * If the keys differ before the current bisection node, * then the search key might rollback to the earliest * available bisection node or to the smallest key * in the current node (if the owner is greater than the * search key). */ if (vm_radix_keybarr(rnode, index)) { if (index > rnode->rn_owner) { ascend: KASSERT(++loops < 1000, ("vm_radix_lookup_ge: too many loops")); /* * Pop nodes from the stack until either the * stack is empty or a node that could have a * matching descendant is found. */ do { if (tos == 0) return (NULL); rnode = stack[--tos]; } while (vm_radix_slot(index, rnode->rn_clev) == (VM_RADIX_COUNT - 1)); /* * The following computation cannot overflow * because index's slot at the current level * is less than VM_RADIX_COUNT - 1. */ index = vm_radix_trimkey(index, rnode->rn_clev); index += VM_RADIX_UNITLEVEL(rnode->rn_clev); } else index = rnode->rn_owner; KASSERT(!vm_radix_keybarr(rnode, index), ("vm_radix_lookup_ge: keybarr failed")); } slot = vm_radix_slot(index, rnode->rn_clev); child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); if (vm_radix_isleaf(child)) { m = vm_radix_topage(child); if (m->pindex >= index) return (m); } else if (child != NULL) goto descend; /* * Look for an available edge or page within the current * bisection node. */ if (slot < (VM_RADIX_COUNT - 1)) { inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); index = vm_radix_trimkey(index, rnode->rn_clev); do { index += inc; slot++; child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); if (vm_radix_isleaf(child)) { m = vm_radix_topage(child); if (m->pindex >= index) return (m); } else if (child != NULL) goto descend; } while (slot < (VM_RADIX_COUNT - 1)); } KASSERT(child == NULL || vm_radix_isleaf(child), ("vm_radix_lookup_ge: child is radix node")); /* * If a page or edge greater than the search slot is not found * in the current node, ascend to the next higher-level node. */ goto ascend; descend: KASSERT(rnode->rn_clev > 0, ("vm_radix_lookup_ge: pushing leaf's parent")); KASSERT(tos < VM_RADIX_LIMIT, ("vm_radix_lookup_ge: stack overflow")); stack[tos++] = rnode; rnode = child; } } /* * Look up the nearest entry at a position less than or equal to index. */ vm_page_t vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *stack[VM_RADIX_LIMIT]; vm_pindex_t inc; vm_page_t m; struct vm_radix_node *child, *rnode; #ifdef INVARIANTS int loops = 0; #endif int slot, tos; rnode = vm_radix_root_load(rtree, LOCKED); if (rnode == NULL) return (NULL); else if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex <= index) return (m); else return (NULL); } tos = 0; for (;;) { /* * If the keys differ before the current bisection node, * then the search key might rollback to the earliest * available bisection node or to the largest key * in the current node (if the owner is smaller than the * search key). */ if (vm_radix_keybarr(rnode, index)) { if (index > rnode->rn_owner) { index = rnode->rn_owner + VM_RADIX_COUNT * VM_RADIX_UNITLEVEL(rnode->rn_clev); } else { ascend: KASSERT(++loops < 1000, ("vm_radix_lookup_le: too many loops")); /* * Pop nodes from the stack until either the * stack is empty or a node that could have a * matching descendant is found. */ do { if (tos == 0) return (NULL); rnode = stack[--tos]; } while (vm_radix_slot(index, rnode->rn_clev) == 0); /* * The following computation cannot overflow * because index's slot at the current level * is greater than 0. */ index = vm_radix_trimkey(index, rnode->rn_clev); } index--; KASSERT(!vm_radix_keybarr(rnode, index), ("vm_radix_lookup_le: keybarr failed")); } slot = vm_radix_slot(index, rnode->rn_clev); child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); if (vm_radix_isleaf(child)) { m = vm_radix_topage(child); if (m->pindex <= index) return (m); } else if (child != NULL) goto descend; /* * Look for an available edge or page within the current * bisection node. */ if (slot > 0) { inc = VM_RADIX_UNITLEVEL(rnode->rn_clev); index |= inc - 1; do { index -= inc; slot--; child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); if (vm_radix_isleaf(child)) { m = vm_radix_topage(child); if (m->pindex <= index) return (m); } else if (child != NULL) goto descend; } while (slot > 0); } KASSERT(child == NULL || vm_radix_isleaf(child), ("vm_radix_lookup_le: child is radix node")); /* * If a page or edge smaller than the search slot is not found * in the current node, ascend to the next higher-level node. */ goto ascend; descend: KASSERT(rnode->rn_clev > 0, ("vm_radix_lookup_le: pushing leaf's parent")); KASSERT(tos < VM_RADIX_LIMIT, ("vm_radix_lookup_le: stack overflow")); stack[tos++] = rnode; rnode = child; } } /* * Remove the specified index from the trie, and return the value stored at * that index. If the index is not present, return NULL. */ vm_page_t vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index) { struct vm_radix_node *rnode, *parent, *tmp; vm_page_t m; int i, slot; rnode = vm_radix_root_load(rtree, LOCKED); if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex != index) return (NULL); vm_radix_root_store(rtree, NULL, LOCKED); return (m); } parent = NULL; for (;;) { if (rnode == NULL) return (NULL); slot = vm_radix_slot(index, rnode->rn_clev); tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); if (vm_radix_isleaf(tmp)) { m = vm_radix_topage(tmp); if (m->pindex != index) return (NULL); vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED); rnode->rn_count--; if (rnode->rn_count > 1) return (m); for (i = 0; i < VM_RADIX_COUNT; i++) if (vm_radix_node_load(&rnode->rn_child[i], LOCKED) != NULL) break; KASSERT(i != VM_RADIX_COUNT, ("%s: invalid node configuration", __func__)); tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED); if (parent == NULL) vm_radix_root_store(rtree, tmp, LOCKED); else { slot = vm_radix_slot(index, parent->rn_clev); KASSERT(vm_radix_node_load( &parent->rn_child[slot], LOCKED) == rnode, ("%s: invalid child value", __func__)); vm_radix_node_store(&parent->rn_child[slot], tmp, LOCKED); } /* * The child is still valid and we can not zero the * pointer until all smr references are gone. */ rnode->rn_count--; vm_radix_node_put(rnode, i); return (m); } parent = rnode; rnode = tmp; } } /* * Remove and free all the nodes from the radix tree. * This function is recursive but there is a tight control on it as the * maximum depth of the tree is fixed. */ void vm_radix_reclaim_allnodes(struct vm_radix *rtree) { struct vm_radix_node *root; root = vm_radix_root_load(rtree, LOCKED); if (root == NULL) return; vm_radix_root_store(rtree, NULL, UNSERIALIZED); if (!vm_radix_isleaf(root)) vm_radix_reclaim_allnodes_int(root); } /* * Replace an existing page in the trie with another one. * Panics if there is not an old page in the trie at the new page's index. */ vm_page_t vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage) { struct vm_radix_node *rnode, *tmp; vm_page_t m; vm_pindex_t index; int slot; index = newpage->pindex; rnode = vm_radix_root_load(rtree, LOCKED); if (rnode == NULL) panic("%s: replacing page on an empty trie", __func__); if (vm_radix_isleaf(rnode)) { m = vm_radix_topage(rnode); if (m->pindex != index) panic("%s: original replacing root key not found", __func__); rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF; return (m); } for (;;) { slot = vm_radix_slot(index, rnode->rn_clev); tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED); if (vm_radix_isleaf(tmp)) { m = vm_radix_topage(tmp); if (m->pindex == index) { vm_radix_node_store(&rnode->rn_child[slot], (struct vm_radix_node *)((uintptr_t)newpage | VM_RADIX_ISLEAF), LOCKED); return (m); } else break; } else if (tmp == NULL || vm_radix_keybarr(tmp, index)) break; rnode = tmp; } panic("%s: original replacing page not found", __func__); } void vm_radix_wait(void) { uma_zwait(vm_radix_node_zone); } #ifdef DDB /* * Show details about the given radix node. */ DB_SHOW_COMMAND(radixnode, db_show_radixnode) { struct vm_radix_node *rnode, *tmp; int i; if (!have_addr) return; rnode = (struct vm_radix_node *)addr; db_printf("radixnode %p, owner %jx, children count %u, level %u:\n", (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count, rnode->rn_clev); for (i = 0; i < VM_RADIX_COUNT; i++) { tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED); if (tmp != NULL) db_printf("slot: %d, val: %p, page: %p, clev: %d\n", i, (void *)tmp, vm_radix_isleaf(tmp) ? vm_radix_topage(tmp) : NULL, rnode->rn_clev); } } #endif /* DDB */