Index: head/sys/compat/linuxkpi/common/src/linux_rcu.c =================================================================== --- head/sys/compat/linuxkpi/common/src/linux_rcu.c (revision 316664) +++ head/sys/compat/linuxkpi/common/src/linux_rcu.c (revision 316665) @@ -1,375 +1,375 @@ /*- * Copyright (c) 2016 Matt Macy (mmacy@nextbsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct callback_head; struct writer_epoch_record { ck_epoch_record_t epoch_record; struct mtx head_lock; struct mtx sync_lock; struct task task; STAILQ_HEAD(, callback_head) head; } __aligned(CACHE_LINE_SIZE); struct callback_head { STAILQ_ENTRY(callback_head) entry; rcu_callback_t func; }; struct srcu_epoch_record { ck_epoch_record_t epoch_record; struct mtx read_lock; struct mtx sync_lock; }; /* * Verify that "struct rcu_head" is big enough to hold "struct * callback_head". This has been done to avoid having to add special * compile flags for including ck_epoch.h to all clients of the * LinuxKPI. */ CTASSERT(sizeof(struct rcu_head) >= sizeof(struct callback_head)); /* * Verify that "epoch_record" is at beginning of "struct * writer_epoch_record": */ CTASSERT(offsetof(struct writer_epoch_record, epoch_record) == 0); /* * Verify that "epoch_record" is at beginning of "struct * srcu_epoch_record": */ CTASSERT(offsetof(struct srcu_epoch_record, epoch_record) == 0); static ck_epoch_t linux_epoch; static MALLOC_DEFINE(M_LRCU, "lrcu", "Linux RCU"); static DPCPU_DEFINE(ck_epoch_record_t *, linux_reader_epoch_record); static DPCPU_DEFINE(struct writer_epoch_record *, linux_writer_epoch_record); static void linux_rcu_cleaner_func(void *, int); static void linux_rcu_runtime_init(void *arg __unused) { int i; ck_epoch_init(&linux_epoch); /* setup reader records */ CPU_FOREACH(i) { ck_epoch_record_t *record; record = malloc(sizeof(*record), M_LRCU, M_WAITOK | M_ZERO); - ck_epoch_register(&linux_epoch, record); + ck_epoch_register(&linux_epoch, record, NULL); DPCPU_ID_SET(i, linux_reader_epoch_record, record); } /* setup writer records */ CPU_FOREACH(i) { struct writer_epoch_record *record; record = malloc(sizeof(*record), M_LRCU, M_WAITOK | M_ZERO); - ck_epoch_register(&linux_epoch, &record->epoch_record); + ck_epoch_register(&linux_epoch, &record->epoch_record, NULL); mtx_init(&record->head_lock, "LRCU-HEAD", NULL, MTX_DEF); mtx_init(&record->sync_lock, "LRCU-SYNC", NULL, MTX_DEF); TASK_INIT(&record->task, 0, linux_rcu_cleaner_func, record); STAILQ_INIT(&record->head); DPCPU_ID_SET(i, linux_writer_epoch_record, record); } } SYSINIT(linux_rcu_runtime, SI_SUB_LOCK, SI_ORDER_SECOND, linux_rcu_runtime_init, NULL); static void linux_rcu_runtime_uninit(void *arg __unused) { ck_stack_entry_t *cursor; ck_stack_entry_t *next; int i; /* make sure all callbacks have been called */ linux_rcu_barrier(); /* destroy all writer record mutexes */ CPU_FOREACH(i) { struct writer_epoch_record *record; record = DPCPU_ID_GET(i, linux_writer_epoch_record); mtx_destroy(&record->head_lock); mtx_destroy(&record->sync_lock); } /* free all registered reader and writer records */ CK_STACK_FOREACH_SAFE(&linux_epoch.records, cursor, next) { ck_epoch_record_t *record; record = container_of(cursor, struct ck_epoch_record, record_next); free(record, M_LRCU); } } SYSUNINIT(linux_rcu_runtime, SI_SUB_LOCK, SI_ORDER_SECOND, linux_rcu_runtime_uninit, NULL); static inline struct srcu_epoch_record * linux_srcu_get_record(void) { struct srcu_epoch_record *record; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "linux_srcu_get_record() might sleep"); /* * NOTE: The only records that are unregistered and can be * recycled are srcu_epoch_records. */ - record = (struct srcu_epoch_record *)ck_epoch_recycle(&linux_epoch); + record = (struct srcu_epoch_record *)ck_epoch_recycle(&linux_epoch, NULL); if (__predict_true(record != NULL)) return (record); record = malloc(sizeof(*record), M_LRCU, M_WAITOK | M_ZERO); mtx_init(&record->read_lock, "SRCU-READ", NULL, MTX_DEF | MTX_NOWITNESS); mtx_init(&record->sync_lock, "SRCU-SYNC", NULL, MTX_DEF | MTX_NOWITNESS); - ck_epoch_register(&linux_epoch, &record->epoch_record); + ck_epoch_register(&linux_epoch, &record->epoch_record, NULL); return (record); } static inline void linux_rcu_synchronize_sub(struct writer_epoch_record *record) { /* protect access to epoch_record */ mtx_lock(&record->sync_lock); ck_epoch_synchronize(&record->epoch_record); mtx_unlock(&record->sync_lock); } static void linux_rcu_cleaner_func(void *context, int pending __unused) { struct writer_epoch_record *record; struct callback_head *rcu; STAILQ_HEAD(, callback_head) head; record = context; /* move current callbacks into own queue */ mtx_lock(&record->head_lock); STAILQ_INIT(&head); STAILQ_CONCAT(&head, &record->head); mtx_unlock(&record->head_lock); /* synchronize */ linux_rcu_synchronize_sub(record); /* dispatch all callbacks, if any */ while ((rcu = STAILQ_FIRST(&head)) != NULL) { uintptr_t offset; STAILQ_REMOVE_HEAD(&head, entry); offset = (uintptr_t)rcu->func; if (offset < LINUX_KFREE_RCU_OFFSET_MAX) kfree((char *)rcu - offset); else rcu->func((struct rcu_head *)rcu); } } void linux_rcu_read_lock(void) { ck_epoch_record_t *record; /* * Pin thread to current CPU so that the unlock code gets the * same per-CPU reader epoch record: */ sched_pin(); record = DPCPU_GET(linux_reader_epoch_record); /* * Use a critical section to prevent recursion inside * ck_epoch_begin(). Else this function supports recursion. */ critical_enter(); ck_epoch_begin(record, NULL); critical_exit(); } void linux_rcu_read_unlock(void) { ck_epoch_record_t *record; record = DPCPU_GET(linux_reader_epoch_record); /* * Use a critical section to prevent recursion inside * ck_epoch_end(). Else this function supports recursion. */ critical_enter(); ck_epoch_end(record, NULL); critical_exit(); sched_unpin(); } void linux_synchronize_rcu(void) { linux_rcu_synchronize_sub(DPCPU_GET(linux_writer_epoch_record)); } void linux_rcu_barrier(void) { int i; CPU_FOREACH(i) { struct writer_epoch_record *record; record = DPCPU_ID_GET(i, linux_writer_epoch_record); linux_rcu_synchronize_sub(record); /* wait for callbacks to complete */ taskqueue_drain(taskqueue_fast, &record->task); } } void linux_call_rcu(struct rcu_head *context, rcu_callback_t func) { struct callback_head *rcu = (struct callback_head *)context; struct writer_epoch_record *record; record = DPCPU_GET(linux_writer_epoch_record); mtx_lock(&record->head_lock); rcu->func = func; STAILQ_INSERT_TAIL(&record->head, rcu, entry); taskqueue_enqueue(taskqueue_fast, &record->task); mtx_unlock(&record->head_lock); } int init_srcu_struct(struct srcu_struct *srcu) { struct srcu_epoch_record *record; record = linux_srcu_get_record(); srcu->ss_epoch_record = record; return (0); } void cleanup_srcu_struct(struct srcu_struct *srcu) { struct srcu_epoch_record *record; record = srcu->ss_epoch_record; srcu->ss_epoch_record = NULL; ck_epoch_unregister(&record->epoch_record); } int srcu_read_lock(struct srcu_struct *srcu) { struct srcu_epoch_record *record; record = srcu->ss_epoch_record; mtx_lock(&record->read_lock); ck_epoch_begin(&record->epoch_record, NULL); mtx_unlock(&record->read_lock); return (0); } void srcu_read_unlock(struct srcu_struct *srcu, int key __unused) { struct srcu_epoch_record *record; record = srcu->ss_epoch_record; mtx_lock(&record->read_lock); ck_epoch_end(&record->epoch_record, NULL); mtx_unlock(&record->read_lock); } void synchronize_srcu(struct srcu_struct *srcu) { struct srcu_epoch_record *record; record = srcu->ss_epoch_record; mtx_lock(&record->sync_lock); ck_epoch_synchronize(&record->epoch_record); mtx_unlock(&record->sync_lock); } void srcu_barrier(struct srcu_struct *srcu) { struct srcu_epoch_record *record; record = srcu->ss_epoch_record; mtx_lock(&record->sync_lock); ck_epoch_barrier(&record->epoch_record); mtx_unlock(&record->sync_lock); } Index: head/sys/contrib/ck/include/ck_epoch.h =================================================================== --- head/sys/contrib/ck/include/ck_epoch.h (revision 316664) +++ head/sys/contrib/ck/include/ck_epoch.h (revision 316665) @@ -1,207 +1,280 @@ /* * Copyright 2011-2015 Samy Al Bahra. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef CK_EPOCH_H #define CK_EPOCH_H /* * The implementation here is inspired from the work described in: * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University * of Cambridge Computing Laboratory. */ #include #include #include #include #include #ifndef CK_EPOCH_LENGTH #define CK_EPOCH_LENGTH 4 #endif /* * This is used for sense detection with-respect to concurrent * epoch sections. */ #define CK_EPOCH_SENSE (2) struct ck_epoch_entry; typedef struct ck_epoch_entry ck_epoch_entry_t; typedef void ck_epoch_cb_t(ck_epoch_entry_t *); /* * This should be embedded into objects you wish to be the target of * ck_epoch_cb_t functions (with ck_epoch_call). */ struct ck_epoch_entry { ck_epoch_cb_t *function; ck_stack_entry_t stack_entry; }; /* * A section object may be passed to every begin-end pair to allow for * forward progress guarantees with-in prolonged active sections. */ struct ck_epoch_section { unsigned int bucket; }; typedef struct ck_epoch_section ck_epoch_section_t; /* * Return pointer to ck_epoch_entry container object. */ #define CK_EPOCH_CONTAINER(T, M, N) \ CK_CC_CONTAINER(struct ck_epoch_entry, T, M, N) struct ck_epoch_ref { unsigned int epoch; unsigned int count; }; struct ck_epoch_record { + ck_stack_entry_t record_next; struct ck_epoch *global; unsigned int state; unsigned int epoch; unsigned int active; struct { struct ck_epoch_ref bucket[CK_EPOCH_SENSE]; } local CK_CC_CACHELINE; unsigned int n_pending; unsigned int n_peak; - unsigned long n_dispatch; + unsigned int n_dispatch; + void *ct; ck_stack_t pending[CK_EPOCH_LENGTH]; - ck_stack_entry_t record_next; } CK_CC_CACHELINE; typedef struct ck_epoch_record ck_epoch_record_t; struct ck_epoch { unsigned int epoch; - char pad[CK_MD_CACHELINE - sizeof(unsigned int)]; - ck_stack_t records; unsigned int n_free; + ck_stack_t records; }; typedef struct ck_epoch ck_epoch_t; /* * Internal functions. */ void _ck_epoch_addref(ck_epoch_record_t *, ck_epoch_section_t *); -void _ck_epoch_delref(ck_epoch_record_t *, ck_epoch_section_t *); +bool _ck_epoch_delref(ck_epoch_record_t *, ck_epoch_section_t *); +CK_CC_FORCE_INLINE static void * +ck_epoch_record_ct(const ck_epoch_record_t *record) +{ + + return ck_pr_load_ptr(&record->ct); +} + /* * Marks the beginning of an epoch-protected section. */ CK_CC_FORCE_INLINE static void ck_epoch_begin(ck_epoch_record_t *record, ck_epoch_section_t *section) { struct ck_epoch *epoch = record->global; /* * Only observe new epoch if thread is not recursing into a read * section. */ if (record->active == 0) { unsigned int g_epoch; /* * It is possible for loads to be re-ordered before the store * is committed into the caller's epoch and active fields. * For this reason, store to load serialization is necessary. */ #if defined(CK_MD_TSO) ck_pr_fas_uint(&record->active, 1); ck_pr_fence_atomic_load(); #else ck_pr_store_uint(&record->active, 1); ck_pr_fence_memory(); #endif /* * This load is allowed to be re-ordered prior to setting * active flag due to monotonic nature of the global epoch. * However, stale values lead to measurable performance * degradation in some torture tests so we disallow early load * of global epoch. */ g_epoch = ck_pr_load_uint(&epoch->epoch); ck_pr_store_uint(&record->epoch, g_epoch); } else { ck_pr_store_uint(&record->active, record->active + 1); } if (section != NULL) _ck_epoch_addref(record, section); return; } /* - * Marks the end of an epoch-protected section. + * Marks the end of an epoch-protected section. Returns true if no more + * sections exist for the caller. */ -CK_CC_FORCE_INLINE static void +CK_CC_FORCE_INLINE static bool ck_epoch_end(ck_epoch_record_t *record, ck_epoch_section_t *section) { ck_pr_fence_release(); ck_pr_store_uint(&record->active, record->active - 1); if (section != NULL) - _ck_epoch_delref(record, section); + return _ck_epoch_delref(record, section); - return; + return record->active == 0; } /* * Defers the execution of the function pointed to by the "cb" * argument until an epoch counter loop. This allows for a * non-blocking deferral. + * + * We can get away without a fence here due to the monotonic nature + * of the epoch counter. Worst case, this will result in some delays + * before object destruction. */ CK_CC_FORCE_INLINE static void ck_epoch_call(ck_epoch_record_t *record, ck_epoch_entry_t *entry, ck_epoch_cb_t *function) { struct ck_epoch *epoch = record->global; unsigned int e = ck_pr_load_uint(&epoch->epoch); unsigned int offset = e & (CK_EPOCH_LENGTH - 1); record->n_pending++; entry->function = function; ck_stack_push_spnc(&record->pending[offset], &entry->stack_entry); return; } +/* + * Same as ck_epoch_call, but allows for records to be shared and is reentrant. + */ +CK_CC_FORCE_INLINE static void +ck_epoch_call_strict(ck_epoch_record_t *record, + ck_epoch_entry_t *entry, + ck_epoch_cb_t *function) +{ + struct ck_epoch *epoch = record->global; + unsigned int e = ck_pr_load_uint(&epoch->epoch); + unsigned int offset = e & (CK_EPOCH_LENGTH - 1); + + ck_pr_inc_uint(&record->n_pending); + entry->function = function; + + /* Store fence is implied by push operation. */ + ck_stack_push_upmc(&record->pending[offset], &entry->stack_entry); + return; +} + +/* + * This callback is used for synchronize_wait to allow for custom blocking + * behavior. + */ +typedef void ck_epoch_wait_cb_t(ck_epoch_t *, ck_epoch_record_t *, + void *); + +/* + * Return latest epoch value. This operation provides load ordering. + */ +CK_CC_FORCE_INLINE static unsigned int +ck_epoch_value(const ck_epoch_t *ep) +{ + + ck_pr_fence_load(); + return ck_pr_load_uint(&ep->epoch); +} + void ck_epoch_init(ck_epoch_t *); -ck_epoch_record_t *ck_epoch_recycle(ck_epoch_t *); -void ck_epoch_register(ck_epoch_t *, ck_epoch_record_t *); + +/* + * Attempts to recycle an unused epoch record. If one is successfully + * allocated, the record context pointer is also updated. + */ +ck_epoch_record_t *ck_epoch_recycle(ck_epoch_t *, void *); + +/* + * Registers an epoch record. An optional context pointer may be passed that + * is retrievable with ck_epoch_record_ct. + */ +void ck_epoch_register(ck_epoch_t *, ck_epoch_record_t *, void *); + +/* + * Marks a record as available for re-use by a subsequent recycle operation. + * Note that the record cannot be physically destroyed. + */ void ck_epoch_unregister(ck_epoch_record_t *); + bool ck_epoch_poll(ck_epoch_record_t *); void ck_epoch_synchronize(ck_epoch_record_t *); +void ck_epoch_synchronize_wait(ck_epoch_t *, ck_epoch_wait_cb_t *, void *); void ck_epoch_barrier(ck_epoch_record_t *); +void ck_epoch_barrier_wait(ck_epoch_record_t *, ck_epoch_wait_cb_t *, void *); + +/* + * Reclaim entries associated with a record. This is safe to call only on + * the caller's record or records that are using call_strict. + */ void ck_epoch_reclaim(ck_epoch_record_t *); #endif /* CK_EPOCH_H */ Index: head/sys/contrib/ck/src/ck_epoch.c =================================================================== --- head/sys/contrib/ck/src/ck_epoch.c (revision 316664) +++ head/sys/contrib/ck/src/ck_epoch.c (revision 316665) @@ -1,545 +1,585 @@ /* * Copyright 2011-2015 Samy Al Bahra. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The implementation here is inspired from the work described in: * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University * of Cambridge Computing Laboratory. */ #include #include #include #include #include #include #include /* * Only three distinct values are used for reclamation, but reclamation occurs * at e+2 rather than e+1. Any thread in a "critical section" would have * acquired some snapshot (e) of the global epoch value (e_g) and set an active * flag. Any hazardous references will only occur after a full memory barrier. * For example, assume an initial e_g value of 1, e value of 0 and active value * of 0. * * ck_epoch_begin(...) * e = e_g * active = 1 * memory_barrier(); * * Any serialized reads may observe e = 0 or e = 1 with active = 0, or e = 0 or * e = 1 with active = 1. The e_g value can only go from 1 to 2 if every thread * has already observed the value of "1" (or the value we are incrementing * from). This guarantees us that for any given value e_g, any threads with-in * critical sections (referred to as "active" threads from here on) would have * an e value of e_g-1 or e_g. This also means that hazardous references may be * shared in both e_g-1 and e_g even if they are logically deleted in e_g. * * For example, assume all threads have an e value of e_g. Another thread may * increment to e_g to e_g+1. Older threads may have a reference to an object * which is only deleted in e_g+1. It could be that reader threads are * executing some hash table look-ups, while some other writer thread (which * causes epoch counter tick) actually deletes the same items that reader * threads are looking up (this writer thread having an e value of e_g+1). * This is possible if the writer thread re-observes the epoch after the * counter tick. * * Psuedo-code for writer: * ck_epoch_begin() * ht_delete(x) * ck_epoch_end() * ck_epoch_begin() * ht_delete(x) * ck_epoch_end() * * Psuedo-code for reader: * for (;;) { * x = ht_lookup(x) * ck_pr_inc(&x->value); * } * * Of course, it is also possible for references logically deleted at e_g-1 to * still be accessed at e_g as threads are "active" at the same time * (real-world time) mutating shared objects. * * Now, if the epoch counter is ticked to e_g+1, then no new hazardous * references could exist to objects logically deleted at e_g-1. The reason for * this is that at e_g+1, all epoch read-side critical sections started at * e_g-1 must have been completed. If any epoch read-side critical sections at * e_g-1 were still active, then we would never increment to e_g+1 (active != 0 * ^ e != e_g). Additionally, e_g may still have hazardous references to * objects logically deleted at e_g-1 which means objects logically deleted at * e_g-1 cannot be deleted at e_g+1 unless all threads have observed e_g+1 * (since it is valid for active threads to be at e_g and threads at e_g still * require safe memory accesses). * * However, at e_g+2, all active threads must be either at e_g+1 or e_g+2. * Though e_g+2 may share hazardous references with e_g+1, and e_g+1 shares * hazardous references to e_g, no active threads are at e_g or e_g-1. This * means no hazardous references could exist to objects deleted at e_g-1 (at * e_g+2). * * To summarize these important points, * 1) Active threads will always have a value of e_g or e_g-1. * 2) Items that are logically deleted e_g or e_g-1 cannot be physically * deleted. * 3) Objects logically deleted at e_g-1 can be physically destroyed at e_g+2 * or at e_g+1 if no threads are at e_g. * * Last but not least, if we are at e_g+2, then no active thread is at e_g * which means it is safe to apply modulo-3 arithmetic to e_g value in order to * re-use e_g to represent the e_g+3 state. This means it is sufficient to * represent e_g using only the values 0, 1 or 2. Every time a thread re-visits * a e_g (which can be determined with a non-empty deferral list) it can assume * objects in the e_g deferral list involved at least three e_g transitions and * are thus, safe, for physical deletion. * * Blocking semantics for epoch reclamation have additional restrictions. * Though we only require three deferral lists, reasonable blocking semantics * must be able to more gracefully handle bursty write work-loads which could * easily cause e_g wrap-around if modulo-3 arithmetic is used. This allows for * easy-to-trigger live-lock situations. The work-around to this is to not * apply modulo arithmetic to e_g but only to deferral list indexing. */ #define CK_EPOCH_GRACE 3U enum { CK_EPOCH_STATE_USED = 0, CK_EPOCH_STATE_FREE = 1 }; CK_STACK_CONTAINER(struct ck_epoch_record, record_next, ck_epoch_record_container) CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, ck_epoch_entry_container) #define CK_EPOCH_SENSE_MASK (CK_EPOCH_SENSE - 1) -void +bool _ck_epoch_delref(struct ck_epoch_record *record, struct ck_epoch_section *section) { struct ck_epoch_ref *current, *other; unsigned int i = section->bucket; current = &record->local.bucket[i]; current->count--; if (current->count > 0) - return; + return false; /* * If the current bucket no longer has any references, then * determine whether we have already transitioned into a newer * epoch. If so, then make sure to update our shared snapshot * to allow for forward progress. * * If no other active bucket exists, then the record will go * inactive in order to allow for forward progress. */ - other = &record->local.bucket[(i + 1) & - CK_EPOCH_SENSE_MASK]; + other = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK]; if (other->count > 0 && ((int)(current->epoch - other->epoch) < 0)) { /* * The other epoch value is actually the newest, * transition to it. */ ck_pr_store_uint(&record->epoch, other->epoch); } - return; + return true; } void _ck_epoch_addref(struct ck_epoch_record *record, struct ck_epoch_section *section) { struct ck_epoch *global = record->global; struct ck_epoch_ref *ref; unsigned int epoch, i; epoch = ck_pr_load_uint(&global->epoch); i = epoch & CK_EPOCH_SENSE_MASK; ref = &record->local.bucket[i]; if (ref->count++ == 0) { #ifndef CK_MD_TSO struct ck_epoch_ref *previous; /* * The system has already ticked. If another non-zero bucket * exists, make sure to order our observations with respect * to it. Otherwise, it is possible to acquire a reference * from the previous epoch generation. * * On TSO architectures, the monoticity of the global counter * and load-{store, load} ordering are sufficient to guarantee * this ordering. */ previous = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK]; if (previous->count > 0) ck_pr_fence_acqrel(); #endif /* !CK_MD_TSO */ /* * If this is this is a new reference into the current * bucket then cache the associated epoch value. */ ref->epoch = epoch; } section->bucket = i; return; } void ck_epoch_init(struct ck_epoch *global) { ck_stack_init(&global->records); global->epoch = 1; global->n_free = 0; ck_pr_fence_store(); return; } struct ck_epoch_record * -ck_epoch_recycle(struct ck_epoch *global) +ck_epoch_recycle(struct ck_epoch *global, void *ct) { struct ck_epoch_record *record; ck_stack_entry_t *cursor; unsigned int state; if (ck_pr_load_uint(&global->n_free) == 0) return NULL; CK_STACK_FOREACH(&global->records, cursor) { record = ck_epoch_record_container(cursor); if (ck_pr_load_uint(&record->state) == CK_EPOCH_STATE_FREE) { /* Serialize with respect to deferral list clean-up. */ ck_pr_fence_load(); state = ck_pr_fas_uint(&record->state, CK_EPOCH_STATE_USED); if (state == CK_EPOCH_STATE_FREE) { ck_pr_dec_uint(&global->n_free); + ck_pr_store_ptr(&record->ct, ct); + + /* + * The context pointer is ordered by a + * subsequent protected section. + */ return record; } } } return NULL; } void -ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record) +ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record, + void *ct) { size_t i; record->global = global; record->state = CK_EPOCH_STATE_USED; record->active = 0; record->epoch = 0; record->n_dispatch = 0; record->n_peak = 0; record->n_pending = 0; + record->ct = ct; memset(&record->local, 0, sizeof record->local); for (i = 0; i < CK_EPOCH_LENGTH; i++) ck_stack_init(&record->pending[i]); ck_pr_fence_store(); ck_stack_push_upmc(&global->records, &record->record_next); return; } void ck_epoch_unregister(struct ck_epoch_record *record) { struct ck_epoch *global = record->global; size_t i; record->active = 0; record->epoch = 0; record->n_dispatch = 0; record->n_peak = 0; record->n_pending = 0; memset(&record->local, 0, sizeof record->local); for (i = 0; i < CK_EPOCH_LENGTH; i++) ck_stack_init(&record->pending[i]); + ck_pr_store_ptr(&record->ct, NULL); ck_pr_fence_store(); ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE); ck_pr_inc_uint(&global->n_free); return; } static struct ck_epoch_record * ck_epoch_scan(struct ck_epoch *global, struct ck_epoch_record *cr, unsigned int epoch, bool *af) { ck_stack_entry_t *cursor; if (cr == NULL) { cursor = CK_STACK_FIRST(&global->records); *af = false; } else { cursor = &cr->record_next; *af = true; } while (cursor != NULL) { unsigned int state, active; cr = ck_epoch_record_container(cursor); state = ck_pr_load_uint(&cr->state); if (state & CK_EPOCH_STATE_FREE) { cursor = CK_STACK_NEXT(cursor); continue; } active = ck_pr_load_uint(&cr->active); *af |= active; if (active != 0 && ck_pr_load_uint(&cr->epoch) != epoch) return cr; cursor = CK_STACK_NEXT(cursor); } return NULL; } static void ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e) { unsigned int epoch = e & (CK_EPOCH_LENGTH - 1); ck_stack_entry_t *head, *next, *cursor; + unsigned int n_pending, n_peak; unsigned int i = 0; - head = CK_STACK_FIRST(&record->pending[epoch]); - ck_stack_init(&record->pending[epoch]); - + head = ck_stack_batch_pop_upmc(&record->pending[epoch]); for (cursor = head; cursor != NULL; cursor = next) { struct ck_epoch_entry *entry = ck_epoch_entry_container(cursor); next = CK_STACK_NEXT(cursor); entry->function(entry); i++; } - if (record->n_pending > record->n_peak) - record->n_peak = record->n_pending; + n_peak = ck_pr_load_uint(&record->n_peak); + n_pending = ck_pr_load_uint(&record->n_pending); - record->n_dispatch += i; - record->n_pending -= i; + /* We don't require accuracy around peak calculation. */ + if (n_pending > n_peak) + ck_pr_store_uint(&record->n_peak, n_peak); + + if (i > 0) { + ck_pr_add_uint(&record->n_dispatch, i); + ck_pr_sub_uint(&record->n_pending, i); + } + return; } /* * Reclaim all objects associated with a record. */ void ck_epoch_reclaim(struct ck_epoch_record *record) { unsigned int epoch; for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++) ck_epoch_dispatch(record, epoch); return; } +CK_CC_FORCE_INLINE static void +epoch_block(struct ck_epoch *global, struct ck_epoch_record *cr, + ck_epoch_wait_cb_t *cb, void *ct) +{ + + if (cb != NULL) + cb(global, cr, ct); + + return; +} + /* * This function must not be called with-in read section. */ void -ck_epoch_synchronize(struct ck_epoch_record *record) +ck_epoch_synchronize_wait(struct ck_epoch *global, + ck_epoch_wait_cb_t *cb, void *ct) { - struct ck_epoch *global = record->global; struct ck_epoch_record *cr; unsigned int delta, epoch, goal, i; bool active; ck_pr_fence_memory(); /* * The observation of the global epoch must be ordered with respect to * all prior operations. The re-ordering of loads is permitted given * monoticity of global epoch counter. * * If UINT_MAX concurrent mutations were to occur then it is possible * to encounter an ABA-issue. If this is a concern, consider tuning * write-side concurrency. */ delta = epoch = ck_pr_load_uint(&global->epoch); goal = epoch + CK_EPOCH_GRACE; for (i = 0, cr = NULL; i < CK_EPOCH_GRACE - 1; cr = NULL, i++) { bool r; /* * Determine whether all threads have observed the current * epoch with respect to the updates on invocation. */ while (cr = ck_epoch_scan(global, cr, delta, &active), cr != NULL) { unsigned int e_d; ck_pr_stall(); /* * Another writer may have already observed a grace * period. */ e_d = ck_pr_load_uint(&global->epoch); - if (e_d != delta) { - delta = e_d; - goto reload; + if (e_d == delta) { + epoch_block(global, cr, cb, ct); + continue; } + + /* + * If the epoch has been updated, we may have already + * met our goal. + */ + delta = e_d; + if ((goal > epoch) & (delta >= goal)) + goto leave; + + epoch_block(global, cr, cb, ct); + + /* + * If the epoch has been updated, then a grace period + * requires that all threads are observed idle at the + * same epoch. + */ + cr = NULL; } /* * If we have observed all threads as inactive, then we assume * we are at a grace period. */ if (active == false) break; /* * Increment current epoch. CAS semantics are used to eliminate * increment operations for synchronization that occurs for the * same global epoch value snapshot. * * If we can guarantee there will only be one active barrier or * epoch tick at a given time, then it is sufficient to use an * increment operation. In a multi-barrier workload, however, * it is possible to overflow the epoch value if we apply * modulo-3 arithmetic. */ r = ck_pr_cas_uint_value(&global->epoch, delta, delta + 1, &delta); /* Order subsequent thread active checks. */ ck_pr_fence_atomic_load(); /* * If CAS has succeeded, then set delta to latest snapshot. * Otherwise, we have just acquired latest snapshot. */ delta = delta + r; - continue; - -reload: - if ((goal > epoch) & (delta >= goal)) { - /* - * Right now, epoch overflow is handled as an edge - * case. If we have already observed an epoch - * generation, then we can be sure no hazardous - * references exist to objects from this generation. We - * can actually avoid an addtional scan step at this - * point. - */ - break; - } } /* * A majority of use-cases will not require full barrier semantics. * However, if non-temporal instructions are used, full barrier * semantics are necessary. */ +leave: ck_pr_fence_memory(); - record->epoch = delta; return; } void +ck_epoch_synchronize(struct ck_epoch_record *record) +{ + + ck_epoch_synchronize_wait(record->global, NULL, NULL); + return; +} + +void ck_epoch_barrier(struct ck_epoch_record *record) { ck_epoch_synchronize(record); ck_epoch_reclaim(record); return; } +void +ck_epoch_barrier_wait(struct ck_epoch_record *record, ck_epoch_wait_cb_t *cb, + void *ct) +{ + + ck_epoch_synchronize_wait(record->global, cb, ct); + ck_epoch_reclaim(record); + return; +} + /* * It may be worth it to actually apply these deferral semantics to an epoch * that was observed at ck_epoch_call time. The problem is that the latter * would require a full fence. * * ck_epoch_call will dispatch to the latest epoch snapshot that was observed. * There are cases where it will fail to reclaim as early as it could. If this * becomes a problem, we could actually use a heap for epoch buckets but that * is far from ideal too. */ bool ck_epoch_poll(struct ck_epoch_record *record) { bool active; unsigned int epoch; - unsigned int snapshot; struct ck_epoch_record *cr = NULL; struct ck_epoch *global = record->global; epoch = ck_pr_load_uint(&global->epoch); /* Serialize epoch snapshots with respect to global epoch. */ ck_pr_fence_memory(); cr = ck_epoch_scan(global, cr, epoch, &active); if (cr != NULL) { record->epoch = epoch; return false; } /* We are at a grace period if all threads are inactive. */ if (active == false) { record->epoch = epoch; for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++) ck_epoch_dispatch(record, epoch); return true; } /* If an active thread exists, rely on epoch observation. */ - if (ck_pr_cas_uint_value(&global->epoch, epoch, epoch + 1, - &snapshot) == false) { - record->epoch = snapshot; - } else { - record->epoch = epoch + 1; - } + (void)ck_pr_cas_uint(&global->epoch, epoch, epoch + 1); ck_epoch_dispatch(record, epoch + 1); return true; }