Changeset View
Standalone View
sys/kern/subr_smr.c
| Show All 35 Lines | |||||
| #include <sys/proc.h> | #include <sys/proc.h> | ||||
| #include <sys/smp.h> | #include <sys/smp.h> | ||||
| #include <sys/smr.h> | #include <sys/smr.h> | ||||
| #include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
| #include <vm/uma.h> | #include <vm/uma.h> | ||||
| /* | /* | ||||
| * Global Unbounded Sequences (GUS) | |||||
| * | |||||
| * This is a novel safe memory reclamation technique inspired by | * This is a novel safe memory reclamation technique inspired by | ||||
| * epoch based reclamation from Samy Al Bahra's concurrency kit which | * epoch based reclamation from Samy Al Bahra's concurrency kit which | ||||
| * in turn was based on work described in: | * in turn was based on work described in: | ||||
| * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University | * Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University | ||||
| * of Cambridge Computing Laboratory. | * of Cambridge Computing Laboratory. | ||||
| * And shares some similarities with: | * And shares some similarities with: | ||||
| * Wang, Stamler, Parmer. 2016 Parallel Sections: Scaling System-Level | * Wang, Stamler, Parmer. 2016 Parallel Sections: Scaling System-Level | ||||
| * Data-Structures | * Data-Structures | ||||
| * | * | ||||
| * This is not an implementation of hazard pointers or related | * This is not an implementation of hazard pointers or related | ||||
| * techniques. The term safe memory reclamation is used as a | * techniques. The term safe memory reclamation is used as a | ||||
| * generic descriptor for algorithms that defer frees to avoid | * generic descriptor for algorithms that defer frees to avoid | ||||
| * use-after-free errors with lockless datastructures. | * use-after-free errors with lockless datastructures or as | ||||
| * a mechanism to detect quiescence for writer synchronization. | |||||
| * | * | ||||
| * The basic approach is to maintain a monotonic write sequence | * The basic approach is to maintain a monotonic write sequence | ||||
| * number that is updated on some application defined granularity. | * number that is updated on some application defined granularity. | ||||
| * Readers record the most recent write sequence number they have | * Readers record the most recent write sequence number they have | ||||
| * observed. A shared read sequence number records the lowest | * observed. A shared read sequence number records the lowest | ||||
| * sequence number observed by any reader as of the last poll. Any | * sequence number observed by any reader as of the last poll. Any | ||||
| * write older than this value has been observed by all readers | * write older than this value has been observed by all readers | ||||
| * and memory can be reclaimed. Like Epoch we also detect idle | * and memory can be reclaimed. Like Epoch we also detect idle | ||||
| * readers by storing an invalid sequence number in the per-cpu | * readers by storing an invalid sequence number in the per-cpu | ||||
| * state when the read section exits. Like Parsec we establish | * state when the read section exits. Like Parsec we establish | ||||
| * a global write clock that is used to mark memory on free. | * a global write clock that is used to mark memory on free. | ||||
| * | * | ||||
| * The write and read sequence numbers can be thought of as a two | * The write and read sequence numbers can be thought of as a two | ||||
| * handed clock with readers always advancing towards writers. SMR | * handed clock with readers always advancing towards writers. GUS | ||||
| * maintains the invariant that all readers can safely access memory | * maintains the invariant that all readers can safely access memory | ||||
| * that was visible at the time they loaded their copy of the sequence | * that was visible at the time they loaded their copy of the sequence | ||||
| * number. Periodically the read sequence or hand is polled and | * number. Periodically the read sequence or hand is polled and | ||||
| * advanced as far towards the write sequence as active readers allow. | * advanced as far towards the write sequence as active readers allow. | ||||
| * Memory which was freed between the old and new global read sequence | * Memory which was freed between the old and new global read sequence | ||||
| * number can now be reclaimed. When the system is idle the two hands | * number can now be reclaimed. When the system is idle the two hands | ||||
| * meet and no deferred memory is outstanding. Readers never advance | * meet and no deferred memory is outstanding. Readers never advance | ||||
| * any sequence number, they only observe them. The shared read | * any sequence number, they only observe them. The shared read | ||||
| * sequence number is consequently never higher than the write sequence. | * sequence number is consequently never higher than the write sequence. | ||||
| * A stored sequence number that falls outside of this range has expired | * A stored sequence number that falls outside of this range has expired | ||||
| * and needs no scan to reclaim. | * and needs no scan to reclaim. | ||||
| * | * | ||||
| * A notable distinction between this SMR and Epoch, qsbr, rcu, etc. is | * A notable distinction between GUS and Epoch, qsbr, rcu, etc. is | ||||
| * that advancing the sequence number is decoupled from detecting its | * that advancing the sequence number is decoupled from detecting its | ||||
| * observation. This results in a more granular assignment of sequence | * observation. That is to say, the delta between read and write | ||||
| * sequence numbers is not bound. This can be thought of as a more | |||||
| * generalized form of epoch which requires them at most one step | |||||
| * apart. This results in a more granular assignment of sequence | |||||
| * numbers even as read latencies prohibit all or some expiration. | * numbers even as read latencies prohibit all or some expiration. | ||||
| * It also allows writers to advance the sequence number and save the | * It also allows writers to advance the sequence number and save the | ||||
| * poll for expiration until a later time when it is likely to | * poll for expiration until a later time when it is likely to | ||||
| * complete without waiting. The batch granularity and free-to-use | * complete without waiting. The batch granularity and free-to-use | ||||
| * latency is dynamic and can be significantly smaller than in more | * latency is dynamic and can be significantly smaller than in more | ||||
| * strict systems. | * strict systems. | ||||
| * | * | ||||
| * This mechanism is primarily intended to be used in coordination with | * This mechanism is primarily intended to be used in coordination with | ||||
| ▲ Show 20 Lines • Show All 65 Lines • ▼ Show 20 Lines | |||||
| /* We want to test the wrapping feature in invariants kernels. */ | /* We want to test the wrapping feature in invariants kernels. */ | ||||
| #define SMR_SEQ_INCR (UINT_MAX / 10000) | #define SMR_SEQ_INCR (UINT_MAX / 10000) | ||||
| #define SMR_SEQ_INIT (UINT_MAX - 100000) | #define SMR_SEQ_INIT (UINT_MAX - 100000) | ||||
| /* Force extra polls to test the integer overflow detection. */ | /* Force extra polls to test the integer overflow detection. */ | ||||
| #define SMR_SEQ_MAX_DELTA (SMR_SEQ_INCR * 32) | #define SMR_SEQ_MAX_DELTA (SMR_SEQ_INCR * 32) | ||||
| #define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2 | #define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2 | ||||
| #endif | #endif | ||||
| /* | |||||
| * The grace period for lazy (tick based) SMR. | |||||
| * | |||||
| * Hardclock is responsible for advancing ticks on a single CPU while every | |||||
| * CPU receives a regular clock interrupt. The clock interrupts are flushing | |||||
| * the store buffers and any speculative loads that may violate our invariants. | |||||
| * Because these interrupts are not synchronized we must wait one additional | |||||
| * tick in the future to be certain that all processors have had their state | |||||
| * synchronized by an interrupt. | |||||
| * | |||||
| * This assumes that the clock interrupt will only be delayed by other causes | |||||
| * that will flush the store buffer or prevent access to the section protected | |||||
| * data. For example, an idle processor, or an system management interrupt, | |||||
| * or a vm exit. | |||||
| * | |||||
| * We must wait one additional tick if we are around the wrap condition | |||||
| * because the write seq will move forward by two with one interrupt. | |||||
| */ | |||||
| #define SMR_LAZY_GRACE 2 | |||||
| #define SMR_LAZY_GRACE_MAX (SMR_LAZY_GRACE + 1) | |||||
| /* | |||||
| * The maximum sequence number ahead of wr_seq that may still be valid. The | |||||
| * sequence may not be advanced on write for lazy or deferred SMRs. In this | |||||
| * case poll needs to attempt to forward the sequence number if the goal is | |||||
| * within wr_seq + SMR_SEQ_ADVANCE. | |||||
| */ | |||||
| #define SMR_SEQ_ADVANCE MAX(SMR_SEQ_INCR, SMR_LAZY_GRACE_MAX) | |||||
rlibby: Unused... | |||||
| static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats"); | static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats"); | ||||
| static counter_u64_t advance = EARLY_COUNTER; | static counter_u64_t advance = EARLY_COUNTER; | ||||
| SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RD, &advance, ""); | SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RW, &advance, ""); | ||||
Not Done Inline ActionsWR? Did you mean RW, so they could be reset? rlibby: WR? Did you mean RW, so they could be reset? | |||||
| static counter_u64_t advance_wait = EARLY_COUNTER; | static counter_u64_t advance_wait = EARLY_COUNTER; | ||||
| SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RD, &advance_wait, ""); | SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RW, &advance_wait, ""); | ||||
| static counter_u64_t poll = EARLY_COUNTER; | static counter_u64_t poll = EARLY_COUNTER; | ||||
| SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RD, &poll, ""); | SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RW, &poll, ""); | ||||
| static counter_u64_t poll_scan = EARLY_COUNTER; | static counter_u64_t poll_scan = EARLY_COUNTER; | ||||
| SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RD, &poll_scan, ""); | SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RW, &poll_scan, ""); | ||||
| static counter_u64_t poll_fail = EARLY_COUNTER; | |||||
| SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_RW, &poll_fail, ""); | |||||
| /* | /* | ||||
| * Advance the write sequence and return the new value for use as the | * Advance a lazy write sequence number. These move forward at the rate of | ||||
| * wait goal. This guarantees that any changes made by the calling | * ticks. Grace is two ticks in the future. lazy write sequence numbers can | ||||
| * thread prior to this call will be visible to all threads after | * be even but not SMR_SEQ_INVALID so we pause time for a tick when we wrap. | ||||
| * rd_seq meets or exceeds the return value. | |||||
| * | * | ||||
| * This function may busy loop if the readers are roughly 1 billion | * This returns the _current_ write sequence number. The lazy goal sequence | ||||
| * sequence numbers behind the writers. | * number is SMR_LAZY_GRACE ticks ahead. | ||||
| */ | */ | ||||
| smr_seq_t | static smr_seq_t | ||||
| smr_advance(smr_t smr) | smr_lazy_advance(smr_t smr, smr_shared_t s) | ||||
| { | { | ||||
| smr_shared_t s; | smr_seq_t s_rd_seq, s_wr_seq, goal; | ||||
| smr_seq_t goal, s_rd_seq; | int t; | ||||
| CRITICAL_ASSERT(curthread); | |||||
| /* | /* | ||||
| * It is illegal to enter while in an smr section. | * Load s_wr_seq prior to ticks to ensure that the thread that | ||||
| * observes the largest value wins. | |||||
| */ | */ | ||||
| SMR_ASSERT_NOT_ENTERED(smr); | s_wr_seq = atomic_load_acq_int(&s->s_wr_seq); | ||||
| /* | /* | ||||
| * Modifications not done in a smr section need to be visible | * We must not allow a zero tick value. We go back in time one tick | ||||
| * before advancing the seq. | * and advance the grace period forward one tick around zero. | ||||
| */ | */ | ||||
| atomic_thread_fence_rel(); | t = ticks; | ||||
| if (t == SMR_SEQ_INVALID) | |||||
| t--; | |||||
Not Done Inline ActionsIsn't there an smr_lazy_advance() race here, where
and is it a problem? If so, I think we need to read ticks "after" s->s_wr_seq. I don't think we could just use SMR_SEQ_LEQ(t, s_wr_seq) as I think that would lead to the idle wrap problem. rlibby: Isn't there an smr_lazy_advance() race here, where
- A reads ticks=X
- B reads ticks=X+1… | |||||
Done Inline ActionsI think you are right. Originally I didn't consider this but it would happen if ticks was ahead of wr_seq on an otherwise idle smr. And an acq barrier with a rel on the cmpset should resolve it. sending wr_seq backwards and immediately polling would leave us considering a goal ticks calculated from the larger value as already expired. jeff: I think you are right. Originally I didn't consider this but it would happen if ticks was… | |||||
Done Inline ActionsI also wanted to add, I tried to write this as: if s->s_ticks == ticks return s_wr_seq I could not work out a way to do this safely with only barriers. I could use a 64bit cmpxchg. I could also use 64bit cmpxchg to fix s_rd_seq in a single atomic. I would prefer to keep the flexibility to make the counters 64bit instead though. I think the only s_rd_seq race below would harmlessly rewind it a little bit. If this were userspace and I could make fewer guarantees I would use the 64bit cmpxchg trick. jeff: I also wanted to add, I tried to write this as:
if s->s_ticks == ticks return s_wr_seq… | |||||
| /* | /* | ||||
| * Load the current read seq before incrementing the goal so | * The most probable condition that the update already took place. | ||||
| * we are guaranteed it is always < goal. | |||||
| */ | */ | ||||
| s = zpcpu_get(smr)->c_shared; | if (__predict_true(t == s_wr_seq)) | ||||
| s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); | goto out; | ||||
| /* | /* | ||||
| * Increment the shared write sequence by 2. Since it is | * After long idle periods the read sequence may fall too far | ||||
| * initialized to 1 this means the only valid values are | * behind write. Prevent poll from ever seeing this condition | ||||
| * odd and an observed value of 0 in a particular CPU means | * by updating the stale rd_seq. This assumes that there can | ||||
| * it is not currently in a read section. | * be no valid section 2bn ticks old. The rd_seq update must | ||||
| * be visible before wr_seq to avoid races with other advance | |||||
| * callers. | |||||
| */ | */ | ||||
| goal = atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR; | s_rd_seq = atomic_load_int(&s->s_rd_seq); | ||||
| if (SMR_SEQ_GT(s_rd_seq, t)) | |||||
| atomic_cmpset_rel_int(&s->s_rd_seq, s_rd_seq, t); | |||||
| /* | |||||
| * Release to synchronize with the wr_seq load above. Ignore | |||||
| * cmpset failures from simultaneous updates. | |||||
| */ | |||||
| atomic_cmpset_rel_int(&s->s_wr_seq, s_wr_seq, t); | |||||
| counter_u64_add(advance, 1); | counter_u64_add(advance, 1); | ||||
| /* If we lost either update race another thread did it. */ | |||||
| s_wr_seq = t; | |||||
| out: | |||||
| goal = s_wr_seq + SMR_LAZY_GRACE; | |||||
| /* Skip over the SMR_SEQ_INVALID tick. */ | |||||
| if (goal < SMR_LAZY_GRACE) | |||||
| goal++; | |||||
| return (goal); | |||||
| } | |||||
| /* | /* | ||||
| * Increment the shared write sequence by 2. Since it is initialized | |||||
| * to 1 this means the only valid values are odd and an observed value | |||||
| * of 0 in a particular CPU means it is not currently in a read section. | |||||
| */ | |||||
| static smr_seq_t | |||||
| smr_shared_advance(smr_shared_t s) | |||||
| { | |||||
| return (atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR); | |||||
| } | |||||
| /* | |||||
| * Advance the write sequence number for a normal smr section. If the | |||||
| * write sequence is too far behind the read sequence we have to poll | |||||
| * to advance rd_seq and prevent undetectable wraps. | |||||
| */ | |||||
| static smr_seq_t | |||||
| smr_default_advance(smr_t smr, smr_shared_t s) | |||||
| { | |||||
| smr_seq_t goal, s_rd_seq; | |||||
| CRITICAL_ASSERT(curthread); | |||||
| KASSERT((zpcpu_get(smr)->c_flags & SMR_LAZY) == 0, | |||||
| ("smr_default_advance: called with lazy smr.")); | |||||
| /* | |||||
| * Load the current read seq before incrementing the goal so | |||||
| * we are guaranteed it is always < goal. | |||||
| */ | |||||
| s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); | |||||
| goal = smr_shared_advance(s); | |||||
| /* | |||||
| * Force a synchronization here if the goal is getting too | * Force a synchronization here if the goal is getting too | ||||
| * far ahead of the read sequence number. This keeps the | * far ahead of the read sequence number. This keeps the | ||||
| * wrap detecting arithmetic working in pathological cases. | * wrap detecting arithmetic working in pathological cases. | ||||
| */ | */ | ||||
| if (SMR_SEQ_DELTA(goal, s_rd_seq) >= SMR_SEQ_MAX_DELTA) { | if (SMR_SEQ_DELTA(goal, s_rd_seq) >= SMR_SEQ_MAX_DELTA) { | ||||
| counter_u64_add(advance_wait, 1); | counter_u64_add(advance_wait, 1); | ||||
| smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE); | smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE); | ||||
| } | } | ||||
| counter_u64_add(advance, 1); | |||||
rlibbyUnsubmitted Not Done Inline ActionsWe grew a critical section around smr_default_advance (I assume you're okay with that...), so we can apply Mark's comments from the earlier review and s/counter_u64_add/counter_u64_add_protected/ (all three instances under advance). rlibby: We grew a critical section around smr_default_advance (I assume you're okay with that...), so… | |||||
jeffAuthorUnsubmitted Done Inline ActionsThat is a good point. I am a little uncertain as to whether advance will continue to use critical. I will evaluate this again once I decide. jeff: That is a good point. I am a little uncertain as to whether advance will continue to use… | |||||
| return (goal); | return (goal); | ||||
| } | } | ||||
| /* | |||||
| * Deferred SMRs conditionally update s_wr_seq based on an | |||||
| * cpu local interval count. | |||||
| */ | |||||
| static smr_seq_t | |||||
| smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self) | |||||
| { | |||||
| if (++self->c_deferred < self->c_limit) | |||||
| return (smr_shared_current(s) + SMR_SEQ_INCR); | |||||
| self->c_deferred = 0; | |||||
| return (smr_default_advance(smr, s)); | |||||
| } | |||||
| /* | |||||
| * Advance the write sequence and return the value for use as the | |||||
| * wait goal. This guarantees that any changes made by the calling | |||||
| * thread prior to this call will be visible to all threads after | |||||
| * rd_seq meets or exceeds the return value. | |||||
| * | |||||
| * This function may busy loop if the readers are roughly 1 billion | |||||
| * sequence numbers behind the writers. | |||||
| * | |||||
| * Lazy SMRs will not busy loop and the wrap happens every 49.6 days | |||||
| * at 1khz and 119 hours at 10khz. Readers can block for no longer | |||||
| * than half of this for SMR_SEQ_ macros to continue working. | |||||
| */ | |||||
| smr_seq_t | smr_seq_t | ||||
| smr_advance_deferred(smr_t smr, int limit) | smr_advance(smr_t smr) | ||||
| { | { | ||||
| smr_t self; | |||||
| smr_shared_t s; | |||||
| smr_seq_t goal; | smr_seq_t goal; | ||||
| smr_t csmr; | int flags; | ||||
| /* | |||||
| * It is illegal to enter while in an smr section. | |||||
| */ | |||||
| SMR_ASSERT_NOT_ENTERED(smr); | SMR_ASSERT_NOT_ENTERED(smr); | ||||
| /* | |||||
| * Modifications not done in a smr section need to be visible | |||||
| * before advancing the seq. | |||||
| */ | |||||
| atomic_thread_fence_rel(); | |||||
| critical_enter(); | critical_enter(); | ||||
| csmr = zpcpu_get(smr); | /* Try to touch the line once. */ | ||||
| if (++csmr->c_deferred >= limit) { | self = zpcpu_get(smr); | ||||
| s = self->c_shared; | |||||
| flags = self->c_flags; | |||||
| goal = SMR_SEQ_INVALID; | goal = SMR_SEQ_INVALID; | ||||
| csmr->c_deferred = 0; | if ((flags & (SMR_LAZY | SMR_DEFERRED)) == 0) | ||||
| } else | goal = smr_default_advance(smr, s); | ||||
| goal = smr_shared_current(csmr->c_shared) + SMR_SEQ_INCR; | else if ((flags & SMR_LAZY) != 0) | ||||
| goal = smr_lazy_advance(smr, s); | |||||
| else if ((flags & SMR_DEFERRED) != 0) | |||||
| goal = smr_deferred_advance(smr, s, self); | |||||
| critical_exit(); | critical_exit(); | ||||
| if (goal != SMR_SEQ_INVALID) | |||||
| return (goal); | return (goal); | ||||
| } | |||||
| return (smr_advance(smr)); | /* | ||||
| * Poll to determine the currently observed sequence number on a cpu | |||||
| * and spinwait if the 'wait' argument is true. | |||||
| */ | |||||
| static smr_seq_t | |||||
| smr_poll_cpu(smr_t c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait) | |||||
| { | |||||
| smr_seq_t c_seq; | |||||
| c_seq = SMR_SEQ_INVALID; | |||||
| for (;;) { | |||||
| c_seq = atomic_load_int(&c->c_seq); | |||||
| if (c_seq == SMR_SEQ_INVALID) | |||||
| break; | |||||
| /* | |||||
| * There is a race described in smr.h:smr_enter that | |||||
| * can lead to a stale seq value but not stale data | |||||
| * access. If we find a value out of range here we | |||||
| * pin it to the current min to prevent it from | |||||
| * advancing until that stale section has expired. | |||||
| * | |||||
| * The race is created when a cpu loads the s_wr_seq | |||||
| * value in a local register and then another thread | |||||
| * advances s_wr_seq and calls smr_poll() which will | |||||
| * oberve no value yet in c_seq and advance s_rd_seq | |||||
| * up to s_wr_seq which is beyond the register | |||||
| * cached value. This is only likely to happen on | |||||
| * hypervisor or with a system management interrupt. | |||||
| */ | |||||
| if (SMR_SEQ_LT(c_seq, s_rd_seq)) | |||||
| c_seq = s_rd_seq; | |||||
| /* | |||||
| * If the sequence number meets the goal we are done | |||||
| * with this cpu. | |||||
| */ | |||||
| if (SMR_SEQ_LEQ(goal, c_seq)) | |||||
| break; | |||||
| if (!wait) | |||||
| break; | |||||
| cpu_spinwait(); | |||||
| } | } | ||||
| return (c_seq); | |||||
| } | |||||
| /* | /* | ||||
| * Loop until all cores have observed the goal sequence or have | |||||
| * gone inactive. Returns the oldest sequence currently active; | |||||
| * | |||||
| * This function assumes a snapshot of sequence values has | |||||
| * been obtained and validated by smr_poll(). | |||||
| */ | |||||
| static smr_seq_t | |||||
| smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq, | |||||
| smr_seq_t s_wr_seq, smr_seq_t goal, bool wait) | |||||
| { | |||||
| smr_seq_t rd_seq, c_seq; | |||||
| int i; | |||||
| CRITICAL_ASSERT(curthread); | |||||
| counter_u64_add_protected(poll_scan, 1); | |||||
| /* | |||||
| * The read sequence can be no larger than the write sequence at | |||||
| * the start of the poll. | |||||
| */ | |||||
| rd_seq = s_wr_seq; | |||||
| CPU_FOREACH(i) { | |||||
| /* | |||||
| * Query the active sequence on this cpu. If we're not | |||||
| * waiting and we don't meet the goal we will still scan | |||||
| * the rest of the cpus to update s_rd_seq before returning | |||||
| * failure. | |||||
| */ | |||||
| c_seq = smr_poll_cpu(zpcpu_get_cpu(smr, i), s_rd_seq, goal, | |||||
| wait); | |||||
| /* | |||||
| * Limit the minimum observed rd_seq whether we met the goal | |||||
| * or not. | |||||
| */ | |||||
| if (c_seq != SMR_SEQ_INVALID) | |||||
| rd_seq = SMR_SEQ_MIN(rd_seq, c_seq); | |||||
| } | |||||
| /* | |||||
| * Advance the rd_seq as long as we observed a more recent value. | |||||
| */ | |||||
| s_rd_seq = atomic_load_int(&s->s_rd_seq); | |||||
| if (SMR_SEQ_GEQ(rd_seq, s_rd_seq)) { | |||||
rlibbyUnsubmitted Not Done Inline ActionsGT rlibby: GT | |||||
| atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq); | |||||
| s_rd_seq = rd_seq; | |||||
| } | |||||
| return (s_rd_seq); | |||||
| } | |||||
| /* | |||||
| * Poll to determine whether all readers have observed the 'goal' write | * Poll to determine whether all readers have observed the 'goal' write | ||||
| * sequence number. | * sequence number. | ||||
| * | * | ||||
| * If wait is true this will spin until the goal is met. | * If wait is true this will spin until the goal is met. | ||||
| * | * | ||||
| * This routine will updated the minimum observed read sequence number in | * This routine will updated the minimum observed read sequence number in | ||||
| * s_rd_seq if it does a scan. It may not do a scan if another call has | * s_rd_seq if it does a scan. It may not do a scan if another call has | ||||
| * advanced s_rd_seq beyond the callers goal already. | * advanced s_rd_seq beyond the callers goal already. | ||||
| * | * | ||||
| * Returns true if the goal is met and false if not. | * Returns true if the goal is met and false if not. | ||||
| */ | */ | ||||
| bool | bool | ||||
| smr_poll(smr_t smr, smr_seq_t goal, bool wait) | smr_poll(smr_t smr, smr_seq_t goal, bool wait) | ||||
| { | { | ||||
| smr_shared_t s; | smr_shared_t s; | ||||
| smr_t c; | smr_t self; | ||||
| smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq; | smr_seq_t s_wr_seq, s_rd_seq; | ||||
| int i; | smr_delta_t delta; | ||||
| int flags; | |||||
| bool success; | bool success; | ||||
| /* | /* | ||||
| * It is illegal to enter while in an smr section. | * It is illegal to enter while in an smr section. | ||||
| */ | */ | ||||
| KASSERT(!wait || !SMR_ENTERED(smr), | KASSERT(!wait || !SMR_ENTERED(smr), | ||||
| ("smr_poll: Blocking not allowed in a SMR section.")); | ("smr_poll: Blocking not allowed in a SMR section.")); | ||||
| KASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0, | |||||
| ("smr_poll: Blocking not allowed on lazy smrs.")); | |||||
| /* | /* | ||||
| * Use a critical section so that we can avoid ABA races | * Use a critical section so that we can avoid ABA races | ||||
| * caused by long preemption sleeps. | * caused by long preemption sleeps. | ||||
| */ | */ | ||||
| success = true; | success = true; | ||||
| critical_enter(); | critical_enter(); | ||||
| s = zpcpu_get(smr)->c_shared; | /* Attempt to load from self only once. */ | ||||
| self = zpcpu_get(smr); | |||||
| s = self->c_shared; | |||||
| flags = self->c_flags; | |||||
| counter_u64_add_protected(poll, 1); | counter_u64_add_protected(poll, 1); | ||||
| /* | /* | ||||
| * Conditionally advance the lazy write clock on any writer | |||||
| * activity. This may reset s_rd_seq. | |||||
| */ | |||||
| if ((flags & SMR_LAZY) != 0) | |||||
| smr_lazy_advance(smr, s); | |||||
rlibbyUnsubmitted Not Done Inline ActionsI don't understand this. Shouldn't the writer have called smr_advance()? Where did the goal come from if not? rlibby: I don't understand this. Shouldn't the writer have called smr_advance()? Where did the goal… | |||||
jeffAuthorUnsubmitted Done Inline Actionslazy only increments wr_seq lazily after ticks does. The goal is always given as later than the current ticks. So it's always in the future. If no other writers follow wr_seq won't be advanced and the goal will never be satisfied. This makes sure the clock is always moving as long as someone cares. jeff: lazy only increments wr_seq lazily after ticks does. The goal is always given as later than… | |||||
| /* | |||||
| * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not | * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not | ||||
| * observe an updated read sequence that is larger than write. | * observe an updated read sequence that is larger than write. | ||||
| */ | */ | ||||
| s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); | s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); | ||||
| /* | /* | ||||
| * wr_seq must be loaded prior to any c_seq value so that a stale | * If we have already observed the sequence number we can immediately | ||||
| * c_seq can only reference time after this wr_seq. | * return success. Most polls should meet this criterion. | ||||
| */ | */ | ||||
| s_wr_seq = atomic_load_acq_int(&s->s_wr_seq); | if (SMR_SEQ_LEQ(goal, s_rd_seq)) | ||||
| goto out; | |||||
| /* | /* | ||||
| * This may have come from a deferred advance. Consider one | * wr_seq must be loaded prior to any c_seq value so that a | ||||
| * increment past the current wr_seq valid and make sure we | * stale c_seq can only reference time after this wr_seq. | ||||
| * have advanced far enough to succeed. We simply add to avoid | |||||
| * an additional fence. | |||||
| */ | */ | ||||
| if (goal == s_wr_seq + SMR_SEQ_INCR) { | s_wr_seq = atomic_load_acq_int(&s->s_wr_seq); | ||||
| atomic_add_int(&s->s_wr_seq, SMR_SEQ_INCR); | |||||
| s_wr_seq = goal; | |||||
| } | |||||
| /* | /* | ||||
| * Detect whether the goal is valid and has already been observed. | * This is the distance from s_wr_seq to goal. Positive values | ||||
| * | * are in the future. | ||||
| * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for | |||||
| * it to be valid. If it is not then the caller held on to it and | |||||
| * the integer wrapped. If we wrapped back within range the caller | |||||
| * will harmlessly scan. | |||||
| * | |||||
| * A valid goal must be greater than s_rd_seq or we have not verified | |||||
| * that it has been observed and must fall through to polling. | |||||
| */ | */ | ||||
| if (SMR_SEQ_GEQ(s_rd_seq, goal) || SMR_SEQ_LT(s_wr_seq, goal)) | delta = SMR_SEQ_DELTA(goal, s_wr_seq); | ||||
| goto out; | |||||
| /* | /* | ||||
| * Loop until all cores have observed the goal sequence or have | * Detect a stale wr_seq. | ||||
| * gone inactive. Keep track of the oldest sequence currently | |||||
| * active as rd_seq. | |||||
| */ | |||||
| counter_u64_add_protected(poll_scan, 1); | |||||
| rd_seq = s_wr_seq; | |||||
| CPU_FOREACH(i) { | |||||
| c = zpcpu_get_cpu(smr, i); | |||||
| c_seq = SMR_SEQ_INVALID; | |||||
| for (;;) { | |||||
| c_seq = atomic_load_int(&c->c_seq); | |||||
| if (c_seq == SMR_SEQ_INVALID) | |||||
| break; | |||||
| /* | |||||
| * There is a race described in smr.h:smr_enter that | |||||
| * can lead to a stale seq value but not stale data | |||||
| * access. If we find a value out of range here we | |||||
| * pin it to the current min to prevent it from | |||||
| * advancing until that stale section has expired. | |||||
| * | * | ||||
| * The race is created when a cpu loads the s_wr_seq | * This goal may have come from a deferred advance or a lazy | ||||
| * value in a local register and then another thread | * smr. If we are not blocking we can not succeed but the | ||||
| * advances s_wr_seq and calls smr_poll() which will | * sequence number is valid. | ||||
| * oberve no value yet in c_seq and advance s_rd_seq | |||||
| * up to s_wr_seq which is beyond the register | |||||
| * cached value. This is only likely to happen on | |||||
| * hypervisor or with a system management interrupt. | |||||
| */ | */ | ||||
| if (SMR_SEQ_LT(c_seq, s_rd_seq)) | if (delta > 0 && delta <= SMR_SEQ_MAX_ADVANCE && | ||||
rlibbyUnsubmitted Not Done Inline Actionss/SMR_SEQ_MAX_ADVANCE/SMR_SEQ_ADVANCE/ ? rlibby: s/SMR_SEQ_MAX_ADVANCE/SMR_SEQ_ADVANCE/ ? | |||||
jeffAuthorUnsubmitted Done Inline ActionsYes thank you. jeff: Yes thank you. | |||||
| c_seq = s_rd_seq; | (flags & (SMR_LAZY | SMR_DEFERRED)) != 0) { | ||||
| /* | |||||
| * If the sequence number meets the goal we are | |||||
| * done with this cpu. | |||||
| */ | |||||
| if (SMR_SEQ_GEQ(c_seq, goal)) | |||||
| break; | |||||
| /* | |||||
| * If we're not waiting we will still scan the rest | |||||
| * of the cpus and update s_rd_seq before returning | |||||
| * an error. | |||||
| */ | |||||
| if (!wait) { | if (!wait) { | ||||
| success = false; | success = false; | ||||
| break; | goto out; | ||||
| } | } | ||||
| cpu_spinwait(); | /* LAZY is always !wait. */ | ||||
| s_wr_seq = smr_shared_advance(s); | |||||
rlibbyUnsubmitted Not Done Inline ActionsHmm. I think perhaps this comment should be expanded. I think what you're saying is, for SMR_DEFERRED, we can only be behind a valid goal by SMR_SEQ_INCR. For SMR_LAZY, we can be further behind, but we never have to catch up because we're always !wait. Is that right? Because it does seem possible that we drop out of this with goal > s_wr_seq for the SMR_LAZY case. rlibby: Hmm. I think perhaps this comment should be expanded. I think what you're saying is, for… | |||||
jeffAuthorUnsubmitted Done Inline ActionsKASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0, ("smr_poll: Blocking not allowed on lazy smrs."));There is an assert at the top of the function. I'm considering a way to decouple the wr_seq from directly using ticks that would enable me to use IPI to advance if there is a blocking wait for LAZY. This may impact the critcal sections in advance. jeff: KASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
("smr_poll: Blocking not… | |||||
| delta = 0; | |||||
| } | } | ||||
| /* | /* | ||||
| * Limit the minimum observed rd_seq whether we met the goal | * Detect an invalid goal. | ||||
| * or not. | * | ||||
| * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for | |||||
| * it to be valid. If it is not then the caller held on to it and | |||||
| * the integer wrapped. If we wrapped back within range the caller | |||||
| * will harmlessly scan. | |||||
| */ | */ | ||||
| if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq)) | if (delta > 0) | ||||
| rd_seq = c_seq; | |||||
| } | |||||
| /* | |||||
| * Advance the rd_seq as long as we observed the most recent one. | |||||
| */ | |||||
| s_rd_seq = atomic_load_int(&s->s_rd_seq); | |||||
| do { | |||||
| if (SMR_SEQ_LEQ(rd_seq, s_rd_seq)) | |||||
| goto out; | goto out; | ||||
| } while (atomic_fcmpset_int(&s->s_rd_seq, &s_rd_seq, rd_seq) == 0); | |||||
| /* Determine the lowest visible sequence number. */ | |||||
| s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait); | |||||
| success = SMR_SEQ_LEQ(goal, s_rd_seq); | |||||
| out: | out: | ||||
| if (!success) | |||||
| counter_u64_add_protected(poll_fail, 1); | |||||
| critical_exit(); | critical_exit(); | ||||
| /* | /* | ||||
| * Serialize with smr_advance()/smr_exit(). The caller is now free | * Serialize with smr_advance()/smr_exit(). The caller is now free | ||||
| * to modify memory as expected. | * to modify memory as expected. | ||||
| */ | */ | ||||
| atomic_thread_fence_acq(); | atomic_thread_fence_acq(); | ||||
| return (success); | return (success); | ||||
| } | } | ||||
| smr_t | smr_t | ||||
| smr_create(const char *name) | smr_create(const char *name, int limit, int flags) | ||||
| { | { | ||||
| smr_t smr, c; | smr_t smr, c; | ||||
| smr_shared_t s; | smr_shared_t s; | ||||
| int i; | int i; | ||||
| s = uma_zalloc(smr_shared_zone, M_WAITOK); | s = uma_zalloc(smr_shared_zone, M_WAITOK); | ||||
| smr = uma_zalloc_pcpu(smr_zone, M_WAITOK); | smr = uma_zalloc_pcpu(smr_zone, M_WAITOK); | ||||
| s->s_name = name; | s->s_name = name; | ||||
| if ((flags & SMR_LAZY) == 0) | |||||
| s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT; | s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT; | ||||
| else | |||||
| s->s_rd_seq = s->s_wr_seq = ticks; | |||||
| /* Initialize all CPUS, not just those running. */ | /* Initialize all CPUS, not just those running. */ | ||||
| for (i = 0; i <= mp_maxid; i++) { | for (i = 0; i <= mp_maxid; i++) { | ||||
| c = zpcpu_get_cpu(smr, i); | c = zpcpu_get_cpu(smr, i); | ||||
| c->c_seq = SMR_SEQ_INVALID; | c->c_seq = SMR_SEQ_INVALID; | ||||
| c->c_shared = s; | c->c_shared = s; | ||||
| c->c_deferred = 0; | |||||
| c->c_limit = limit; | |||||
| c->c_flags = flags; | |||||
| } | } | ||||
| atomic_thread_fence_seq_cst(); | atomic_thread_fence_seq_cst(); | ||||
| return (smr); | return (smr); | ||||
| } | } | ||||
| void | void | ||||
| smr_destroy(smr_t smr) | smr_destroy(smr_t smr) | ||||
| Show All 20 Lines | |||||
| static void | static void | ||||
| smr_init_counters(void *unused) | smr_init_counters(void *unused) | ||||
| { | { | ||||
| advance = counter_u64_alloc(M_WAITOK); | advance = counter_u64_alloc(M_WAITOK); | ||||
| advance_wait = counter_u64_alloc(M_WAITOK); | advance_wait = counter_u64_alloc(M_WAITOK); | ||||
| poll = counter_u64_alloc(M_WAITOK); | poll = counter_u64_alloc(M_WAITOK); | ||||
| poll_scan = counter_u64_alloc(M_WAITOK); | poll_scan = counter_u64_alloc(M_WAITOK); | ||||
| poll_fail = counter_u64_alloc(M_WAITOK); | |||||
| } | } | ||||
| SYSINIT(smr_counters, SI_SUB_CPU, SI_ORDER_ANY, smr_init_counters, NULL); | SYSINIT(smr_counters, SI_SUB_CPU, SI_ORDER_ANY, smr_init_counters, NULL); | ||||
Unused...