Changeset View
Changeset View
Standalone View
Standalone View
sys/kern/subr_smr.c
Show First 20 Lines • Show All 177 Lines • ▼ Show 20 Lines | |||||
* the store buffers and any speculative loads that may violate our invariants. | * the store buffers and any speculative loads that may violate our invariants. | ||||
* Because these interrupts are not synchronized we must wait one additional | * Because these interrupts are not synchronized we must wait one additional | ||||
* tick in the future to be certain that all processors have had their state | * tick in the future to be certain that all processors have had their state | ||||
* synchronized by an interrupt. | * synchronized by an interrupt. | ||||
* | * | ||||
* This assumes that the clock interrupt will only be delayed by other causes | * This assumes that the clock interrupt will only be delayed by other causes | ||||
* that will flush the store buffer or prevent access to the section protected | * that will flush the store buffer or prevent access to the section protected | ||||
* data. For example, an idle processor, or an system management interrupt, | * data. For example, an idle processor, or an system management interrupt, | ||||
* or a vm exit. | * or a vm exit. | ||||
jeff: This would have a bug if you acquired a spinlock and blocked the interrupt for 2 ticks without… | |||||
* | |||||
* We must wait one additional tick if we are around the wrap condition | |||||
* because the write seq will move forward by two with one interrupt. | |||||
*/ | */ | ||||
#define SMR_LAZY_GRACE 2 | #define SMR_LAZY_GRACE 2 | ||||
#define SMR_LAZY_GRACE_MAX (SMR_LAZY_GRACE + 1) | #define SMR_LAZY_INCR (SMR_LAZY_GRACE * SMR_SEQ_INCR) | ||||
/* | /* | ||||
* The maximum sequence number ahead of wr_seq that may still be valid. The | * The maximum sequence number ahead of wr_seq that may still be valid. The | ||||
* sequence may not be advanced on write for lazy or deferred SMRs. In this | * sequence may not be advanced on write for lazy or deferred SMRs. In this | ||||
* case poll needs to attempt to forward the sequence number if the goal is | * case poll needs to attempt to forward the sequence number if the goal is | ||||
* within wr_seq + SMR_SEQ_ADVANCE. | * within wr_seq + SMR_SEQ_ADVANCE. | ||||
*/ | */ | ||||
#define SMR_SEQ_ADVANCE MAX(SMR_SEQ_INCR, SMR_LAZY_GRACE_MAX) | #define SMR_SEQ_ADVANCE SMR_LAZY_INCR | ||||
static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats"); | static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats"); | ||||
static counter_u64_t advance = EARLY_COUNTER; | static counter_u64_t advance = EARLY_COUNTER; | ||||
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RW, &advance, ""); | SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RW, &advance, ""); | ||||
static counter_u64_t advance_wait = EARLY_COUNTER; | static counter_u64_t advance_wait = EARLY_COUNTER; | ||||
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RW, &advance_wait, ""); | SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RW, &advance_wait, ""); | ||||
static counter_u64_t poll = EARLY_COUNTER; | static counter_u64_t poll = EARLY_COUNTER; | ||||
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RW, &poll, ""); | SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RW, &poll, ""); | ||||
static counter_u64_t poll_scan = EARLY_COUNTER; | static counter_u64_t poll_scan = EARLY_COUNTER; | ||||
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RW, &poll_scan, ""); | SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RW, &poll_scan, ""); | ||||
static counter_u64_t poll_fail = EARLY_COUNTER; | static counter_u64_t poll_fail = EARLY_COUNTER; | ||||
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_RW, &poll_fail, ""); | SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_RW, &poll_fail, ""); | ||||
/* | /* | ||||
* Advance a lazy write sequence number. These move forward at the rate of | * Advance a lazy write sequence number. These move forward at the rate of | ||||
* ticks. Grace is two ticks in the future. lazy write sequence numbers can | * ticks. Grace is SMR_LAZY_INCR (2 ticks) in the future. | ||||
* be even but not SMR_SEQ_INVALID so we pause time for a tick when we wrap. | |||||
* | * | ||||
* This returns the _current_ write sequence number. The lazy goal sequence | * This returns the goal write sequence number. | ||||
* number is SMR_LAZY_GRACE ticks ahead. | |||||
*/ | */ | ||||
static smr_seq_t | static smr_seq_t | ||||
smr_lazy_advance(smr_t smr, smr_shared_t s) | smr_lazy_advance(smr_t smr, smr_shared_t s) | ||||
{ | { | ||||
smr_seq_t s_rd_seq, s_wr_seq, goal; | union s_wr s_wr, old; | ||||
int t; | int t, d; | ||||
CRITICAL_ASSERT(curthread); | CRITICAL_ASSERT(curthread); | ||||
/* | /* | ||||
* Load s_wr_seq prior to ticks to ensure that the thread that | * Load the stored ticks value before the current one. This way the | ||||
* observes the largest value wins. | * current value can only be the same or larger. | ||||
*/ | */ | ||||
s_wr_seq = atomic_load_acq_int(&s->s_wr_seq); | old._pair = s_wr._pair = atomic_load_acq_64(&s->s_wr._pair); | ||||
/* | |||||
* We must not allow a zero tick value. We go back in time one tick | |||||
* and advance the grace period forward one tick around zero. | |||||
*/ | |||||
t = ticks; | t = ticks; | ||||
if (t == SMR_SEQ_INVALID) | |||||
t--; | |||||
/* | /* | ||||
* The most probable condition that the update already took place. | * The most probable condition that the update already took place. | ||||
*/ | */ | ||||
if (__predict_true(t == s_wr_seq)) | d = t - s_wr.ticks; | ||||
if (__predict_true(d == 0)) | |||||
goto out; | goto out; | ||||
/* Cap the rate of advancement and handle long idle periods. */ | |||||
if (d > SMR_LAZY_GRACE || d < 0) | |||||
d = SMR_LAZY_GRACE; | |||||
s_wr.ticks = t; | |||||
s_wr.seq += d * SMR_SEQ_INCR; | |||||
/* | /* | ||||
* After long idle periods the read sequence may fall too far | * This can only fail if another thread races to call advance(). | ||||
* behind write. Prevent poll from ever seeing this condition | * Strong cmpset semantics mean we are guaranteed that the update | ||||
* by updating the stale rd_seq. This assumes that there can | * happened. | ||||
* be no valid section 2bn ticks old. The rd_seq update must | |||||
* be visible before wr_seq to avoid races with other advance | |||||
* callers. | |||||
*/ | */ | ||||
s_rd_seq = atomic_load_int(&s->s_rd_seq); | atomic_cmpset_64(&s->s_wr._pair, old._pair, s_wr._pair); | ||||
rlibbyUnsubmitted Not Done Inline Actions(While looking at the generated code, I noticed amd64 atomic_cmpset forces us to populate a register with the return value of cmpset, even though we don't care. I'll see what kib thinks of a possible fix for that...) rlibby: (While looking at the generated code, I noticed amd64 atomic_cmpset forces us to populate a… | |||||
if (SMR_SEQ_GT(s_rd_seq, t)) | |||||
atomic_cmpset_rel_int(&s->s_rd_seq, s_rd_seq, t); | |||||
/* | |||||
* Release to synchronize with the wr_seq load above. Ignore | |||||
* cmpset failures from simultaneous updates. | |||||
*/ | |||||
atomic_cmpset_rel_int(&s->s_wr_seq, s_wr_seq, t); | |||||
counter_u64_add(advance, 1); | |||||
/* If we lost either update race another thread did it. */ | |||||
s_wr_seq = t; | |||||
out: | out: | ||||
goal = s_wr_seq + SMR_LAZY_GRACE; | return (s_wr.seq + SMR_LAZY_INCR); | ||||
/* Skip over the SMR_SEQ_INVALID tick. */ | |||||
if (goal < SMR_LAZY_GRACE) | |||||
goal++; | |||||
return (goal); | |||||
} | } | ||||
/* | /* | ||||
* Increment the shared write sequence by 2. Since it is initialized | * Increment the shared write sequence by 2. Since it is initialized | ||||
* to 1 this means the only valid values are odd and an observed value | * to 1 this means the only valid values are odd and an observed value | ||||
* of 0 in a particular CPU means it is not currently in a read section. | * of 0 in a particular CPU means it is not currently in a read section. | ||||
*/ | */ | ||||
static smr_seq_t | static smr_seq_t | ||||
smr_shared_advance(smr_shared_t s) | smr_shared_advance(smr_shared_t s) | ||||
{ | { | ||||
return (atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR); | return (atomic_fetchadd_int(&s->s_wr.seq, SMR_SEQ_INCR) + SMR_SEQ_INCR); | ||||
} | } | ||||
/* | /* | ||||
* Advance the write sequence number for a normal smr section. If the | * Advance the write sequence number for a normal smr section. If the | ||||
* write sequence is too far behind the read sequence we have to poll | * write sequence is too far behind the read sequence we have to poll | ||||
* to advance rd_seq and prevent undetectable wraps. | * to advance rd_seq and prevent undetectable wraps. | ||||
*/ | */ | ||||
static smr_seq_t | static smr_seq_t | ||||
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | |||||
* Advance the write sequence and return the value for use as the | * Advance the write sequence and return the value for use as the | ||||
* wait goal. This guarantees that any changes made by the calling | * wait goal. This guarantees that any changes made by the calling | ||||
* thread prior to this call will be visible to all threads after | * thread prior to this call will be visible to all threads after | ||||
* rd_seq meets or exceeds the return value. | * rd_seq meets or exceeds the return value. | ||||
* | * | ||||
* This function may busy loop if the readers are roughly 1 billion | * This function may busy loop if the readers are roughly 1 billion | ||||
* sequence numbers behind the writers. | * sequence numbers behind the writers. | ||||
* | * | ||||
* Lazy SMRs will not busy loop and the wrap happens every 49.6 days | * Lazy SMRs will not busy loop and the wrap happens every 25 days | ||||
* at 1khz and 119 hours at 10khz. Readers can block for no longer | * at 1khz and 60 hours at 10khz. Readers can block for no longer | ||||
* than half of this for SMR_SEQ_ macros to continue working. | * than half of this for SMR_SEQ_ macros to continue working. | ||||
*/ | */ | ||||
smr_seq_t | smr_seq_t | ||||
smr_advance(smr_t smr) | smr_advance(smr_t smr) | ||||
{ | { | ||||
smr_t self; | smr_t self; | ||||
smr_shared_t s; | smr_shared_t s; | ||||
smr_seq_t goal; | smr_seq_t goal; | ||||
▲ Show 20 Lines • Show All 114 Lines • ▼ Show 20 Lines | CPU_FOREACH(i) { | ||||
if (c_seq != SMR_SEQ_INVALID) | if (c_seq != SMR_SEQ_INVALID) | ||||
rd_seq = SMR_SEQ_MIN(rd_seq, c_seq); | rd_seq = SMR_SEQ_MIN(rd_seq, c_seq); | ||||
} | } | ||||
/* | /* | ||||
* Advance the rd_seq as long as we observed a more recent value. | * Advance the rd_seq as long as we observed a more recent value. | ||||
*/ | */ | ||||
s_rd_seq = atomic_load_int(&s->s_rd_seq); | s_rd_seq = atomic_load_int(&s->s_rd_seq); | ||||
if (SMR_SEQ_GEQ(rd_seq, s_rd_seq)) { | if (SMR_SEQ_GT(rd_seq, s_rd_seq)) { | ||||
atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq); | atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq); | ||||
s_rd_seq = rd_seq; | s_rd_seq = rd_seq; | ||||
} | } | ||||
return (s_rd_seq); | return (s_rd_seq); | ||||
} | } | ||||
/* | /* | ||||
* Poll to determine whether all readers have observed the 'goal' write | * Poll to determine whether all readers have observed the 'goal' write | ||||
* sequence number. | * sequence number. | ||||
* | * | ||||
* If wait is true this will spin until the goal is met. | * If wait is true this will spin until the goal is met. | ||||
* | * | ||||
* This routine will updated the minimum observed read sequence number in | * This routine will updated the minimum observed read sequence number in | ||||
* s_rd_seq if it does a scan. It may not do a scan if another call has | * s_rd_seq if it does a scan. It may not do a scan if another call has | ||||
* advanced s_rd_seq beyond the callers goal already. | * advanced s_rd_seq beyond the callers goal already. | ||||
* | * | ||||
* Returns true if the goal is met and false if not. | * Returns true if the goal is met and false if not. | ||||
*/ | */ | ||||
bool | bool | ||||
smr_poll(smr_t smr, smr_seq_t goal, bool wait) | smr_poll(smr_t smr, smr_seq_t goal, bool wait) | ||||
jeffAuthorUnsubmitted Done Inline ActionsI think ultimately poll will have some top-level driver function that handles the sleep/advance for deferred and lazy and this function can be simplified slightly. So many polls don't require actual looping that it may be advantageous to make a small poll pre-check and return fast-path. jeff: I think ultimately poll will have some top-level driver function that handles the sleep/advance… | |||||
{ | { | ||||
smr_shared_t s; | smr_shared_t s; | ||||
smr_t self; | smr_t self; | ||||
smr_seq_t s_wr_seq, s_rd_seq; | smr_seq_t s_wr_seq, s_rd_seq; | ||||
smr_delta_t delta; | smr_delta_t delta; | ||||
int flags; | int flags; | ||||
bool success; | bool success; | ||||
Show All 14 Lines | smr_poll(smr_t smr, smr_seq_t goal, bool wait) | ||||
/* Attempt to load from self only once. */ | /* Attempt to load from self only once. */ | ||||
self = zpcpu_get(smr); | self = zpcpu_get(smr); | ||||
s = self->c_shared; | s = self->c_shared; | ||||
flags = self->c_flags; | flags = self->c_flags; | ||||
counter_u64_add_protected(poll, 1); | counter_u64_add_protected(poll, 1); | ||||
/* | /* | ||||
* Conditionally advance the lazy write clock on any writer | * Conditionally advance the lazy write clock on any writer | ||||
* activity. This may reset s_rd_seq. | * activity. | ||||
*/ | */ | ||||
if ((flags & SMR_LAZY) != 0) | if ((flags & SMR_LAZY) != 0) | ||||
smr_lazy_advance(smr, s); | smr_lazy_advance(smr, s); | ||||
/* | /* | ||||
* Acquire barrier loads s_wr_seq after s_rd_seq so that we can not | * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not | ||||
* observe an updated read sequence that is larger than write. | * observe an updated read sequence that is larger than write. | ||||
*/ | */ | ||||
s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); | s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); | ||||
/* | /* | ||||
* If we have already observed the sequence number we can immediately | * If we have already observed the sequence number we can immediately | ||||
* return success. Most polls should meet this criterion. | * return success. Most polls should meet this criterion. | ||||
*/ | */ | ||||
if (SMR_SEQ_LEQ(goal, s_rd_seq)) | if (SMR_SEQ_LEQ(goal, s_rd_seq)) | ||||
goto out; | goto out; | ||||
/* | /* | ||||
* wr_seq must be loaded prior to any c_seq value so that a | * wr_seq must be loaded prior to any c_seq value so that a | ||||
* stale c_seq can only reference time after this wr_seq. | * stale c_seq can only reference time after this wr_seq. | ||||
*/ | */ | ||||
s_wr_seq = atomic_load_acq_int(&s->s_wr_seq); | s_wr_seq = atomic_load_acq_int(&s->s_wr.seq); | ||||
/* | /* | ||||
* This is the distance from s_wr_seq to goal. Positive values | * This is the distance from s_wr_seq to goal. Positive values | ||||
* are in the future. | * are in the future. | ||||
*/ | */ | ||||
delta = SMR_SEQ_DELTA(goal, s_wr_seq); | delta = SMR_SEQ_DELTA(goal, s_wr_seq); | ||||
/* | /* | ||||
* Detect a stale wr_seq. | * Detect a stale wr_seq. | ||||
* | * | ||||
* This goal may have come from a deferred advance or a lazy | * This goal may have come from a deferred advance or a lazy | ||||
* smr. If we are not blocking we can not succeed but the | * smr. If we are not blocking we can not succeed but the | ||||
* sequence number is valid. | * sequence number is valid. | ||||
*/ | */ | ||||
if (delta > 0 && delta <= SMR_SEQ_MAX_ADVANCE && | if (delta > 0 && delta <= SMR_SEQ_ADVANCE && | ||||
(flags & (SMR_LAZY | SMR_DEFERRED)) != 0) { | (flags & (SMR_LAZY | SMR_DEFERRED)) != 0) { | ||||
if (!wait) { | if (!wait) { | ||||
success = false; | success = false; | ||||
goto out; | goto out; | ||||
} | } | ||||
/* LAZY is always !wait. */ | /* LAZY is always !wait. */ | ||||
s_wr_seq = smr_shared_advance(s); | s_wr_seq = smr_shared_advance(s); | ||||
delta = 0; | delta = 0; | ||||
Show All 33 Lines | smr_create(const char *name, int limit, int flags) | ||||
smr_t smr, c; | smr_t smr, c; | ||||
smr_shared_t s; | smr_shared_t s; | ||||
int i; | int i; | ||||
s = uma_zalloc(smr_shared_zone, M_WAITOK); | s = uma_zalloc(smr_shared_zone, M_WAITOK); | ||||
smr = uma_zalloc_pcpu(smr_zone, M_WAITOK); | smr = uma_zalloc_pcpu(smr_zone, M_WAITOK); | ||||
s->s_name = name; | s->s_name = name; | ||||
if ((flags & SMR_LAZY) == 0) | s->s_rd_seq = s->s_wr.seq = SMR_SEQ_INIT; | ||||
s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT; | s->s_wr.ticks = ticks; | ||||
else | |||||
s->s_rd_seq = s->s_wr_seq = ticks; | |||||
/* Initialize all CPUS, not just those running. */ | /* Initialize all CPUS, not just those running. */ | ||||
for (i = 0; i <= mp_maxid; i++) { | for (i = 0; i <= mp_maxid; i++) { | ||||
c = zpcpu_get_cpu(smr, i); | c = zpcpu_get_cpu(smr, i); | ||||
c->c_seq = SMR_SEQ_INVALID; | c->c_seq = SMR_SEQ_INVALID; | ||||
c->c_shared = s; | c->c_shared = s; | ||||
c->c_deferred = 0; | c->c_deferred = 0; | ||||
c->c_limit = limit; | c->c_limit = limit; | ||||
Show All 40 Lines |
This would have a bug if you acquired a spinlock and blocked the interrupt for 2 ticks without the processor flushing the store buffer.
This seems really unlikely but perhaps needs an assert or some other protection.