Differential D23825 Diff 68781 sys/kern/subr_smr.c

Changeset View

Standalone View

sys/kern/subr_smr.c

Show First 20 Lines • Show All 177 Lines • ▼ Show 20 Lines
* the store buffers and any speculative loads that may violate our invariants.		* the store buffers and any speculative loads that may violate our invariants.
* Because these interrupts are not synchronized we must wait one additional		* Because these interrupts are not synchronized we must wait one additional
* tick in the future to be certain that all processors have had their state		* tick in the future to be certain that all processors have had their state
* synchronized by an interrupt.		* synchronized by an interrupt.
*		*
* This assumes that the clock interrupt will only be delayed by other causes		* This assumes that the clock interrupt will only be delayed by other causes
* that will flush the store buffer or prevent access to the section protected		* that will flush the store buffer or prevent access to the section protected
* data. For example, an idle processor, or an system management interrupt,		* data. For example, an idle processor, or an system management interrupt,
* or a vm exit.		* or a vm exit.
jeffAuthorUnsubmitted Done Inline Actions This would have a bug if you acquired a spinlock and blocked the interrupt for 2 ticks without the processor flushing the store buffer. This seems really unlikely but perhaps needs an assert or some other protection. jeff: This would have a bug if you acquired a spinlock and blocked the interrupt for 2 ticks without…
*
* We must wait one additional tick if we are around the wrap condition
* because the write seq will move forward by two with one interrupt.
*/		*/
#define SMR_LAZY_GRACE 2		#define SMR_LAZY_GRACE 2
#define SMR_LAZY_GRACE_MAX (SMR_LAZY_GRACE + 1)		#define SMR_LAZY_INCR (SMR_LAZY_GRACE * SMR_SEQ_INCR)

/*		/*
* The maximum sequence number ahead of wr_seq that may still be valid. The		* The maximum sequence number ahead of wr_seq that may still be valid. The
* sequence may not be advanced on write for lazy or deferred SMRs. In this		* sequence may not be advanced on write for lazy or deferred SMRs. In this
* case poll needs to attempt to forward the sequence number if the goal is		* case poll needs to attempt to forward the sequence number if the goal is
* within wr_seq + SMR_SEQ_ADVANCE.		* within wr_seq + SMR_SEQ_ADVANCE.
*/		*/
#define SMR_SEQ_ADVANCE MAX(SMR_SEQ_INCR, SMR_LAZY_GRACE_MAX)		#define SMR_SEQ_ADVANCE SMR_LAZY_INCR

static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats");		static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats");
static counter_u64_t advance = EARLY_COUNTER;		static counter_u64_t advance = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RW, &advance, "");		SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RW, &advance, "");
static counter_u64_t advance_wait = EARLY_COUNTER;		static counter_u64_t advance_wait = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RW, &advance_wait, "");		SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RW, &advance_wait, "");
static counter_u64_t poll = EARLY_COUNTER;		static counter_u64_t poll = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RW, &poll, "");		SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RW, &poll, "");
static counter_u64_t poll_scan = EARLY_COUNTER;		static counter_u64_t poll_scan = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RW, &poll_scan, "");		SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RW, &poll_scan, "");
static counter_u64_t poll_fail = EARLY_COUNTER;		static counter_u64_t poll_fail = EARLY_COUNTER;
SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_RW, &poll_fail, "");		SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_RW, &poll_fail, "");

/*		/*
* Advance a lazy write sequence number. These move forward at the rate of		* Advance a lazy write sequence number. These move forward at the rate of
* ticks. Grace is two ticks in the future. lazy write sequence numbers can		* ticks. Grace is SMR_LAZY_INCR (2 ticks) in the future.
* be even but not SMR_SEQ_INVALID so we pause time for a tick when we wrap.
*		*
* This returns the _current_ write sequence number. The lazy goal sequence		* This returns the goal write sequence number.
* number is SMR_LAZY_GRACE ticks ahead.
*/		*/
static smr_seq_t		static smr_seq_t
smr_lazy_advance(smr_t smr, smr_shared_t s)		smr_lazy_advance(smr_t smr, smr_shared_t s)
{		{
smr_seq_t s_rd_seq, s_wr_seq, goal;		union s_wr s_wr, old;
int t;		int t, d;

CRITICAL_ASSERT(curthread);		CRITICAL_ASSERT(curthread);

/*		/*
* Load s_wr_seq prior to ticks to ensure that the thread that		* Load the stored ticks value before the current one. This way the
* observes the largest value wins.		* current value can only be the same or larger.
*/		*/
s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);		old._pair = s_wr._pair = atomic_load_acq_64(&s->s_wr._pair);

/*
* We must not allow a zero tick value. We go back in time one tick
* and advance the grace period forward one tick around zero.
*/
t = ticks;		t = ticks;
if (t == SMR_SEQ_INVALID)
t--;

/*		/*
* The most probable condition that the update already took place.		* The most probable condition that the update already took place.
*/		*/
if (__predict_true(t == s_wr_seq))		d = t - s_wr.ticks;
		if (__predict_true(d == 0))
goto out;		goto out;
		/* Cap the rate of advancement and handle long idle periods. */
		if (d > SMR_LAZY_GRACE \|\| d < 0)
		d = SMR_LAZY_GRACE;
		s_wr.ticks = t;
		s_wr.seq += d * SMR_SEQ_INCR;

/*		/*
* After long idle periods the read sequence may fall too far		* This can only fail if another thread races to call advance().
* behind write. Prevent poll from ever seeing this condition		* Strong cmpset semantics mean we are guaranteed that the update
* by updating the stale rd_seq. This assumes that there can		* happened.
* be no valid section 2bn ticks old. The rd_seq update must
* be visible before wr_seq to avoid races with other advance
* callers.
*/		*/
s_rd_seq = atomic_load_int(&s->s_rd_seq);		atomic_cmpset_64(&s->s_wr._pair, old._pair, s_wr._pair);
		rlibbyUnsubmitted Not Done Inline Actions (While looking at the generated code, I noticed amd64 atomic_cmpset forces us to populate a register with the return value of cmpset, even though we don't care. I'll see what kib thinks of a possible fix for that...) rlibby: (While looking at the generated code, I noticed amd64 atomic_cmpset forces us to populate a…
if (SMR_SEQ_GT(s_rd_seq, t))
atomic_cmpset_rel_int(&s->s_rd_seq, s_rd_seq, t);

/*
* Release to synchronize with the wr_seq load above. Ignore
* cmpset failures from simultaneous updates.
*/
atomic_cmpset_rel_int(&s->s_wr_seq, s_wr_seq, t);
counter_u64_add(advance, 1);
/* If we lost either update race another thread did it. */
s_wr_seq = t;
out:		out:
goal = s_wr_seq + SMR_LAZY_GRACE;		return (s_wr.seq + SMR_LAZY_INCR);
/* Skip over the SMR_SEQ_INVALID tick. */
if (goal < SMR_LAZY_GRACE)
goal++;
return (goal);
}		}

/*		/*
* Increment the shared write sequence by 2. Since it is initialized		* Increment the shared write sequence by 2. Since it is initialized
* to 1 this means the only valid values are odd and an observed value		* to 1 this means the only valid values are odd and an observed value
* of 0 in a particular CPU means it is not currently in a read section.		* of 0 in a particular CPU means it is not currently in a read section.
*/		*/
static smr_seq_t		static smr_seq_t
smr_shared_advance(smr_shared_t s)		smr_shared_advance(smr_shared_t s)
{		{

return (atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR);		return (atomic_fetchadd_int(&s->s_wr.seq, SMR_SEQ_INCR) + SMR_SEQ_INCR);
}		}

/*		/*
* Advance the write sequence number for a normal smr section. If the		* Advance the write sequence number for a normal smr section. If the
* write sequence is too far behind the read sequence we have to poll		* write sequence is too far behind the read sequence we have to poll
* to advance rd_seq and prevent undetectable wraps.		* to advance rd_seq and prevent undetectable wraps.
*/		*/
static smr_seq_t		static smr_seq_t
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines
* Advance the write sequence and return the value for use as the		* Advance the write sequence and return the value for use as the
* wait goal. This guarantees that any changes made by the calling		* wait goal. This guarantees that any changes made by the calling
* thread prior to this call will be visible to all threads after		* thread prior to this call will be visible to all threads after
* rd_seq meets or exceeds the return value.		* rd_seq meets or exceeds the return value.
*		*
* This function may busy loop if the readers are roughly 1 billion		* This function may busy loop if the readers are roughly 1 billion
* sequence numbers behind the writers.		* sequence numbers behind the writers.
*		*
* Lazy SMRs will not busy loop and the wrap happens every 49.6 days		* Lazy SMRs will not busy loop and the wrap happens every 25 days
* at 1khz and 119 hours at 10khz. Readers can block for no longer		* at 1khz and 60 hours at 10khz. Readers can block for no longer
* than half of this for SMR_SEQ_ macros to continue working.		* than half of this for SMR_SEQ_ macros to continue working.
*/		*/
smr_seq_t		smr_seq_t
smr_advance(smr_t smr)		smr_advance(smr_t smr)
{		{
smr_t self;		smr_t self;
smr_shared_t s;		smr_shared_t s;
smr_seq_t goal;		smr_seq_t goal;
▲ Show 20 Lines • Show All 114 Lines • ▼ Show 20 Lines	CPU_FOREACH(i) {
if (c_seq != SMR_SEQ_INVALID)		if (c_seq != SMR_SEQ_INVALID)
rd_seq = SMR_SEQ_MIN(rd_seq, c_seq);		rd_seq = SMR_SEQ_MIN(rd_seq, c_seq);
}		}

/*		/*
* Advance the rd_seq as long as we observed a more recent value.		* Advance the rd_seq as long as we observed a more recent value.
*/		*/
s_rd_seq = atomic_load_int(&s->s_rd_seq);		s_rd_seq = atomic_load_int(&s->s_rd_seq);
if (SMR_SEQ_GEQ(rd_seq, s_rd_seq)) {		if (SMR_SEQ_GT(rd_seq, s_rd_seq)) {
atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq);		atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq);
s_rd_seq = rd_seq;		s_rd_seq = rd_seq;
}		}

return (s_rd_seq);		return (s_rd_seq);
}		}

/*		/*
* Poll to determine whether all readers have observed the 'goal' write		* Poll to determine whether all readers have observed the 'goal' write
* sequence number.		* sequence number.
*		*
* If wait is true this will spin until the goal is met.		* If wait is true this will spin until the goal is met.
*		*
* This routine will updated the minimum observed read sequence number in		* This routine will updated the minimum observed read sequence number in
* s_rd_seq if it does a scan. It may not do a scan if another call has		* s_rd_seq if it does a scan. It may not do a scan if another call has
* advanced s_rd_seq beyond the callers goal already.		* advanced s_rd_seq beyond the callers goal already.
*		*
* Returns true if the goal is met and false if not.		* Returns true if the goal is met and false if not.
*/		*/
bool		bool
smr_poll(smr_t smr, smr_seq_t goal, bool wait)		smr_poll(smr_t smr, smr_seq_t goal, bool wait)
		jeffAuthorUnsubmitted Done Inline Actions I think ultimately poll will have some top-level driver function that handles the sleep/advance for deferred and lazy and this function can be simplified slightly. So many polls don't require actual looping that it may be advantageous to make a small poll pre-check and return fast-path. jeff: I think ultimately poll will have some top-level driver function that handles the sleep/advance…
{		{
smr_shared_t s;		smr_shared_t s;
smr_t self;		smr_t self;
smr_seq_t s_wr_seq, s_rd_seq;		smr_seq_t s_wr_seq, s_rd_seq;
smr_delta_t delta;		smr_delta_t delta;
int flags;		int flags;
bool success;		bool success;

Show All 14 Lines	smr_poll(smr_t smr, smr_seq_t goal, bool wait)
/* Attempt to load from self only once. */		/* Attempt to load from self only once. */
self = zpcpu_get(smr);		self = zpcpu_get(smr);
s = self->c_shared;		s = self->c_shared;
flags = self->c_flags;		flags = self->c_flags;
counter_u64_add_protected(poll, 1);		counter_u64_add_protected(poll, 1);

/*		/*
* Conditionally advance the lazy write clock on any writer		* Conditionally advance the lazy write clock on any writer
* activity. This may reset s_rd_seq.		* activity.
*/		*/
if ((flags & SMR_LAZY) != 0)		if ((flags & SMR_LAZY) != 0)
smr_lazy_advance(smr, s);		smr_lazy_advance(smr, s);

/*		/*
* Acquire barrier loads s_wr_seq after s_rd_seq so that we can not		* Acquire barrier loads s_wr_seq after s_rd_seq so that we can not
* observe an updated read sequence that is larger than write.		* observe an updated read sequence that is larger than write.
*/		*/
s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);		s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);

/*		/*
* If we have already observed the sequence number we can immediately		* If we have already observed the sequence number we can immediately
* return success. Most polls should meet this criterion.		* return success. Most polls should meet this criterion.
*/		*/
if (SMR_SEQ_LEQ(goal, s_rd_seq))		if (SMR_SEQ_LEQ(goal, s_rd_seq))
goto out;		goto out;

/*		/*
* wr_seq must be loaded prior to any c_seq value so that a		* wr_seq must be loaded prior to any c_seq value so that a
* stale c_seq can only reference time after this wr_seq.		* stale c_seq can only reference time after this wr_seq.
*/		*/
s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);		s_wr_seq = atomic_load_acq_int(&s->s_wr.seq);

/*		/*
* This is the distance from s_wr_seq to goal. Positive values		* This is the distance from s_wr_seq to goal. Positive values
* are in the future.		* are in the future.
*/		*/
delta = SMR_SEQ_DELTA(goal, s_wr_seq);		delta = SMR_SEQ_DELTA(goal, s_wr_seq);

/*		/*
* Detect a stale wr_seq.		* Detect a stale wr_seq.
*		*
* This goal may have come from a deferred advance or a lazy		* This goal may have come from a deferred advance or a lazy
* smr. If we are not blocking we can not succeed but the		* smr. If we are not blocking we can not succeed but the
* sequence number is valid.		* sequence number is valid.
*/		*/
if (delta > 0 && delta <= SMR_SEQ_MAX_ADVANCE &&		if (delta > 0 && delta <= SMR_SEQ_ADVANCE &&
(flags & (SMR_LAZY \| SMR_DEFERRED)) != 0) {		(flags & (SMR_LAZY \| SMR_DEFERRED)) != 0) {
if (!wait) {		if (!wait) {
success = false;		success = false;
goto out;		goto out;
}		}
/* LAZY is always !wait. */		/* LAZY is always !wait. */
s_wr_seq = smr_shared_advance(s);		s_wr_seq = smr_shared_advance(s);
delta = 0;		delta = 0;
Show All 33 Lines	smr_create(const char *name, int limit, int flags)
smr_t smr, c;		smr_t smr, c;
smr_shared_t s;		smr_shared_t s;
int i;		int i;

s = uma_zalloc(smr_shared_zone, M_WAITOK);		s = uma_zalloc(smr_shared_zone, M_WAITOK);
smr = uma_zalloc_pcpu(smr_zone, M_WAITOK);		smr = uma_zalloc_pcpu(smr_zone, M_WAITOK);

s->s_name = name;		s->s_name = name;
if ((flags & SMR_LAZY) == 0)		s->s_rd_seq = s->s_wr.seq = SMR_SEQ_INIT;
s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;		s->s_wr.ticks = ticks;
else
s->s_rd_seq = s->s_wr_seq = ticks;

/* Initialize all CPUS, not just those running. */		/* Initialize all CPUS, not just those running. */
for (i = 0; i <= mp_maxid; i++) {		for (i = 0; i <= mp_maxid; i++) {
c = zpcpu_get_cpu(smr, i);		c = zpcpu_get_cpu(smr, i);
c->c_seq = SMR_SEQ_INVALID;		c->c_seq = SMR_SEQ_INVALID;
c->c_shared = s;		c->c_shared = s;
c->c_deferred = 0;		c->c_deferred = 0;
c->c_limit = limit;		c->c_limit = limit;
Show All 40 Lines