Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F146204474
D23738.id68514.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
27 KB
Referenced Files
None
Subscribers
None
D23738.id68514.diff
View Options
Index: sys/kern/subr_smr.c
===================================================================
--- sys/kern/subr_smr.c
+++ sys/kern/subr_smr.c
@@ -164,58 +164,133 @@
#define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2
#endif
+/*
+ * The grace period for lazy (tick based) SMR.
+ *
+ * Hardclock is responsible for advancing ticks on a single CPU while every
+ * CPU receives a regular clock interrupt. The clock interrupts are flushing
+ * the store buffers and any speculative loads that may violate our invariants.
+ * Because these interrupts are not synchronized we must wait one additional
+ * tick in the future to be certain that all processors have had their state
+ * synchronized by an interrupt.
+ *
+ * This assumes that the clock interrupt will only be delayed by other causes
+ * that will flush the store buffer or prevent access to the section protected
+ * data. For example, an idle processor, or an system management interrupt,
+ * or a vm exit.
+ *
+ * We must wait one additional tick if we are around the wrap condition
+ * because the write seq will move forward by two with one interrupt.
+ */
+#define SMR_LAZY_GRACE 2
+#define SMR_LAZY_GRACE_MAX (SMR_LAZY_GRACE + 1)
+
+/*
+ * The maximum sequence number ahead of wr_seq that may still be valid. The
+ * sequence may not be advanced on write for lazy or deferred SMRs. In this
+ * case poll needs to attempt to forward the sequence number if the goal is
+ * within wr_seq + SMR_SEQ_ADVANCE.
+ */
+#define SMR_SEQ_ADVANCE MAX(SMR_SEQ_INCR, SMR_LAZY_GRACE_MAX)
+
static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats");
static counter_u64_t advance = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RD, &advance, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_WR, &advance, "");
static counter_u64_t advance_wait = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RD, &advance_wait, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_WR, &advance_wait, "");
static counter_u64_t poll = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RD, &poll, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_WR, &poll, "");
static counter_u64_t poll_scan = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RD, &poll_scan, "");
-
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_WR, &poll_scan, "");
+static counter_u64_t poll_fail = EARLY_COUNTER;
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_WR, &poll_fail, "");
/*
- * Advance the write sequence and return the new value for use as the
- * wait goal. This guarantees that any changes made by the calling
- * thread prior to this call will be visible to all threads after
- * rd_seq meets or exceeds the return value.
+ * Advance a lazy write sequence number. These move forward at the rate of
+ * ticks. Grace is two ticks in the future. lazy write sequence numbers can
+ * be even but not SMR_SEQ_INVALID so we pause time for a tick when we wrap.
*
- * This function may busy loop if the readers are roughly 1 billion
- * sequence numbers behind the writers.
+ * This returns the _current_ write sequence number. The lazy goal sequence
+ * number is SMR_LAZY_GRACE ticks ahead.
*/
-smr_seq_t
-smr_advance(smr_t smr)
+static smr_seq_t
+smr_lazy_advance(smr_t smr, smr_shared_t s)
{
- smr_shared_t s;
- smr_seq_t goal, s_rd_seq;
+ smr_seq_t s_rd_seq, s_wr_seq, goal;
+ int t;
+
+ CRITICAL_ASSERT(curthread);
/*
- * It is illegal to enter while in an smr section.
+ * We must not allow a zero tick value. We go back in time one tick
+ * and advance the grace period forward one tick around zero.
*/
- SMR_ASSERT_NOT_ENTERED(smr);
+ t = ticks;
+ if (t == SMR_SEQ_INVALID)
+ t--;
/*
- * Modifications not done in a smr section need to be visible
- * before advancing the seq.
+ * The most probable condition that the update already took place.
*/
- atomic_thread_fence_rel();
+ s_wr_seq = atomic_load_int(&s->s_wr_seq);
+ if (__predict_true(t == s_wr_seq))
+ goto out;
/*
- * Load the current read seq before incrementing the goal so
- * we are guaranteed it is always < goal.
+ * After long idle periods the read sequence may fall too far
+ * behind write. Prevent poll from ever seeing this condition
+ * by updating the stale rd_seq. This assumes that there can
+ * be no valid section 2bn ticks old. The rd_seq update must
+ * be visible before wr_seq to avoid races with other advance
+ * callers.
*/
- s = zpcpu_get(smr)->c_shared;
- s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
+ s_rd_seq = atomic_load_int(&s->s_rd_seq);
+ if (SMR_SEQ_GT(s_rd_seq, t))
+ atomic_cmpset_rel_int(&s->s_rd_seq, s_rd_seq, t);
+ atomic_cmpset_int(&s->s_wr_seq, s_wr_seq, t);
+ counter_u64_add(advance, 1);
+ /* If we lost either update race another thread did it. */
+ s_wr_seq = t;
+out:
+ goal = s_wr_seq + SMR_LAZY_GRACE;
+ /* Skip over the SMR_SEQ_INVALID tick. */
+ if (goal < SMR_LAZY_GRACE)
+ goal++;
+ return (goal);
+}
+
+/*
+ * Increment the shared write sequence by 2. Since it is initialized
+ * to 1 this means the only valid values are odd and an observed value
+ * of 0 in a particular CPU means it is not currently in a read section.
+ */
+static smr_seq_t
+smr_shared_advance(smr_shared_t s)
+{
+
+ return (atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR);
+}
+
+/*
+ * Advance the write sequence number for a normal smr section. If the
+ * write sequence is too far behind the read sequence we have to poll
+ * to advance rd_seq and prevent undetectable wraps.
+ */
+static smr_seq_t
+smr_default_advance(smr_t smr, smr_shared_t s)
+{
+ smr_seq_t goal, s_rd_seq;
+
+ CRITICAL_ASSERT(curthread);
+ KASSERT((zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
+ ("smr_default_advance: called with lazy smr."));
/*
- * Increment the shared write sequence by 2. Since it is
- * initialized to 1 this means the only valid values are
- * odd and an observed value of 0 in a particular CPU means
- * it is not currently in a read section.
+ * Load the current read seq before incrementing the goal so
+ * we are guaranteed it is always < goal.
*/
- goal = atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR;
- counter_u64_add(advance, 1);
+ s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
+ goal = smr_shared_advance(s);
/*
* Force a synchronization here if the goal is getting too
@@ -226,30 +301,172 @@
counter_u64_add(advance_wait, 1);
smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE);
}
+ counter_u64_add(advance, 1);
return (goal);
}
+/*
+ * Deferred SMRs conditionally update s_wr_seq based on an
+ * cpu local interval count.
+ */
+static smr_seq_t
+smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self)
+{
+
+ if (++self->c_deferred < self->c_limit)
+ return (smr_shared_current(s) + SMR_SEQ_INCR);
+ self->c_deferred = 0;
+ return (smr_default_advance(smr, s));
+}
+
+/*
+ * Advance the write sequence and return the value for use as the
+ * wait goal. This guarantees that any changes made by the calling
+ * thread prior to this call will be visible to all threads after
+ * rd_seq meets or exceeds the return value.
+ *
+ * This function may busy loop if the readers are roughly 1 billion
+ * sequence numbers behind the writers.
+ *
+ * Lazy SMRs will not busy loop and the wrap happens every 49.6 days
+ * at 1khz and 119 hours at 10khz. Readers can block for no longer
+ * than half of this for SMR_SEQ_ macros to continue working.
+ */
smr_seq_t
-smr_advance_deferred(smr_t smr, int limit)
+smr_advance(smr_t smr)
{
+ smr_t self;
+ smr_shared_t s;
smr_seq_t goal;
- smr_t csmr;
+ int flags;
+ /*
+ * It is illegal to enter while in an smr section.
+ */
SMR_ASSERT_NOT_ENTERED(smr);
+ /*
+ * Modifications not done in a smr section need to be visible
+ * before advancing the seq.
+ */
+ atomic_thread_fence_rel();
+
critical_enter();
- csmr = zpcpu_get(smr);
- if (++csmr->c_deferred >= limit) {
- goal = SMR_SEQ_INVALID;
- csmr->c_deferred = 0;
- } else
- goal = smr_shared_current(csmr->c_shared) + SMR_SEQ_INCR;
+ /* Try to touch the line once. */
+ self = zpcpu_get(smr);
+ s = self->c_shared;
+ flags = self->c_flags;
+ goal = SMR_SEQ_INVALID;
+ if ((flags & (SMR_LAZY | SMR_DEFERRED)) == 0)
+ goal = smr_default_advance(smr, s);
+ else if ((flags & SMR_LAZY) != 0)
+ goal = smr_lazy_advance(smr, s);
+ else if ((flags & SMR_DEFERRED) != 0)
+ goal = smr_deferred_advance(smr, s, self);
critical_exit();
- if (goal != SMR_SEQ_INVALID)
- return (goal);
- return (smr_advance(smr));
+ return (goal);
+}
+
+/*
+ * Poll to determine the currently observed sequence number on a cpu
+ * and spinwait if the 'wait' argument is true.
+ */
+static smr_seq_t
+smr_poll_cpu(smr_t c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait)
+{
+ smr_seq_t c_seq;
+
+ c_seq = SMR_SEQ_INVALID;
+ for (;;) {
+ c_seq = atomic_load_int(&c->c_seq);
+ if (c_seq == SMR_SEQ_INVALID)
+ break;
+
+ /*
+ * There is a race described in smr.h:smr_enter that
+ * can lead to a stale seq value but not stale data
+ * access. If we find a value out of range here we
+ * pin it to the current min to prevent it from
+ * advancing until that stale section has expired.
+ *
+ * The race is created when a cpu loads the s_wr_seq
+ * value in a local register and then another thread
+ * advances s_wr_seq and calls smr_poll() which will
+ * oberve no value yet in c_seq and advance s_rd_seq
+ * up to s_wr_seq which is beyond the register
+ * cached value. This is only likely to happen on
+ * hypervisor or with a system management interrupt.
+ */
+ if (SMR_SEQ_LT(c_seq, s_rd_seq))
+ c_seq = s_rd_seq;
+
+ /*
+ * If the sequence number meets the goal we are done
+ * with this cpu.
+ */
+ if (SMR_SEQ_LEQ(goal, c_seq))
+ break;
+
+ if (!wait)
+ break;
+ cpu_spinwait();
+ }
+
+ return (c_seq);
+}
+
+/*
+ * Loop until all cores have observed the goal sequence or have
+ * gone inactive. Returns the oldest sequence currently active;
+ *
+ * This function assumes a snapshot of sequence values has
+ * been obtained and validated by smr_poll().
+ */
+static smr_seq_t
+smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq,
+ smr_seq_t s_wr_seq, smr_seq_t goal, bool wait)
+{
+ smr_seq_t rd_seq, c_seq;
+ int i;
+
+ CRITICAL_ASSERT(curthread);
+ counter_u64_add_protected(poll_scan, 1);
+
+ /*
+ * The read sequence can be no larger than the write sequence at
+ * the start of the poll.
+ */
+ rd_seq = s_wr_seq;
+ CPU_FOREACH(i) {
+ /*
+ * Query the active sequence on this cpu. If we're not
+ * waiting and we don't meet the goal we will still scan
+ * the rest of the cpus to update s_rd_seq before returning
+ * failure.
+ */
+ c_seq = smr_poll_cpu(zpcpu_get_cpu(smr, i), s_rd_seq, goal,
+ wait);
+
+ /*
+ * Limit the minimum observed rd_seq whether we met the goal
+ * or not.
+ */
+ if (c_seq != SMR_SEQ_INVALID)
+ rd_seq = SMR_SEQ_MIN(rd_seq, c_seq);
+ }
+
+ /*
+ * Advance the rd_seq as long as we observed a more recent value.
+ */
+ s_rd_seq = atomic_load_int(&s->s_rd_seq);
+ if (SMR_SEQ_GEQ(rd_seq, s_rd_seq)) {
+ atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq);
+ s_rd_seq = rd_seq;
+ }
+
+ return (s_rd_seq);
}
/*
@@ -268,9 +485,10 @@
smr_poll(smr_t smr, smr_seq_t goal, bool wait)
{
smr_shared_t s;
- smr_t c;
- smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq;
- int i;
+ smr_t self;
+ smr_seq_t s_wr_seq, s_rd_seq;
+ smr_delta_t delta;
+ int flags;
bool success;
/*
@@ -278,6 +496,8 @@
*/
KASSERT(!wait || !SMR_ENTERED(smr),
("smr_poll: Blocking not allowed in a SMR section."));
+ KASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
+ ("smr_poll: Blocking not allowed on lazy smrs."));
/*
* Use a critical section so that we can avoid ABA races
@@ -285,9 +505,19 @@
*/
success = true;
critical_enter();
- s = zpcpu_get(smr)->c_shared;
+ /* Attempt to load from self only once. */
+ self = zpcpu_get(smr);
+ s = self->c_shared;
+ flags = self->c_flags;
counter_u64_add_protected(poll, 1);
+ /*
+ * Conditionally advance the lazy write clock on any writer
+ * activity. This may reset s_rd_seq.
+ */
+ if ((flags & SMR_LAZY) != 0)
+ smr_lazy_advance(smr, s);
+
/*
* Acquire barrier loads s_wr_seq after s_rd_seq so that we can not
* observe an updated read sequence that is larger than write.
@@ -295,106 +525,59 @@
s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
/*
- * wr_seq must be loaded prior to any c_seq value so that a stale
- * c_seq can only reference time after this wr_seq.
+ * If we have already observed the sequence number we can immediately
+ * return success. Most polls should meet this criterion.
*/
- s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);
+ if (SMR_SEQ_LEQ(goal, s_rd_seq))
+ goto out;
/*
- * This may have come from a deferred advance. Consider one
- * increment past the current wr_seq valid and make sure we
- * have advanced far enough to succeed. We simply add to avoid
- * an additional fence.
+ * wr_seq must be loaded prior to any c_seq value so that a
+ * stale c_seq can only reference time after this wr_seq.
*/
- if (goal == s_wr_seq + SMR_SEQ_INCR) {
- atomic_add_int(&s->s_wr_seq, SMR_SEQ_INCR);
- s_wr_seq = goal;
- }
+ s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);
/*
- * Detect whether the goal is valid and has already been observed.
- *
- * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
- * it to be valid. If it is not then the caller held on to it and
- * the integer wrapped. If we wrapped back within range the caller
- * will harmlessly scan.
- *
- * A valid goal must be greater than s_rd_seq or we have not verified
- * that it has been observed and must fall through to polling.
+ * This is the distance from s_wr_seq to goal. Positive values
+ * are in the future.
*/
- if (SMR_SEQ_GEQ(s_rd_seq, goal) || SMR_SEQ_LT(s_wr_seq, goal))
- goto out;
+ delta = SMR_SEQ_DELTA(goal, s_wr_seq);
/*
- * Loop until all cores have observed the goal sequence or have
- * gone inactive. Keep track of the oldest sequence currently
- * active as rd_seq.
+ * Detect a stale wr_seq.
+ *
+ * This goal may have come from a deferred advance or a lazy
+ * smr. If we are not blocking we can not succeed but the
+ * sequence number is valid.
*/
- counter_u64_add_protected(poll_scan, 1);
- rd_seq = s_wr_seq;
- CPU_FOREACH(i) {
- c = zpcpu_get_cpu(smr, i);
- c_seq = SMR_SEQ_INVALID;
- for (;;) {
- c_seq = atomic_load_int(&c->c_seq);
- if (c_seq == SMR_SEQ_INVALID)
- break;
-
- /*
- * There is a race described in smr.h:smr_enter that
- * can lead to a stale seq value but not stale data
- * access. If we find a value out of range here we
- * pin it to the current min to prevent it from
- * advancing until that stale section has expired.
- *
- * The race is created when a cpu loads the s_wr_seq
- * value in a local register and then another thread
- * advances s_wr_seq and calls smr_poll() which will
- * oberve no value yet in c_seq and advance s_rd_seq
- * up to s_wr_seq which is beyond the register
- * cached value. This is only likely to happen on
- * hypervisor or with a system management interrupt.
- */
- if (SMR_SEQ_LT(c_seq, s_rd_seq))
- c_seq = s_rd_seq;
-
- /*
- * If the sequence number meets the goal we are
- * done with this cpu.
- */
- if (SMR_SEQ_GEQ(c_seq, goal))
- break;
-
- /*
- * If we're not waiting we will still scan the rest
- * of the cpus and update s_rd_seq before returning
- * an error.
- */
- if (!wait) {
- success = false;
- break;
- }
- cpu_spinwait();
+ if (delta > 0 && delta <= SMR_SEQ_MAX_ADVANCE &&
+ (flags & (SMR_LAZY | SMR_DEFERRED)) != 0) {
+ if (!wait) {
+ success = false;
+ goto out;
}
-
- /*
- * Limit the minimum observed rd_seq whether we met the goal
- * or not.
- */
- if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq))
- rd_seq = c_seq;
+ /* LAZY is always !wait. */
+ s_wr_seq = smr_shared_advance(s);
+ delta = 0;
}
/*
- * Advance the rd_seq as long as we observed the most recent one.
+ * Detect an invalid goal.
+ *
+ * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
+ * it to be valid. If it is not then the caller held on to it and
+ * the integer wrapped. If we wrapped back within range the caller
+ * will harmlessly scan.
*/
- s_rd_seq = atomic_load_int(&s->s_rd_seq);
- do {
- if (SMR_SEQ_LEQ(rd_seq, s_rd_seq))
- goto out;
- } while (atomic_fcmpset_int(&s->s_rd_seq, &s_rd_seq, rd_seq) == 0);
+ if (delta > 0)
+ goto out;
+ /* Determine the lowest visible sequence number. */
+ s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait);
+ success = SMR_SEQ_LEQ(goal, s_rd_seq);
out:
+ if (!success)
+ counter_u64_add_protected(poll_fail, 1);
critical_exit();
/*
@@ -407,7 +590,7 @@
}
smr_t
-smr_create(const char *name)
+smr_create(const char *name, int limit, int flags)
{
smr_t smr, c;
smr_shared_t s;
@@ -417,13 +600,19 @@
smr = uma_zalloc_pcpu(smr_zone, M_WAITOK);
s->s_name = name;
- s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
+ if ((flags & SMR_LAZY) == 0)
+ s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
+ else
+ s->s_rd_seq = s->s_wr_seq = ticks;
/* Initialize all CPUS, not just those running. */
for (i = 0; i <= mp_maxid; i++) {
c = zpcpu_get_cpu(smr, i);
c->c_seq = SMR_SEQ_INVALID;
c->c_shared = s;
+ c->c_deferred = 0;
+ c->c_limit = limit;
+ c->c_flags = flags;
}
atomic_thread_fence_seq_cst();
@@ -460,5 +649,6 @@
advance_wait = counter_u64_alloc(M_WAITOK);
poll = counter_u64_alloc(M_WAITOK);
poll_scan = counter_u64_alloc(M_WAITOK);
+ poll_fail = counter_u64_alloc(M_WAITOK);
}
SYSINIT(smr_counters, SI_SUB_CPU, SI_ORDER_ANY, smr_init_counters, NULL);
Index: sys/sys/_smr.h
===================================================================
--- sys/sys/_smr.h
+++ sys/sys/_smr.h
@@ -32,6 +32,7 @@
#define _SYS__SMR_H_
typedef uint32_t smr_seq_t;
+typedef int32_t smr_delta_t;
typedef struct smr *smr_t;
#endif /* __SYS_SMR_H_ */
Index: sys/sys/smr.h
===================================================================
--- sys/sys/smr.h
+++ sys/sys/smr.h
@@ -45,11 +45,13 @@
* Modular arithmetic for comparing sequence numbers that have
* potentially wrapped. Copied from tcp_seq.h.
*/
-#define SMR_SEQ_LT(a, b) ((int32_t)((a)-(b)) < 0)
-#define SMR_SEQ_LEQ(a, b) ((int32_t)((a)-(b)) <= 0)
-#define SMR_SEQ_GT(a, b) ((int32_t)((a)-(b)) > 0)
-#define SMR_SEQ_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
-#define SMR_SEQ_DELTA(a, b) ((int32_t)((a)-(b)))
+#define SMR_SEQ_LT(a, b) ((smr_delta_t)((a)-(b)) < 0)
+#define SMR_SEQ_LEQ(a, b) ((smr_delta_t)((a)-(b)) <= 0)
+#define SMR_SEQ_GT(a, b) ((smr_delta_t)((a)-(b)) > 0)
+#define SMR_SEQ_GEQ(a, b) ((smr_delta_t)((a)-(b)) >= 0)
+#define SMR_SEQ_DELTA(a, b) ((smr_delta_t)((a)-(b)))
+#define SMR_SEQ_MIN(a, b) (SMR_SEQ_LT(a, b) ? a : b)
+#define SMR_SEQ_MAX(a, b) (SMR_SEQ_GT(a, b) ? a : b)
#define SMR_SEQ_INVALID 0
@@ -66,8 +68,13 @@
smr_seq_t c_seq; /* Current observed sequence. */
smr_shared_t c_shared; /* Shared SMR state. */
int c_deferred; /* Deferred advance counter. */
+ int c_limit; /* Deferred advance limit. */
+ int c_flags; /* SMR Configuration */
};
+#define SMR_LAZY 0x0001 /* Higher latency write, fast read. */
+#define SMR_DEFERRED 0x0002 /* Aggregate updates to wr_seq. */
+
#define SMR_ENTERED(smr) \
(curthread->td_critnest != 0 && zpcpu_get((smr))->c_seq != SMR_SEQ_INVALID)
@@ -170,7 +177,8 @@
} while (0)
/*
- * Return the current write sequence number.
+ * Return the current write sequence number. This is not the same as the
+ * current goal which may be in the future.
*/
static inline smr_seq_t
smr_shared_current(smr_shared_t s)
@@ -195,6 +203,8 @@
critical_enter();
smr = zpcpu_get(smr);
+ KASSERT((smr->c_flags & SMR_LAZY) == 0,
+ ("smr_enter(%s) lazy smr.", smr->c_shared->s_name));
KASSERT(smr->c_seq == 0,
("smr_enter(%s) does not support recursion.",
smr->c_shared->s_name));
@@ -228,6 +238,8 @@
smr = zpcpu_get(smr);
CRITICAL_ASSERT(curthread);
+ KASSERT((smr->c_flags & SMR_LAZY) == 0,
+ ("smr_exit(%s) lazy smr.", smr->c_shared->s_name));
KASSERT(smr->c_seq != SMR_SEQ_INVALID,
("smr_exit(%s) not in a smr section.", smr->c_shared->s_name));
@@ -243,17 +255,61 @@
}
/*
- * Advances the write sequence number. Returns the sequence number
- * required to ensure that all modifications are visible to readers.
+ * Enter a lazy smr section. This is used for read-mostly state that
+ * can tolerate a high free latency.
*/
-smr_seq_t smr_advance(smr_t smr);
+static inline void
+smr_lazy_enter(smr_t smr)
+{
+
+ critical_enter();
+ smr = zpcpu_get(smr);
+ KASSERT((smr->c_flags & SMR_LAZY) != 0,
+ ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
+ KASSERT(smr->c_seq == 0,
+ ("smr_lazy_enter(%s) does not support recursion.",
+ smr->c_shared->s_name));
+
+ /*
+ * This needs no serialization. If an interrupt occurs before we
+ * assign sr_seq to c_seq any speculative loads will be discarded.
+ * If we assign a stale wr_seq value due to interrupt we use the
+ * same algorithm that renders smr_enter() safe.
+ */
+ smr->c_seq = smr_shared_current(smr->c_shared);
+}
/*
- * Advances the write sequence number only after N calls. Returns
- * the correct goal for a wr_seq that has not yet occurred. Used to
- * minimize shared cacheline invalidations for frequent writers.
+ * Exit a lazy smr section. This is used for read-mostly state that
+ * can tolerate a high free latency.
*/
-smr_seq_t smr_advance_deferred(smr_t smr, int limit);
+static inline void
+smr_lazy_exit(smr_t smr)
+{
+
+ smr = zpcpu_get(smr);
+ CRITICAL_ASSERT(curthread);
+ KASSERT((smr->c_flags & SMR_LAZY) != 0,
+ ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
+ KASSERT(smr->c_seq != SMR_SEQ_INVALID,
+ ("smr_lazy_exit(%s) not in a smr section.", smr->c_shared->s_name));
+
+ /*
+ * All loads/stores must be retired before the sequence becomes
+ * visible. The fence compiles away on amd64. Another
+ * alternative would be to omit the fence but store the exit
+ * time and wait 1 tick longer.
+ */
+ atomic_thread_fence_rel();
+ smr->c_seq = SMR_SEQ_INVALID;
+ critical_exit();
+}
+
+/*
+ * Advances the write sequence number. Returns the sequence number
+ * required to ensure that all modifications are visible to readers.
+ */
+smr_seq_t smr_advance(smr_t smr);
/*
* Returns true if a goal sequence has been reached. If
@@ -262,7 +318,9 @@
bool smr_poll(smr_t smr, smr_seq_t goal, bool wait);
/* Create a new SMR context. */
-smr_t smr_create(const char *name);
+smr_t smr_create(const char *name, int limit, int flags);
+
+/* Destroy the context. */
void smr_destroy(smr_t smr);
/*
Index: sys/tools/umaperf/umaperf.c
===================================================================
--- sys/tools/umaperf/umaperf.c
+++ sys/tools/umaperf/umaperf.c
@@ -230,6 +230,9 @@
case DEFER_SMR:
smr_enter(umaperf_smr);
break;
+ case LAZY_SMR:
+ smr_lazy_enter(umaperf_smr);
+ break;
case EPOCH:
epoch_enter(umaperf_epoch);
break;
@@ -256,6 +259,9 @@
case DEFER_SMR:
smr_exit(umaperf_smr);
break;
+ case LAZY_SMR:
+ smr_lazy_exit(umaperf_smr);
+ break;
case EPOCH:
epoch_exit(umaperf_epoch);
break;
@@ -618,7 +624,7 @@
switch (umaperf_type) {
#ifdef __FreeBSD__
case PLAIN:
- flags = UMA_ZONE_ROUNDROBIN;
+ flags = UMA_ZONE_FIRSTTOUCH;
break;
case SMR:
flags = UMA_ZONE_ROUNDROBIN | UMA_ZONE_SMR;
@@ -652,7 +658,8 @@
umaperf_zone = uma_zcreate("umaperf", umaperf_zone_size,
NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, flags);
#ifdef __FreeBSD__
- umaperf_smr = uma_zone_get_smr(umaperf_zone);
+ if (umaperf_smr != 0)
+ uma_zone_set_smr(umaperf_zone, umaperf_smr);
#endif
umaperf_init_cpus();
}
Index: sys/vm/uma_core.c
===================================================================
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -1168,7 +1168,6 @@
* Returns:
* Nothing
*/
-
static void
bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
{
@@ -1228,7 +1227,7 @@
*/
seq = SMR_SEQ_INVALID;
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
- seq = smr_current(zone->uz_smr);
+ seq = smr_advance(zone->uz_smr);
CPU_FOREACH(cpu) {
cache = &zone->uz_cpu[cpu];
bucket = cache_bucket_unload_alloc(cache);
@@ -2707,7 +2706,7 @@
/* Caller requests a private SMR context. */
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
- zone->uz_smr = smr_create(zone->uz_name);
+ zone->uz_smr = smr_create(zone->uz_name, 0, 0);
KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
(UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
@@ -4180,22 +4179,21 @@
"uma_zfree: zone %s(%p) draining cross bucket %p",
zone->uz_name, zone, bucket);
- STAILQ_INIT(&fullbuckets);
+ /*
+ * It is possible for buckets to arrive here out of order so we fetch
+ * the current smr seq rather than accepting the bucket's.
+ */
+ seq = SMR_SEQ_INVALID;
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
+ seq = smr_advance(zone->uz_smr);
/*
* To avoid having ndomain * ndomain buckets for sorting we have a
* lock on the current crossfree bucket. A full matrix with
* per-domain locking could be used if necessary.
*/
+ STAILQ_INIT(&fullbuckets);
ZONE_CROSS_LOCK(zone);
-
- /*
- * It is possible for buckets to arrive here out of order so we fetch
- * the current smr seq rather than accepting the bucket's.
- */
- seq = SMR_SEQ_INVALID;
- if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
- seq = smr_current(zone->uz_smr);
while (bucket->ub_cnt > 0) {
item = bucket->ub_bucket[bucket->ub_cnt - 1];
domain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
Index: tools/uma/smrstress/smrstress.c
===================================================================
--- tools/uma/smrstress/smrstress.c
+++ tools/uma/smrstress/smrstress.c
@@ -64,12 +64,14 @@
static void
smrs_error(struct smrs *smrs, const char *fmt, ...)
{
+ smr_t self;
va_list ap;
+ self = zpcpu_get(smrs_smr);
atomic_add_int(&smrs_failures, 1);
printf("SMR ERROR: wr_seq %d, rd_seq %d, c_seq %d, generation %d, count %d ",
- smrs_smr->c_shared->s_wr_seq, smrs_smr->c_shared->s_rd_seq,
- zpcpu_get(smrs_smr)->c_seq, smrs->generation, smrs->count);
+ smr_current(smrs_smr), self->c_shared->s_rd_seq, self->c_seq,
+ smrs->generation, smrs->count);
va_start(ap, fmt);
(void)vprintf(fmt, ap);
va_end(ap);
@@ -83,7 +85,7 @@
/* Wait for the writer to exit. */
while (smrs_completed == 0) {
- smr_enter(smrs_smr);
+ smr_lazy_enter(smrs_smr);
cur = (void *)atomic_load_acq_ptr(&smrs_current);
if (cur->generation == -1)
smrs_error(cur, "read early: Use after free!\n");
@@ -94,7 +96,7 @@
smrs_error(cur, "read late: Use after free!\n");
else if (cnt <= 0)
smrs_error(cur, "Invalid ref\n");
- smr_exit(smrs_smr);
+ smr_lazy_exit(smrs_smr);
maybe_yield();
}
}
@@ -190,8 +192,9 @@
smrs_zone = uma_zcreate("smrs", sizeof(struct smrs),
smrs_ctor, smrs_dtor, NULL, NULL, UMA_ALIGN_PTR,
- UMA_ZONE_SMR | UMA_ZONE_ZINIT);
- smrs_smr = uma_zone_get_smr(smrs_zone);
+ UMA_ZONE_ZINIT);
+ smrs_smr = smr_create("smrs", SMR_LAZY);
+ uma_zone_set_smr(smrs_zone, smrs_smr);
}
static void
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Mar 1, 5:58 PM (8 h, 50 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
29121657
Default Alt Text
D23738.id68514.diff (27 KB)
Attached To
Mode
D23738: Experimental ticks based SMR.
Attached
Detach File
Event Timeline
Log In to Comment