Page MenuHomeFreeBSD

D23738.id68514.diff
No OneTemporary

D23738.id68514.diff

Index: sys/kern/subr_smr.c
===================================================================
--- sys/kern/subr_smr.c
+++ sys/kern/subr_smr.c
@@ -164,58 +164,133 @@
#define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2
#endif
+/*
+ * The grace period for lazy (tick based) SMR.
+ *
+ * Hardclock is responsible for advancing ticks on a single CPU while every
+ * CPU receives a regular clock interrupt. The clock interrupts are flushing
+ * the store buffers and any speculative loads that may violate our invariants.
+ * Because these interrupts are not synchronized we must wait one additional
+ * tick in the future to be certain that all processors have had their state
+ * synchronized by an interrupt.
+ *
+ * This assumes that the clock interrupt will only be delayed by other causes
+ * that will flush the store buffer or prevent access to the section protected
+ * data. For example, an idle processor, or an system management interrupt,
+ * or a vm exit.
+ *
+ * We must wait one additional tick if we are around the wrap condition
+ * because the write seq will move forward by two with one interrupt.
+ */
+#define SMR_LAZY_GRACE 2
+#define SMR_LAZY_GRACE_MAX (SMR_LAZY_GRACE + 1)
+
+/*
+ * The maximum sequence number ahead of wr_seq that may still be valid. The
+ * sequence may not be advanced on write for lazy or deferred SMRs. In this
+ * case poll needs to attempt to forward the sequence number if the goal is
+ * within wr_seq + SMR_SEQ_ADVANCE.
+ */
+#define SMR_SEQ_ADVANCE MAX(SMR_SEQ_INCR, SMR_LAZY_GRACE_MAX)
+
static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats");
static counter_u64_t advance = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RD, &advance, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_WR, &advance, "");
static counter_u64_t advance_wait = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RD, &advance_wait, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_WR, &advance_wait, "");
static counter_u64_t poll = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RD, &poll, "");
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_WR, &poll, "");
static counter_u64_t poll_scan = EARLY_COUNTER;
-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RD, &poll_scan, "");
-
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_WR, &poll_scan, "");
+static counter_u64_t poll_fail = EARLY_COUNTER;
+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_WR, &poll_fail, "");
/*
- * Advance the write sequence and return the new value for use as the
- * wait goal. This guarantees that any changes made by the calling
- * thread prior to this call will be visible to all threads after
- * rd_seq meets or exceeds the return value.
+ * Advance a lazy write sequence number. These move forward at the rate of
+ * ticks. Grace is two ticks in the future. lazy write sequence numbers can
+ * be even but not SMR_SEQ_INVALID so we pause time for a tick when we wrap.
*
- * This function may busy loop if the readers are roughly 1 billion
- * sequence numbers behind the writers.
+ * This returns the _current_ write sequence number. The lazy goal sequence
+ * number is SMR_LAZY_GRACE ticks ahead.
*/
-smr_seq_t
-smr_advance(smr_t smr)
+static smr_seq_t
+smr_lazy_advance(smr_t smr, smr_shared_t s)
{
- smr_shared_t s;
- smr_seq_t goal, s_rd_seq;
+ smr_seq_t s_rd_seq, s_wr_seq, goal;
+ int t;
+
+ CRITICAL_ASSERT(curthread);
/*
- * It is illegal to enter while in an smr section.
+ * We must not allow a zero tick value. We go back in time one tick
+ * and advance the grace period forward one tick around zero.
*/
- SMR_ASSERT_NOT_ENTERED(smr);
+ t = ticks;
+ if (t == SMR_SEQ_INVALID)
+ t--;
/*
- * Modifications not done in a smr section need to be visible
- * before advancing the seq.
+ * The most probable condition that the update already took place.
*/
- atomic_thread_fence_rel();
+ s_wr_seq = atomic_load_int(&s->s_wr_seq);
+ if (__predict_true(t == s_wr_seq))
+ goto out;
/*
- * Load the current read seq before incrementing the goal so
- * we are guaranteed it is always < goal.
+ * After long idle periods the read sequence may fall too far
+ * behind write. Prevent poll from ever seeing this condition
+ * by updating the stale rd_seq. This assumes that there can
+ * be no valid section 2bn ticks old. The rd_seq update must
+ * be visible before wr_seq to avoid races with other advance
+ * callers.
*/
- s = zpcpu_get(smr)->c_shared;
- s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
+ s_rd_seq = atomic_load_int(&s->s_rd_seq);
+ if (SMR_SEQ_GT(s_rd_seq, t))
+ atomic_cmpset_rel_int(&s->s_rd_seq, s_rd_seq, t);
+ atomic_cmpset_int(&s->s_wr_seq, s_wr_seq, t);
+ counter_u64_add(advance, 1);
+ /* If we lost either update race another thread did it. */
+ s_wr_seq = t;
+out:
+ goal = s_wr_seq + SMR_LAZY_GRACE;
+ /* Skip over the SMR_SEQ_INVALID tick. */
+ if (goal < SMR_LAZY_GRACE)
+ goal++;
+ return (goal);
+}
+
+/*
+ * Increment the shared write sequence by 2. Since it is initialized
+ * to 1 this means the only valid values are odd and an observed value
+ * of 0 in a particular CPU means it is not currently in a read section.
+ */
+static smr_seq_t
+smr_shared_advance(smr_shared_t s)
+{
+
+ return (atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR);
+}
+
+/*
+ * Advance the write sequence number for a normal smr section. If the
+ * write sequence is too far behind the read sequence we have to poll
+ * to advance rd_seq and prevent undetectable wraps.
+ */
+static smr_seq_t
+smr_default_advance(smr_t smr, smr_shared_t s)
+{
+ smr_seq_t goal, s_rd_seq;
+
+ CRITICAL_ASSERT(curthread);
+ KASSERT((zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
+ ("smr_default_advance: called with lazy smr."));
/*
- * Increment the shared write sequence by 2. Since it is
- * initialized to 1 this means the only valid values are
- * odd and an observed value of 0 in a particular CPU means
- * it is not currently in a read section.
+ * Load the current read seq before incrementing the goal so
+ * we are guaranteed it is always < goal.
*/
- goal = atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR;
- counter_u64_add(advance, 1);
+ s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
+ goal = smr_shared_advance(s);
/*
* Force a synchronization here if the goal is getting too
@@ -226,30 +301,172 @@
counter_u64_add(advance_wait, 1);
smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE);
}
+ counter_u64_add(advance, 1);
return (goal);
}
+/*
+ * Deferred SMRs conditionally update s_wr_seq based on an
+ * cpu local interval count.
+ */
+static smr_seq_t
+smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self)
+{
+
+ if (++self->c_deferred < self->c_limit)
+ return (smr_shared_current(s) + SMR_SEQ_INCR);
+ self->c_deferred = 0;
+ return (smr_default_advance(smr, s));
+}
+
+/*
+ * Advance the write sequence and return the value for use as the
+ * wait goal. This guarantees that any changes made by the calling
+ * thread prior to this call will be visible to all threads after
+ * rd_seq meets or exceeds the return value.
+ *
+ * This function may busy loop if the readers are roughly 1 billion
+ * sequence numbers behind the writers.
+ *
+ * Lazy SMRs will not busy loop and the wrap happens every 49.6 days
+ * at 1khz and 119 hours at 10khz. Readers can block for no longer
+ * than half of this for SMR_SEQ_ macros to continue working.
+ */
smr_seq_t
-smr_advance_deferred(smr_t smr, int limit)
+smr_advance(smr_t smr)
{
+ smr_t self;
+ smr_shared_t s;
smr_seq_t goal;
- smr_t csmr;
+ int flags;
+ /*
+ * It is illegal to enter while in an smr section.
+ */
SMR_ASSERT_NOT_ENTERED(smr);
+ /*
+ * Modifications not done in a smr section need to be visible
+ * before advancing the seq.
+ */
+ atomic_thread_fence_rel();
+
critical_enter();
- csmr = zpcpu_get(smr);
- if (++csmr->c_deferred >= limit) {
- goal = SMR_SEQ_INVALID;
- csmr->c_deferred = 0;
- } else
- goal = smr_shared_current(csmr->c_shared) + SMR_SEQ_INCR;
+ /* Try to touch the line once. */
+ self = zpcpu_get(smr);
+ s = self->c_shared;
+ flags = self->c_flags;
+ goal = SMR_SEQ_INVALID;
+ if ((flags & (SMR_LAZY | SMR_DEFERRED)) == 0)
+ goal = smr_default_advance(smr, s);
+ else if ((flags & SMR_LAZY) != 0)
+ goal = smr_lazy_advance(smr, s);
+ else if ((flags & SMR_DEFERRED) != 0)
+ goal = smr_deferred_advance(smr, s, self);
critical_exit();
- if (goal != SMR_SEQ_INVALID)
- return (goal);
- return (smr_advance(smr));
+ return (goal);
+}
+
+/*
+ * Poll to determine the currently observed sequence number on a cpu
+ * and spinwait if the 'wait' argument is true.
+ */
+static smr_seq_t
+smr_poll_cpu(smr_t c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait)
+{
+ smr_seq_t c_seq;
+
+ c_seq = SMR_SEQ_INVALID;
+ for (;;) {
+ c_seq = atomic_load_int(&c->c_seq);
+ if (c_seq == SMR_SEQ_INVALID)
+ break;
+
+ /*
+ * There is a race described in smr.h:smr_enter that
+ * can lead to a stale seq value but not stale data
+ * access. If we find a value out of range here we
+ * pin it to the current min to prevent it from
+ * advancing until that stale section has expired.
+ *
+ * The race is created when a cpu loads the s_wr_seq
+ * value in a local register and then another thread
+ * advances s_wr_seq and calls smr_poll() which will
+ * oberve no value yet in c_seq and advance s_rd_seq
+ * up to s_wr_seq which is beyond the register
+ * cached value. This is only likely to happen on
+ * hypervisor or with a system management interrupt.
+ */
+ if (SMR_SEQ_LT(c_seq, s_rd_seq))
+ c_seq = s_rd_seq;
+
+ /*
+ * If the sequence number meets the goal we are done
+ * with this cpu.
+ */
+ if (SMR_SEQ_LEQ(goal, c_seq))
+ break;
+
+ if (!wait)
+ break;
+ cpu_spinwait();
+ }
+
+ return (c_seq);
+}
+
+/*
+ * Loop until all cores have observed the goal sequence or have
+ * gone inactive. Returns the oldest sequence currently active;
+ *
+ * This function assumes a snapshot of sequence values has
+ * been obtained and validated by smr_poll().
+ */
+static smr_seq_t
+smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq,
+ smr_seq_t s_wr_seq, smr_seq_t goal, bool wait)
+{
+ smr_seq_t rd_seq, c_seq;
+ int i;
+
+ CRITICAL_ASSERT(curthread);
+ counter_u64_add_protected(poll_scan, 1);
+
+ /*
+ * The read sequence can be no larger than the write sequence at
+ * the start of the poll.
+ */
+ rd_seq = s_wr_seq;
+ CPU_FOREACH(i) {
+ /*
+ * Query the active sequence on this cpu. If we're not
+ * waiting and we don't meet the goal we will still scan
+ * the rest of the cpus to update s_rd_seq before returning
+ * failure.
+ */
+ c_seq = smr_poll_cpu(zpcpu_get_cpu(smr, i), s_rd_seq, goal,
+ wait);
+
+ /*
+ * Limit the minimum observed rd_seq whether we met the goal
+ * or not.
+ */
+ if (c_seq != SMR_SEQ_INVALID)
+ rd_seq = SMR_SEQ_MIN(rd_seq, c_seq);
+ }
+
+ /*
+ * Advance the rd_seq as long as we observed a more recent value.
+ */
+ s_rd_seq = atomic_load_int(&s->s_rd_seq);
+ if (SMR_SEQ_GEQ(rd_seq, s_rd_seq)) {
+ atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq);
+ s_rd_seq = rd_seq;
+ }
+
+ return (s_rd_seq);
}
/*
@@ -268,9 +485,10 @@
smr_poll(smr_t smr, smr_seq_t goal, bool wait)
{
smr_shared_t s;
- smr_t c;
- smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq;
- int i;
+ smr_t self;
+ smr_seq_t s_wr_seq, s_rd_seq;
+ smr_delta_t delta;
+ int flags;
bool success;
/*
@@ -278,6 +496,8 @@
*/
KASSERT(!wait || !SMR_ENTERED(smr),
("smr_poll: Blocking not allowed in a SMR section."));
+ KASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
+ ("smr_poll: Blocking not allowed on lazy smrs."));
/*
* Use a critical section so that we can avoid ABA races
@@ -285,9 +505,19 @@
*/
success = true;
critical_enter();
- s = zpcpu_get(smr)->c_shared;
+ /* Attempt to load from self only once. */
+ self = zpcpu_get(smr);
+ s = self->c_shared;
+ flags = self->c_flags;
counter_u64_add_protected(poll, 1);
+ /*
+ * Conditionally advance the lazy write clock on any writer
+ * activity. This may reset s_rd_seq.
+ */
+ if ((flags & SMR_LAZY) != 0)
+ smr_lazy_advance(smr, s);
+
/*
* Acquire barrier loads s_wr_seq after s_rd_seq so that we can not
* observe an updated read sequence that is larger than write.
@@ -295,106 +525,59 @@
s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
/*
- * wr_seq must be loaded prior to any c_seq value so that a stale
- * c_seq can only reference time after this wr_seq.
+ * If we have already observed the sequence number we can immediately
+ * return success. Most polls should meet this criterion.
*/
- s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);
+ if (SMR_SEQ_LEQ(goal, s_rd_seq))
+ goto out;
/*
- * This may have come from a deferred advance. Consider one
- * increment past the current wr_seq valid and make sure we
- * have advanced far enough to succeed. We simply add to avoid
- * an additional fence.
+ * wr_seq must be loaded prior to any c_seq value so that a
+ * stale c_seq can only reference time after this wr_seq.
*/
- if (goal == s_wr_seq + SMR_SEQ_INCR) {
- atomic_add_int(&s->s_wr_seq, SMR_SEQ_INCR);
- s_wr_seq = goal;
- }
+ s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);
/*
- * Detect whether the goal is valid and has already been observed.
- *
- * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
- * it to be valid. If it is not then the caller held on to it and
- * the integer wrapped. If we wrapped back within range the caller
- * will harmlessly scan.
- *
- * A valid goal must be greater than s_rd_seq or we have not verified
- * that it has been observed and must fall through to polling.
+ * This is the distance from s_wr_seq to goal. Positive values
+ * are in the future.
*/
- if (SMR_SEQ_GEQ(s_rd_seq, goal) || SMR_SEQ_LT(s_wr_seq, goal))
- goto out;
+ delta = SMR_SEQ_DELTA(goal, s_wr_seq);
/*
- * Loop until all cores have observed the goal sequence or have
- * gone inactive. Keep track of the oldest sequence currently
- * active as rd_seq.
+ * Detect a stale wr_seq.
+ *
+ * This goal may have come from a deferred advance or a lazy
+ * smr. If we are not blocking we can not succeed but the
+ * sequence number is valid.
*/
- counter_u64_add_protected(poll_scan, 1);
- rd_seq = s_wr_seq;
- CPU_FOREACH(i) {
- c = zpcpu_get_cpu(smr, i);
- c_seq = SMR_SEQ_INVALID;
- for (;;) {
- c_seq = atomic_load_int(&c->c_seq);
- if (c_seq == SMR_SEQ_INVALID)
- break;
-
- /*
- * There is a race described in smr.h:smr_enter that
- * can lead to a stale seq value but not stale data
- * access. If we find a value out of range here we
- * pin it to the current min to prevent it from
- * advancing until that stale section has expired.
- *
- * The race is created when a cpu loads the s_wr_seq
- * value in a local register and then another thread
- * advances s_wr_seq and calls smr_poll() which will
- * oberve no value yet in c_seq and advance s_rd_seq
- * up to s_wr_seq which is beyond the register
- * cached value. This is only likely to happen on
- * hypervisor or with a system management interrupt.
- */
- if (SMR_SEQ_LT(c_seq, s_rd_seq))
- c_seq = s_rd_seq;
-
- /*
- * If the sequence number meets the goal we are
- * done with this cpu.
- */
- if (SMR_SEQ_GEQ(c_seq, goal))
- break;
-
- /*
- * If we're not waiting we will still scan the rest
- * of the cpus and update s_rd_seq before returning
- * an error.
- */
- if (!wait) {
- success = false;
- break;
- }
- cpu_spinwait();
+ if (delta > 0 && delta <= SMR_SEQ_MAX_ADVANCE &&
+ (flags & (SMR_LAZY | SMR_DEFERRED)) != 0) {
+ if (!wait) {
+ success = false;
+ goto out;
}
-
- /*
- * Limit the minimum observed rd_seq whether we met the goal
- * or not.
- */
- if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq))
- rd_seq = c_seq;
+ /* LAZY is always !wait. */
+ s_wr_seq = smr_shared_advance(s);
+ delta = 0;
}
/*
- * Advance the rd_seq as long as we observed the most recent one.
+ * Detect an invalid goal.
+ *
+ * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
+ * it to be valid. If it is not then the caller held on to it and
+ * the integer wrapped. If we wrapped back within range the caller
+ * will harmlessly scan.
*/
- s_rd_seq = atomic_load_int(&s->s_rd_seq);
- do {
- if (SMR_SEQ_LEQ(rd_seq, s_rd_seq))
- goto out;
- } while (atomic_fcmpset_int(&s->s_rd_seq, &s_rd_seq, rd_seq) == 0);
+ if (delta > 0)
+ goto out;
+ /* Determine the lowest visible sequence number. */
+ s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait);
+ success = SMR_SEQ_LEQ(goal, s_rd_seq);
out:
+ if (!success)
+ counter_u64_add_protected(poll_fail, 1);
critical_exit();
/*
@@ -407,7 +590,7 @@
}
smr_t
-smr_create(const char *name)
+smr_create(const char *name, int limit, int flags)
{
smr_t smr, c;
smr_shared_t s;
@@ -417,13 +600,19 @@
smr = uma_zalloc_pcpu(smr_zone, M_WAITOK);
s->s_name = name;
- s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
+ if ((flags & SMR_LAZY) == 0)
+ s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
+ else
+ s->s_rd_seq = s->s_wr_seq = ticks;
/* Initialize all CPUS, not just those running. */
for (i = 0; i <= mp_maxid; i++) {
c = zpcpu_get_cpu(smr, i);
c->c_seq = SMR_SEQ_INVALID;
c->c_shared = s;
+ c->c_deferred = 0;
+ c->c_limit = limit;
+ c->c_flags = flags;
}
atomic_thread_fence_seq_cst();
@@ -460,5 +649,6 @@
advance_wait = counter_u64_alloc(M_WAITOK);
poll = counter_u64_alloc(M_WAITOK);
poll_scan = counter_u64_alloc(M_WAITOK);
+ poll_fail = counter_u64_alloc(M_WAITOK);
}
SYSINIT(smr_counters, SI_SUB_CPU, SI_ORDER_ANY, smr_init_counters, NULL);
Index: sys/sys/_smr.h
===================================================================
--- sys/sys/_smr.h
+++ sys/sys/_smr.h
@@ -32,6 +32,7 @@
#define _SYS__SMR_H_
typedef uint32_t smr_seq_t;
+typedef int32_t smr_delta_t;
typedef struct smr *smr_t;
#endif /* __SYS_SMR_H_ */
Index: sys/sys/smr.h
===================================================================
--- sys/sys/smr.h
+++ sys/sys/smr.h
@@ -45,11 +45,13 @@
* Modular arithmetic for comparing sequence numbers that have
* potentially wrapped. Copied from tcp_seq.h.
*/
-#define SMR_SEQ_LT(a, b) ((int32_t)((a)-(b)) < 0)
-#define SMR_SEQ_LEQ(a, b) ((int32_t)((a)-(b)) <= 0)
-#define SMR_SEQ_GT(a, b) ((int32_t)((a)-(b)) > 0)
-#define SMR_SEQ_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
-#define SMR_SEQ_DELTA(a, b) ((int32_t)((a)-(b)))
+#define SMR_SEQ_LT(a, b) ((smr_delta_t)((a)-(b)) < 0)
+#define SMR_SEQ_LEQ(a, b) ((smr_delta_t)((a)-(b)) <= 0)
+#define SMR_SEQ_GT(a, b) ((smr_delta_t)((a)-(b)) > 0)
+#define SMR_SEQ_GEQ(a, b) ((smr_delta_t)((a)-(b)) >= 0)
+#define SMR_SEQ_DELTA(a, b) ((smr_delta_t)((a)-(b)))
+#define SMR_SEQ_MIN(a, b) (SMR_SEQ_LT(a, b) ? a : b)
+#define SMR_SEQ_MAX(a, b) (SMR_SEQ_GT(a, b) ? a : b)
#define SMR_SEQ_INVALID 0
@@ -66,8 +68,13 @@
smr_seq_t c_seq; /* Current observed sequence. */
smr_shared_t c_shared; /* Shared SMR state. */
int c_deferred; /* Deferred advance counter. */
+ int c_limit; /* Deferred advance limit. */
+ int c_flags; /* SMR Configuration */
};
+#define SMR_LAZY 0x0001 /* Higher latency write, fast read. */
+#define SMR_DEFERRED 0x0002 /* Aggregate updates to wr_seq. */
+
#define SMR_ENTERED(smr) \
(curthread->td_critnest != 0 && zpcpu_get((smr))->c_seq != SMR_SEQ_INVALID)
@@ -170,7 +177,8 @@
} while (0)
/*
- * Return the current write sequence number.
+ * Return the current write sequence number. This is not the same as the
+ * current goal which may be in the future.
*/
static inline smr_seq_t
smr_shared_current(smr_shared_t s)
@@ -195,6 +203,8 @@
critical_enter();
smr = zpcpu_get(smr);
+ KASSERT((smr->c_flags & SMR_LAZY) == 0,
+ ("smr_enter(%s) lazy smr.", smr->c_shared->s_name));
KASSERT(smr->c_seq == 0,
("smr_enter(%s) does not support recursion.",
smr->c_shared->s_name));
@@ -228,6 +238,8 @@
smr = zpcpu_get(smr);
CRITICAL_ASSERT(curthread);
+ KASSERT((smr->c_flags & SMR_LAZY) == 0,
+ ("smr_exit(%s) lazy smr.", smr->c_shared->s_name));
KASSERT(smr->c_seq != SMR_SEQ_INVALID,
("smr_exit(%s) not in a smr section.", smr->c_shared->s_name));
@@ -243,17 +255,61 @@
}
/*
- * Advances the write sequence number. Returns the sequence number
- * required to ensure that all modifications are visible to readers.
+ * Enter a lazy smr section. This is used for read-mostly state that
+ * can tolerate a high free latency.
*/
-smr_seq_t smr_advance(smr_t smr);
+static inline void
+smr_lazy_enter(smr_t smr)
+{
+
+ critical_enter();
+ smr = zpcpu_get(smr);
+ KASSERT((smr->c_flags & SMR_LAZY) != 0,
+ ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
+ KASSERT(smr->c_seq == 0,
+ ("smr_lazy_enter(%s) does not support recursion.",
+ smr->c_shared->s_name));
+
+ /*
+ * This needs no serialization. If an interrupt occurs before we
+ * assign sr_seq to c_seq any speculative loads will be discarded.
+ * If we assign a stale wr_seq value due to interrupt we use the
+ * same algorithm that renders smr_enter() safe.
+ */
+ smr->c_seq = smr_shared_current(smr->c_shared);
+}
/*
- * Advances the write sequence number only after N calls. Returns
- * the correct goal for a wr_seq that has not yet occurred. Used to
- * minimize shared cacheline invalidations for frequent writers.
+ * Exit a lazy smr section. This is used for read-mostly state that
+ * can tolerate a high free latency.
*/
-smr_seq_t smr_advance_deferred(smr_t smr, int limit);
+static inline void
+smr_lazy_exit(smr_t smr)
+{
+
+ smr = zpcpu_get(smr);
+ CRITICAL_ASSERT(curthread);
+ KASSERT((smr->c_flags & SMR_LAZY) != 0,
+ ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
+ KASSERT(smr->c_seq != SMR_SEQ_INVALID,
+ ("smr_lazy_exit(%s) not in a smr section.", smr->c_shared->s_name));
+
+ /*
+ * All loads/stores must be retired before the sequence becomes
+ * visible. The fence compiles away on amd64. Another
+ * alternative would be to omit the fence but store the exit
+ * time and wait 1 tick longer.
+ */
+ atomic_thread_fence_rel();
+ smr->c_seq = SMR_SEQ_INVALID;
+ critical_exit();
+}
+
+/*
+ * Advances the write sequence number. Returns the sequence number
+ * required to ensure that all modifications are visible to readers.
+ */
+smr_seq_t smr_advance(smr_t smr);
/*
* Returns true if a goal sequence has been reached. If
@@ -262,7 +318,9 @@
bool smr_poll(smr_t smr, smr_seq_t goal, bool wait);
/* Create a new SMR context. */
-smr_t smr_create(const char *name);
+smr_t smr_create(const char *name, int limit, int flags);
+
+/* Destroy the context. */
void smr_destroy(smr_t smr);
/*
Index: sys/tools/umaperf/umaperf.c
===================================================================
--- sys/tools/umaperf/umaperf.c
+++ sys/tools/umaperf/umaperf.c
@@ -230,6 +230,9 @@
case DEFER_SMR:
smr_enter(umaperf_smr);
break;
+ case LAZY_SMR:
+ smr_lazy_enter(umaperf_smr);
+ break;
case EPOCH:
epoch_enter(umaperf_epoch);
break;
@@ -256,6 +259,9 @@
case DEFER_SMR:
smr_exit(umaperf_smr);
break;
+ case LAZY_SMR:
+ smr_lazy_exit(umaperf_smr);
+ break;
case EPOCH:
epoch_exit(umaperf_epoch);
break;
@@ -618,7 +624,7 @@
switch (umaperf_type) {
#ifdef __FreeBSD__
case PLAIN:
- flags = UMA_ZONE_ROUNDROBIN;
+ flags = UMA_ZONE_FIRSTTOUCH;
break;
case SMR:
flags = UMA_ZONE_ROUNDROBIN | UMA_ZONE_SMR;
@@ -652,7 +658,8 @@
umaperf_zone = uma_zcreate("umaperf", umaperf_zone_size,
NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, flags);
#ifdef __FreeBSD__
- umaperf_smr = uma_zone_get_smr(umaperf_zone);
+ if (umaperf_smr != 0)
+ uma_zone_set_smr(umaperf_zone, umaperf_smr);
#endif
umaperf_init_cpus();
}
Index: sys/vm/uma_core.c
===================================================================
--- sys/vm/uma_core.c
+++ sys/vm/uma_core.c
@@ -1168,7 +1168,6 @@
* Returns:
* Nothing
*/
-
static void
bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
{
@@ -1228,7 +1227,7 @@
*/
seq = SMR_SEQ_INVALID;
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
- seq = smr_current(zone->uz_smr);
+ seq = smr_advance(zone->uz_smr);
CPU_FOREACH(cpu) {
cache = &zone->uz_cpu[cpu];
bucket = cache_bucket_unload_alloc(cache);
@@ -2707,7 +2706,7 @@
/* Caller requests a private SMR context. */
if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
- zone->uz_smr = smr_create(zone->uz_name);
+ zone->uz_smr = smr_create(zone->uz_name, 0, 0);
KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
(UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
@@ -4180,22 +4179,21 @@
"uma_zfree: zone %s(%p) draining cross bucket %p",
zone->uz_name, zone, bucket);
- STAILQ_INIT(&fullbuckets);
+ /*
+ * It is possible for buckets to arrive here out of order so we fetch
+ * the current smr seq rather than accepting the bucket's.
+ */
+ seq = SMR_SEQ_INVALID;
+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
+ seq = smr_advance(zone->uz_smr);
/*
* To avoid having ndomain * ndomain buckets for sorting we have a
* lock on the current crossfree bucket. A full matrix with
* per-domain locking could be used if necessary.
*/
+ STAILQ_INIT(&fullbuckets);
ZONE_CROSS_LOCK(zone);
-
- /*
- * It is possible for buckets to arrive here out of order so we fetch
- * the current smr seq rather than accepting the bucket's.
- */
- seq = SMR_SEQ_INVALID;
- if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
- seq = smr_current(zone->uz_smr);
while (bucket->ub_cnt > 0) {
item = bucket->ub_bucket[bucket->ub_cnt - 1];
domain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
Index: tools/uma/smrstress/smrstress.c
===================================================================
--- tools/uma/smrstress/smrstress.c
+++ tools/uma/smrstress/smrstress.c
@@ -64,12 +64,14 @@
static void
smrs_error(struct smrs *smrs, const char *fmt, ...)
{
+ smr_t self;
va_list ap;
+ self = zpcpu_get(smrs_smr);
atomic_add_int(&smrs_failures, 1);
printf("SMR ERROR: wr_seq %d, rd_seq %d, c_seq %d, generation %d, count %d ",
- smrs_smr->c_shared->s_wr_seq, smrs_smr->c_shared->s_rd_seq,
- zpcpu_get(smrs_smr)->c_seq, smrs->generation, smrs->count);
+ smr_current(smrs_smr), self->c_shared->s_rd_seq, self->c_seq,
+ smrs->generation, smrs->count);
va_start(ap, fmt);
(void)vprintf(fmt, ap);
va_end(ap);
@@ -83,7 +85,7 @@
/* Wait for the writer to exit. */
while (smrs_completed == 0) {
- smr_enter(smrs_smr);
+ smr_lazy_enter(smrs_smr);
cur = (void *)atomic_load_acq_ptr(&smrs_current);
if (cur->generation == -1)
smrs_error(cur, "read early: Use after free!\n");
@@ -94,7 +96,7 @@
smrs_error(cur, "read late: Use after free!\n");
else if (cnt <= 0)
smrs_error(cur, "Invalid ref\n");
- smr_exit(smrs_smr);
+ smr_lazy_exit(smrs_smr);
maybe_yield();
}
}
@@ -190,8 +192,9 @@
smrs_zone = uma_zcreate("smrs", sizeof(struct smrs),
smrs_ctor, smrs_dtor, NULL, NULL, UMA_ALIGN_PTR,
- UMA_ZONE_SMR | UMA_ZONE_ZINIT);
- smrs_smr = uma_zone_get_smr(smrs_zone);
+ UMA_ZONE_ZINIT);
+ smrs_smr = smr_create("smrs", SMR_LAZY);
+ uma_zone_set_smr(smrs_zone, smrs_smr);
}
static void

File Metadata

Mime Type
text/plain
Expires
Sun, Mar 1, 5:58 PM (8 h, 50 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
29121657
Default Alt Text
D23738.id68514.diff (27 KB)

Event Timeline