D23738.id68514.diff
No OneTemporary
Actions

Size

27 KB

Referenced Files

None

Subscribers

None

D23738.id68514.diff
View Options

	Index: sys/kern/subr_smr.c
	===================================================================
	--- sys/kern/subr_smr.c
	+++ sys/kern/subr_smr.c
	@@ -164,58 +164,133 @@
	#define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2
	#endif

	+/*
	+ * The grace period for lazy (tick based) SMR.
	+ *
	+ * Hardclock is responsible for advancing ticks on a single CPU while every
	+ * CPU receives a regular clock interrupt. The clock interrupts are flushing
	+ * the store buffers and any speculative loads that may violate our invariants.
	+ * Because these interrupts are not synchronized we must wait one additional
	+ * tick in the future to be certain that all processors have had their state
	+ * synchronized by an interrupt.
	+ *
	+ * This assumes that the clock interrupt will only be delayed by other causes
	+ * that will flush the store buffer or prevent access to the section protected
	+ * data. For example, an idle processor, or an system management interrupt,
	+ * or a vm exit.
	+ *
	+ * We must wait one additional tick if we are around the wrap condition
	+ * because the write seq will move forward by two with one interrupt.
	+ */
	+#define SMR_LAZY_GRACE 2
	+#define SMR_LAZY_GRACE_MAX (SMR_LAZY_GRACE + 1)
	+
	+/*
	+ * The maximum sequence number ahead of wr_seq that may still be valid. The
	+ * sequence may not be advanced on write for lazy or deferred SMRs. In this
	+ * case poll needs to attempt to forward the sequence number if the goal is
	+ * within wr_seq + SMR_SEQ_ADVANCE.
	+ */
	+#define SMR_SEQ_ADVANCE MAX(SMR_SEQ_INCR, SMR_LAZY_GRACE_MAX)
	+
	static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats");
	static counter_u64_t advance = EARLY_COUNTER;
	-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RD, &advance, "");
	+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_WR, &advance, "");
	static counter_u64_t advance_wait = EARLY_COUNTER;
	-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RD, &advance_wait, "");
	+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_WR, &advance_wait, "");
	static counter_u64_t poll = EARLY_COUNTER;
	-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RD, &poll, "");
	+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_WR, &poll, "");
	static counter_u64_t poll_scan = EARLY_COUNTER;
	-SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RD, &poll_scan, "");
	-
	+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_WR, &poll_scan, "");
	+static counter_u64_t poll_fail = EARLY_COUNTER;
	+SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_WR, &poll_fail, "");

	/*
	- * Advance the write sequence and return the new value for use as the
	- * wait goal. This guarantees that any changes made by the calling
	- * thread prior to this call will be visible to all threads after
	- * rd_seq meets or exceeds the return value.
	+ * Advance a lazy write sequence number. These move forward at the rate of
	+ * ticks. Grace is two ticks in the future. lazy write sequence numbers can
	+ * be even but not SMR_SEQ_INVALID so we pause time for a tick when we wrap.
	*
	- * This function may busy loop if the readers are roughly 1 billion
	- * sequence numbers behind the writers.
	+ * This returns the _current_ write sequence number. The lazy goal sequence
	+ * number is SMR_LAZY_GRACE ticks ahead.
	*/
	-smr_seq_t
	-smr_advance(smr_t smr)
	+static smr_seq_t
	+smr_lazy_advance(smr_t smr, smr_shared_t s)
	{
	- smr_shared_t s;
	- smr_seq_t goal, s_rd_seq;
	+ smr_seq_t s_rd_seq, s_wr_seq, goal;
	+ int t;
	+
	+ CRITICAL_ASSERT(curthread);

	/*
	- * It is illegal to enter while in an smr section.
	+ * We must not allow a zero tick value. We go back in time one tick
	+ * and advance the grace period forward one tick around zero.
	*/
	- SMR_ASSERT_NOT_ENTERED(smr);
	+ t = ticks;
	+ if (t == SMR_SEQ_INVALID)
	+ t--;

	/*
	- * Modifications not done in a smr section need to be visible
	- * before advancing the seq.
	+ * The most probable condition that the update already took place.
	*/
	- atomic_thread_fence_rel();
	+ s_wr_seq = atomic_load_int(&s->s_wr_seq);
	+ if (__predict_true(t == s_wr_seq))
	+ goto out;

	/*
	- * Load the current read seq before incrementing the goal so
	- * we are guaranteed it is always < goal.
	+ * After long idle periods the read sequence may fall too far
	+ * behind write. Prevent poll from ever seeing this condition
	+ * by updating the stale rd_seq. This assumes that there can
	+ * be no valid section 2bn ticks old. The rd_seq update must
	+ * be visible before wr_seq to avoid races with other advance
	+ * callers.
	*/
	- s = zpcpu_get(smr)->c_shared;
	- s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
	+ s_rd_seq = atomic_load_int(&s->s_rd_seq);
	+ if (SMR_SEQ_GT(s_rd_seq, t))
	+ atomic_cmpset_rel_int(&s->s_rd_seq, s_rd_seq, t);
	+ atomic_cmpset_int(&s->s_wr_seq, s_wr_seq, t);
	+ counter_u64_add(advance, 1);
	+ /* If we lost either update race another thread did it. */
	+ s_wr_seq = t;
	+out:
	+ goal = s_wr_seq + SMR_LAZY_GRACE;
	+ /* Skip over the SMR_SEQ_INVALID tick. */
	+ if (goal < SMR_LAZY_GRACE)
	+ goal++;
	+ return (goal);
	+}
	+
	+/*
	+ * Increment the shared write sequence by 2. Since it is initialized
	+ * to 1 this means the only valid values are odd and an observed value
	+ * of 0 in a particular CPU means it is not currently in a read section.
	+ */
	+static smr_seq_t
	+smr_shared_advance(smr_shared_t s)
	+{
	+
	+ return (atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR);
	+}
	+
	+/*
	+ * Advance the write sequence number for a normal smr section. If the
	+ * write sequence is too far behind the read sequence we have to poll
	+ * to advance rd_seq and prevent undetectable wraps.
	+ */
	+static smr_seq_t
	+smr_default_advance(smr_t smr, smr_shared_t s)
	+{
	+ smr_seq_t goal, s_rd_seq;
	+
	+ CRITICAL_ASSERT(curthread);
	+ KASSERT((zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
	+ ("smr_default_advance: called with lazy smr."));

	/*
	- * Increment the shared write sequence by 2. Since it is
	- * initialized to 1 this means the only valid values are
	- * odd and an observed value of 0 in a particular CPU means
	- * it is not currently in a read section.
	+ * Load the current read seq before incrementing the goal so
	+ * we are guaranteed it is always < goal.
	*/
	- goal = atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR;
	- counter_u64_add(advance, 1);
	+ s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);
	+ goal = smr_shared_advance(s);

	/*
	* Force a synchronization here if the goal is getting too
	@@ -226,30 +301,172 @@
	counter_u64_add(advance_wait, 1);
	smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE);
	}
	+ counter_u64_add(advance, 1);

	return (goal);
	}

	+/*
	+ * Deferred SMRs conditionally update s_wr_seq based on an
	+ * cpu local interval count.
	+ */
	+static smr_seq_t
	+smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self)
	+{
	+
	+ if (++self->c_deferred < self->c_limit)
	+ return (smr_shared_current(s) + SMR_SEQ_INCR);
	+ self->c_deferred = 0;
	+ return (smr_default_advance(smr, s));
	+}
	+
	+/*
	+ * Advance the write sequence and return the value for use as the
	+ * wait goal. This guarantees that any changes made by the calling
	+ * thread prior to this call will be visible to all threads after
	+ * rd_seq meets or exceeds the return value.
	+ *
	+ * This function may busy loop if the readers are roughly 1 billion
	+ * sequence numbers behind the writers.
	+ *
	+ * Lazy SMRs will not busy loop and the wrap happens every 49.6 days
	+ * at 1khz and 119 hours at 10khz. Readers can block for no longer
	+ * than half of this for SMR_SEQ_ macros to continue working.
	+ */
	smr_seq_t
	-smr_advance_deferred(smr_t smr, int limit)
	+smr_advance(smr_t smr)
	{
	+ smr_t self;
	+ smr_shared_t s;
	smr_seq_t goal;
	- smr_t csmr;
	+ int flags;

	+ /*
	+ * It is illegal to enter while in an smr section.
	+ */
	SMR_ASSERT_NOT_ENTERED(smr);

	+ /*
	+ * Modifications not done in a smr section need to be visible
	+ * before advancing the seq.
	+ */
	+ atomic_thread_fence_rel();
	+
	critical_enter();
	- csmr = zpcpu_get(smr);
	- if (++csmr->c_deferred >= limit) {
	- goal = SMR_SEQ_INVALID;
	- csmr->c_deferred = 0;
	- } else
	- goal = smr_shared_current(csmr->c_shared) + SMR_SEQ_INCR;
	+ /* Try to touch the line once. */
	+ self = zpcpu_get(smr);
	+ s = self->c_shared;
	+ flags = self->c_flags;
	+ goal = SMR_SEQ_INVALID;
	+ if ((flags & (SMR_LAZY \| SMR_DEFERRED)) == 0)
	+ goal = smr_default_advance(smr, s);
	+ else if ((flags & SMR_LAZY) != 0)
	+ goal = smr_lazy_advance(smr, s);
	+ else if ((flags & SMR_DEFERRED) != 0)
	+ goal = smr_deferred_advance(smr, s, self);
	critical_exit();
	- if (goal != SMR_SEQ_INVALID)
	- return (goal);

	- return (smr_advance(smr));
	+ return (goal);
	+}
	+
	+/*
	+ * Poll to determine the currently observed sequence number on a cpu
	+ * and spinwait if the 'wait' argument is true.
	+ */
	+static smr_seq_t
	+smr_poll_cpu(smr_t c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait)
	+{
	+ smr_seq_t c_seq;
	+
	+ c_seq = SMR_SEQ_INVALID;
	+ for (;;) {
	+ c_seq = atomic_load_int(&c->c_seq);
	+ if (c_seq == SMR_SEQ_INVALID)
	+ break;
	+
	+ /*
	+ * There is a race described in smr.h:smr_enter that
	+ * can lead to a stale seq value but not stale data
	+ * access. If we find a value out of range here we
	+ * pin it to the current min to prevent it from
	+ * advancing until that stale section has expired.
	+ *
	+ * The race is created when a cpu loads the s_wr_seq
	+ * value in a local register and then another thread
	+ * advances s_wr_seq and calls smr_poll() which will
	+ * oberve no value yet in c_seq and advance s_rd_seq
	+ * up to s_wr_seq which is beyond the register
	+ * cached value. This is only likely to happen on
	+ * hypervisor or with a system management interrupt.
	+ */
	+ if (SMR_SEQ_LT(c_seq, s_rd_seq))
	+ c_seq = s_rd_seq;
	+
	+ /*
	+ * If the sequence number meets the goal we are done
	+ * with this cpu.
	+ */
	+ if (SMR_SEQ_LEQ(goal, c_seq))
	+ break;
	+
	+ if (!wait)
	+ break;
	+ cpu_spinwait();
	+ }
	+
	+ return (c_seq);
	+}
	+
	+/*
	+ * Loop until all cores have observed the goal sequence or have
	+ * gone inactive. Returns the oldest sequence currently active;
	+ *
	+ * This function assumes a snapshot of sequence values has
	+ * been obtained and validated by smr_poll().
	+ */
	+static smr_seq_t
	+smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq,
	+ smr_seq_t s_wr_seq, smr_seq_t goal, bool wait)
	+{
	+ smr_seq_t rd_seq, c_seq;
	+ int i;
	+
	+ CRITICAL_ASSERT(curthread);
	+ counter_u64_add_protected(poll_scan, 1);
	+
	+ /*
	+ * The read sequence can be no larger than the write sequence at
	+ * the start of the poll.
	+ */
	+ rd_seq = s_wr_seq;
	+ CPU_FOREACH(i) {
	+ /*
	+ * Query the active sequence on this cpu. If we're not
	+ * waiting and we don't meet the goal we will still scan
	+ * the rest of the cpus to update s_rd_seq before returning
	+ * failure.
	+ */
	+ c_seq = smr_poll_cpu(zpcpu_get_cpu(smr, i), s_rd_seq, goal,
	+ wait);
	+
	+ /*
	+ * Limit the minimum observed rd_seq whether we met the goal
	+ * or not.
	+ */
	+ if (c_seq != SMR_SEQ_INVALID)
	+ rd_seq = SMR_SEQ_MIN(rd_seq, c_seq);
	+ }
	+
	+ /*
	+ * Advance the rd_seq as long as we observed a more recent value.
	+ */
	+ s_rd_seq = atomic_load_int(&s->s_rd_seq);
	+ if (SMR_SEQ_GEQ(rd_seq, s_rd_seq)) {
	+ atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq);
	+ s_rd_seq = rd_seq;
	+ }
	+
	+ return (s_rd_seq);
	}

	/*
	@@ -268,9 +485,10 @@
	smr_poll(smr_t smr, smr_seq_t goal, bool wait)
	{
	smr_shared_t s;
	- smr_t c;
	- smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq;
	- int i;
	+ smr_t self;
	+ smr_seq_t s_wr_seq, s_rd_seq;
	+ smr_delta_t delta;
	+ int flags;
	bool success;

	/*
	@@ -278,6 +496,8 @@
	*/
	KASSERT(!wait \|\| !SMR_ENTERED(smr),
	("smr_poll: Blocking not allowed in a SMR section."));
	+ KASSERT(!wait \|\| (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0,
	+ ("smr_poll: Blocking not allowed on lazy smrs."));

	/*
	* Use a critical section so that we can avoid ABA races
	@@ -285,9 +505,19 @@
	*/
	success = true;
	critical_enter();
	- s = zpcpu_get(smr)->c_shared;
	+ /* Attempt to load from self only once. */
	+ self = zpcpu_get(smr);
	+ s = self->c_shared;
	+ flags = self->c_flags;
	counter_u64_add_protected(poll, 1);

	+ /*
	+ * Conditionally advance the lazy write clock on any writer
	+ * activity. This may reset s_rd_seq.
	+ */
	+ if ((flags & SMR_LAZY) != 0)
	+ smr_lazy_advance(smr, s);
	+
	/*
	* Acquire barrier loads s_wr_seq after s_rd_seq so that we can not
	* observe an updated read sequence that is larger than write.
	@@ -295,106 +525,59 @@
	s_rd_seq = atomic_load_acq_int(&s->s_rd_seq);

	/*
	- * wr_seq must be loaded prior to any c_seq value so that a stale
	- * c_seq can only reference time after this wr_seq.
	+ * If we have already observed the sequence number we can immediately
	+ * return success. Most polls should meet this criterion.
	*/
	- s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);
	+ if (SMR_SEQ_LEQ(goal, s_rd_seq))
	+ goto out;

	/*
	- * This may have come from a deferred advance. Consider one
	- * increment past the current wr_seq valid and make sure we
	- * have advanced far enough to succeed. We simply add to avoid
	- * an additional fence.
	+ * wr_seq must be loaded prior to any c_seq value so that a
	+ * stale c_seq can only reference time after this wr_seq.
	*/
	- if (goal == s_wr_seq + SMR_SEQ_INCR) {
	- atomic_add_int(&s->s_wr_seq, SMR_SEQ_INCR);
	- s_wr_seq = goal;
	- }
	+ s_wr_seq = atomic_load_acq_int(&s->s_wr_seq);

	/*
	- * Detect whether the goal is valid and has already been observed.
	- *
	- * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
	- * it to be valid. If it is not then the caller held on to it and
	- * the integer wrapped. If we wrapped back within range the caller
	- * will harmlessly scan.
	- *
	- * A valid goal must be greater than s_rd_seq or we have not verified
	- * that it has been observed and must fall through to polling.
	+ * This is the distance from s_wr_seq to goal. Positive values
	+ * are in the future.
	*/
	- if (SMR_SEQ_GEQ(s_rd_seq, goal) \|\| SMR_SEQ_LT(s_wr_seq, goal))
	- goto out;
	+ delta = SMR_SEQ_DELTA(goal, s_wr_seq);

	/*
	- * Loop until all cores have observed the goal sequence or have
	- * gone inactive. Keep track of the oldest sequence currently
	- * active as rd_seq.
	+ * Detect a stale wr_seq.
	+ *
	+ * This goal may have come from a deferred advance or a lazy
	+ * smr. If we are not blocking we can not succeed but the
	+ * sequence number is valid.
	*/
	- counter_u64_add_protected(poll_scan, 1);
	- rd_seq = s_wr_seq;
	- CPU_FOREACH(i) {
	- c = zpcpu_get_cpu(smr, i);
	- c_seq = SMR_SEQ_INVALID;
	- for (;;) {
	- c_seq = atomic_load_int(&c->c_seq);
	- if (c_seq == SMR_SEQ_INVALID)
	- break;
	-
	- /*
	- * There is a race described in smr.h:smr_enter that
	- * can lead to a stale seq value but not stale data
	- * access. If we find a value out of range here we
	- * pin it to the current min to prevent it from
	- * advancing until that stale section has expired.
	- *
	- * The race is created when a cpu loads the s_wr_seq
	- * value in a local register and then another thread
	- * advances s_wr_seq and calls smr_poll() which will
	- * oberve no value yet in c_seq and advance s_rd_seq
	- * up to s_wr_seq which is beyond the register
	- * cached value. This is only likely to happen on
	- * hypervisor or with a system management interrupt.
	- */
	- if (SMR_SEQ_LT(c_seq, s_rd_seq))
	- c_seq = s_rd_seq;
	-
	- /*
	- * If the sequence number meets the goal we are
	- * done with this cpu.
	- */
	- if (SMR_SEQ_GEQ(c_seq, goal))
	- break;
	-
	- /*
	- * If we're not waiting we will still scan the rest
	- * of the cpus and update s_rd_seq before returning
	- * an error.
	- */
	- if (!wait) {
	- success = false;
	- break;
	- }
	- cpu_spinwait();
	+ if (delta > 0 && delta <= SMR_SEQ_MAX_ADVANCE &&
	+ (flags & (SMR_LAZY \| SMR_DEFERRED)) != 0) {
	+ if (!wait) {
	+ success = false;
	+ goto out;
	}
	-
	- /*
	- * Limit the minimum observed rd_seq whether we met the goal
	- * or not.
	- */
	- if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq))
	- rd_seq = c_seq;
	+ /* LAZY is always !wait. */
	+ s_wr_seq = smr_shared_advance(s);
	+ delta = 0;
	}

	/*
	- * Advance the rd_seq as long as we observed the most recent one.
	+ * Detect an invalid goal.
	+ *
	+ * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for
	+ * it to be valid. If it is not then the caller held on to it and
	+ * the integer wrapped. If we wrapped back within range the caller
	+ * will harmlessly scan.
	*/
	- s_rd_seq = atomic_load_int(&s->s_rd_seq);
	- do {
	- if (SMR_SEQ_LEQ(rd_seq, s_rd_seq))
	- goto out;
	- } while (atomic_fcmpset_int(&s->s_rd_seq, &s_rd_seq, rd_seq) == 0);
	+ if (delta > 0)
	+ goto out;

	+ /* Determine the lowest visible sequence number. */
	+ s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait);
	+ success = SMR_SEQ_LEQ(goal, s_rd_seq);
	out:
	+ if (!success)
	+ counter_u64_add_protected(poll_fail, 1);
	critical_exit();

	/*
	@@ -407,7 +590,7 @@
	}

	smr_t
	-smr_create(const char *name)
	+smr_create(const char *name, int limit, int flags)
	{
	smr_t smr, c;
	smr_shared_t s;
	@@ -417,13 +600,19 @@
	smr = uma_zalloc_pcpu(smr_zone, M_WAITOK);

	s->s_name = name;
	- s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
	+ if ((flags & SMR_LAZY) == 0)
	+ s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT;
	+ else
	+ s->s_rd_seq = s->s_wr_seq = ticks;

	/* Initialize all CPUS, not just those running. */
	for (i = 0; i <= mp_maxid; i++) {
	c = zpcpu_get_cpu(smr, i);
	c->c_seq = SMR_SEQ_INVALID;
	c->c_shared = s;
	+ c->c_deferred = 0;
	+ c->c_limit = limit;
	+ c->c_flags = flags;
	}
	atomic_thread_fence_seq_cst();

	@@ -460,5 +649,6 @@
	advance_wait = counter_u64_alloc(M_WAITOK);
	poll = counter_u64_alloc(M_WAITOK);
	poll_scan = counter_u64_alloc(M_WAITOK);
	+ poll_fail = counter_u64_alloc(M_WAITOK);
	}
	SYSINIT(smr_counters, SI_SUB_CPU, SI_ORDER_ANY, smr_init_counters, NULL);
	Index: sys/sys/_smr.h
	===================================================================
	--- sys/sys/_smr.h
	+++ sys/sys/_smr.h
	@@ -32,6 +32,7 @@
	#define _SYS__SMR_H_

	typedef uint32_t smr_seq_t;
	+typedef int32_t smr_delta_t;
	typedef struct smr *smr_t;

	#endif /* __SYS_SMR_H_ */
	Index: sys/sys/smr.h
	===================================================================
	--- sys/sys/smr.h
	+++ sys/sys/smr.h
	@@ -45,11 +45,13 @@
	* Modular arithmetic for comparing sequence numbers that have
	* potentially wrapped. Copied from tcp_seq.h.
	*/
	-#define SMR_SEQ_LT(a, b) ((int32_t)((a)-(b)) < 0)
	-#define SMR_SEQ_LEQ(a, b) ((int32_t)((a)-(b)) <= 0)
	-#define SMR_SEQ_GT(a, b) ((int32_t)((a)-(b)) > 0)
	-#define SMR_SEQ_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
	-#define SMR_SEQ_DELTA(a, b) ((int32_t)((a)-(b)))
	+#define SMR_SEQ_LT(a, b) ((smr_delta_t)((a)-(b)) < 0)
	+#define SMR_SEQ_LEQ(a, b) ((smr_delta_t)((a)-(b)) <= 0)
	+#define SMR_SEQ_GT(a, b) ((smr_delta_t)((a)-(b)) > 0)
	+#define SMR_SEQ_GEQ(a, b) ((smr_delta_t)((a)-(b)) >= 0)
	+#define SMR_SEQ_DELTA(a, b) ((smr_delta_t)((a)-(b)))
	+#define SMR_SEQ_MIN(a, b) (SMR_SEQ_LT(a, b) ? a : b)
	+#define SMR_SEQ_MAX(a, b) (SMR_SEQ_GT(a, b) ? a : b)

	#define SMR_SEQ_INVALID 0

	@@ -66,8 +68,13 @@
	smr_seq_t c_seq; /* Current observed sequence. */
	smr_shared_t c_shared; /* Shared SMR state. */
	int c_deferred; /* Deferred advance counter. */
	+ int c_limit; /* Deferred advance limit. */
	+ int c_flags; /* SMR Configuration */
	};

	+#define SMR_LAZY 0x0001 /* Higher latency write, fast read. */
	+#define SMR_DEFERRED 0x0002 /* Aggregate updates to wr_seq. */
	+
	#define SMR_ENTERED(smr) \
	(curthread->td_critnest != 0 && zpcpu_get((smr))->c_seq != SMR_SEQ_INVALID)

	@@ -170,7 +177,8 @@
	} while (0)

	/*
	- * Return the current write sequence number.
	+ * Return the current write sequence number. This is not the same as the
	+ * current goal which may be in the future.
	*/
	static inline smr_seq_t
	smr_shared_current(smr_shared_t s)
	@@ -195,6 +203,8 @@

	critical_enter();
	smr = zpcpu_get(smr);
	+ KASSERT((smr->c_flags & SMR_LAZY) == 0,
	+ ("smr_enter(%s) lazy smr.", smr->c_shared->s_name));
	KASSERT(smr->c_seq == 0,
	("smr_enter(%s) does not support recursion.",
	smr->c_shared->s_name));
	@@ -228,6 +238,8 @@

	smr = zpcpu_get(smr);
	CRITICAL_ASSERT(curthread);
	+ KASSERT((smr->c_flags & SMR_LAZY) == 0,
	+ ("smr_exit(%s) lazy smr.", smr->c_shared->s_name));
	KASSERT(smr->c_seq != SMR_SEQ_INVALID,
	("smr_exit(%s) not in a smr section.", smr->c_shared->s_name));

	@@ -243,17 +255,61 @@
	}

	/*
	- * Advances the write sequence number. Returns the sequence number
	- * required to ensure that all modifications are visible to readers.
	+ * Enter a lazy smr section. This is used for read-mostly state that
	+ * can tolerate a high free latency.
	*/
	-smr_seq_t smr_advance(smr_t smr);
	+static inline void
	+smr_lazy_enter(smr_t smr)
	+{
	+
	+ critical_enter();
	+ smr = zpcpu_get(smr);
	+ KASSERT((smr->c_flags & SMR_LAZY) != 0,
	+ ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
	+ KASSERT(smr->c_seq == 0,
	+ ("smr_lazy_enter(%s) does not support recursion.",
	+ smr->c_shared->s_name));
	+
	+ /*
	+ * This needs no serialization. If an interrupt occurs before we
	+ * assign sr_seq to c_seq any speculative loads will be discarded.
	+ * If we assign a stale wr_seq value due to interrupt we use the
	+ * same algorithm that renders smr_enter() safe.
	+ */
	+ smr->c_seq = smr_shared_current(smr->c_shared);
	+}

	/*
	- * Advances the write sequence number only after N calls. Returns
	- * the correct goal for a wr_seq that has not yet occurred. Used to
	- * minimize shared cacheline invalidations for frequent writers.
	+ * Exit a lazy smr section. This is used for read-mostly state that
	+ * can tolerate a high free latency.
	*/
	-smr_seq_t smr_advance_deferred(smr_t smr, int limit);
	+static inline void
	+smr_lazy_exit(smr_t smr)
	+{
	+
	+ smr = zpcpu_get(smr);
	+ CRITICAL_ASSERT(curthread);
	+ KASSERT((smr->c_flags & SMR_LAZY) != 0,
	+ ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name));
	+ KASSERT(smr->c_seq != SMR_SEQ_INVALID,
	+ ("smr_lazy_exit(%s) not in a smr section.", smr->c_shared->s_name));
	+
	+ /*
	+ * All loads/stores must be retired before the sequence becomes
	+ * visible. The fence compiles away on amd64. Another
	+ * alternative would be to omit the fence but store the exit
	+ * time and wait 1 tick longer.
	+ */
	+ atomic_thread_fence_rel();
	+ smr->c_seq = SMR_SEQ_INVALID;
	+ critical_exit();
	+}
	+
	+/*
	+ * Advances the write sequence number. Returns the sequence number
	+ * required to ensure that all modifications are visible to readers.
	+ */
	+smr_seq_t smr_advance(smr_t smr);

	/*
	* Returns true if a goal sequence has been reached. If
	@@ -262,7 +318,9 @@
	bool smr_poll(smr_t smr, smr_seq_t goal, bool wait);

	/* Create a new SMR context. */
	-smr_t smr_create(const char *name);
	+smr_t smr_create(const char *name, int limit, int flags);
	+
	+/* Destroy the context. */
	void smr_destroy(smr_t smr);

	/*
	Index: sys/tools/umaperf/umaperf.c
	===================================================================
	--- sys/tools/umaperf/umaperf.c
	+++ sys/tools/umaperf/umaperf.c
	@@ -230,6 +230,9 @@
	case DEFER_SMR:
	smr_enter(umaperf_smr);
	break;
	+ case LAZY_SMR:
	+ smr_lazy_enter(umaperf_smr);
	+ break;
	case EPOCH:
	epoch_enter(umaperf_epoch);
	break;
	@@ -256,6 +259,9 @@
	case DEFER_SMR:
	smr_exit(umaperf_smr);
	break;
	+ case LAZY_SMR:
	+ smr_lazy_exit(umaperf_smr);
	+ break;
	case EPOCH:
	epoch_exit(umaperf_epoch);
	break;
	@@ -618,7 +624,7 @@
	switch (umaperf_type) {
	#ifdef __FreeBSD__
	case PLAIN:
	- flags = UMA_ZONE_ROUNDROBIN;
	+ flags = UMA_ZONE_FIRSTTOUCH;
	break;
	case SMR:
	flags = UMA_ZONE_ROUNDROBIN \| UMA_ZONE_SMR;
	@@ -652,7 +658,8 @@
	umaperf_zone = uma_zcreate("umaperf", umaperf_zone_size,
	NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, flags);
	#ifdef __FreeBSD__
	- umaperf_smr = uma_zone_get_smr(umaperf_zone);
	+ if (umaperf_smr != 0)
	+ uma_zone_set_smr(umaperf_zone, umaperf_smr);
	#endif
	umaperf_init_cpus();
	}
	Index: sys/vm/uma_core.c
	===================================================================
	--- sys/vm/uma_core.c
	+++ sys/vm/uma_core.c
	@@ -1168,7 +1168,6 @@
	* Returns:
	* Nothing
	*/
	-
	static void
	bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
	{
	@@ -1228,7 +1227,7 @@
	*/
	seq = SMR_SEQ_INVALID;
	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
	- seq = smr_current(zone->uz_smr);
	+ seq = smr_advance(zone->uz_smr);
	CPU_FOREACH(cpu) {
	cache = &zone->uz_cpu[cpu];
	bucket = cache_bucket_unload_alloc(cache);
	@@ -2707,7 +2706,7 @@

	/* Caller requests a private SMR context. */
	if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
	- zone->uz_smr = smr_create(zone->uz_name);
	+ zone->uz_smr = smr_create(zone->uz_name, 0, 0);

	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET \| UMA_ZONE_NOBUCKET)) !=
	(UMA_ZONE_MAXBUCKET \| UMA_ZONE_NOBUCKET),
	@@ -4180,22 +4179,21 @@
	"uma_zfree: zone %s(%p) draining cross bucket %p",
	zone->uz_name, zone, bucket);

	- STAILQ_INIT(&fullbuckets);
	+ /*
	+ * It is possible for buckets to arrive here out of order so we fetch
	+ * the current smr seq rather than accepting the bucket's.
	+ */
	+ seq = SMR_SEQ_INVALID;
	+ if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
	+ seq = smr_advance(zone->uz_smr);

	/*
	* To avoid having ndomain * ndomain buckets for sorting we have a
	* lock on the current crossfree bucket. A full matrix with
	* per-domain locking could be used if necessary.
	*/
	+ STAILQ_INIT(&fullbuckets);
	ZONE_CROSS_LOCK(zone);
	-
	- /*
	- * It is possible for buckets to arrive here out of order so we fetch
	- * the current smr seq rather than accepting the bucket's.
	- */
	- seq = SMR_SEQ_INVALID;
	- if ((zone->uz_flags & UMA_ZONE_SMR) != 0)
	- seq = smr_current(zone->uz_smr);
	while (bucket->ub_cnt > 0) {
	item = bucket->ub_bucket[bucket->ub_cnt - 1];
	domain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
	Index: tools/uma/smrstress/smrstress.c
	===================================================================
	--- tools/uma/smrstress/smrstress.c
	+++ tools/uma/smrstress/smrstress.c
	@@ -64,12 +64,14 @@
	static void
	smrs_error(struct smrs smrs, const char fmt, ...)
	{
	+ smr_t self;
	va_list ap;

	+ self = zpcpu_get(smrs_smr);
	atomic_add_int(&smrs_failures, 1);
	printf("SMR ERROR: wr_seq %d, rd_seq %d, c_seq %d, generation %d, count %d ",
	- smrs_smr->c_shared->s_wr_seq, smrs_smr->c_shared->s_rd_seq,
	- zpcpu_get(smrs_smr)->c_seq, smrs->generation, smrs->count);
	+ smr_current(smrs_smr), self->c_shared->s_rd_seq, self->c_seq,
	+ smrs->generation, smrs->count);
	va_start(ap, fmt);
	(void)vprintf(fmt, ap);
	va_end(ap);
	@@ -83,7 +85,7 @@

	/* Wait for the writer to exit. */
	while (smrs_completed == 0) {
	- smr_enter(smrs_smr);
	+ smr_lazy_enter(smrs_smr);
	cur = (void *)atomic_load_acq_ptr(&smrs_current);
	if (cur->generation == -1)
	smrs_error(cur, "read early: Use after free!\n");
	@@ -94,7 +96,7 @@
	smrs_error(cur, "read late: Use after free!\n");
	else if (cnt <= 0)
	smrs_error(cur, "Invalid ref\n");
	- smr_exit(smrs_smr);
	+ smr_lazy_exit(smrs_smr);
	maybe_yield();
	}
	}
	@@ -190,8 +192,9 @@

	smrs_zone = uma_zcreate("smrs", sizeof(struct smrs),
	smrs_ctor, smrs_dtor, NULL, NULL, UMA_ALIGN_PTR,
	- UMA_ZONE_SMR \| UMA_ZONE_ZINIT);
	- smrs_smr = uma_zone_get_smr(smrs_zone);
	+ UMA_ZONE_ZINIT);
	+ smrs_smr = smr_create("smrs", SMR_LAZY);
	+ uma_zone_set_smr(smrs_zone, smrs_smr);
	}

	static void

File Metadata

Mime Type: text/plain
Expires: Sun, Mar 1, 5:58 PM (8 h, 50 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 29121657
Default Alt Text: D23738.id68514.diff (27 KB)

D23738.id68514.diffNo OneTemporaryActions

D23738.id68514.diffView Options

File Metadata

Event Timeline

D23738.id68514.diff
No OneTemporary
Actions

D23738.id68514.diff
View Options