Index: sys/kern/subr_smr.c =================================================================== --- sys/kern/subr_smr.c +++ sys/kern/subr_smr.c @@ -164,58 +164,133 @@ #define SMR_SEQ_MAX_ADVANCE SMR_SEQ_MAX_DELTA / 2 #endif +/* + * The grace period for lazy (tick based) SMR. + * + * Hardclock is responsible for advancing ticks on a single CPU while every + * CPU receives a regular clock interrupt. The clock interrupts are flushing + * the store buffers and any speculative loads that may violate our invariants. + * Because these interrupts are not synchronized we must wait one additional + * tick in the future to be certain that all processors have had their state + * synchronized by an interrupt. + * + * This assumes that the clock interrupt will only be delayed by other causes + * that will flush the store buffer or prevent access to the section protected + * data. For example, an idle processor, or an system management interrupt, + * or a vm exit. + * + * We must wait one additional tick if we are around the wrap condition + * because the write seq will move forward by two with one interrupt. + */ +#define SMR_LAZY_GRACE 2 +#define SMR_LAZY_GRACE_MAX (SMR_LAZY_GRACE + 1) + +/* + * The maximum sequence number ahead of wr_seq that may still be valid. The + * sequence may not be advanced on write for lazy or deferred SMRs. In this + * case poll needs to attempt to forward the sequence number if the goal is + * within wr_seq + SMR_SEQ_ADVANCE. + */ +#define SMR_SEQ_ADVANCE MAX(SMR_SEQ_INCR, SMR_LAZY_GRACE_MAX) + static SYSCTL_NODE(_debug, OID_AUTO, smr, CTLFLAG_RW, NULL, "SMR Stats"); static counter_u64_t advance = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_RD, &advance, ""); +SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance, CTLFLAG_WR, &advance, ""); static counter_u64_t advance_wait = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_RD, &advance_wait, ""); +SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, advance_wait, CTLFLAG_WR, &advance_wait, ""); static counter_u64_t poll = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_RD, &poll, ""); +SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll, CTLFLAG_WR, &poll, ""); static counter_u64_t poll_scan = EARLY_COUNTER; -SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RD, &poll_scan, ""); - +SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_WR, &poll_scan, ""); +static counter_u64_t poll_fail = EARLY_COUNTER; +SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_fail, CTLFLAG_WR, &poll_fail, ""); /* - * Advance the write sequence and return the new value for use as the - * wait goal. This guarantees that any changes made by the calling - * thread prior to this call will be visible to all threads after - * rd_seq meets or exceeds the return value. + * Advance a lazy write sequence number. These move forward at the rate of + * ticks. Grace is two ticks in the future. lazy write sequence numbers can + * be even but not SMR_SEQ_INVALID so we pause time for a tick when we wrap. * - * This function may busy loop if the readers are roughly 1 billion - * sequence numbers behind the writers. + * This returns the _current_ write sequence number. The lazy goal sequence + * number is SMR_LAZY_GRACE ticks ahead. */ -smr_seq_t -smr_advance(smr_t smr) +static smr_seq_t +smr_lazy_advance(smr_t smr, smr_shared_t s) { - smr_shared_t s; - smr_seq_t goal, s_rd_seq; + smr_seq_t s_rd_seq, s_wr_seq, goal; + int t; + + CRITICAL_ASSERT(curthread); /* - * It is illegal to enter while in an smr section. + * We must not allow a zero tick value. We go back in time one tick + * and advance the grace period forward one tick around zero. */ - SMR_ASSERT_NOT_ENTERED(smr); + t = ticks; + if (t == SMR_SEQ_INVALID) + t--; /* - * Modifications not done in a smr section need to be visible - * before advancing the seq. + * The most probable condition that the update already took place. */ - atomic_thread_fence_rel(); + s_wr_seq = atomic_load_int(&s->s_wr_seq); + if (__predict_true(t == s_wr_seq)) + goto out; /* - * Load the current read seq before incrementing the goal so - * we are guaranteed it is always < goal. + * After long idle periods the read sequence may fall too far + * behind write. Prevent poll from ever seeing this condition + * by updating the stale rd_seq. This assumes that there can + * be no valid section 2bn ticks old. The rd_seq update must + * be visible before wr_seq to avoid races with other advance + * callers. */ - s = zpcpu_get(smr)->c_shared; - s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); + s_rd_seq = atomic_load_int(&s->s_rd_seq); + if (SMR_SEQ_GT(s_rd_seq, t)) + atomic_cmpset_rel_int(&s->s_rd_seq, s_rd_seq, t); + atomic_cmpset_int(&s->s_wr_seq, s_wr_seq, t); + counter_u64_add(advance, 1); + /* If we lost either update race another thread did it. */ + s_wr_seq = t; +out: + goal = s_wr_seq + SMR_LAZY_GRACE; + /* Skip over the SMR_SEQ_INVALID tick. */ + if (goal < SMR_LAZY_GRACE) + goal++; + return (goal); +} + +/* + * Increment the shared write sequence by 2. Since it is initialized + * to 1 this means the only valid values are odd and an observed value + * of 0 in a particular CPU means it is not currently in a read section. + */ +static smr_seq_t +smr_shared_advance(smr_shared_t s) +{ + + return (atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR); +} + +/* + * Advance the write sequence number for a normal smr section. If the + * write sequence is too far behind the read sequence we have to poll + * to advance rd_seq and prevent undetectable wraps. + */ +static smr_seq_t +smr_default_advance(smr_t smr, smr_shared_t s) +{ + smr_seq_t goal, s_rd_seq; + + CRITICAL_ASSERT(curthread); + KASSERT((zpcpu_get(smr)->c_flags & SMR_LAZY) == 0, + ("smr_default_advance: called with lazy smr.")); /* - * Increment the shared write sequence by 2. Since it is - * initialized to 1 this means the only valid values are - * odd and an observed value of 0 in a particular CPU means - * it is not currently in a read section. + * Load the current read seq before incrementing the goal so + * we are guaranteed it is always < goal. */ - goal = atomic_fetchadd_int(&s->s_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR; - counter_u64_add(advance, 1); + s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); + goal = smr_shared_advance(s); /* * Force a synchronization here if the goal is getting too @@ -226,30 +301,172 @@ counter_u64_add(advance_wait, 1); smr_wait(smr, goal - SMR_SEQ_MAX_ADVANCE); } + counter_u64_add(advance, 1); return (goal); } +/* + * Deferred SMRs conditionally update s_wr_seq based on an + * cpu local interval count. + */ +static smr_seq_t +smr_deferred_advance(smr_t smr, smr_shared_t s, smr_t self) +{ + + if (++self->c_deferred < self->c_limit) + return (smr_shared_current(s) + SMR_SEQ_INCR); + self->c_deferred = 0; + return (smr_default_advance(smr, s)); +} + +/* + * Advance the write sequence and return the value for use as the + * wait goal. This guarantees that any changes made by the calling + * thread prior to this call will be visible to all threads after + * rd_seq meets or exceeds the return value. + * + * This function may busy loop if the readers are roughly 1 billion + * sequence numbers behind the writers. + * + * Lazy SMRs will not busy loop and the wrap happens every 49.6 days + * at 1khz and 119 hours at 10khz. Readers can block for no longer + * than half of this for SMR_SEQ_ macros to continue working. + */ smr_seq_t -smr_advance_deferred(smr_t smr, int limit) +smr_advance(smr_t smr) { + smr_t self; + smr_shared_t s; smr_seq_t goal; - smr_t csmr; + int flags; + /* + * It is illegal to enter while in an smr section. + */ SMR_ASSERT_NOT_ENTERED(smr); + /* + * Modifications not done in a smr section need to be visible + * before advancing the seq. + */ + atomic_thread_fence_rel(); + critical_enter(); - csmr = zpcpu_get(smr); - if (++csmr->c_deferred >= limit) { - goal = SMR_SEQ_INVALID; - csmr->c_deferred = 0; - } else - goal = smr_shared_current(csmr->c_shared) + SMR_SEQ_INCR; + /* Try to touch the line once. */ + self = zpcpu_get(smr); + s = self->c_shared; + flags = self->c_flags; + goal = SMR_SEQ_INVALID; + if ((flags & (SMR_LAZY | SMR_DEFERRED)) == 0) + goal = smr_default_advance(smr, s); + else if ((flags & SMR_LAZY) != 0) + goal = smr_lazy_advance(smr, s); + else if ((flags & SMR_DEFERRED) != 0) + goal = smr_deferred_advance(smr, s, self); critical_exit(); - if (goal != SMR_SEQ_INVALID) - return (goal); - return (smr_advance(smr)); + return (goal); +} + +/* + * Poll to determine the currently observed sequence number on a cpu + * and spinwait if the 'wait' argument is true. + */ +static smr_seq_t +smr_poll_cpu(smr_t c, smr_seq_t s_rd_seq, smr_seq_t goal, bool wait) +{ + smr_seq_t c_seq; + + c_seq = SMR_SEQ_INVALID; + for (;;) { + c_seq = atomic_load_int(&c->c_seq); + if (c_seq == SMR_SEQ_INVALID) + break; + + /* + * There is a race described in smr.h:smr_enter that + * can lead to a stale seq value but not stale data + * access. If we find a value out of range here we + * pin it to the current min to prevent it from + * advancing until that stale section has expired. + * + * The race is created when a cpu loads the s_wr_seq + * value in a local register and then another thread + * advances s_wr_seq and calls smr_poll() which will + * oberve no value yet in c_seq and advance s_rd_seq + * up to s_wr_seq which is beyond the register + * cached value. This is only likely to happen on + * hypervisor or with a system management interrupt. + */ + if (SMR_SEQ_LT(c_seq, s_rd_seq)) + c_seq = s_rd_seq; + + /* + * If the sequence number meets the goal we are done + * with this cpu. + */ + if (SMR_SEQ_LEQ(goal, c_seq)) + break; + + if (!wait) + break; + cpu_spinwait(); + } + + return (c_seq); +} + +/* + * Loop until all cores have observed the goal sequence or have + * gone inactive. Returns the oldest sequence currently active; + * + * This function assumes a snapshot of sequence values has + * been obtained and validated by smr_poll(). + */ +static smr_seq_t +smr_poll_scan(smr_t smr, smr_shared_t s, smr_seq_t s_rd_seq, + smr_seq_t s_wr_seq, smr_seq_t goal, bool wait) +{ + smr_seq_t rd_seq, c_seq; + int i; + + CRITICAL_ASSERT(curthread); + counter_u64_add_protected(poll_scan, 1); + + /* + * The read sequence can be no larger than the write sequence at + * the start of the poll. + */ + rd_seq = s_wr_seq; + CPU_FOREACH(i) { + /* + * Query the active sequence on this cpu. If we're not + * waiting and we don't meet the goal we will still scan + * the rest of the cpus to update s_rd_seq before returning + * failure. + */ + c_seq = smr_poll_cpu(zpcpu_get_cpu(smr, i), s_rd_seq, goal, + wait); + + /* + * Limit the minimum observed rd_seq whether we met the goal + * or not. + */ + if (c_seq != SMR_SEQ_INVALID) + rd_seq = SMR_SEQ_MIN(rd_seq, c_seq); + } + + /* + * Advance the rd_seq as long as we observed a more recent value. + */ + s_rd_seq = atomic_load_int(&s->s_rd_seq); + if (SMR_SEQ_GEQ(rd_seq, s_rd_seq)) { + atomic_cmpset_int(&s->s_rd_seq, s_rd_seq, rd_seq); + s_rd_seq = rd_seq; + } + + return (s_rd_seq); } /* @@ -268,9 +485,10 @@ smr_poll(smr_t smr, smr_seq_t goal, bool wait) { smr_shared_t s; - smr_t c; - smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq; - int i; + smr_t self; + smr_seq_t s_wr_seq, s_rd_seq; + smr_delta_t delta; + int flags; bool success; /* @@ -278,6 +496,8 @@ */ KASSERT(!wait || !SMR_ENTERED(smr), ("smr_poll: Blocking not allowed in a SMR section.")); + KASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0, + ("smr_poll: Blocking not allowed on lazy smrs.")); /* * Use a critical section so that we can avoid ABA races @@ -285,9 +505,19 @@ */ success = true; critical_enter(); - s = zpcpu_get(smr)->c_shared; + /* Attempt to load from self only once. */ + self = zpcpu_get(smr); + s = self->c_shared; + flags = self->c_flags; counter_u64_add_protected(poll, 1); + /* + * Conditionally advance the lazy write clock on any writer + * activity. This may reset s_rd_seq. + */ + if ((flags & SMR_LAZY) != 0) + smr_lazy_advance(smr, s); + /* * Acquire barrier loads s_wr_seq after s_rd_seq so that we can not * observe an updated read sequence that is larger than write. @@ -295,106 +525,59 @@ s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); /* - * wr_seq must be loaded prior to any c_seq value so that a stale - * c_seq can only reference time after this wr_seq. + * If we have already observed the sequence number we can immediately + * return success. Most polls should meet this criterion. */ - s_wr_seq = atomic_load_acq_int(&s->s_wr_seq); + if (SMR_SEQ_LEQ(goal, s_rd_seq)) + goto out; /* - * This may have come from a deferred advance. Consider one - * increment past the current wr_seq valid and make sure we - * have advanced far enough to succeed. We simply add to avoid - * an additional fence. + * wr_seq must be loaded prior to any c_seq value so that a + * stale c_seq can only reference time after this wr_seq. */ - if (goal == s_wr_seq + SMR_SEQ_INCR) { - atomic_add_int(&s->s_wr_seq, SMR_SEQ_INCR); - s_wr_seq = goal; - } + s_wr_seq = atomic_load_acq_int(&s->s_wr_seq); /* - * Detect whether the goal is valid and has already been observed. - * - * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for - * it to be valid. If it is not then the caller held on to it and - * the integer wrapped. If we wrapped back within range the caller - * will harmlessly scan. - * - * A valid goal must be greater than s_rd_seq or we have not verified - * that it has been observed and must fall through to polling. + * This is the distance from s_wr_seq to goal. Positive values + * are in the future. */ - if (SMR_SEQ_GEQ(s_rd_seq, goal) || SMR_SEQ_LT(s_wr_seq, goal)) - goto out; + delta = SMR_SEQ_DELTA(goal, s_wr_seq); /* - * Loop until all cores have observed the goal sequence or have - * gone inactive. Keep track of the oldest sequence currently - * active as rd_seq. + * Detect a stale wr_seq. + * + * This goal may have come from a deferred advance or a lazy + * smr. If we are not blocking we can not succeed but the + * sequence number is valid. */ - counter_u64_add_protected(poll_scan, 1); - rd_seq = s_wr_seq; - CPU_FOREACH(i) { - c = zpcpu_get_cpu(smr, i); - c_seq = SMR_SEQ_INVALID; - for (;;) { - c_seq = atomic_load_int(&c->c_seq); - if (c_seq == SMR_SEQ_INVALID) - break; - - /* - * There is a race described in smr.h:smr_enter that - * can lead to a stale seq value but not stale data - * access. If we find a value out of range here we - * pin it to the current min to prevent it from - * advancing until that stale section has expired. - * - * The race is created when a cpu loads the s_wr_seq - * value in a local register and then another thread - * advances s_wr_seq and calls smr_poll() which will - * oberve no value yet in c_seq and advance s_rd_seq - * up to s_wr_seq which is beyond the register - * cached value. This is only likely to happen on - * hypervisor or with a system management interrupt. - */ - if (SMR_SEQ_LT(c_seq, s_rd_seq)) - c_seq = s_rd_seq; - - /* - * If the sequence number meets the goal we are - * done with this cpu. - */ - if (SMR_SEQ_GEQ(c_seq, goal)) - break; - - /* - * If we're not waiting we will still scan the rest - * of the cpus and update s_rd_seq before returning - * an error. - */ - if (!wait) { - success = false; - break; - } - cpu_spinwait(); + if (delta > 0 && delta <= SMR_SEQ_MAX_ADVANCE && + (flags & (SMR_LAZY | SMR_DEFERRED)) != 0) { + if (!wait) { + success = false; + goto out; } - - /* - * Limit the minimum observed rd_seq whether we met the goal - * or not. - */ - if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq)) - rd_seq = c_seq; + /* LAZY is always !wait. */ + s_wr_seq = smr_shared_advance(s); + delta = 0; } /* - * Advance the rd_seq as long as we observed the most recent one. + * Detect an invalid goal. + * + * The goal must be in the range of s_wr_seq >= goal >= s_rd_seq for + * it to be valid. If it is not then the caller held on to it and + * the integer wrapped. If we wrapped back within range the caller + * will harmlessly scan. */ - s_rd_seq = atomic_load_int(&s->s_rd_seq); - do { - if (SMR_SEQ_LEQ(rd_seq, s_rd_seq)) - goto out; - } while (atomic_fcmpset_int(&s->s_rd_seq, &s_rd_seq, rd_seq) == 0); + if (delta > 0) + goto out; + /* Determine the lowest visible sequence number. */ + s_rd_seq = smr_poll_scan(smr, s, s_rd_seq, s_wr_seq, goal, wait); + success = SMR_SEQ_LEQ(goal, s_rd_seq); out: + if (!success) + counter_u64_add_protected(poll_fail, 1); critical_exit(); /* @@ -407,7 +590,7 @@ } smr_t -smr_create(const char *name) +smr_create(const char *name, int limit, int flags) { smr_t smr, c; smr_shared_t s; @@ -417,13 +600,19 @@ smr = uma_zalloc_pcpu(smr_zone, M_WAITOK); s->s_name = name; - s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT; + if ((flags & SMR_LAZY) == 0) + s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT; + else + s->s_rd_seq = s->s_wr_seq = ticks; /* Initialize all CPUS, not just those running. */ for (i = 0; i <= mp_maxid; i++) { c = zpcpu_get_cpu(smr, i); c->c_seq = SMR_SEQ_INVALID; c->c_shared = s; + c->c_deferred = 0; + c->c_limit = limit; + c->c_flags = flags; } atomic_thread_fence_seq_cst(); @@ -460,5 +649,6 @@ advance_wait = counter_u64_alloc(M_WAITOK); poll = counter_u64_alloc(M_WAITOK); poll_scan = counter_u64_alloc(M_WAITOK); + poll_fail = counter_u64_alloc(M_WAITOK); } SYSINIT(smr_counters, SI_SUB_CPU, SI_ORDER_ANY, smr_init_counters, NULL); Index: sys/sys/_smr.h =================================================================== --- sys/sys/_smr.h +++ sys/sys/_smr.h @@ -32,6 +32,7 @@ #define _SYS__SMR_H_ typedef uint32_t smr_seq_t; +typedef int32_t smr_delta_t; typedef struct smr *smr_t; #endif /* __SYS_SMR_H_ */ Index: sys/sys/smr.h =================================================================== --- sys/sys/smr.h +++ sys/sys/smr.h @@ -45,11 +45,13 @@ * Modular arithmetic for comparing sequence numbers that have * potentially wrapped. Copied from tcp_seq.h. */ -#define SMR_SEQ_LT(a, b) ((int32_t)((a)-(b)) < 0) -#define SMR_SEQ_LEQ(a, b) ((int32_t)((a)-(b)) <= 0) -#define SMR_SEQ_GT(a, b) ((int32_t)((a)-(b)) > 0) -#define SMR_SEQ_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) -#define SMR_SEQ_DELTA(a, b) ((int32_t)((a)-(b))) +#define SMR_SEQ_LT(a, b) ((smr_delta_t)((a)-(b)) < 0) +#define SMR_SEQ_LEQ(a, b) ((smr_delta_t)((a)-(b)) <= 0) +#define SMR_SEQ_GT(a, b) ((smr_delta_t)((a)-(b)) > 0) +#define SMR_SEQ_GEQ(a, b) ((smr_delta_t)((a)-(b)) >= 0) +#define SMR_SEQ_DELTA(a, b) ((smr_delta_t)((a)-(b))) +#define SMR_SEQ_MIN(a, b) (SMR_SEQ_LT(a, b) ? a : b) +#define SMR_SEQ_MAX(a, b) (SMR_SEQ_GT(a, b) ? a : b) #define SMR_SEQ_INVALID 0 @@ -66,8 +68,13 @@ smr_seq_t c_seq; /* Current observed sequence. */ smr_shared_t c_shared; /* Shared SMR state. */ int c_deferred; /* Deferred advance counter. */ + int c_limit; /* Deferred advance limit. */ + int c_flags; /* SMR Configuration */ }; +#define SMR_LAZY 0x0001 /* Higher latency write, fast read. */ +#define SMR_DEFERRED 0x0002 /* Aggregate updates to wr_seq. */ + #define SMR_ENTERED(smr) \ (curthread->td_critnest != 0 && zpcpu_get((smr))->c_seq != SMR_SEQ_INVALID) @@ -170,7 +177,8 @@ } while (0) /* - * Return the current write sequence number. + * Return the current write sequence number. This is not the same as the + * current goal which may be in the future. */ static inline smr_seq_t smr_shared_current(smr_shared_t s) @@ -195,6 +203,8 @@ critical_enter(); smr = zpcpu_get(smr); + KASSERT((smr->c_flags & SMR_LAZY) == 0, + ("smr_enter(%s) lazy smr.", smr->c_shared->s_name)); KASSERT(smr->c_seq == 0, ("smr_enter(%s) does not support recursion.", smr->c_shared->s_name)); @@ -228,6 +238,8 @@ smr = zpcpu_get(smr); CRITICAL_ASSERT(curthread); + KASSERT((smr->c_flags & SMR_LAZY) == 0, + ("smr_exit(%s) lazy smr.", smr->c_shared->s_name)); KASSERT(smr->c_seq != SMR_SEQ_INVALID, ("smr_exit(%s) not in a smr section.", smr->c_shared->s_name)); @@ -243,17 +255,61 @@ } /* - * Advances the write sequence number. Returns the sequence number - * required to ensure that all modifications are visible to readers. + * Enter a lazy smr section. This is used for read-mostly state that + * can tolerate a high free latency. */ -smr_seq_t smr_advance(smr_t smr); +static inline void +smr_lazy_enter(smr_t smr) +{ + + critical_enter(); + smr = zpcpu_get(smr); + KASSERT((smr->c_flags & SMR_LAZY) != 0, + ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name)); + KASSERT(smr->c_seq == 0, + ("smr_lazy_enter(%s) does not support recursion.", + smr->c_shared->s_name)); + + /* + * This needs no serialization. If an interrupt occurs before we + * assign sr_seq to c_seq any speculative loads will be discarded. + * If we assign a stale wr_seq value due to interrupt we use the + * same algorithm that renders smr_enter() safe. + */ + smr->c_seq = smr_shared_current(smr->c_shared); +} /* - * Advances the write sequence number only after N calls. Returns - * the correct goal for a wr_seq that has not yet occurred. Used to - * minimize shared cacheline invalidations for frequent writers. + * Exit a lazy smr section. This is used for read-mostly state that + * can tolerate a high free latency. */ -smr_seq_t smr_advance_deferred(smr_t smr, int limit); +static inline void +smr_lazy_exit(smr_t smr) +{ + + smr = zpcpu_get(smr); + CRITICAL_ASSERT(curthread); + KASSERT((smr->c_flags & SMR_LAZY) != 0, + ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name)); + KASSERT(smr->c_seq != SMR_SEQ_INVALID, + ("smr_lazy_exit(%s) not in a smr section.", smr->c_shared->s_name)); + + /* + * All loads/stores must be retired before the sequence becomes + * visible. The fence compiles away on amd64. Another + * alternative would be to omit the fence but store the exit + * time and wait 1 tick longer. + */ + atomic_thread_fence_rel(); + smr->c_seq = SMR_SEQ_INVALID; + critical_exit(); +} + +/* + * Advances the write sequence number. Returns the sequence number + * required to ensure that all modifications are visible to readers. + */ +smr_seq_t smr_advance(smr_t smr); /* * Returns true if a goal sequence has been reached. If @@ -262,7 +318,9 @@ bool smr_poll(smr_t smr, smr_seq_t goal, bool wait); /* Create a new SMR context. */ -smr_t smr_create(const char *name); +smr_t smr_create(const char *name, int limit, int flags); + +/* Destroy the context. */ void smr_destroy(smr_t smr); /* Index: sys/tools/umaperf/umaperf.c =================================================================== --- sys/tools/umaperf/umaperf.c +++ sys/tools/umaperf/umaperf.c @@ -230,6 +230,9 @@ case DEFER_SMR: smr_enter(umaperf_smr); break; + case LAZY_SMR: + smr_lazy_enter(umaperf_smr); + break; case EPOCH: epoch_enter(umaperf_epoch); break; @@ -256,6 +259,9 @@ case DEFER_SMR: smr_exit(umaperf_smr); break; + case LAZY_SMR: + smr_lazy_exit(umaperf_smr); + break; case EPOCH: epoch_exit(umaperf_epoch); break; @@ -618,7 +624,7 @@ switch (umaperf_type) { #ifdef __FreeBSD__ case PLAIN: - flags = UMA_ZONE_ROUNDROBIN; + flags = UMA_ZONE_FIRSTTOUCH; break; case SMR: flags = UMA_ZONE_ROUNDROBIN | UMA_ZONE_SMR; @@ -652,7 +658,8 @@ umaperf_zone = uma_zcreate("umaperf", umaperf_zone_size, NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, flags); #ifdef __FreeBSD__ - umaperf_smr = uma_zone_get_smr(umaperf_zone); + if (umaperf_smr != 0) + uma_zone_set_smr(umaperf_zone, umaperf_smr); #endif umaperf_init_cpus(); } Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -1168,7 +1168,6 @@ * Returns: * Nothing */ - static void bucket_drain(uma_zone_t zone, uma_bucket_t bucket) { @@ -1228,7 +1227,7 @@ */ seq = SMR_SEQ_INVALID; if ((zone->uz_flags & UMA_ZONE_SMR) != 0) - seq = smr_current(zone->uz_smr); + seq = smr_advance(zone->uz_smr); CPU_FOREACH(cpu) { cache = &zone->uz_cpu[cpu]; bucket = cache_bucket_unload_alloc(cache); @@ -2707,7 +2706,7 @@ /* Caller requests a private SMR context. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) - zone->uz_smr = smr_create(zone->uz_name); + zone->uz_smr = smr_create(zone->uz_name, 0, 0); KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), @@ -4180,22 +4179,21 @@ "uma_zfree: zone %s(%p) draining cross bucket %p", zone->uz_name, zone, bucket); - STAILQ_INIT(&fullbuckets); + /* + * It is possible for buckets to arrive here out of order so we fetch + * the current smr seq rather than accepting the bucket's. + */ + seq = SMR_SEQ_INVALID; + if ((zone->uz_flags & UMA_ZONE_SMR) != 0) + seq = smr_advance(zone->uz_smr); /* * To avoid having ndomain * ndomain buckets for sorting we have a * lock on the current crossfree bucket. A full matrix with * per-domain locking could be used if necessary. */ + STAILQ_INIT(&fullbuckets); ZONE_CROSS_LOCK(zone); - - /* - * It is possible for buckets to arrive here out of order so we fetch - * the current smr seq rather than accepting the bucket's. - */ - seq = SMR_SEQ_INVALID; - if ((zone->uz_flags & UMA_ZONE_SMR) != 0) - seq = smr_current(zone->uz_smr); while (bucket->ub_cnt > 0) { item = bucket->ub_bucket[bucket->ub_cnt - 1]; domain = _vm_phys_domain(pmap_kextract((vm_offset_t)item)); Index: tools/uma/smrstress/smrstress.c =================================================================== --- tools/uma/smrstress/smrstress.c +++ tools/uma/smrstress/smrstress.c @@ -64,12 +64,14 @@ static void smrs_error(struct smrs *smrs, const char *fmt, ...) { + smr_t self; va_list ap; + self = zpcpu_get(smrs_smr); atomic_add_int(&smrs_failures, 1); printf("SMR ERROR: wr_seq %d, rd_seq %d, c_seq %d, generation %d, count %d ", - smrs_smr->c_shared->s_wr_seq, smrs_smr->c_shared->s_rd_seq, - zpcpu_get(smrs_smr)->c_seq, smrs->generation, smrs->count); + smr_current(smrs_smr), self->c_shared->s_rd_seq, self->c_seq, + smrs->generation, smrs->count); va_start(ap, fmt); (void)vprintf(fmt, ap); va_end(ap); @@ -83,7 +85,7 @@ /* Wait for the writer to exit. */ while (smrs_completed == 0) { - smr_enter(smrs_smr); + smr_lazy_enter(smrs_smr); cur = (void *)atomic_load_acq_ptr(&smrs_current); if (cur->generation == -1) smrs_error(cur, "read early: Use after free!\n"); @@ -94,7 +96,7 @@ smrs_error(cur, "read late: Use after free!\n"); else if (cnt <= 0) smrs_error(cur, "Invalid ref\n"); - smr_exit(smrs_smr); + smr_lazy_exit(smrs_smr); maybe_yield(); } } @@ -190,8 +192,9 @@ smrs_zone = uma_zcreate("smrs", sizeof(struct smrs), smrs_ctor, smrs_dtor, NULL, NULL, UMA_ALIGN_PTR, - UMA_ZONE_SMR | UMA_ZONE_ZINIT); - smrs_smr = uma_zone_get_smr(smrs_zone); + UMA_ZONE_ZINIT); + smrs_smr = smr_create("smrs", SMR_LAZY); + uma_zone_set_smr(smrs_zone, smrs_smr); } static void