Index: sys/kern/subr_smr.c =================================================================== --- sys/kern/subr_smr.c +++ sys/kern/subr_smr.c @@ -174,6 +174,25 @@ static counter_u64_t poll_scan = EARLY_COUNTER; SYSCTL_COUNTER_U64(_debug_smr, OID_AUTO, poll_scan, CTLFLAG_RD, &poll_scan, ""); +/* + * Advance a lazy write sequence number. These move forward at the rate of + * ticks. Grace is two ticks in the future. lazy write sequence numbers can + * be odd but not SMR_SEQ_INVALID so we pause time for a tick when we wrap. + */ +static smr_seq_t +smr_lazy_advance(smr_shared_t s) +{ + smr_seq_t wr_seq; + int t; + + t = ticks; + wr_seq = atomic_load_int(&s->s_wr_seq); + if (t != SMR_SEQ_INVALID && SMR_SEQ_GT(t, wr_seq) && + atomic_cmpset_int(&s->s_wr_seq, wr_seq, t)) + wr_seq = t; + + return (wr_seq + SMR_LAZY_GRACE); +} /* * Advance the write sequence and return the new value for use as the @@ -188,26 +207,34 @@ smr_advance(smr_t smr) { smr_shared_t s; + smr_t self; smr_seq_t goal, s_rd_seq; /* * It is illegal to enter while in an smr section. */ SMR_ASSERT_NOT_ENTERED(smr); + self = zpcpu_get(smr); + s = self->c_shared; /* - * Modifications not done in a smr section need to be visible - * before advancing the seq. + * Lazy SMRs simply return the grace period. */ - atomic_thread_fence_rel(); + if ((self->c_flags & SMR_LAZY) != 0) + return (smr_lazy_advance(s)); /* * Load the current read seq before incrementing the goal so * we are guaranteed it is always < goal. */ - s = zpcpu_get(smr)->c_shared; s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); + /* + * Modifications not done in a smr section need to be visible + * before advancing the seq. + */ + atomic_thread_fence_rel(); + /* * Increment the shared write sequence by 2. Since it is * initialized to 1 this means the only valid values are @@ -234,17 +261,17 @@ smr_advance_deferred(smr_t smr, int limit) { smr_seq_t goal; - smr_t csmr; + smr_t self; SMR_ASSERT_NOT_ENTERED(smr); critical_enter(); - csmr = zpcpu_get(smr); - if (++csmr->c_deferred >= limit) { - goal = SMR_SEQ_INVALID; - csmr->c_deferred = 0; - } else - goal = smr_shared_current(csmr->c_shared) + SMR_SEQ_INCR; + self = zpcpu_get(smr); + goal = SMR_SEQ_INVALID; + if (++self->c_deferred >= limit) + self->c_deferred = 0; + else if ((self->c_flags & SMR_LAZY) == 0) + goal = smr_shared_current(self->c_shared) + SMR_SEQ_INCR; critical_exit(); if (goal != SMR_SEQ_INVALID) return (goal); @@ -268,7 +295,7 @@ smr_poll(smr_t smr, smr_seq_t goal, bool wait) { smr_shared_t s; - smr_t c; + smr_t c, self; smr_seq_t s_wr_seq, s_rd_seq, rd_seq, c_seq; int i; bool success; @@ -278,6 +305,8 @@ */ KASSERT(!wait || !SMR_ENTERED(smr), ("smr_poll: Blocking not allowed in a SMR section.")); + KASSERT(!wait || (zpcpu_get(smr)->c_flags & SMR_LAZY) == 0, + ("smr_poll: Blocking not allowed on lazy smrs.")); /* * Use a critical section so that we can avoid ABA races @@ -285,7 +314,8 @@ */ success = true; critical_enter(); - s = zpcpu_get(smr)->c_shared; + self = zpcpu_get(smr); + s = self->c_shared; counter_u64_add_protected(poll, 1); /* @@ -295,20 +325,35 @@ s_rd_seq = atomic_load_acq_int(&s->s_rd_seq); /* - * wr_seq must be loaded prior to any c_seq value so that a stale - * c_seq can only reference time after this wr_seq. + * wr_seq must be loaded prior to any c_seq value so that a + * stale c_seq can only reference time after this wr_seq. */ s_wr_seq = atomic_load_acq_int(&s->s_wr_seq); - /* - * This may have come from a deferred advance. Consider one - * increment past the current wr_seq valid and make sure we - * have advanced far enough to succeed. We simply add to avoid - * an additional fence. - */ - if (goal == s_wr_seq + SMR_SEQ_INCR) { - atomic_add_int(&s->s_wr_seq, SMR_SEQ_INCR); - s_wr_seq = goal; + if ((self->c_flags & SMR_LAZY) == 0) { + /* + * This may have come from a deferred advance. Consider one + * increment past the current wr_seq valid and make sure we + * have advanced far enough to succeed. We simply add to avoid + * an additional fence. + */ + if (SMR_SEQ_DELTA(goal, s_wr_seq) == SMR_SEQ_INCR) { + atomic_add_int(&s->s_wr_seq, SMR_SEQ_INCR); + s_wr_seq = goal; + } + } else { + /* + * If this goal is in the future we can't succeed. We + * assume that read sections are often enough that the + * idle test will not be fruitful. + */ + if (SMR_SEQ_LT(s_wr_seq, ticks)) + s_wr_seq = smr_lazy_advance(s) - SMR_LAZY_GRACE; + if (SMR_SEQ_GT(goal, s_wr_seq) && + SMR_SEQ_DELTA(goal, s_wr_seq) <= SMR_LAZY_GRACE) { + success = false; + goto out; + } } /* @@ -407,7 +452,7 @@ } smr_t -smr_create(const char *name) +smr_create(const char *name, int flags) { smr_t smr, c; smr_shared_t s; @@ -417,13 +462,18 @@ smr = uma_zalloc_pcpu(smr_zone, M_WAITOK); s->s_name = name; - s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT; + if ((flags & SMR_LAZY) == 0) + s->s_rd_seq = s->s_wr_seq = SMR_SEQ_INIT; + else + s->s_rd_seq = s->s_wr_seq = ticks; /* Initialize all CPUS, not just those running. */ for (i = 0; i <= mp_maxid; i++) { c = zpcpu_get_cpu(smr, i); c->c_seq = SMR_SEQ_INVALID; c->c_shared = s; + c->c_deferred = 0; + c->c_flags = flags; } atomic_thread_fence_seq_cst(); Index: sys/sys/smr.h =================================================================== --- sys/sys/smr.h +++ sys/sys/smr.h @@ -66,8 +66,13 @@ smr_seq_t c_seq; /* Current observed sequence. */ smr_shared_t c_shared; /* Shared SMR state. */ int c_deferred; /* Deferred advance counter. */ + int c_flags; /* SMR Configuration */ }; +#define SMR_LAZY 0x0001 /* Higher latency write, fast read. */ + +#define SMR_LAZY_GRACE 2 /* Grace preriod for lazy smr. */ + #define SMR_ENTERED(smr) \ (curthread->td_critnest != 0 && zpcpu_get((smr))->c_seq != SMR_SEQ_INVALID) @@ -170,7 +175,8 @@ } while (0) /* - * Return the current write sequence number. + * Return the current write sequence number. This is not the same as the + * current goal which may be in the future. */ static inline smr_seq_t smr_shared_current(smr_shared_t s) @@ -195,6 +201,8 @@ critical_enter(); smr = zpcpu_get(smr); + KASSERT((smr->c_flags & SMR_LAZY) == 0, + ("smr_enter(%s) lazy smr.", smr->c_shared->s_name)); KASSERT(smr->c_seq == 0, ("smr_enter(%s) does not support recursion.", smr->c_shared->s_name)); @@ -228,6 +236,8 @@ smr = zpcpu_get(smr); CRITICAL_ASSERT(curthread); + KASSERT((smr->c_flags & SMR_LAZY) == 0, + ("smr_exit(%s) lazy smr.", smr->c_shared->s_name)); KASSERT(smr->c_seq != SMR_SEQ_INVALID, ("smr_exit(%s) not in a smr section.", smr->c_shared->s_name)); @@ -242,6 +252,57 @@ critical_exit(); } +/* + * Enter a lazy smr section. This is used for read-mostly state that + * can tolerate a high free latency. + */ +static inline void +smr_lazy_enter(smr_t smr) +{ + + critical_enter(); + smr = zpcpu_get(smr); + KASSERT((smr->c_flags & SMR_LAZY) != 0, + ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name)); + KASSERT(smr->c_seq == 0, + ("smr_lazy_enter(%s) does not support recursion.", + smr->c_shared->s_name)); + + /* + * This needs no serialization. If an interrupt occurs before we + * assign sr_seq to c_seq any speculative loads will be discarded. + * If we assign a stale wr_seq value due to interrupt we use the + * same algorithm that renders smr_enter() safe. + */ + smr->c_seq = smr_shared_current(smr->c_shared); +} + +/* + * Exit a lazy smr section. This is used for read-mostly state that + * can tolerate a high free latency. + */ +static inline void +smr_lazy_exit(smr_t smr) +{ + + smr = zpcpu_get(smr); + CRITICAL_ASSERT(curthread); + KASSERT((smr->c_flags & SMR_LAZY) != 0, + ("smr_lazy_enter(%s) non-lazy smr.", smr->c_shared->s_name)); + KASSERT(smr->c_seq != SMR_SEQ_INVALID, + ("smr_lazy_exit(%s) not in a smr section.", smr->c_shared->s_name)); + + /* + * All loads/stores must be retired before the sequence becomes + * visible. The fence compiles away on amd64. Another + * alternative would be to omit the fence but store the exit + * time and wait 1 tick longer. + */ + atomic_thread_fence_rel(); + smr->c_seq = SMR_SEQ_INVALID; + critical_exit(); +} + /* * Advances the write sequence number. Returns the sequence number * required to ensure that all modifications are visible to readers. @@ -262,7 +323,9 @@ bool smr_poll(smr_t smr, smr_seq_t goal, bool wait); /* Create a new SMR context. */ -smr_t smr_create(const char *name); +smr_t smr_create(const char *name, int flags); + +/* Destroy the context. */ void smr_destroy(smr_t smr); /* Index: sys/tools/umaperf/umaperf.c =================================================================== --- sys/tools/umaperf/umaperf.c +++ sys/tools/umaperf/umaperf.c @@ -164,6 +164,7 @@ PLAIN, #ifdef __FreeBSD__ SMR, + LAZY_SMR, EPOCH, EPOCH_PRE, #else @@ -176,6 +177,7 @@ [PLAIN] = "PLAIN", #ifdef __FreeBSD__ [SMR] = "SMR", + [LAZY_SMR] = "SMR_LAZY", [EPOCH] = "EPOCH", [EPOCH_PRE] = "EPOCH_PREEMPT" #else @@ -224,6 +226,9 @@ case SMR: smr_enter(umaperf_smr); break; + case LAZY_SMR: + smr_lazy_enter(umaperf_smr); + break; case EPOCH: epoch_enter(umaperf_epoch); break; @@ -249,6 +254,9 @@ case SMR: smr_exit(umaperf_smr); break; + case LAZY_SMR: + smr_lazy_exit(umaperf_smr); + break; case EPOCH: epoch_exit(umaperf_epoch); break; @@ -280,6 +288,7 @@ break; #ifdef __FreeBSD__ case SMR: + case LAZY_SMR: uma_zfree_smr(umaperf_zone, p); break; case EPOCH: @@ -301,6 +310,7 @@ switch (umaperf_type) { #ifdef __FreeBSD__ case SMR: + case LAZY_SMR: return uma_zalloc_smr(umaperf_zone, M_WAITOK); case EPOCH: case EPOCH_PRE: @@ -606,10 +616,15 @@ switch (umaperf_type) { #ifdef __FreeBSD__ case PLAIN: - flags = UMA_ZONE_ROUNDROBIN; + flags = UMA_ZONE_FIRSTTOUCH; break; case SMR: - flags = UMA_ZONE_ROUNDROBIN | UMA_ZONE_SMR; + umaperf_smr = smr_create("umaperf", 0); + flags = UMA_ZONE_FIRSTTOUCH; + break; + case LAZY_SMR: + umaperf_smr = smr_create("umaperf", SMR_LAZY); + flags = UMA_ZONE_FIRSTTOUCH; break; case EPOCH: umaperf_epoch = epoch_alloc("umaperf", 0); @@ -629,7 +644,8 @@ umaperf_zone = uma_zcreate("umaperf", umaperf_zone_size, NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, flags); #ifdef __FreeBSD__ - umaperf_smr = uma_zone_get_smr(umaperf_zone); + if (umaperf_smr != 0) + uma_zone_set_smr(umaperf_zone, umaperf_smr); #endif umaperf_init_cpus(); } Index: sys/vm/uma_core.c =================================================================== --- sys/vm/uma_core.c +++ sys/vm/uma_core.c @@ -2707,7 +2707,7 @@ /* Caller requests a private SMR context. */ if ((zone->uz_flags & UMA_ZONE_SMR) != 0) - zone->uz_smr = smr_create(zone->uz_name); + zone->uz_smr = smr_create(zone->uz_name, 0); KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) != (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET), Index: tools/uma/smrstress/smrstress.c =================================================================== --- tools/uma/smrstress/smrstress.c +++ tools/uma/smrstress/smrstress.c @@ -64,12 +64,14 @@ static void smrs_error(struct smrs *smrs, const char *fmt, ...) { + smr_t self; va_list ap; + self = zpcpu_get(smrs_smr); atomic_add_int(&smrs_failures, 1); printf("SMR ERROR: wr_seq %d, rd_seq %d, c_seq %d, generation %d, count %d ", - smrs_smr->c_shared->s_wr_seq, smrs_smr->c_shared->s_rd_seq, - zpcpu_get(smrs_smr)->c_seq, smrs->generation, smrs->count); + smr_current(smrs_smr), self->c_shared->s_rd_seq, self->c_seq, + smrs->generation, smrs->count); va_start(ap, fmt); (void)vprintf(fmt, ap); va_end(ap); @@ -83,7 +85,7 @@ /* Wait for the writer to exit. */ while (smrs_completed == 0) { - smr_enter(smrs_smr); + smr_lazy_enter(smrs_smr); cur = (void *)atomic_load_acq_ptr(&smrs_current); if (cur->generation == -1) smrs_error(cur, "read early: Use after free!\n"); @@ -94,7 +96,7 @@ smrs_error(cur, "read late: Use after free!\n"); else if (cnt <= 0) smrs_error(cur, "Invalid ref\n"); - smr_exit(smrs_smr); + smr_lazy_exit(smrs_smr); maybe_yield(); } } @@ -190,8 +192,9 @@ smrs_zone = uma_zcreate("smrs", sizeof(struct smrs), smrs_ctor, smrs_dtor, NULL, NULL, UMA_ALIGN_PTR, - UMA_ZONE_SMR | UMA_ZONE_ZINIT); - smrs_smr = uma_zone_get_smr(smrs_zone); + UMA_ZONE_ZINIT); + smrs_smr = smr_create("smrs", SMR_LAZY); + uma_zone_set_smr(smrs_zone, smrs_smr); } static void