Changeset View
Standalone View
sys/kern/subr_smr.c
- This file was added.
/*- | |||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD | |||||
* | |||||
* Copyright (c) 2019 Jeffrey Roberson <jeff@FreeBSD.org> | |||||
* All rights reserved. | |||||
* | |||||
* Redistribution and use in source and binary forms, with or without | |||||
* modification, are permitted provided that the following conditions | |||||
* are met: | |||||
* 1. Redistributions of source code must retain the above copyright | |||||
* notice unmodified, this list of conditions, and the following | |||||
* disclaimer. | |||||
* 2. Redistributions in binary form must reproduce the above copyright | |||||
* notice, this list of conditions and the following disclaimer in the | |||||
* documentation and/or other materials provided with the distribution. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR | |||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | |||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | |||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, | |||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | |||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*/ | |||||
#include <sys/cdefs.h> | |||||
__FBSDID("$FreeBSD$"); | |||||
#include <sys/param.h> | |||||
#include <sys/systm.h> | |||||
#include <sys/limits.h> | |||||
#include <sys/kernel.h> | |||||
#include <sys/proc.h> | |||||
#include <sys/smp.h> | |||||
#include <sys/smr.h> | |||||
#include <vm/uma.h> | |||||
/* | |||||
* This is a novel safe memory reclamation technique inspired by | |||||
* epoch based reclamation from Samy Al Bahra's concurrency kit which | |||||
* in turn was based on work described in: | |||||
* Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University | |||||
* of Cambridge Computing Laboratory. | |||||
* | |||||
* This is not an implementation of hazard pointers or related | |||||
* techniques. The term safe memory reclamation is used as a | |||||
* generic descriptor for algorithms that defer frees to avoid | |||||
* use-after-free errors with lockless datastructures. | |||||
* | |||||
* The basic approach is to maintain a monotonic write sequence | |||||
* number that is updated on some application defined granularity. | |||||
* Readers record the most recent write sequence number they have | |||||
* observed. A global read sequence number records the lowest | |||||
* sequence number observed by any reader. Any write older than | |||||
markj: You might note that the global rd_seq is lazily updated, so it is really just a lower bound. | |||||
Not Done Inline Actions"a the" rlibby: "a the" | |||||
* this value has been observed by all readers and memory can be | |||||
* reclaimed. Like epoch we also detect idle readers by storing | |||||
* an invalid sequence number in the per-cpu state when the read | |||||
* section is exited. | |||||
* | |||||
* The write sequence and read sequence number can be thought of as | |||||
* a two handed clock. SMR maintains the invariant that all readers | |||||
* can only see state that existed between these two values. Periodically | |||||
* the read sequence or hand is advanced towards the write sequence | |||||
jeffAuthorUnsubmitted Done Inline ActionsThe language describing the invariant is not very good. jeff: The language describing the invariant is not very good. | |||||
* and memory is reclaimed. When the system is idle the two hands | |||||
* meet. Readers never advance the sequence number and the global | |||||
* read sequence number is never higher than the write sequence. | |||||
* A stored sequence number that falls outside of this range has | |||||
* expired and needs no scan to reclaim. | |||||
* | |||||
* A notable distinction between this SMR and epoch, qsbr, rcu, etc. is | |||||
* that advancing the sequence number is decoupled from detecting its | |||||
Not Done Inline Actions"Readers never advance [either] sequence number" rlibby: "Readers never advance [either] sequence number" | |||||
* observation. This results in a more granular assignment of sequence | |||||
* numbers even as read latencies prohibit all or some expiration. | |||||
* It also allows writers to advance the sequence number and save the | |||||
* poll for expiration until a later time when it is likely to | |||||
* complete without waiting. The batch granularity and free-to-use | |||||
* latency is dynamic and can be significantly smaller than in more | |||||
* strict systems. | |||||
* | |||||
* This mechanism is primarily intended to be used in coordination with | |||||
* UMA. By integrating with the allocator we avoid all of the callout | |||||
* queue machinery and are provided with an efficient way to batch | |||||
* sequence advancement and waiting. The allocator accumulates a full | |||||
* per-cpu cache of memory before advancing the sequence. It then | |||||
* delays waiting for this sequence to expire until the memory is | |||||
* selected for reuse. In this way we only increment the sequence | |||||
* value once for n=cache-size frees and the waits are done long | |||||
* after the sequence has been expired so they need only be verified | |||||
* to account for pathological conditions and to advance the read | |||||
* sequence. Tying the sequence number to the bucket size has the | |||||
* nice property that as the zone gets busier the buckets get larger | |||||
* and the sequence writes become fewer. If the coherency of advancing | |||||
* the write sequence number becomes too costly we can advance | |||||
* it for every N buckets in exchange for higher free-to-use | |||||
* latency and consequently higher memory consumption. | |||||
* | |||||
* If the read overhead of accessing the global cacheline becomes | |||||
* especially burdensome an invariant TSC could be used in place of the | |||||
* sequence. The algorithm would then only need to maintain the minimum | |||||
* observed tsc. This would trade potential cache synchronization | |||||
* overhead for local serialization and cpu timestamp overhead. | |||||
*/ | |||||
/* | |||||
* A simplified diagram: | |||||
* | |||||
* 0 UINT_MAX | |||||
* | -------------------- sequence number space -------------------- | | |||||
* ^ rd seq ^ wr seq | |||||
* | ----- valid sequence numbers ---- | | |||||
* ^cpuA ^cpuC | |||||
* | -- free -- | --------- deferred frees -------- | ---- free ---- | | |||||
* | |||||
* | |||||
* In this example cpuA has the lowest sequence number and poll can | |||||
* advance rd seq. cpuB is not running and is considered to observe | |||||
* wr seq. | |||||
* | |||||
* Freed memory that is tagged with a sequence number between rd seq and | |||||
* wr seq can not be safely reclaimed because cpuA may hold a reference to | |||||
* it. Any other memory is guaranteed to be unreferenced. | |||||
* | |||||
* Any writer is free to advance wr seq at any time. | |||||
markjUnsubmitted Not Done Inline Actions... but may need to spin until the difference wr_seq - rd_seq is smaller than than SMR_SEQ_MAX_DELTA. markj: ... but may need to spin until the difference wr_seq - rd_seq is smaller than than… | |||||
*/ | |||||
static uma_zone_t smr_g_zone; | |||||
static uma_zone_t smr_zone; | |||||
#ifndef INVARIANTS | |||||
#define SMR_SEQ_INIT 1 /* All valid sequence numbers are odd. */ | |||||
#define SMR_SEQ_INCR 2 | |||||
/* | |||||
* The maximum distance between rd_seq and wr_seq. For the modular | |||||
* arithmetic to work a value of UINT_MAX / 2 would be possible but | |||||
* it is checked after increment so this gives us UINT_MAX / 4 threads | |||||
* executing before we could actually overflow. | |||||
*/ | |||||
#define SMR_SEQ_MAX_DELTA (UINT_MAX / 4) | |||||
#else | |||||
/* We want to test the wrapping feature in invariants kernels. */ | |||||
#define SMR_SEQ_INCR (UINT_MAX / 10000) | |||||
#define SMR_SEQ_INIT (UINT_MAX - 100000) | |||||
/* Force extra polls to test the integer overflow detection. */ | |||||
#define SMR_SEQ_MAX_DELTA (1000) | |||||
#endif | |||||
/* | |||||
* Advance the write sequence and return the new value for use as the | |||||
* wait goal. This guarantees that any changes made by the calling | |||||
* thread prior to this call will be visible to all threads after | |||||
* rd_seq meets or exceeds the return value. | |||||
*/ | |||||
smr_seq_t | |||||
smr_advance(smr_t smr) | |||||
{ | |||||
smr_g_t g; | |||||
smr_seq_t goal; | |||||
/* | |||||
* Modifications not done in a smr section need to be visible | |||||
* before advancing the seq. | |||||
*/ | |||||
atomic_thread_fence_rel(); | |||||
/* | |||||
* Increment the global write sequence by 2. Since it is | |||||
* initialized to 1 this means the only valid values are | |||||
* odd and an observed value of 0 in a particular CPU means | |||||
* it is not currently in a read section. | |||||
*/ | |||||
g = smr->c_global; | |||||
goal = atomic_fetchadd_int(&g->g_wr_seq, SMR_SEQ_INCR) + SMR_SEQ_INCR; | |||||
rlibbyUnsubmitted Not Done Inline ActionsWhy not just atomic_add? rlibby: Why not just atomic_add? | |||||
/* | |||||
* Force a synchronization here if the goal is getting too | |||||
* far ahead of the read sequence number. This keeps the | |||||
* wrap detecting arithmetic working in pathological cases. | |||||
*/ | |||||
if (goal - atomic_load_int(&g->g_rd_seq) >= SMR_SEQ_MAX_DELTA) | |||||
smr_wait(smr, goal); | |||||
rlibbyUnsubmitted Not Done Inline ActionsIs it legal to smr_advance() inside an SMR section? I think no, and we should KASSERT. Otherwise, there's a potential deadlock here if a CPU is in an SMR section and tries to advance too far, where it would attempt to wait on itself. Separately, can you state rationale for waiting until goal (completely caught up)? A couple other options might be goal - SMR_SEQ_MAX_DELTA + 1 (minimally caught up), or goal - SMR_SEQ_MAX_DELTA/2 (somewhere in between). I think a minimum wait may be better, after all it wouldn't restrict how much we actually advance the read hand. rlibby: Is it legal to smr_advance() inside an SMR section? I think no, and we should KASSERT. | |||||
markjUnsubmitted Not Done Inline ActionsWould it be worth adding a counter for instances of this scenario? markj: Would it be worth adding a counter for instances of this scenario? | |||||
jeffAuthorUnsubmitted Done Inline ActionsDefinitely counters & sysctl are on my todo. I have been using dtrace and hwpmc to verify. jeff: Definitely counters & sysctl are on my todo. I have been using dtrace and hwpmc to verify. | |||||
return (goal); | |||||
} | |||||
/* | |||||
* Poll to determine whether all readers have observed the 'goal' write | |||||
* sequence number. | |||||
* | |||||
* If wait is true this will spin until the goal is met. | |||||
* | |||||
* This routine will updated the minimum observed read sequence number in | |||||
* g_rd_seq if it does a scan. It may not do a scan if another call has | |||||
* advanced g_rd_seq beyond the callers goal already. | |||||
* | |||||
* Returns true if the goal is met and false if not. | |||||
*/ | |||||
bool | |||||
smr_poll(smr_t smr, smr_seq_t goal, bool wait) | |||||
{ | |||||
smr_g_t g; | |||||
smr_t c; | |||||
smr_seq_t g_wr_seq, g_rd_seq, rd_seq, c_seq; | |||||
int i; | |||||
bool success; | |||||
/* | |||||
* Use a critical section so that we can avoid ABA races | |||||
* caused by long preemption sleeps. | |||||
*/ | |||||
success = true; | |||||
critical_enter(); | |||||
g = smr->c_global; | |||||
/* | |||||
* Acquire barrier loads g_wr_seq after g_rd_seq so that we can not | |||||
* observe an updated read sequence that is larger than write. | |||||
*/ | |||||
g_rd_seq = atomic_load_acq_int(&g->g_rd_seq); | |||||
g_wr_seq = atomic_load_int(&g->g_wr_seq); | |||||
/* | |||||
* Detect whether the goal has already been observed. | |||||
* | |||||
* The goal must be in the range of g_wr_seq >= goal >= g_rd_seq for | |||||
* it to be valid. If it is not then the caller held on to it and | |||||
* the integer wrapped. If we wrapped back within range the caller | |||||
* will harmlessly scan. | |||||
*/ | |||||
if (SMR_SEQ_GEQ(g_rd_seq, goal) || SMR_SEQ_LT(g_wr_seq, goal)) | |||||
rlibbyUnsubmitted Not Done Inline ActionsSMR_SEQ_GEQ(g_rd_seq, goal) (vs GT) reads like an off-by-one according to the comment, but I think it's actually just that you are combining the separate case for s_rd_seq == goal, which is in range (and therefore not skippable), but means that all readers have observed it. I think for clarity it would be good to either call this out or have a separate comment and test for it. rlibby: `SMR_SEQ_GEQ(g_rd_seq, goal)` (vs `GT`) reads like an off-by-one according to the comment, but… | |||||
goto out; | |||||
/* | |||||
* Loop until all cores have observed the goal sequence or have | |||||
* gone inactive. Keep track of the oldest sequence currently | |||||
* active as rd_seq. | |||||
*/ | |||||
success = true; | |||||
rlibbyUnsubmitted Not Done Inline ActionsRedundant, success is already true. rlibby: Redundant, success is already true. | |||||
rd_seq = g_wr_seq; | |||||
CPU_FOREACH(i) { | |||||
c = zpcpu_get_cpu(smr, i); | |||||
c_seq = 0; | |||||
markjUnsubmitted Not Done Inline ActionsThis is a dead store. markj: This is a dead store. | |||||
jeffAuthorUnsubmitted Done Inline ActionsI just wanted to be sure the compiler wouldn't confuse it for an uninitialized value. It is accessed after the for loop. jeff: I just wanted to be sure the compiler wouldn't confuse it for an uninitialized value. It is… | |||||
rlibbyUnsubmitted Not Done Inline ActionsSpell it c_seq = SMR_SEQ_INVALID ? rlibby: Spell it `c_seq = SMR_SEQ_INVALID` ? | |||||
for (;;) { | |||||
c_seq = atomic_load_int(&c->c_seq); | |||||
if (c_seq == SMR_SEQ_INVALID) | |||||
break; | |||||
/* | |||||
* There is a race described in smr.h:smr_enter that | |||||
* can lead to a stale seq value but not stale data | |||||
* access. If we find a value out of range here we | |||||
* pin it to the current min to prevent it from | |||||
* advancing until that stale section has expired. | |||||
* | |||||
* The race is created when a cpu loads the g_wr_seq | |||||
* value in a local register and then another thread | |||||
* advances g_wr_seq and calls smr_poll() which will | |||||
* oberve no value yet in c_seq and advance g_rd_seq | |||||
* up to g_wr_seq which is beyond the register | |||||
* cached value. This is only likely to happen on | |||||
* hypervisor or with a system management interrupt. | |||||
*/ | |||||
if (SMR_SEQ_LT(c_seq, g_rd_seq)) | |||||
c_seq = g_rd_seq; | |||||
Not Done Inline ActionsI need to think about this some more... I think this may still be a problem. Isn't it possible that the same race results in us observing SMR_SEQ_INVALID here instead of the stale value? I guess this is done this way to avoid a loop in smr_enter, but would a __predict_false loop be so rlibby: I need to think about this some more... I think this may still be a problem. Isn't it possible… | |||||
Done Inline ActionsI still believe this is safe. The race would be that the cpu had loaded a value that is now stale and has not stored it yet. Imagine a hypervisor thread was preempted for a long time. However, in this case, we are guaranteed that no user data has been loaded yet. Which means that the local copy of g_wr_seq which was fetched before c_seq is guaranteed to be less than or equal to the sequence number of the actual data accesses that follow. Since we don't allow rd_seq to advance beyond the wr_seq that we observed at the start of the function we can not advance unsafely. I had looked at a loop but it has other implications and I think as long as I can prove it's not strictly necessary it will just have to be a quirk with a lot of comments. It does bring to mind that I have not thought extensively about what happens if the thread running poll() is preempted and looking at very stale values. That is perhaps more concerning. I need to re-evaluate with this in mind and possibly reload the global values after spinwaits. jeff: I still believe this is safe. The race would be that the cpu had loaded a value that is now… | |||||
Not Done Inline ActionsNever mind about SMR_SEQ_INVALID, I see why that doesn't matter. I think this is okay, I get that we'd rather handle this case in smr_poll() than by looping in smr_enter(). rlibby: Never mind about SMR_SEQ_INVALID, I see why that doesn't matter.
I think this is okay, I get… | |||||
/* | |||||
* If the sequence number meets the goal we are | |||||
* done with this cpu. | |||||
*/ | |||||
if (SMR_SEQ_GEQ(c_seq, goal)) | |||||
break; | |||||
/* | |||||
* If we're not waiting we will still scan the rest | |||||
* of the cpus and update g_rd_seq before returning | |||||
* an error. | |||||
*/ | |||||
if (!wait) { | |||||
success = false; | |||||
break; | |||||
} | |||||
cpu_spinwait(); | |||||
} | |||||
/* | |||||
* Limit the minimum observed rd_seq whether we met the goal | |||||
* or not. | |||||
*/ | |||||
if (c_seq != SMR_SEQ_INVALID && SMR_SEQ_GT(rd_seq, c_seq)) | |||||
rd_seq = c_seq; | |||||
} | |||||
Not Done Inline ActionsThis would be easier to read with a SMR_SEQ_MIN(x, y) macro. Ditto above. rlibby: This would be easier to read with a `SMR_SEQ_MIN(x, y)` macro. Ditto above. | |||||
/* | |||||
* Advance the rd_seq as long as we observed the most recent one. | |||||
*/ | |||||
g_rd_seq = atomic_load_int(&g->g_rd_seq); | |||||
do { | |||||
if (SMR_SEQ_LEQ(rd_seq, g_rd_seq)) | |||||
break; | |||||
} while (atomic_fcmpset_int(&g->g_rd_seq, &g_rd_seq, rd_seq) == 0); | |||||
out: | |||||
critical_exit(); | |||||
return (success); | |||||
} | |||||
smr_t | |||||
smr_create(const char *name) | |||||
{ | |||||
smr_t smr, c; | |||||
smr_g_t g; | |||||
int i; | |||||
g = uma_zalloc(smr_g_zone, M_WAITOK); | |||||
smr = uma_zalloc(smr_zone, M_WAITOK); | |||||
g->g_name = name; | |||||
g->g_rd_seq = g->g_wr_seq = SMR_SEQ_INIT; | |||||
/* Initialize all CPUS, not just those running. */ | |||||
for (i = 0; i <= mp_maxid; i++) { | |||||
c = zpcpu_get_cpu(smr, i); | |||||
c->c_seq = SMR_SEQ_INVALID; | |||||
c->c_global = g; | |||||
} | |||||
atomic_thread_fence_seq_cst(); | |||||
return (smr); | |||||
} | |||||
void | |||||
smr_destroy(smr_t smr) | |||||
{ | |||||
smr_synchronize(smr); | |||||
uma_zfree(smr_g_zone, smr->c_global); | |||||
uma_zfree(smr_zone, smr); | |||||
} | |||||
/* | |||||
* Initialize the UMA slab zone. | |||||
*/ | |||||
void | |||||
smr_init(void) | |||||
{ | |||||
smr_g_zone = uma_zcreate("SMR GLOBAL", sizeof(struct smr_g), | |||||
NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, 0); | |||||
smr_zone = uma_zcreate("SMR CPU", sizeof(struct smr), | |||||
NULL, NULL, NULL, NULL, (CACHE_LINE_SIZE * 2) - 1, UMA_ZONE_PCPU); | |||||
} |
You might note that the global rd_seq is lazily updated, so it is really just a lower bound.