Changeset View
Standalone View
sys/kern/kern_sharedpage.c
Show All 35 Lines | |||||
#include "opt_vm.h" | #include "opt_vm.h" | ||||
#include <sys/param.h> | #include <sys/param.h> | ||||
#include <sys/systm.h> | #include <sys/systm.h> | ||||
#include <sys/kernel.h> | #include <sys/kernel.h> | ||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/rwlock.h> | #include <sys/rwlock.h> | ||||
#include <sys/stddef.h> | |||||
#include <sys/sysent.h> | #include <sys/sysent.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/vdso.h> | #include <sys/vdso.h> | ||||
#include <vm/vm.h> | #include <vm/vm.h> | ||||
#include <vm/vm_param.h> | #include <vm/vm_param.h> | ||||
#include <vm/pmap.h> | #include <vm/pmap.h> | ||||
#include <vm/vm_extern.h> | #include <vm/vm_extern.h> | ||||
#include <vm/vm_kern.h> | #include <vm/vm_kern.h> | ||||
#include <vm/vm_map.h> | #include <vm/vm_map.h> | ||||
#include <vm/vm_object.h> | #include <vm/vm_object.h> | ||||
#include <vm/vm_page.h> | #include <vm/vm_page.h> | ||||
#include <vm/vm_pager.h> | #include <vm/vm_pager.h> | ||||
static struct sx shared_page_alloc_sx; | static struct sx shared_page_alloc_sx; | ||||
static vm_object_t shared_page_obj; | static vm_object_t shared_page_obj; | ||||
static int shared_page_free; | static int shared_page_free; | ||||
char *shared_page_mapping; | char *shared_page_mapping; | ||||
#ifdef RANDOM_FENESTRASX | |||||
static struct vdso_fxrng_generation *fxrng_shpage_mapping; | |||||
static bool fxrng_enabled = true; | |||||
SYSCTL_BOOL(_debug, OID_AUTO, fxrng_vdso_enable, CTLFLAG_RWTUN, &fxrng_enabled, | |||||
0, "Enable FXRNG VDSO"); | |||||
#endif | |||||
void | void | ||||
shared_page_write(int base, int size, const void *data) | shared_page_write(int base, int size, const void *data) | ||||
{ | { | ||||
bcopy(data, shared_page_mapping + base, size); | bcopy(data, shared_page_mapping + base, size); | ||||
} | } | ||||
static int | static int | ||||
▲ Show 20 Lines • Show All 180 Lines • ▼ Show 20 Lines | alloc_sv_tk_compat32(void) | ||||
shared_page_write(tk_base + offsetof(struct vdso_timekeep32, | shared_page_write(tk_base + offsetof(struct vdso_timekeep32, | ||||
tk_ver), sizeof(uint32_t), &tk_ver); | tk_ver), sizeof(uint32_t), &tk_ver); | ||||
svtk->sv_timekeep_off = tk_base; | svtk->sv_timekeep_off = tk_base; | ||||
timekeep_push_vdso(); | timekeep_push_vdso(); | ||||
return (svtk); | return (svtk); | ||||
} | } | ||||
#endif | #endif | ||||
#ifdef RANDOM_FENESTRASX | |||||
void | void | ||||
fxrng_push_seed_generation(uint64_t gen) | |||||
{ | |||||
if (fxrng_shpage_mapping == NULL || !fxrng_enabled) | |||||
return; | |||||
kib: On ILP32 you need to store low, then high words, separated by the _rel fence. Both to ensure… | |||||
Not Done Inline ActionsThis is nonsense, it cannot work this way. Even assuming sequential consistency, it is trivial to find executions which would allow the reader to see inconsistent high/low pair if write is teared. This raises the question, why do you need the generation to be 64-bit value ? You said that the update frequency is once in a hour. Then 32bit counter should stay unwrapped for 500 billions of years, it seems. kib: This is nonsense, it cannot work this way. Even assuming sequential consistency, it is trivial… | |||||
Done Inline ActionsI don't understand why the rel fence isn't adequate to provide reciprocal fence to the acq fence(s) on the userspace read side. Re: writing individual words, I think to match the existing read pattern, I would store MSW then LSW. But, ILP32 platforms all claim to implement atomic_store_rel_64(); if that can tear, it seeems like a violation of semantics.
I might misunderstand what "atomic_store" means, but how can an atomic store tear?
That's a good question. The whitepaper is very explicit about the value being 64-bit. I don't know if that's just an artifact of Windows being somewhat picky about the architectures it runs on (maybe they've truly abandoned IA-32 now and only run on arm64). The counter is also bumped in a handful of other circumstances but none that essentially change the hour math. I believe you are correct that a 32-bit counter should be sufficient for a long time (I get "only" 456,000 years). cem: I don't understand why the rel fence isn't adequate to provide reciprocal fence to the acq… | |||||
Done Inline ActionsAre you sure about store_rel_64() on all ILP32 ? For instance, on PPC ? Even if they implement it, it would be with some global lock, which is probably fine for C11 semantic of atomic, but not for this use, where subwords of atomic are used, and which is undefined behavior for C11. And if the code uses two atomic stores to implement 64bit write, aba recheck does not help. Yes, I miscalculated the years by using 1<<64 instead of 1<<32 as the number of hours. My bad. kib: Are you sure about store_rel_64() on all ILP32 ? For instance, on PPC ? Even if they implement… | |||||
Done Inline ActionsSo I think we have two options for ILP32 userspace, including e.g. compat32 on amd64:
I think (2) seems adequate, although I'm concerned that maybe I don't understand why the whitepaper chose a 64-bit seed version explicitly. (1) would also not be too difficult. Does either (1) or (2) seem like a reasonable option to you, @kib? Is there a better approach I'm missing? Do you have a preference, and if you prefer something like (1), does the implementation in (A) below seem correct to you? Thanks, (A). E.g., shpage structure: { uint64_t seed; uint32_t gen; } writer (locked/serialized externally): atomic_store_32(&shpage->gen, shpage->gen + 1); atomic_thread_fence_rel(); store_64(&shpage->seed, new_seed_version); atomic_store_rel_32(&shpage->gen, shpage->gen + 1); reader: u32 gen1, gen2; u64 seedversion; do { // odd generation denotes concurrent writer; spin until writer is done. do { gen1 = atomic_load_acq_32(&shpage->gen); } while (gen1 % 2 == 1); seedversion = load_64(&shpage->seed); atomic_thread_fence_rel(); gen2 = atomic_load_32(&shpage->gen); } while (gen1 != gen2); cem: So I think we have two options for ILP32 userspace, including e.g. compat32 on amd64:
1. Wrap… | |||||
Done Inline Actions(2) looks completely adequate to me, and after all, this is exactly what I agitated for before. Custom seqlock you wrote should work as well, but IMO is overkill if plain 32bit seed is enough. kib: (2) looks completely adequate to me, and after all, this is exactly what I agitated for before. | |||||
Done Inline Actions(2) it will be. cem: (2) it will be. | |||||
KASSERT(gen < INT32_MAX, | |||||
("fxrng seed version shouldn't roll over a 32-bit counter " | |||||
"for approximately 456,000 years")); | |||||
atomic_store_rel_32(&fxrng_shpage_mapping->fx_generation32, | |||||
(uint32_t)gen); | |||||
} | |||||
static void | |||||
alloc_sv_fxrng_generation(void) | |||||
{ | |||||
int base; | |||||
/* | |||||
* Allocate a full cache line for the fxrng root generation (64-bit | |||||
* counter, or truncated 32-bit counter on ILP32 userspace). It is | |||||
* important that the line is not shared with frequently dirtied data, | |||||
* and the shared page allocator lacks a __read_mostly mechanism. | |||||
* However, PAGE_SIZE is typically large relative to the amount of | |||||
* stuff we've got in it so far, so maybe the possible waste isn't an | |||||
* issue. | |||||
*/ | |||||
base = shared_page_alloc(CACHE_LINE_SIZE, CACHE_LINE_SIZE); | |||||
KASSERT(base != -1, ("%s: base allocation failed", __func__)); | |||||
fxrng_shpage_mapping = (void *)(shared_page_mapping + base); | |||||
*fxrng_shpage_mapping = (struct vdso_fxrng_generation) { | |||||
.fx_vdso_version = VDSO_FXRNG_VER_CURR, | |||||
}; | |||||
} | |||||
#endif /* RANDOM_FENESTRASX */ | |||||
Done Inline ActionsWhy do you need this persistent 'zero' structure with single use ? You can copy from zero_region and save some memory. kib: Why do you need this persistent 'zero' structure with single use ? You can copy from… | |||||
Done Inline ActionsSure, that would be fine. cem: Sure, that would be fine. | |||||
void | |||||
exec_sysvec_init(void *param) | exec_sysvec_init(void *param) | ||||
{ | { | ||||
struct sysentvec *sv; | struct sysentvec *sv; | ||||
#ifdef RANDOM_FENESTRASX | |||||
ptrdiff_t base; | |||||
#endif | |||||
sv = (struct sysentvec *)param; | sv = (struct sysentvec *)param; | ||||
if ((sv->sv_flags & SV_SHP) == 0) | if ((sv->sv_flags & SV_SHP) == 0) | ||||
return; | return; | ||||
sv->sv_shared_page_obj = shared_page_obj; | sv->sv_shared_page_obj = shared_page_obj; | ||||
sv->sv_sigcode_base = sv->sv_shared_page_base + | sv->sv_sigcode_base = sv->sv_shared_page_base + | ||||
shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode); | shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode); | ||||
if ((sv->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD) | if ((sv->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD) | ||||
return; | return; | ||||
Not Done Inline ActionsThis is very strange 'disable'. It only makes newly exec-ed processes to ignore the fxrng_shpage, and if process saw _DISABLED then it is forever (until exec or exit). I really do not see this 'disable' as useful, it provides very non-obvious behavior. Timecounter provides the 'disable' switch as an emergency measure which appeared useful more than once in field debugging and making people machines operational. Main point of it was that all instances of libc reacted to it right away. kib: This is very strange 'disable'. It only makes newly exec-ed processes to ignore the… | |||||
Done Inline ActionsI really don't know what you want in a disable mechanism. I don't see how it can possibly be useful for this feature. Just as a reminder, RANDOM_FENESTRASX will land completely compiled-out of the GENERIC kernel anyway. I've tried to add a disable mechanism because you want it, but now you are complaining that it can't be re-enabled for processes that saw it was disabled. I don't understand the concern on either end. I don't think this needs a disable mechanism, nor do I think it is especially significant how quickly the feature is re-enabled if the user already turned it off once. I've been running this code in kernel and libc on my workstation and test VMs for something like two years now without issue. cem: I really don't know what you want in a disable mechanism. I don't see how it can possibly be… | |||||
Done Inline ActionsOne year* cem: One year* | |||||
Done Inline ActionsNormally disable in vdso causes all currently running FreeBSD-ABI processes to stop using corresponding vdso facility. In reverse, enable causes them to start using it at the moment of enablement. This is not true for your variant of disable, which only acts once for the program, at exec time. I am sure that if somebody sees potential issues with the patch the issue is fixed in advance, the 'disable' knob is for emergency and debug. It helped me to save several user machines for timecounter. Anyway, I do not want to block the feature on that disagreement. Please fix AT_FXRNG numeric value and go ahead. kib: Normally disable in vdso causes all currently running FreeBSD-ABI processes to stop using… | |||||
if ((sv->sv_flags & SV_TIMEKEEP) != 0) { | if ((sv->sv_flags & SV_TIMEKEEP) != 0) { | ||||
#ifdef COMPAT_FREEBSD32 | #ifdef COMPAT_FREEBSD32 | ||||
if ((sv->sv_flags & SV_ILP32) != 0) { | if ((sv->sv_flags & SV_ILP32) != 0) { | ||||
KASSERT(compat32_svtk == NULL, | KASSERT(compat32_svtk == NULL, | ||||
("Compat32 already registered")); | ("Compat32 already registered")); | ||||
compat32_svtk = alloc_sv_tk_compat32(); | compat32_svtk = alloc_sv_tk_compat32(); | ||||
sv->sv_timekeep_base = sv->sv_shared_page_base + | sv->sv_timekeep_base = sv->sv_shared_page_base + | ||||
compat32_svtk->sv_timekeep_off; | compat32_svtk->sv_timekeep_off; | ||||
} else { | } else { | ||||
#endif | #endif | ||||
KASSERT(host_svtk == NULL, ("Host already registered")); | KASSERT(host_svtk == NULL, ("Host already registered")); | ||||
host_svtk = alloc_sv_tk(); | host_svtk = alloc_sv_tk(); | ||||
sv->sv_timekeep_base = sv->sv_shared_page_base + | sv->sv_timekeep_base = sv->sv_shared_page_base + | ||||
host_svtk->sv_timekeep_off; | host_svtk->sv_timekeep_off; | ||||
#ifdef COMPAT_FREEBSD32 | #ifdef COMPAT_FREEBSD32 | ||||
} | } | ||||
#endif | #endif | ||||
} | } | ||||
#ifdef RANDOM_FENESTRASX | |||||
if ((sv->sv_flags & SV_RNG_SEED_VER) != 0) { | |||||
/* | |||||
Done Inline ActionsPlease move the declaration to the start of the function. kib: Please move the declaration to the start of the function. | |||||
Done Inline ActionsOk cem: Ok | |||||
* Only allocate a single VDSO entry for multiple sysentvecs, | |||||
* i.e., native and COMPAT32. | |||||
*/ | |||||
if (fxrng_shpage_mapping == NULL) | |||||
alloc_sv_fxrng_generation(); | |||||
base = (char *)fxrng_shpage_mapping - shared_page_mapping; | |||||
sv->sv_fxrng_gen_base = sv->sv_shared_page_base + base; | |||||
} | } | ||||
#endif | |||||
} | |||||
void | void | ||||
exec_sysvec_init_secondary(struct sysentvec *sv, struct sysentvec *sv2) | exec_sysvec_init_secondary(struct sysentvec *sv, struct sysentvec *sv2) | ||||
Done Inline ActionsThis function needs update to init sv2->sv_handle fxrng_gen_base. kib: This function needs update to init sv2->sv_handle fxrng_gen_base. | |||||
Done Inline ActionsThanks, will investigate. cem: Thanks, will investigate. | |||||
{ | { | ||||
MPASS((sv2->sv_flags & SV_ABI_MASK) == (sv->sv_flags & SV_ABI_MASK)); | MPASS((sv2->sv_flags & SV_ABI_MASK) == (sv->sv_flags & SV_ABI_MASK)); | ||||
MPASS((sv2->sv_flags & SV_TIMEKEEP) == (sv->sv_flags & SV_TIMEKEEP)); | MPASS((sv2->sv_flags & SV_TIMEKEEP) == (sv->sv_flags & SV_TIMEKEEP)); | ||||
MPASS((sv2->sv_flags & SV_SHP) != 0 && (sv->sv_flags & SV_SHP) != 0); | MPASS((sv2->sv_flags & SV_SHP) != 0 && (sv->sv_flags & SV_SHP) != 0); | ||||
Done Inline ActionsAssert should be added that secondary sysvec has the same value of SV_RNG_SEED_VER flag as the primary. kib: Assert should be added that secondary sysvec has the same value of SV_RNG_SEED_VER flag as the… | |||||
Done Inline ActionsDoes this assert and assignment to base below need #ifdef braces ? If support is not compiled in, base should be NULL. Flag should be consistent anyway. kib: Does this assert and assignment to base below need #ifdef braces ? If support is not compiled… | |||||
Done Inline ActionsNo, I guess it doesn't. Will fix. cem: No, I guess it doesn't. Will fix. | |||||
sv2->sv_shared_page_obj = sv->sv_shared_page_obj; | sv2->sv_shared_page_obj = sv->sv_shared_page_obj; | ||||
sv2->sv_sigcode_base = sv2->sv_shared_page_base + | sv2->sv_sigcode_base = sv2->sv_shared_page_base + | ||||
(sv->sv_sigcode_base - sv->sv_shared_page_base); | (sv->sv_sigcode_base - sv->sv_shared_page_base); | ||||
if ((sv2->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD) | if ((sv2->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD) | ||||
return; | return; | ||||
if ((sv2->sv_flags & SV_TIMEKEEP) != 0) { | if ((sv2->sv_flags & SV_TIMEKEEP) != 0) { | ||||
sv2->sv_timekeep_base = sv2->sv_shared_page_base + | sv2->sv_timekeep_base = sv2->sv_shared_page_base + | ||||
(sv->sv_timekeep_base - sv->sv_shared_page_base); | (sv->sv_timekeep_base - sv->sv_shared_page_base); | ||||
} | } | ||||
#ifdef RANDOM_FENESTRASX | |||||
if ((sv2->sv_flags & SV_RNG_SEED_VER) != 0) { | |||||
sv2->sv_fxrng_gen_base = sv2->sv_shared_page_base + | |||||
(sv->sv_fxrng_gen_base - sv->sv_shared_page_base); | |||||
} | |||||
#endif | |||||
} | } |
On ILP32 you need to store low, then high words, separated by the _rel fence. Both to ensure the ordering of writes and to give the reciprocal fence to the _acq fence of reader.