Changeset View
Standalone View
lib/libc/x86/sys/__vdso_gettc.c
Show All 39 Lines | |||||
#include <sys/time.h> | #include <sys/time.h> | ||||
#include <sys/vdso.h> | #include <sys/vdso.h> | ||||
#include <errno.h> | #include <errno.h> | ||||
#include <string.h> | #include <string.h> | ||||
#include <unistd.h> | #include <unistd.h> | ||||
#include "un-namespace.h" | #include "un-namespace.h" | ||||
#include <machine/atomic.h> | #include <machine/atomic.h> | ||||
#include <machine/cpufunc.h> | #include <machine/cpufunc.h> | ||||
#include <machine/pvclock.h> | |||||
#include <machine/specialreg.h> | #include <machine/specialreg.h> | ||||
#include <dev/acpica/acpi_hpet.h> | #include <dev/acpica/acpi_hpet.h> | ||||
#ifdef WANT_HYPERV | #ifdef WANT_HYPERV | ||||
#include <dev/hyperv/hyperv.h> | #include <dev/hyperv/hyperv.h> | ||||
#endif | #endif | ||||
#include <x86/ifunc.h> | #include <x86/ifunc.h> | ||||
#include "libc_private.h" | #include "libc_private.h" | ||||
Show All 39 Lines | |||||
static u_int | static u_int | ||||
rdtsc32_mb_lfence(void) | rdtsc32_mb_lfence(void) | ||||
{ | { | ||||
lfence(); | lfence(); | ||||
return (rdtsc32()); | return (rdtsc32()); | ||||
} | } | ||||
static u_int | static u_int | ||||
royger: I think the inline here is IMO misleading, as you end up taking a pointer to the function when… | |||||
rdtsc32_mb_mfence(void) | rdtsc32_mb_mfence(void) | ||||
{ | { | ||||
mfence(); | mfence(); | ||||
return (rdtsc32()); | return (rdtsc32()); | ||||
} | } | ||||
static u_int | static u_int | ||||
rdtsc32_mb_none(void) | rdtsc32_mb_none(void) | ||||
▲ Show 20 Lines • Show All 195 Lines • ▼ Show 20 Lines | while ((seq = atomic_load_acq_int(&tsc_ref->tsc_seq)) != 0) { | ||||
/* Sequence changed; re-sync. */ | /* Sequence changed; re-sync. */ | ||||
} | } | ||||
return (ENOSYS); | return (ENOSYS); | ||||
} | } | ||||
#endif /* WANT_HYPERV */ | #endif /* WANT_HYPERV */ | ||||
static struct pvclock_vcpu_time_info *pvclock_timeinfos; | |||||
static int | |||||
__vdso_pvclock_gettc(const struct vdso_timehands *th, u_int *tc) | |||||
{ | |||||
uint64_t delta, ns, tsc; | |||||
struct pvclock_vcpu_time_info *ti; | |||||
uint32_t cpuid_ti, cpuid_tsc, version; | |||||
Done Inline ActionsI wonder whether it would be possible to place the scale_delta helper in the pvclock header and have a single implementation shared between the kernel and libc, having duplicated copies of this helper is cumbersome. royger: I wonder whether it would be possible to place the scale_delta helper in the pvclock header and… | |||||
bool stable; | |||||
do { | |||||
ti = &pvclock_timeinfos[0]; | |||||
version = atomic_load_acq_32(&ti->version); | |||||
stable = (ti->flags & th->th_x86_pvc_stable_mask) != 0; | |||||
if (stable) { | |||||
tsc = rdtscp(); | |||||
} else { | |||||
(void)rdtscp_aux(&cpuid_ti); | |||||
ti = &pvclock_timeinfos[cpuid_ti]; | |||||
Done Inline ActionsI think it would be enough to just say that the TSC_STABLE is mandatory because TSCs must be synchronized across CPUs or else using vcpu time info of CPU 0 on for all CPUs would be wrong. royger: I think it would be enough to just say that the TSC_STABLE is mandatory because TSCs must be… | |||||
Done Inline ActionsI decided to just remove this comment altogether. It was basically just attempting to point out how this flag can theoretically change at any time, which would explain how we could arrive at this location from binuptime() even though binuptime(), indirectly via tk_enabled, just observed PVCLOCK_FLAG_TSC_STABLE as set. But I don't think it's as subtle of a point as I must have when I originally wrote this, and I can see how this comment might confuse if __vdso_pvclock_tsc() is read separately from binuptime(). adam_fenn.io: I decided to just remove this comment altogether.
It was basically just attempting to point… | |||||
version = atomic_load_acq_32(&ti->version); | |||||
tsc = rdtscp_aux(&cpuid_tsc); | |||||
} | |||||
delta = tsc - ti->tsc_timestamp; | |||||
ns = ti->system_time + pvclock_scale_delta(delta, | |||||
Done Inline ActionsI do not quite follow this. Why do you need to set pvclock_vcpu_info to MAP_FAILED in advance, during init? Basically, this introduces somewhat unpleasant and very hard to diagnose case, where two threads happen to execute e.g. gettimeofday(), and then one of them falls back to syscall. Unpleasant part is that on the next gettimeofday() call this thread would use vdso path, and kernel vs. usermode timecounters might be relatively off. I tried to avoid this with HPET. Also note use of acq/rel atomics to not rely implicitly on x86 TSO. kib: I do not quite follow this. Why do you need to set pvclock_vcpu_info to MAP_FAILED in advance… | |||||
Done Inline Actionshm... Yeah, this was assuming that it's always "ok"---in that a thread will never see a syscall-based time reading that is less than a previous syscall- or vDSO-based time reading---for a thread to fall back to the syscall path and, based on that assumption, was deliberately allowing the described race in exchange for simplified init code. If it can't be assumed to be safe to switch back and forth between the vDSO and syscall codepaths, then maybe we can't currently support the vDSO codepath for this clock source as the timekeeping code presently stands. (I'll think about this more). But doesn't the hyperv PV TSC vDSO codepath make this same assumption, since a thread can end up switching between the vDSO and syscall codepaths at any time based on whether TscSequence == 0 (hyperv_ref_tsc->tsc_seq == 0)? I want to make sure I have a correct and detailed understanding of the specific vDSO/syscall mismatch problem(s) you're referencing. Would you be able to provide example(s) of the specific way(s) that syscall-based and vDSO-based time values can differ that are being referenced here? I think I can see a case where two calls to clock_gettime() for CLOCK_MONOTONIC_FAST or CLOCK_UPTIME_FAST---which only use th_offset without adding the subsequent delta---could lead to the second call's value being less than the first. This would happen if a thread managed to make a syscall-based reading followed by a vDSO-based reading that both occurred after the in-kernel timehands update but before the corresponding vDSO-based timehands update. Is this an example of the sorts of cases you're thinking of? Am I understanding correctly if I'm thinking that this same scenario but for CLOCK_{MONOTONIC,UPTIME}{,_PRECISE}---which use th_offset plus a th_offset_count-relative delta---would not have this problem because the th_offset_count-based delta that would be included in each reading would be relative to its respective th_offset value? adam_fenn.io: hm...
Yeah, this was assuming that it's always "ok"---in that a thread will never see a… | |||||
ti->tsc_to_system_mul, ti->tsc_shift); | |||||
atomic_thread_fence_acq(); | |||||
} while ((ti->version & 1) != 0 || ti->version != version || | |||||
(!stable && cpuid_ti != cpuid_tsc)); | |||||
*tc = MAX(ns, th->th_x86_pvc_last_systime); | |||||
return (0); | |||||
} | |||||
static void | |||||
__vdso_init_pvclock_timeinfos(void) | |||||
{ | |||||
struct pvclock_vcpu_time_info *timeinfos; | |||||
size_t len; | |||||
int fd, ncpus; | |||||
unsigned int mode; | |||||
timeinfos = MAP_FAILED; | |||||
if (_elf_aux_info(AT_NCPUS, &ncpus, sizeof(ncpus)) != 0 || | |||||
(cap_getmode(&mode) == 0 && mode != 0) || | |||||
(fd = _open("/dev/" PVCLOCK_CDEVNAME, O_RDONLY | O_CLOEXEC)) < 0) | |||||
goto leave; | |||||
len = ncpus * sizeof(*pvclock_timeinfos); | |||||
timeinfos = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); | |||||
_close(fd); | |||||
leave: | |||||
if (atomic_cmpset_rel_ptr( | |||||
(volatile uintptr_t *)&pvclock_timeinfos, (uintptr_t)NULL, | |||||
(uintptr_t)timeinfos) == 0 && timeinfos != MAP_FAILED) | |||||
(void)munmap((void *)timeinfos, len); | |||||
return; | |||||
} | |||||
#pragma weak __vdso_gettc | #pragma weak __vdso_gettc | ||||
int | int | ||||
__vdso_gettc(const struct vdso_timehands *th, u_int *tc) | __vdso_gettc(const struct vdso_timehands *th, u_int *tc) | ||||
{ | { | ||||
volatile char *map; | volatile char *map; | ||||
uint32_t idx; | uint32_t idx; | ||||
switch (th->th_algo) { | switch (th->th_algo) { | ||||
Show All 19 Lines | |||||
#ifdef WANT_HYPERV | #ifdef WANT_HYPERV | ||||
case VDSO_TH_ALGO_X86_HVTSC: | case VDSO_TH_ALGO_X86_HVTSC: | ||||
if (hyperv_ref_tsc == NULL) | if (hyperv_ref_tsc == NULL) | ||||
__vdso_init_hyperv_tsc(); | __vdso_init_hyperv_tsc(); | ||||
if (hyperv_ref_tsc == MAP_FAILED) | if (hyperv_ref_tsc == MAP_FAILED) | ||||
return (ENOSYS); | return (ENOSYS); | ||||
return (__vdso_hyperv_tsc(hyperv_ref_tsc, tc)); | return (__vdso_hyperv_tsc(hyperv_ref_tsc, tc)); | ||||
#endif | #endif | ||||
case VDSO_TH_ALGO_X86_PVCLK: | |||||
if (pvclock_timeinfos == NULL) | |||||
__vdso_init_pvclock_timeinfos(); | |||||
if (pvclock_timeinfos == MAP_FAILED) | |||||
return (ENOSYS); | |||||
return (__vdso_pvclock_gettc(th, tc)); | |||||
default: | default: | ||||
return (ENOSYS); | return (ENOSYS); | ||||
} | } | ||||
} | } | ||||
#pragma weak __vdso_gettimekeep | #pragma weak __vdso_gettimekeep | ||||
int | int | ||||
__vdso_gettimekeep(struct vdso_timekeep **tk) | __vdso_gettimekeep(struct vdso_timekeep **tk) | ||||
{ | { | ||||
return (_elf_aux_info(AT_TIMEKEEP, tk, sizeof(*tk))); | return (_elf_aux_info(AT_TIMEKEEP, tk, sizeof(*tk))); | ||||
} | } |
I think the inline here is IMO misleading, as you end up taking a pointer to the function when filling the tsc_selector struct, so the inline is just dropped.