Changeset View
Standalone View
lib/libc/x86/sys/__vdso_gettc.c
Show All 39 Lines | |||||
#include <sys/time.h> | #include <sys/time.h> | ||||
#include <sys/vdso.h> | #include <sys/vdso.h> | ||||
#include <errno.h> | #include <errno.h> | ||||
#include <string.h> | #include <string.h> | ||||
#include <unistd.h> | #include <unistd.h> | ||||
#include "un-namespace.h" | #include "un-namespace.h" | ||||
#include <machine/atomic.h> | #include <machine/atomic.h> | ||||
#include <machine/cpufunc.h> | #include <machine/cpufunc.h> | ||||
#include <machine/pvclock.h> | |||||
#include <machine/specialreg.h> | #include <machine/specialreg.h> | ||||
#include <dev/acpica/acpi_hpet.h> | #include <dev/acpica/acpi_hpet.h> | ||||
#ifdef WANT_HYPERV | #ifdef WANT_HYPERV | ||||
#include <dev/hyperv/hyperv.h> | #include <dev/hyperv/hyperv.h> | ||||
#endif | #endif | ||||
#include <x86/ifunc.h> | #include <x86/ifunc.h> | ||||
#include "libc_private.h" | #include "libc_private.h" | ||||
Show All 32 Lines | |||||
} | } | ||||
static u_int | static u_int | ||||
rdtsc_low_mb_none(const struct vdso_timehands *th) | rdtsc_low_mb_none(const struct vdso_timehands *th) | ||||
{ | { | ||||
return (rdtsc_low(th)); | return (rdtsc_low(th)); | ||||
} | } | ||||
static inline uint64_t | |||||
rdtsc_mb_lfence(void) | |||||
{ | |||||
lfence(); | |||||
return (rdtsc()); | |||||
} | |||||
static inline uint64_t | |||||
royger: I think the inline here is IMO misleading, as you end up taking a pointer to the function when… | |||||
rdtsc_mb_mfence(void) | |||||
{ | |||||
mfence(); | |||||
return (rdtsc()); | |||||
} | |||||
static u_int | static u_int | ||||
rdtsc32_mb_lfence(void) | rdtsc32_mb_lfence(void) | ||||
{ | { | ||||
lfence(); | lfence(); | ||||
return (rdtsc32()); | return (rdtsc32()); | ||||
} | } | ||||
static u_int | static u_int | ||||
Show All 11 Lines | |||||
static u_int | static u_int | ||||
rdtscp32_(void) | rdtscp32_(void) | ||||
{ | { | ||||
return (rdtscp32()); | return (rdtscp32()); | ||||
} | } | ||||
struct tsc_selector_tag { | struct tsc_selector_tag { | ||||
uint64_t (*ts_rdtsc)(void); | |||||
u_int (*ts_rdtsc32)(void); | u_int (*ts_rdtsc32)(void); | ||||
u_int (*ts_rdtsc_low)(const struct vdso_timehands *); | u_int (*ts_rdtsc_low)(const struct vdso_timehands *); | ||||
}; | }; | ||||
static const struct tsc_selector_tag tsc_selector[] = { | static const struct tsc_selector_tag tsc_selector[] = { | ||||
[0] = { /* Intel, LFENCE */ | [0] = { /* Intel, LFENCE */ | ||||
.ts_rdtsc = rdtsc_mb_lfence, | |||||
.ts_rdtsc32 = rdtsc32_mb_lfence, | .ts_rdtsc32 = rdtsc32_mb_lfence, | ||||
.ts_rdtsc_low = rdtsc_low_mb_lfence, | .ts_rdtsc_low = rdtsc_low_mb_lfence, | ||||
}, | }, | ||||
[1] = { /* AMD, MFENCE */ | [1] = { /* AMD, MFENCE */ | ||||
.ts_rdtsc = rdtsc_mb_mfence, | |||||
.ts_rdtsc32 = rdtsc32_mb_mfence, | .ts_rdtsc32 = rdtsc32_mb_mfence, | ||||
.ts_rdtsc_low = rdtsc_low_mb_mfence, | .ts_rdtsc_low = rdtsc_low_mb_mfence, | ||||
}, | }, | ||||
[2] = { /* No SSE2 */ | [2] = { /* No SSE2 */ | ||||
.ts_rdtsc = rdtsc, | |||||
.ts_rdtsc32 = rdtsc32_mb_none, | .ts_rdtsc32 = rdtsc32_mb_none, | ||||
.ts_rdtsc_low = rdtsc_low_mb_none, | .ts_rdtsc_low = rdtsc_low_mb_none, | ||||
}, | }, | ||||
[3] = { /* RDTSCP */ | [3] = { /* RDTSCP */ | ||||
.ts_rdtsc = rdtscp, | |||||
.ts_rdtsc32 = rdtscp32_, | .ts_rdtsc32 = rdtscp32_, | ||||
.ts_rdtsc_low = rdtscp_low, | .ts_rdtsc_low = rdtscp_low, | ||||
}, | }, | ||||
}; | }; | ||||
static int | static int | ||||
tsc_selector_idx(u_int cpu_feature) | tsc_selector_idx(u_int cpu_feature) | ||||
{ | { | ||||
Show All 30 Lines | tsc_selector_idx(u_int cpu_feature) | ||||
if ((amd_feature & AMDID_RDTSCP) != 0) | if ((amd_feature & AMDID_RDTSCP) != 0) | ||||
return (3); | return (3); | ||||
if ((cpu_feature & CPUID_SSE2) == 0) | if ((cpu_feature & CPUID_SSE2) == 0) | ||||
return (2); | return (2); | ||||
return (amd_cpu ? 1 : 0); | return (amd_cpu ? 1 : 0); | ||||
} | } | ||||
DEFINE_UIFUNC(static, uint64_t, __vdso_gettc_rdtsc, (void)) | |||||
{ | |||||
return (tsc_selector[tsc_selector_idx(cpu_feature)].ts_rdtsc); | |||||
} | |||||
DEFINE_UIFUNC(static, u_int, __vdso_gettc_rdtsc_low, | DEFINE_UIFUNC(static, u_int, __vdso_gettc_rdtsc_low, | ||||
(const struct vdso_timehands *th)) | (const struct vdso_timehands *th)) | ||||
{ | { | ||||
return (tsc_selector[tsc_selector_idx(cpu_feature)].ts_rdtsc_low); | return (tsc_selector[tsc_selector_idx(cpu_feature)].ts_rdtsc_low); | ||||
} | } | ||||
DEFINE_UIFUNC(static, u_int, __vdso_gettc_rdtsc32, (void)) | DEFINE_UIFUNC(static, u_int, __vdso_gettc_rdtsc32, (void)) | ||||
{ | { | ||||
▲ Show 20 Lines • Show All 112 Lines • ▼ Show 20 Lines | while ((seq = atomic_load_acq_int(&tsc_ref->tsc_seq)) != 0) { | ||||
/* Sequence changed; re-sync. */ | /* Sequence changed; re-sync. */ | ||||
} | } | ||||
return (ENOSYS); | return (ENOSYS); | ||||
} | } | ||||
#endif /* WANT_HYPERV */ | #endif /* WANT_HYPERV */ | ||||
static struct pvclock_vcpu_time_info *pvclock_vcpu0_info; | |||||
/* | |||||
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | |||||
* yielding a 64-bit result. | |||||
*/ | |||||
static inline uint64_t | |||||
__vdso_pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) | |||||
Done Inline ActionsI wonder whether it would be possible to place the scale_delta helper in the pvclock header and have a single implementation shared between the kernel and libc, having duplicated copies of this helper is cumbersome. royger: I wonder whether it would be possible to place the scale_delta helper in the pvclock header and… | |||||
{ | |||||
uint64_t product; | |||||
if (shift < 0) | |||||
delta >>= -shift; | |||||
else | |||||
delta <<= shift; | |||||
#if defined(__i386__) | |||||
{ | |||||
uint32_t tmp1, tmp2; | |||||
Done Inline ActionsI think it would be enough to just say that the TSC_STABLE is mandatory because TSCs must be synchronized across CPUs or else using vcpu time info of CPU 0 on for all CPUs would be wrong. royger: I think it would be enough to just say that the TSC_STABLE is mandatory because TSCs must be… | |||||
Done Inline ActionsI decided to just remove this comment altogether. It was basically just attempting to point out how this flag can theoretically change at any time, which would explain how we could arrive at this location from binuptime() even though binuptime(), indirectly via tk_enabled, just observed PVCLOCK_FLAG_TSC_STABLE as set. But I don't think it's as subtle of a point as I must have when I originally wrote this, and I can see how this comment might confuse if __vdso_pvclock_tsc() is read separately from binuptime(). adam_fenn.io: I decided to just remove this comment altogether.
It was basically just attempting to point… | |||||
/** | |||||
* For i386, the formula looks like: | |||||
* | |||||
* lower = (mul_frac * (delta & UINT_MAX)) >> 32 | |||||
Done Inline ActionsI do not quite follow this. Why do you need to set pvclock_vcpu_info to MAP_FAILED in advance, during init? Basically, this introduces somewhat unpleasant and very hard to diagnose case, where two threads happen to execute e.g. gettimeofday(), and then one of them falls back to syscall. Unpleasant part is that on the next gettimeofday() call this thread would use vdso path, and kernel vs. usermode timecounters might be relatively off. I tried to avoid this with HPET. Also note use of acq/rel atomics to not rely implicitly on x86 TSO. kib: I do not quite follow this. Why do you need to set pvclock_vcpu_info to MAP_FAILED in advance… | |||||
Done Inline Actionshm... Yeah, this was assuming that it's always "ok"---in that a thread will never see a syscall-based time reading that is less than a previous syscall- or vDSO-based time reading---for a thread to fall back to the syscall path and, based on that assumption, was deliberately allowing the described race in exchange for simplified init code. If it can't be assumed to be safe to switch back and forth between the vDSO and syscall codepaths, then maybe we can't currently support the vDSO codepath for this clock source as the timekeeping code presently stands. (I'll think about this more). But doesn't the hyperv PV TSC vDSO codepath make this same assumption, since a thread can end up switching between the vDSO and syscall codepaths at any time based on whether TscSequence == 0 (hyperv_ref_tsc->tsc_seq == 0)? I want to make sure I have a correct and detailed understanding of the specific vDSO/syscall mismatch problem(s) you're referencing. Would you be able to provide example(s) of the specific way(s) that syscall-based and vDSO-based time values can differ that are being referenced here? I think I can see a case where two calls to clock_gettime() for CLOCK_MONOTONIC_FAST or CLOCK_UPTIME_FAST---which only use th_offset without adding the subsequent delta---could lead to the second call's value being less than the first. This would happen if a thread managed to make a syscall-based reading followed by a vDSO-based reading that both occurred after the in-kernel timehands update but before the corresponding vDSO-based timehands update. Is this an example of the sorts of cases you're thinking of? Am I understanding correctly if I'm thinking that this same scenario but for CLOCK_{MONOTONIC,UPTIME}{,_PRECISE}---which use th_offset plus a th_offset_count-relative delta---would not have this problem because the th_offset_count-based delta that would be included in each reading would be relative to its respective th_offset value? adam_fenn.io: hm...
Yeah, this was assuming that it's always "ok"---in that a thread will never see a… | |||||
* upper = mul_frac * (delta >> 32) | |||||
* product = lower + upper | |||||
*/ | |||||
__asm__ ( | |||||
"mul %5 ; " | |||||
"mov %4,%%eax ; " | |||||
"mov %%edx,%4 ; " | |||||
"mul %5 ; " | |||||
"xor %5,%5 ; " | |||||
"add %4,%%eax ; " | |||||
"adc %5,%%edx ; " | |||||
: "=A" (product), "=r" (tmp1), "=r" (tmp2) | |||||
: "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), | |||||
"2" (mul_frac) ); | |||||
} | |||||
#elif defined(__amd64__) | |||||
{ | |||||
unsigned long tmp; | |||||
__asm__ ( | |||||
"mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" | |||||
: [lo]"=a" (product), [hi]"=d" (tmp) | |||||
: "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); | |||||
} | |||||
#else | |||||
#error "pvclock: unsupported x86 architecture?" | |||||
#endif | |||||
return (product); | |||||
} | |||||
static inline uint64_t | |||||
__vdso_pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti) | |||||
{ | |||||
uint64_t delta; | |||||
delta = __vdso_gettc_rdtsc() - ti->tsc_timestamp; | |||||
return (__vdso_pvclock_scale_delta(delta, ti->tsc_to_system_mul, | |||||
ti->tsc_shift)); | |||||
} | |||||
static int | |||||
__vdso_pvclock_tsc(struct pvclock_vcpu_time_info *ti, u_int *tc) | |||||
{ | |||||
uint64_t cycles; | |||||
uint32_t version; | |||||
uint8_t flags; | |||||
do { | |||||
version = ti->version; | |||||
rmb(); | |||||
cycles = ti->system_time + __vdso_pvclock_get_nsec_offset(ti); | |||||
flags = ti->flags; | |||||
rmb(); | |||||
} while ((ti->version & 1) != 0 || ti->version != version); | |||||
if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) { | |||||
/* | |||||
* The 'PVCLOCK_FLAG_TSC_STABLE' bit must have transitioned from | |||||
* set to unset between the time that the current (from the | |||||
* perspective of this context's call to binuptime) 'struct | |||||
* vdso_timehands' structure and the 'struct vdso_timekeep' | |||||
* 'tk_enabled' flag were updated and now. Return 'ENOSYS' to | |||||
* fall-back to the syscall codepath: | |||||
*/ | |||||
return (ENOSYS); | |||||
} | |||||
*tc = cycles; | |||||
return (0); | |||||
} | |||||
static void | |||||
__vdso_init_pvclock_tsc(void) | |||||
{ | |||||
int fd; | |||||
unsigned int mode; | |||||
if (atomic_cmpset_ptr((volatile uintptr_t *)&pvclock_vcpu0_info, | |||||
(uintptr_t)NULL, (uintptr_t)MAP_FAILED) != 0) { | |||||
if (cap_getmode(&mode) == 0 && mode != 0) | |||||
return; | |||||
fd = _open("/dev/" PVCLOCK_CDEVNAME, O_RDONLY); | |||||
if (fd < 0) | |||||
return; | |||||
pvclock_vcpu0_info = mmap(NULL, sizeof(*pvclock_vcpu0_info), | |||||
PROT_READ, MAP_SHARED, fd, 0); | |||||
_close(fd); | |||||
} | |||||
} | |||||
#pragma weak __vdso_gettc | #pragma weak __vdso_gettc | ||||
int | int | ||||
__vdso_gettc(const struct vdso_timehands *th, u_int *tc) | __vdso_gettc(const struct vdso_timehands *th, u_int *tc) | ||||
{ | { | ||||
volatile char *map; | volatile char *map; | ||||
uint32_t idx; | uint32_t idx; | ||||
switch (th->th_algo) { | switch (th->th_algo) { | ||||
Show All 19 Lines | |||||
#ifdef WANT_HYPERV | #ifdef WANT_HYPERV | ||||
case VDSO_TH_ALGO_X86_HVTSC: | case VDSO_TH_ALGO_X86_HVTSC: | ||||
if (hyperv_ref_tsc == NULL) | if (hyperv_ref_tsc == NULL) | ||||
__vdso_init_hyperv_tsc(); | __vdso_init_hyperv_tsc(); | ||||
if (hyperv_ref_tsc == MAP_FAILED) | if (hyperv_ref_tsc == MAP_FAILED) | ||||
return (ENOSYS); | return (ENOSYS); | ||||
return (__vdso_hyperv_tsc(hyperv_ref_tsc, tc)); | return (__vdso_hyperv_tsc(hyperv_ref_tsc, tc)); | ||||
#endif | #endif | ||||
case VDSO_TH_ALGO_X86_PVCLK: | |||||
if (pvclock_vcpu0_info == NULL) | |||||
__vdso_init_pvclock_tsc(); | |||||
if (pvclock_vcpu0_info == MAP_FAILED) | |||||
return (ENOSYS); | |||||
return (__vdso_pvclock_tsc(pvclock_vcpu0_info, tc)); | |||||
default: | default: | ||||
return (ENOSYS); | return (ENOSYS); | ||||
} | } | ||||
} | } | ||||
#pragma weak __vdso_gettimekeep | #pragma weak __vdso_gettimekeep | ||||
int | int | ||||
__vdso_gettimekeep(struct vdso_timekeep **tk) | __vdso_gettimekeep(struct vdso_timekeep **tk) | ||||
{ | { | ||||
return (_elf_aux_info(AT_TIMEKEEP, tk, sizeof(*tk))); | return (_elf_aux_info(AT_TIMEKEEP, tk, sizeof(*tk))); | ||||
} | } |
I think the inline here is IMO misleading, as you end up taking a pointer to the function when filling the tsc_selector struct, so the inline is just dropped.