Changeset View
Standalone View
sys/x86/x86/pvclock.c
Show All 25 Lines | |||||
* SUCH DAMAGE. | * SUCH DAMAGE. | ||||
*/ | */ | ||||
#include <sys/cdefs.h> | #include <sys/cdefs.h> | ||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#include <sys/param.h> | #include <sys/param.h> | ||||
#include <sys/systm.h> | #include <sys/systm.h> | ||||
#include <sys/bus.h> | |||||
#include <sys/clock.h> | |||||
#include <sys/conf.h> | |||||
#include <sys/fcntl.h> | |||||
#include <sys/limits.h> | |||||
#include <sys/mman.h> | |||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||
#include <sys/smp.h> | |||||
#include <sys/sysctl.h> | |||||
#include <sys/vdso.h> | |||||
#include <machine/cpufunc.h> | #include <vm/vm.h> | ||||
#include <machine/cpu.h> | #include <vm/pmap.h> | ||||
#include <machine/atomic.h> | #include <machine/atomic.h> | ||||
#include <machine/md_var.h> | |||||
#include <machine/pvclock.h> | #include <machine/pvclock.h> | ||||
#include <x86/rdtsc_ordered.h> | |||||
/* | /* | ||||
* Last time; this guarantees a monotonically increasing clock for when | * Last system time. This is used to guarantee a monotonically non-decreasing | ||||
* a stable TSC is not provided. | * clock for the kernel codepath and approximate the same for the vDSO codepath. | ||||
* In theory, this should be unnecessary absent hypervisor bug(s) and/or what | |||||
* should be rare cases where TSC jitter may still be visible despite the | |||||
* hypervisor's best efforts. | |||||
*/ | */ | ||||
static volatile uint64_t pvclock_last_cycles; | static volatile uint64_t pvclock_last_systime; | ||||
static uint64_t pvclock_getsystime(struct pvclock *pvc); | |||||
static void pvclock_read_time_info( | |||||
struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags); | |||||
static void pvclock_read_wall_clock(struct pvclock_wall_clock *wc, | |||||
struct timespec *ts); | |||||
static u_int pvclock_tc_get_timecount(struct timecounter *tc); | |||||
static uint32_t pvclock_tc_vdso_timehands( | |||||
struct vdso_timehands *vdso_th, struct timecounter *tc); | |||||
#ifdef COMPAT_FREEBSD32 | |||||
static uint32_t pvclock_tc_vdso_timehands32( | |||||
struct vdso_timehands32 *vdso_th, struct timecounter *tc); | |||||
#endif | |||||
static d_open_t pvclock_cdev_open; | |||||
static d_mmap_t pvclock_cdev_mmap; | |||||
static struct cdevsw pvclock_cdev_cdevsw = { | |||||
.d_version = D_VERSION, | |||||
.d_name = PVCLOCK_CDEVNAME, | |||||
.d_open = pvclock_cdev_open, | |||||
.d_mmap = pvclock_cdev_mmap, | |||||
}; | |||||
void | void | ||||
pvclock_resume(void) | pvclock_resume(void) | ||||
{ | { | ||||
atomic_store_rel_64(&pvclock_last_systime, 0); | |||||
atomic_store_rel_64(&pvclock_last_cycles, 0); | |||||
} | } | ||||
uint64_t | uint64_t | ||||
pvclock_get_last_cycles(void) | |||||
{ | |||||
return (atomic_load_acq_64(&pvclock_last_cycles)); | |||||
} | |||||
uint64_t | |||||
pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti) | pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti) | ||||
{ | { | ||||
uint64_t freq; | uint64_t freq; | ||||
freq = (1000000000ULL << 32) / ti->tsc_to_system_mul; | freq = (1000000000ULL << 32) / ti->tsc_to_system_mul; | ||||
if (ti->tsc_shift < 0) | if (ti->tsc_shift < 0) | ||||
freq <<= -ti->tsc_shift; | freq <<= -ti->tsc_shift; | ||||
else | else | ||||
freq >>= ti->tsc_shift; | freq >>= ti->tsc_shift; | ||||
return (freq); | return (freq); | ||||
} | } | ||||
/* | |||||
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, | |||||
* yielding a 64-bit result. | |||||
*/ | |||||
static inline uint64_t | |||||
pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) | |||||
{ | |||||
uint64_t product; | |||||
if (shift < 0) | |||||
delta >>= -shift; | |||||
else | |||||
delta <<= shift; | |||||
#if defined(__i386__) | |||||
{ | |||||
uint32_t tmp1, tmp2; | |||||
/** | |||||
* For i386, the formula looks like: | |||||
* | |||||
* lower = (mul_frac * (delta & UINT_MAX)) >> 32 | |||||
* upper = mul_frac * (delta >> 32) | |||||
* product = lower + upper | |||||
*/ | |||||
__asm__ ( | |||||
"mul %5 ; " | |||||
"mov %4,%%eax ; " | |||||
"mov %%edx,%4 ; " | |||||
"mul %5 ; " | |||||
"xor %5,%5 ; " | |||||
"add %4,%%eax ; " | |||||
"adc %5,%%edx ; " | |||||
: "=A" (product), "=r" (tmp1), "=r" (tmp2) | |||||
: "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), | |||||
"2" (mul_frac) ); | |||||
} | |||||
#elif defined(__amd64__) | |||||
{ | |||||
unsigned long tmp; | |||||
__asm__ ( | |||||
"mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" | |||||
: [lo]"=a" (product), [hi]"=d" (tmp) | |||||
: "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); | |||||
} | |||||
#else | |||||
#error "pvclock: unsupported x86 architecture?" | |||||
#endif | |||||
return (product); | |||||
} | |||||
static uint64_t | |||||
pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti) | |||||
{ | |||||
uint64_t delta; | |||||
delta = rdtsc() - ti->tsc_timestamp; | |||||
return (pvclock_scale_delta(delta, ti->tsc_to_system_mul, | |||||
ti->tsc_shift)); | |||||
} | |||||
static void | static void | ||||
pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, | pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, | ||||
uint64_t *cycles, uint8_t *flags) | uint64_t *ns, uint8_t *flags) | ||||
{ | { | ||||
uint64_t delta; | |||||
uint32_t version; | uint32_t version; | ||||
do { | do { | ||||
version = ti->version; | version = atomic_load_acq_32(&ti->version); | ||||
rmb(); | delta = rdtsc_ordered() - ti->tsc_timestamp; | ||||
*cycles = ti->system_time + pvclock_get_nsec_offset(ti); | *ns = ti->system_time + pvclock_scale_delta(delta, | ||||
ti->tsc_to_system_mul, ti->tsc_shift); | |||||
*flags = ti->flags; | *flags = ti->flags; | ||||
rmb(); | atomic_thread_fence_acq(); | ||||
} while ((ti->version & 1) != 0 || ti->version != version); | } while ((ti->version & 1) != 0 || ti->version != version); | ||||
} | } | ||||
static void | static void | ||||
pvclock_read_wall_clock(struct pvclock_wall_clock *wc, uint32_t *sec, | pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts) | ||||
uint32_t *nsec) | |||||
{ | { | ||||
uint32_t version; | uint32_t version; | ||||
do { | do { | ||||
version = wc->version; | version = atomic_load_acq_32(&wc->version); | ||||
rmb(); | ts->tv_sec = wc->sec; | ||||
*sec = wc->sec; | ts->tv_nsec = wc->nsec; | ||||
*nsec = wc->nsec; | atomic_thread_fence_acq(); | ||||
rmb(); | |||||
} while ((wc->version & 1) != 0 || wc->version != version); | } while ((wc->version & 1) != 0 || wc->version != version); | ||||
} | } | ||||
static uint64_t | |||||
pvclock_getsystime(struct pvclock *pvc) | |||||
{ | |||||
uint64_t now, last, ret; | |||||
uint8_t flags; | |||||
critical_enter(); | |||||
pvclock_read_time_info(&pvc->timeinfos[curcpu], &now, &flags); | |||||
if (flags & PVCLOCK_FLAG_TSC_STABLE) | |||||
goto leave_ret_now; | |||||
do { | |||||
last = atomic_load_acq_64(&pvclock_last_systime); | |||||
if (last > now) { | |||||
ret = last; | |||||
goto leave; | |||||
} | |||||
} while (!atomic_cmpset_64(&pvclock_last_systime, last, now)); | |||||
leave_ret_now: | |||||
ret = now; | |||||
leave: | |||||
critical_exit(); | |||||
return (ret); | |||||
} | |||||
/* | |||||
* NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' | |||||
* has been migrated to the 'struct pvclock' API. | |||||
*/ | |||||
uint64_t | uint64_t | ||||
pvclock_get_timecount(struct pvclock_vcpu_time_info *ti) | pvclock_get_timecount(struct pvclock_vcpu_time_info *ti) | ||||
{ | { | ||||
uint64_t now, last; | uint64_t now, last; | ||||
uint8_t flags; | uint8_t flags; | ||||
pvclock_read_time_info(ti, &now, &flags); | pvclock_read_time_info(ti, &now, &flags); | ||||
if (flags & PVCLOCK_FLAG_TSC_STABLE) | if (flags & PVCLOCK_FLAG_TSC_STABLE) | ||||
return (now); | return (now); | ||||
/* | |||||
* Enforce a monotonically increasing clock time across all VCPUs. | |||||
* If our time is too old, use the last time and return. Otherwise, | |||||
* try to update the last time. | |||||
*/ | |||||
do { | do { | ||||
last = atomic_load_acq_64(&pvclock_last_cycles); | last = atomic_load_acq_64(&pvclock_last_systime); | ||||
if (last > now) | if (last > now) | ||||
return (last); | return (last); | ||||
} while (!atomic_cmpset_64(&pvclock_last_cycles, last, now)); | } while (!atomic_cmpset_64(&pvclock_last_systime, last, now)); | ||||
return (now); | return (now); | ||||
} | } | ||||
/* | |||||
* NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' | |||||
* has been migrated to the 'struct pvclock' API. | |||||
*/ | |||||
void | void | ||||
pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts) | pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts) | ||||
{ | { | ||||
uint32_t sec, nsec; | pvclock_read_wall_clock(wc, ts); | ||||
} | |||||
pvclock_read_wall_clock(wc, &sec, &nsec); | static int | ||||
ts->tv_sec = sec; | pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) | ||||
kib: Please do not use these linuxisms.
I suspect you need load_acq(&wc->version) instead of these… | |||||
ts->tv_nsec = nsec; | { | ||||
if (oflags & FWRITE) | |||||
return (EPERM); | |||||
Done Inline Actions.. and then atomic_thread_fence_acq() instead of this rmb(). But where is the writer? kib: .. and then atomic_thread_fence_acq() instead of this rmb(). But where is the writer? | |||||
Done Inline ActionsThe writer is the hypervisor. royger: The writer is the hypervisor. | |||||
return (0); | |||||
} | |||||
static int | |||||
pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, | |||||
int nprot, vm_memattr_t *memattr) | |||||
{ | |||||
if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info)) | |||||
return (EINVAL); | |||||
if (PROT_EXTRACT(nprot) != PROT_READ) | |||||
return (EACCES); | |||||
*paddr = vtophys((uintptr_t)dev->si_drv1 + offset); | |||||
*memattr = VM_MEMATTR_DEFAULT; | |||||
return (0); | |||||
} | |||||
static u_int | |||||
pvclock_tc_get_timecount(struct timecounter *tc) | |||||
{ | |||||
struct pvclock *pvc = tc->tc_priv; | |||||
Done Inline ActionsI think this should return EACCES as the file only allows opening in read-only mode. royger: I think this should return EACCES as the file only allows opening in read-only mode. | |||||
return (pvclock_getsystime(pvc) & UINT_MAX); | |||||
} | |||||
static uint32_t | |||||
pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th, | |||||
struct timecounter *tc) | |||||
{ | |||||
struct pvclock *pvc = tc->tc_priv; | |||||
vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; | |||||
vdso_th->th_x86_shift = 0; | |||||
vdso_th->th_x86_hpet_idx = 0; | |||||
vdso_th->th_x86_pvc_last_systime = | |||||
atomic_load_acq_64(&pvclock_last_systime); | |||||
vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && | |||||
pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; | |||||
bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); | |||||
return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP); | |||||
} | |||||
#ifdef COMPAT_FREEBSD32 | |||||
static uint32_t | |||||
pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th, | |||||
struct timecounter *tc) | |||||
{ | |||||
struct pvclock *pvc = tc->tc_priv; | |||||
vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; | |||||
vdso_th->th_x86_shift = 0; | |||||
vdso_th->th_x86_hpet_idx = 0; | |||||
vdso_th->th_x86_pvc_last_systime = | |||||
atomic_load_acq_64(&pvclock_last_systime); | |||||
vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && | |||||
pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; | |||||
bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); | |||||
return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP); | |||||
} | |||||
#endif | |||||
void | |||||
pvclock_gettime(struct pvclock *pvc, struct timespec *ts) | |||||
{ | |||||
struct timespec system_ts; | |||||
uint64_t system_ns; | |||||
pvclock_read_wall_clock(pvc->get_wallclock(pvc->get_wallclock_arg), ts); | |||||
system_ns = pvclock_getsystime(pvc); | |||||
system_ts.tv_sec = system_ns / 1000000000ULL; | |||||
system_ts.tv_nsec = system_ns % 1000000000ULL; | |||||
Done Inline ActionsCouldn't vDSO became functional at some later point (ie: when migrated to a different host) even if the current vcpu_time_info reports non-stable? I wonder whether we should just check if the stable flag is supported instead of also checking if it's currently set. royger: Couldn't vDSO became functional at some later point (ie: when migrated to a different host)… | |||||
Done Inline ActionsRight. But pvclock_tc_vdso_timehands{,32}() is called with each vDSO timehands update, so, such scenarios should be supported, no? I think the (pvc->ti_vcpu0_page->flags & PVCLOCK_FLAG_TSC_STABLE) != 0 clause could, indeed, be dropped. But I included it thinking that, in the unstable case, we might as well short-circuit the vDSO codepath sooner than later---with this clause, the vDSO codepath decides to fall back to the syscall codepath after it looks at tk->tk_enabled in binuptime() whereas, without this clause, this decision will happen after the PVCLOCK_FLAG_TSC_STABLE flag check in __vdso_pvclock_tsc(). I obtained a rough measurement of the delta between these two versions of the unstable TSC codepath by inverting the PVCLOCK_FLAG_TSC_STABLE check in __vdso_pvclock_tsc() and then looking at syscall_timing gettimeofday numbers for (1) kern.timecounter.fast_gettime=0 and (2) kern.timecounter.fast_gettime=1. I figure, on my PVCLOCK_FLAG_TSC_STABLE test systems, (1) should roughly simulate the version where the code is left as-is and (2) should roughly simulate the version where this clause is dropped. These syscall_timing gettimeofday numbers from (1) to (2) regressed from 116ns -> 146ns = +30ns on my AMD HW and 119ns -> 141ns = +22ns on my Intel HW. So, potentially non-negligible and makes sense to keep this clause in place (or achieve the same outcome in some other way)? What do you think? adam_fenn.io: Right. But `pvclock_tc_vdso_timehands{,32}()` is called with each vDSO timehands update, so… | |||||
timespecadd(ts, &system_ts, ts); | |||||
} | |||||
void | |||||
pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name, | |||||
int tc_quality, u_int tc_flags) | |||||
{ | |||||
struct make_dev_args mda; | |||||
int err; | |||||
KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0, | |||||
("Specified time info page(s) address is not page-aligned.")); | |||||
/* Set up vDSO stable-flag suppression test facility: */ | |||||
pvc->vdso_force_unstable = false; | |||||
SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), | |||||
SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, | |||||
"vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0, | |||||
"Forcibly deassert stable flag in vDSO codepath"); | |||||
/* Set up timecounter and timecounter-supporting members: */ | |||||
pvc->tc.tc_get_timecount = pvclock_tc_get_timecount; | |||||
pvc->tc.tc_poll_pps = NULL; | |||||
pvc->tc.tc_counter_mask = ~0U; | |||||
pvc->tc.tc_frequency = 1000000000ULL; | |||||
pvc->tc.tc_name = tc_name; | |||||
pvc->tc.tc_quality = tc_quality; | |||||
pvc->tc.tc_flags = tc_flags; | |||||
pvc->tc.tc_priv = pvc; | |||||
pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands; | |||||
#ifdef COMPAT_FREEBSD32 | |||||
pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32; | |||||
#endif | |||||
/* Set up cdev for userspace mmapping of vCPU 0 time info page: */ | |||||
make_dev_args_init(&mda); | |||||
mda.mda_devsw = &pvclock_cdev_cdevsw; | |||||
mda.mda_uid = UID_ROOT; | |||||
mda.mda_gid = GID_WHEEL; | |||||
mda.mda_mode = 0444; | |||||
mda.mda_si_drv1 = pvc->timeinfos; | |||||
err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME); | |||||
if (err != 0) { | |||||
device_printf(dev, "Could not create /dev/%s, error %d. Fast " | |||||
"time of day will be unavailable for this timecounter.\n", | |||||
PVCLOCK_CDEVNAME, err); | |||||
KASSERT(pvc->cdev == NULL, | |||||
("Failed make_dev_s() unexpectedly inited cdev.")); | |||||
} | |||||
/* Register timecounter: */ | |||||
Done Inline ActionsGiven the list of arguments here I think we should consider creating a structure and passing a pointer to it, or rather fill some of the pvclock struct fields by the caller? You could split the fields of pvclock between the public ones to be filled by the caller and the private ones. royger: Given the list of arguments here I think we should consider creating a structure and passing a… | |||||
tc_init(&pvc->tc); | |||||
/* | |||||
* Register wallclock: | |||||
* The RTC registration API expects a resolution in microseconds; | |||||
* pvclock's 1ns resolution is rounded up to 1us. | |||||
Done Inline ActionsNit: I usually try to avoid splitting such log messages because grepping for them afterwards is not possible. I would rather try to place the message starting on the newline as you won't have to split it then. royger: Nit: I usually try to avoid splitting such log messages because grepping for them afterwards is… | |||||
*/ | |||||
clock_register(dev, 1); | |||||
} | |||||
int | |||||
pvclock_destroy(struct pvclock *pvc) | |||||
{ | |||||
/* | |||||
* Not currently possible since there is no teardown counterpart of | |||||
* 'tc_init()'. | |||||
*/ | |||||
return (EBUSY); | |||||
} | } | ||||
Done Inline ActionsGiven the pvclock has a resolution of 1ns and that's fixed I think it would be fine to just hardcode this as 1us. It's not like PVCLOCK_RESOLUTION_NS can or will be modified anyway? royger: Given the pvclock has a resolution of 1ns and that's fixed I think it would be fine to just… |
Please do not use these linuxisms.
I suspect you need load_acq(&wc->version) instead of these two lines ...