diff --git a/sys/x86/include/pvclock.h b/sys/x86/include/pvclock.h index 023acdb80d9c..1306f11fc960 100644 --- a/sys/x86/include/pvclock.h +++ b/sys/x86/include/pvclock.h @@ -1,146 +1,147 @@ /*- * Copyright (c) 2014, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef X86_PVCLOCK #define X86_PVCLOCK #include #ifdef _KERNEL #include #endif /* _KERNEL */ #define PVCLOCK_CDEVNAME "pvclock" struct pvclock_vcpu_time_info { uint32_t version; uint32_t pad0; uint64_t tsc_timestamp; uint64_t system_time; uint32_t tsc_to_system_mul; int8_t tsc_shift; uint8_t flags; uint8_t pad[2]; }; #define PVCLOCK_FLAG_TSC_STABLE 0x01 #define PVCLOCK_FLAG_GUEST_PASUED 0x02 /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ static inline uint64_t pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) { uint64_t product; if (shift < 0) delta >>= -shift; else delta <<= shift; #if defined(__i386__) { uint32_t tmp1, tmp2; /** * For i386, the formula looks like: * * lower = (mul_frac * (delta & UINT_MAX)) >> 32 * upper = mul_frac * (delta >> 32) * product = lower + upper */ __asm__ ( "mul %5 ; " "mov %4,%%eax ; " "mov %%edx,%4 ; " "mul %5 ; " "xor %5,%5 ; " "add %4,%%eax ; " "adc %5,%%edx ; " : "=A" (product), "=r" (tmp1), "=r" (tmp2) : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), "2" (mul_frac) ); } #elif defined(__amd64__) { unsigned long tmp; __asm__ ( "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" : [lo]"=a" (product), [hi]"=d" (tmp) : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); } #else #error "pvclock: unsupported x86 architecture?" #endif return (product); } #ifdef _KERNEL typedef struct pvclock_wall_clock *pvclock_get_wallclock_t(void *arg); struct pvclock_wall_clock { uint32_t version; uint32_t sec; uint32_t nsec; }; struct pvclock { /* Public; initialized by the caller of 'pvclock_init()': */ pvclock_get_wallclock_t *get_wallclock; void *get_wallclock_arg; struct pvclock_vcpu_time_info *timeinfos; bool stable_flag_supported; /* Private; initialized by the 'pvclock' API: */ bool vdso_force_unstable; + bool vdso_enable_without_rdtscp; struct timecounter tc; struct cdev *cdev; }; /* * NOTE: 'pvclock_get_timecount()' and 'pvclock_get_wallclock()' are purely * transitional; they should be removed after 'dev/xen/timer/timer.c' has been * migrated to the 'struct pvclock' API. */ void pvclock_resume(void); uint64_t pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti); uint64_t pvclock_get_timecount(struct pvclock_vcpu_time_info *ti); void pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts); void pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name, int tc_quality, u_int tc_flags); void pvclock_gettime(struct pvclock *pvc, struct timespec *ts); int pvclock_destroy(struct pvclock *pvc); #endif /* _KERNEL */ #endif diff --git a/sys/x86/x86/pvclock.c b/sys/x86/x86/pvclock.c index cc2377bdbcf0..3da3373bb2ee 100644 --- a/sys/x86/x86/pvclock.c +++ b/sys/x86/x86/pvclock.c @@ -1,336 +1,358 @@ /*- * Copyright (c) 2009 Adrian Chadd * Copyright (c) 2012 Spectra Logic Corporation * Copyright (c) 2014 Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Last system time. This is used to guarantee a monotonically non-decreasing * clock for the kernel codepath and approximate the same for the vDSO codepath. * In theory, this should be unnecessary absent hypervisor bug(s) and/or what * should be rare cases where TSC jitter may still be visible despite the * hypervisor's best efforts. */ static volatile uint64_t pvclock_last_systime; static uint64_t pvclock_getsystime(struct pvclock *pvc); static void pvclock_read_time_info( struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags); static void pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts); static u_int pvclock_tc_get_timecount(struct timecounter *tc); static uint32_t pvclock_tc_vdso_timehands( struct vdso_timehands *vdso_th, struct timecounter *tc); #ifdef COMPAT_FREEBSD32 static uint32_t pvclock_tc_vdso_timehands32( struct vdso_timehands32 *vdso_th, struct timecounter *tc); #endif static d_open_t pvclock_cdev_open; static d_mmap_t pvclock_cdev_mmap; static struct cdevsw pvclock_cdev_cdevsw = { .d_version = D_VERSION, .d_name = PVCLOCK_CDEVNAME, .d_open = pvclock_cdev_open, .d_mmap = pvclock_cdev_mmap, }; void pvclock_resume(void) { atomic_store_rel_64(&pvclock_last_systime, 0); } uint64_t pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti) { uint64_t freq; freq = (1000000000ULL << 32) / ti->tsc_to_system_mul; if (ti->tsc_shift < 0) freq <<= -ti->tsc_shift; else freq >>= ti->tsc_shift; return (freq); } static void pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags) { uint64_t delta; uint32_t version; do { version = atomic_load_acq_32(&ti->version); delta = rdtsc_ordered() - ti->tsc_timestamp; *ns = ti->system_time + pvclock_scale_delta(delta, ti->tsc_to_system_mul, ti->tsc_shift); *flags = ti->flags; atomic_thread_fence_acq(); } while ((ti->version & 1) != 0 || ti->version != version); } static void pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts) { uint32_t version; do { version = atomic_load_acq_32(&wc->version); ts->tv_sec = wc->sec; ts->tv_nsec = wc->nsec; atomic_thread_fence_acq(); } while ((wc->version & 1) != 0 || wc->version != version); } static uint64_t pvclock_getsystime(struct pvclock *pvc) { uint64_t now, last, ret; uint8_t flags; critical_enter(); pvclock_read_time_info(&pvc->timeinfos[curcpu], &now, &flags); ret = now; if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) { last = atomic_load_acq_64(&pvclock_last_systime); do { if (last > now) { ret = last; break; } } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last, now)); } critical_exit(); return (ret); } /* * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' * has been migrated to the 'struct pvclock' API. */ uint64_t pvclock_get_timecount(struct pvclock_vcpu_time_info *ti) { uint64_t now, last, ret; uint8_t flags; pvclock_read_time_info(ti, &now, &flags); ret = now; if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) { last = atomic_load_acq_64(&pvclock_last_systime); do { if (last > now) { ret = last; break; } } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last, now)); } return (ret); } /* * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' * has been migrated to the 'struct pvclock' API. */ void pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts) { pvclock_read_wall_clock(wc, ts); } static int pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { if (oflags & FWRITE) return (EPERM); return (0); } static int pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info)) return (EINVAL); if (PROT_EXTRACT(nprot) != PROT_READ) return (EACCES); *paddr = vtophys((uintptr_t)dev->si_drv1 + offset); *memattr = VM_MEMATTR_DEFAULT; return (0); } static u_int pvclock_tc_get_timecount(struct timecounter *tc) { struct pvclock *pvc = tc->tc_priv; return (pvclock_getsystime(pvc) & UINT_MAX); } static uint32_t pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { struct pvclock *pvc = tc->tc_priv; + if (pvc->cdev == NULL) + return (0); + vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; vdso_th->th_x86_shift = 0; vdso_th->th_x86_hpet_idx = 0; vdso_th->th_x86_pvc_last_systime = atomic_load_acq_64(&pvclock_last_systime); vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); - return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP); + return ((amd_feature & AMDID_RDTSCP) != 0 || + ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 && + pvc->vdso_enable_without_rdtscp)); } #ifdef COMPAT_FREEBSD32 static uint32_t pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th, struct timecounter *tc) { struct pvclock *pvc = tc->tc_priv; + if (pvc->cdev == NULL) + return (0); + vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; vdso_th->th_x86_shift = 0; vdso_th->th_x86_hpet_idx = 0; vdso_th->th_x86_pvc_last_systime = atomic_load_acq_64(&pvclock_last_systime); vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); - return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP); + return ((amd_feature & AMDID_RDTSCP) != 0 || + ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 && + pvc->vdso_enable_without_rdtscp)); } #endif void pvclock_gettime(struct pvclock *pvc, struct timespec *ts) { struct timespec system_ts; uint64_t system_ns; pvclock_read_wall_clock(pvc->get_wallclock(pvc->get_wallclock_arg), ts); system_ns = pvclock_getsystime(pvc); system_ts.tv_sec = system_ns / 1000000000ULL; system_ts.tv_nsec = system_ns % 1000000000ULL; timespecadd(ts, &system_ts, ts); } void pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name, int tc_quality, u_int tc_flags) { struct make_dev_args mda; int err; KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0, ("Specified time info page(s) address is not page-aligned.")); /* Set up vDSO stable-flag suppression test facility: */ pvc->vdso_force_unstable = false; SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0, "Forcibly deassert stable flag in vDSO codepath"); + /* + * Make it possible to use the vDSO page even when the hypervisor does + * not support the rdtscp instruction. This is disabled by default for + * compatibility with old libc. + */ + pvc->vdso_enable_without_rdtscp = false; + SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), + SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, + "vdso_enable_without_rdtscp", CTLFLAG_RWTUN, + &pvc->vdso_enable_without_rdtscp, 0, + "Allow the use of a vDSO when rdtscp is not available"); + /* Set up timecounter and timecounter-supporting members: */ pvc->tc.tc_get_timecount = pvclock_tc_get_timecount; pvc->tc.tc_poll_pps = NULL; pvc->tc.tc_counter_mask = ~0U; pvc->tc.tc_frequency = 1000000000ULL; pvc->tc.tc_name = tc_name; pvc->tc.tc_quality = tc_quality; pvc->tc.tc_flags = tc_flags; pvc->tc.tc_priv = pvc; pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands; #ifdef COMPAT_FREEBSD32 pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32; #endif /* Set up cdev for userspace mmapping of vCPU 0 time info page: */ make_dev_args_init(&mda); mda.mda_devsw = &pvclock_cdev_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0444; mda.mda_si_drv1 = pvc->timeinfos; err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME); if (err != 0) { device_printf(dev, "Could not create /dev/%s, error %d. Fast " "time of day will be unavailable for this timecounter.\n", PVCLOCK_CDEVNAME, err); KASSERT(pvc->cdev == NULL, ("Failed make_dev_s() unexpectedly inited cdev.")); } /* Register timecounter: */ tc_init(&pvc->tc); /* * Register wallclock: * The RTC registration API expects a resolution in microseconds; * pvclock's 1ns resolution is rounded up to 1us. */ clock_register(dev, 1); } int pvclock_destroy(struct pvclock *pvc) { /* * Not currently possible since there is no teardown counterpart of * 'tc_init()'. */ return (EBUSY); }