Index: lib/libc/x86/gen/Makefile.inc =================================================================== --- lib/libc/x86/gen/Makefile.inc +++ lib/libc/x86/gen/Makefile.inc @@ -3,4 +3,5 @@ .PATH: ${LIBC_SRCTOP}/x86/gen SRCS+= \ - getcontextx.c + getcontextx.c \ + rdtsc_ordered.c Index: lib/libc/x86/gen/rdtsc_ordered.c =================================================================== --- lib/libc/x86/gen/rdtsc_ordered.c +++ lib/libc/x86/gen/rdtsc_ordered.c @@ -1,6 +1,8 @@ /*- - * Copyright (c) 2014, Bryan Venteicher - * All rights reserved. + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Juniper Networks, Inc. + * Copyright (c) 2021 Klara, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -22,38 +24,11 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ -#ifndef X86_PVCLOCK -#define X86_PVCLOCK - -struct pvclock_vcpu_time_info { - uint32_t version; - uint32_t pad0; - uint64_t tsc_timestamp; - uint64_t system_time; - uint32_t tsc_to_system_mul; - int8_t tsc_shift; - uint8_t flags; - uint8_t pad[2]; -}; - -#define PVCLOCK_FLAG_TSC_STABLE 0x01 -#define PVCLOCK_FLAG_GUEST_PASUED 0x02 - -struct pvclock_wall_clock { - uint32_t version; - uint32_t sec; - uint32_t nsec; -}; +#include +__FBSDID("$FreeBSD$"); -void pvclock_resume(void); -uint64_t pvclock_get_last_cycles(void); -uint64_t pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti); -uint64_t pvclock_get_timecount(struct pvclock_vcpu_time_info *ti); -void pvclock_get_wallclock(struct pvclock_wall_clock *wc, - struct timespec *ts); +#include -#endif +DEFINE_RDTSC_ORDERED(); Index: lib/libc/x86/sys/__vdso_gettc.c =================================================================== --- lib/libc/x86/sys/__vdso_gettc.c +++ lib/libc/x86/sys/__vdso_gettc.c @@ -45,6 +45,7 @@ #include "un-namespace.h" #include #include +#include #include #include #ifdef WANT_HYPERV @@ -312,6 +313,40 @@ #endif /* WANT_HYPERV */ +static struct pvclock_vcpu_time_info *pvclock_vcpu0_info; + +static int +__vdso_pvclock_tsc(struct pvclock_vcpu_time_info *ti, u_int *tc) +{ + uint64_t ns; + uint8_t flags; + + pvclock_read_time_info(ti, &ns, &flags); + if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) + return (ENOSYS); + *tc = ns; + return (0); +} + +static void +__vdso_init_pvclock_tsc(void) +{ + int fd; + unsigned int mode; + + if (atomic_cmpset_acq_ptr((volatile uintptr_t *)&pvclock_vcpu0_info, + (uintptr_t)NULL, (uintptr_t)MAP_FAILED) != 0) { + if (cap_getmode(&mode) == 0 && mode != 0) + return; + fd = _open("/dev/" PVCLOCK_CDEVNAME, O_RDONLY); + if (fd < 0) + return; + pvclock_vcpu0_info = mmap(NULL, sizeof(*pvclock_vcpu0_info), + PROT_READ, MAP_SHARED, fd, 0); + _close(fd); + } +} + #pragma weak __vdso_gettc int __vdso_gettc(const struct vdso_timehands *th, u_int *tc) @@ -347,6 +382,12 @@ return (ENOSYS); return (__vdso_hyperv_tsc(hyperv_ref_tsc, tc)); #endif + case VDSO_TH_ALGO_X86_PVCLK: + if (pvclock_vcpu0_info == NULL) + __vdso_init_pvclock_tsc(); + if (pvclock_vcpu0_info == MAP_FAILED) + return (ENOSYS); + return (__vdso_pvclock_tsc(pvclock_vcpu0_info, tc)); default: return (ENOSYS); } Index: sys/amd64/conf/GENERIC =================================================================== --- sys/amd64/conf/GENERIC +++ sys/amd64/conf/GENERIC @@ -376,6 +376,9 @@ device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # HyperV drivers and enhancement support device hyperv # HyperV drivers Index: sys/amd64/conf/MINIMAL =================================================================== --- sys/amd64/conf/MINIMAL +++ sys/amd64/conf/MINIMAL @@ -144,6 +144,9 @@ # Note that 'bpf' is required for DHCP. device bpf # Berkeley packet filter +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # Xen HVM Guest Optimizations # NOTE: XENHVM depends on xenpci. They must be added or removed together. options XENHVM # Xen HVM kernel infrastructure Index: sys/amd64/conf/NOTES =================================================================== --- sys/amd64/conf/NOTES +++ sys/amd64/conf/NOTES @@ -499,6 +499,9 @@ device virtio_random # VirtIO Entropy device device virtio_console # VirtIO Console device +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # Microsoft Hyper-V enhancement support device hyperv # HyperV drivers Index: sys/conf/files.x86 =================================================================== --- sys/conf/files.x86 +++ sys/conf/files.x86 @@ -276,6 +276,7 @@ dev/isci/scil/scif_sas_task_request_states.c optional isci dev/isci/scil/scif_sas_timer.c optional isci dev/itwd/itwd.c optional itwd +dev/kvm_clock/kvm_clock.c optional kvm_clock dev/qat/qat.c optional qat dev/qat/qat_ae.c optional qat dev/qat/qat_c2xxx.c optional qat @@ -332,6 +333,7 @@ x86/x86/mp_watchdog.c optional mp_watchdog smp x86/x86/nexus.c standard x86/x86/pvclock.c standard +x86/x86/rdtsc_ordered.c standard x86/x86/stack_machdep.c optional ddb | stack x86/x86/tsc.c standard x86/x86/ucode.c standard Index: sys/dev/kvm_clock/kvm_clock.c =================================================================== --- /dev/null +++ sys/dev/kvm_clock/kvm_clock.c @@ -0,0 +1,249 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Bryan Venteicher + * Copyright (c) 2021 Mathieu Chouquet-Stringer + * Copyright (c) 2021 Juniper Networks, Inc. + * Copyright (c) 2021 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Linux KVM paravirtual clock support + * + * References: + * - [1] https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html + * - [2] https://www.kernel.org/doc/html/latest/virt/kvm/msr.html + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "clock_if.h" + +#define KVM_CLOCK_DEVNAME "kvmclock" +/* + * Note: Chosen to be (1) above HPET's value (always 950), (2) above the TSC's + * default value of 800, and (3) below the TSC's value when it supports the + * "Invariant TSC" feature and is believed to be synchronized across all CPUs. + */ +#define KVM_CLOCK_TC_QUALITY 975 + +struct kvm_clock_softc { + struct pvclock pvc; + struct pvclock_wall_clock wc; + struct pvclock_vcpu_time_info *timeinfos; + u_int msr_tc; + u_int msr_wc; +}; + +static devclass_t kvm_clock_devclass; + +static struct pvclock_vcpu_time_info *kvm_clock_get_curcpu_timeinfo(void *arg); +static struct pvclock_wall_clock *kvm_clock_get_wallclock(void *arg); +static void kvm_clock_system_time_enable(struct kvm_clock_softc *sc); +static void kvm_clock_system_time_enable_pcpu(void *arg); + +static struct pvclock_vcpu_time_info * +kvm_clock_get_curcpu_timeinfo(void *arg) +{ + struct pvclock_vcpu_time_info *timeinfos = arg; + + return (&timeinfos[curcpu]); +} + +static struct pvclock_wall_clock * +kvm_clock_get_wallclock(void *arg) +{ + struct kvm_clock_softc *sc = arg; + + wrmsr(sc->msr_wc, vtophys(&sc->wc)); + return (&sc->wc); +} + +static void +kvm_clock_system_time_enable(struct kvm_clock_softc *sc) +{ + smp_rendezvous(NULL, kvm_clock_system_time_enable_pcpu, NULL, sc); +} + +static void +kvm_clock_system_time_enable_pcpu(void *arg) +{ + struct kvm_clock_softc *sc = arg; + + /* + * See [2]; the lsb of this MSR is the system time enable bit. + */ + wrmsr(sc->msr_tc, vtophys(&(sc->timeinfos)[curcpu]) | 1); +} + +static void +kvm_clock_identify(driver_t *driver, device_t parent) +{ + u_int regs[4]; + + kvm_cpuid_get_features(regs); + if ((regs[0] & KVM_FEATURE_CLOCKSOURCE2) == 0 && + (regs[0] & KVM_FEATURE_CLOCKSOURCE) == 0) + return; + if (device_find_child(parent, KVM_CLOCK_DEVNAME, -1)) + return; + BUS_ADD_CHILD(parent, 0, KVM_CLOCK_DEVNAME, 0); +} + +static int +kvm_clock_probe(device_t dev) +{ + device_set_desc(dev, "KVM paravirtual clock"); + return (BUS_PROBE_DEFAULT); +} + +static int +kvm_clock_attach(device_t dev) +{ + u_int regs[4]; + struct kvm_clock_softc *sc = device_get_softc(dev); + bool stable_flag_supported; + + /* Process KVM "features" CPUID leaf content: */ + kvm_cpuid_get_features(regs); + if ((regs[0] & KVM_FEATURE_CLOCKSOURCE2) != 0) { + sc->msr_tc = KVM_MSR_SYSTEM_TIME_NEW; + sc->msr_wc = KVM_MSR_WALL_CLOCK_NEW; + } else if ((regs[0] & KVM_FEATURE_CLOCKSOURCE) != 0) { + sc->msr_tc = KVM_MSR_SYSTEM_TIME; + sc->msr_wc = KVM_MSR_WALL_CLOCK; + } else + return (ENXIO); + stable_flag_supported = + ((regs[0] & KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) != 0); + + /* Set up 'struct pvclock_vcpu_time_info' page(s): */ + sc->timeinfos = malloc_domainset_aligned(round_page(mp_ncpus * + sizeof(struct pvclock_vcpu_time_info)), PAGE_SIZE, M_DEVBUF, + DOMAINSET_RR(), M_WAITOK | M_ZERO); + kvm_clock_system_time_enable(sc); + + /* + * Init pvclock; register KVM clock wall clock, register KVM clock + * timecounter, and set up the requisite infrastructure for vDSO access + * to this timecounter. + * Regarding 'tc_flags': Since the KVM MSR documentation does not + * specifically discuss suspend/resume scenarios, conservatively + * leave 'TC_FLAGS_SUSPEND_SAFE' cleared and assume that the system + * time must be re-inited in such cases. + */ + sc->pvc.get_curcpu_ti = kvm_clock_get_curcpu_timeinfo; + sc->pvc.get_curcpu_ti_arg = sc->timeinfos; + sc->pvc.get_wallclock = kvm_clock_get_wallclock; + sc->pvc.get_wallclock_arg = sc; + sc->pvc.ti_vcpu0_page = sc->timeinfos; + sc->pvc.stable_flag_supported = stable_flag_supported; + pvclock_init(&sc->pvc, dev, KVM_CLOCK_DEVNAME, KVM_CLOCK_TC_QUALITY, 0); + return (0); +} + +static int +kvm_clock_detach(device_t dev) +{ + struct kvm_clock_softc *sc = device_get_softc(dev); + + return (pvclock_destroy(&sc->pvc)); +} + +static int +kvm_clock_suspend(device_t dev) +{ + return (0); +} + +static int +kvm_clock_resume(device_t dev) +{ + /* + * See note in 'kvm_clock_attach()' regarding 'TC_FLAGS_SUSPEND_SAFE'; + * conservatively assume that the system time must be re-inited in + * suspend/resume scenarios. + */ + kvm_clock_system_time_enable(device_get_softc(dev)); + pvclock_resume(); + inittodr(time_second); + return (0); +} + +static int +kvm_clock_gettime(device_t dev, struct timespec *ts) +{ + struct kvm_clock_softc *sc = device_get_softc(dev); + + pvclock_gettime(&sc->pvc, ts); + return (0); +} + +static int +kvm_clock_settime(device_t dev, struct timespec *ts) +{ + /* + * Even though it is not possible to set the KVM clock's wall clock, to + * avoid the possibility of periodic benign error messages from + * 'settime_task_func()', report success rather than, e.g., 'ENODEV'. + */ + return (0); +} + +static device_method_t kvm_clock_methods[] = { + DEVMETHOD(device_identify, kvm_clock_identify), + DEVMETHOD(device_probe, kvm_clock_probe), + DEVMETHOD(device_attach, kvm_clock_attach), + DEVMETHOD(device_detach, kvm_clock_detach), + DEVMETHOD(device_suspend, kvm_clock_suspend), + DEVMETHOD(device_resume, kvm_clock_resume), + /* clock interface */ + DEVMETHOD(clock_gettime, kvm_clock_gettime), + DEVMETHOD(clock_settime, kvm_clock_settime), + + DEVMETHOD_END +}; + +static driver_t kvm_clock_driver = { + KVM_CLOCK_DEVNAME, + kvm_clock_methods, + sizeof(struct kvm_clock_softc), +}; + +DRIVER_MODULE(kvm_clock, nexus, kvm_clock_driver, kvm_clock_devclass, 0, 0); Index: sys/i386/conf/GENERIC =================================================================== --- sys/i386/conf/GENERIC +++ sys/i386/conf/GENERIC @@ -338,6 +338,9 @@ device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # HyperV drivers and enhancement support device hyperv # HyperV drivers Index: sys/i386/conf/MINIMAL =================================================================== --- sys/i386/conf/MINIMAL +++ sys/i386/conf/MINIMAL @@ -145,6 +145,9 @@ # Note that 'bpf' is required for DHCP. device bpf # Berkeley packet filter +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # Xen HVM Guest Optimizations # NOTE: XENHVM depends on xenpci. They must be added or removed together. options XENHVM # Xen HVM kernel infrastructure Index: sys/i386/conf/NOTES =================================================================== --- sys/i386/conf/NOTES +++ sys/i386/conf/NOTES @@ -719,6 +719,9 @@ device virtio_random # VirtIO Entropy device device virtio_console # VirtIO Console device +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + device hyperv # HyperV drivers ##################################################################### Index: sys/x86/include/_rdtsc_ordered.h =================================================================== --- /dev/null +++ sys/x86/include/_rdtsc_ordered.h @@ -0,0 +1,120 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Juniper Networks, Inc. + * Copyright (c) 2021 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _X86__RDTSC_ORDERED_H_ +#define _X86__RDTSC_ORDERED_H_ + +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#else /* !_KERNEL */ +#include +#include +#endif /* _KERNEL */ + +#define DEFINE_RDTSC_ORDERED_COMMON() \ + static uint64_t \ + rdtsc_ordered_lfence(void) \ + { \ + lfence(); \ + return (rdtsc()); \ + } \ + static uint64_t \ + rdtsc_ordered_mfence(void) \ + { \ + mfence(); \ + return (rdtsc()); \ + } \ + static inline uint64_t (*rdtsc_ordered_select(u_int amd_feature, \ + u_int cpu_feature, bool cpu_is_amd))(void) \ + { \ + if ((amd_feature & AMDID_RDTSCP) != 0) \ + return (rdtscp); \ + else if ((cpu_feature & CPUID_SSE2) != 0) \ + if (cpu_is_amd) \ + return (rdtsc_ordered_mfence); \ + else \ + return (rdtsc_ordered_lfence); \ + else \ + return (rdtsc); \ + } + +#ifdef _KERNEL +#define DEFINE_RDTSC_ORDERED() \ + DEFINE_RDTSC_ORDERED_COMMON() \ + DEFINE_IFUNC(, uint64_t, rdtsc_ordered, (void)) \ + { \ + bool cpu_is_amd = cpu_vendor_id == CPU_VENDOR_AMD || \ + cpu_vendor_id == CPU_VENDOR_HYGON; \ + \ + return (rdtsc_ordered_select(amd_feature, cpu_feature, \ + cpu_is_amd)); \ + } +#else /* !_KERNEL */ +#define DEFINE_RDTSC_ORDERED() \ + DEFINE_RDTSC_ORDERED_COMMON() \ + DEFINE_UIFUNC(, uint64_t, rdtsc_ordered, (void)) \ + { \ + u_int amd_feature, cpu_exthigh, p[4], v[3]; \ + static const char amd_id[] = "AuthenticAMD"; \ + static const char hygon_id[] = "HygonGenuine"; \ + bool cpu_is_amd; \ + \ + do_cpuid(0, p); \ + v[0] = p[1]; \ + v[1] = p[3]; \ + v[2] = p[2]; \ + cpu_is_amd = memcmp(v, amd_id, sizeof(amd_id) - 1) == 0 || \ + memcmp(v, hygon_id, sizeof(hygon_id) - 1) == 0; \ + if (cpu_feature != 0) { \ + do_cpuid(0x80000000, p); \ + cpu_exthigh = p[0]; \ + } else { \ + cpu_exthigh = 0; \ + } \ + if (cpu_exthigh >= 0x80000001) { \ + do_cpuid(0x80000001, p); \ + amd_feature = p[3]; \ + } else { \ + amd_feature = 0; \ + } \ + return (rdtsc_ordered_select(amd_feature, cpu_feature, \ + cpu_is_amd)); \ + } +#endif /* _KERNEL */ + +#endif /* !_X86__RDTSC_ORDERED_H_ */ Index: sys/x86/include/kvm.h =================================================================== --- /dev/null +++ sys/x86/include/kvm.h @@ -0,0 +1,80 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Bryan Venteicher + * Copyright (c) 2021 Mathieu Chouquet-Stringer + * Copyright (c) 2021 Juniper Networks, Inc. + * Copyright (c) 2021 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Linux KVM paravirtualization: common definitions + * + * References: + * - [1] https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html + * - [2] https://www.kernel.org/doc/html/latest/virt/kvm/msr.html + */ + +#ifndef _X86_KVM_H_ +#define _X86_KVM_H_ + +#include +#include + +#include + +#define KVM_CPUID_SIGNATURE 0x40000000 +#define KVM_CPUID_FEATURES_LEAF 0x40000001 + +#define KVM_FEATURE_CLOCKSOURCE 0x00000001 +#define KVM_FEATURE_CLOCKSOURCE2 0x00000008 +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 0x01000000 + +/* Deprecated: for the CLOCKSOURCE feature. */ +#define KVM_MSR_WALL_CLOCK 0x11 +#define KVM_MSR_SYSTEM_TIME 0x12 + +#define KVM_MSR_WALL_CLOCK_NEW 0x4b564d00 +#define KVM_MSR_SYSTEM_TIME_NEW 0x4b564d01 + +static inline bool +kvm_cpuid_features_leaf_supported(void) +{ + return (vm_guest == VM_GUEST_KVM && + KVM_CPUID_FEATURES_LEAF > hv_base && + KVM_CPUID_FEATURES_LEAF <= hv_high); +} + +static inline void +kvm_cpuid_get_features(u_int *regs) +{ + if (!kvm_cpuid_features_leaf_supported()) + regs[0] = regs[1] = regs[2] = regs[3] = 0; + else + do_cpuid(KVM_CPUID_FEATURES_LEAF, regs); +} + +#endif /* !_X86_KVM_H_ */ Index: sys/x86/include/pvclock.h =================================================================== --- sys/x86/include/pvclock.h +++ sys/x86/include/pvclock.h @@ -29,6 +29,16 @@ #ifndef X86_PVCLOCK #define X86_PVCLOCK +#include +#include +#include + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ + +#define PVCLOCK_CDEVNAME "pvclock" + struct pvclock_vcpu_time_info { uint32_t version; uint32_t pad0; @@ -43,12 +53,106 @@ #define PVCLOCK_FLAG_TSC_STABLE 0x01 #define PVCLOCK_FLAG_GUEST_PASUED 0x02 +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline uint64_t +pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) +{ + uint64_t product; + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; +#if defined(__i386__) + { + uint32_t tmp1, tmp2; + + /** + * For i386, the formula looks like: + * + * lower = (mul_frac * (delta & UINT_MAX)) >> 32 + * upper = mul_frac * (delta >> 32) + * product = lower + upper + */ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), + "2" (mul_frac) ); + } +#elif defined(__amd64__) + { + unsigned long tmp; + + __asm__ ( + "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" + : [lo]"=a" (product), [hi]"=d" (tmp) + : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); + } +#else +#error "pvclock: unsupported x86 architecture?" +#endif + return (product); +} + +static inline uint64_t +pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti) +{ + uint64_t delta; + + delta = rdtsc_ordered() - ti->tsc_timestamp; + return (pvclock_scale_delta(delta, ti->tsc_to_system_mul, + ti->tsc_shift)); +} + +static inline void +pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, + uint64_t *ns, uint8_t *flags) +{ + uint32_t version; + + do { + version = atomic_load_acq_32(&ti->version); + *ns = ti->system_time + pvclock_get_nsec_offset(ti); + *flags = ti->flags; + atomic_thread_fence_acq(); + } while ((ti->version & 1) != 0 || ti->version != version); +} + +#ifdef _KERNEL + +typedef struct pvclock_vcpu_time_info *pvclock_get_curcpu_timeinfo_t(void *arg); +typedef struct pvclock_wall_clock *pvclock_get_wallclock_t(void *arg); + struct pvclock_wall_clock { uint32_t version; uint32_t sec; uint32_t nsec; }; +struct pvclock { + /* Public; initialized by the caller of 'pvclock_init()': */ + pvclock_get_curcpu_timeinfo_t *get_curcpu_ti; + void *get_curcpu_ti_arg; + pvclock_get_wallclock_t *get_wallclock; + void *get_wallclock_arg; + struct pvclock_vcpu_time_info *ti_vcpu0_page; + bool stable_flag_supported; + + /* Private; initialized by the 'pvclock' API: */ + struct timecounter tc; + struct cdev *cdev; +}; + void pvclock_resume(void); uint64_t pvclock_get_last_cycles(void); uint64_t pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti); @@ -56,4 +160,11 @@ void pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts); +void pvclock_init(struct pvclock *pvc, device_t dev, + const char *tc_name, int tc_quality, u_int tc_flags); +void pvclock_gettime(struct pvclock *pvc, struct timespec *ts); +int pvclock_destroy(struct pvclock *pvc); + +#endif /* _KERNEL */ + #endif Index: sys/x86/include/rdtsc_ordered.h =================================================================== --- sys/x86/include/rdtsc_ordered.h +++ sys/x86/include/rdtsc_ordered.h @@ -1,6 +1,8 @@ /*- - * Copyright (c) 2014, Bryan Venteicher - * All rights reserved. + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Juniper Networks, Inc. + * Copyright (c) 2021 Klara, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,34 +28,11 @@ * $FreeBSD$ */ -#ifndef X86_PVCLOCK -#define X86_PVCLOCK - -struct pvclock_vcpu_time_info { - uint32_t version; - uint32_t pad0; - uint64_t tsc_timestamp; - uint64_t system_time; - uint32_t tsc_to_system_mul; - int8_t tsc_shift; - uint8_t flags; - uint8_t pad[2]; -}; - -#define PVCLOCK_FLAG_TSC_STABLE 0x01 -#define PVCLOCK_FLAG_GUEST_PASUED 0x02 +#ifndef _X86_RDTSC_ORDERED_H_ +#define _X86_RDTSC_ORDERED_H_ -struct pvclock_wall_clock { - uint32_t version; - uint32_t sec; - uint32_t nsec; -}; +#include -void pvclock_resume(void); -uint64_t pvclock_get_last_cycles(void); -uint64_t pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti); -uint64_t pvclock_get_timecount(struct pvclock_vcpu_time_info *ti); -void pvclock_get_wallclock(struct pvclock_wall_clock *wc, - struct timespec *ts); +uint64_t rdtsc_ordered(void); -#endif +#endif /* !_X86_RDTSC_ORDERED_H_ */ Index: sys/x86/include/vdso.h =================================================================== --- sys/x86/include/vdso.h +++ sys/x86/include/vdso.h @@ -42,6 +42,7 @@ #define VDSO_TH_ALGO_X86_TSC VDSO_TH_ALGO_1 #define VDSO_TH_ALGO_X86_HPET VDSO_TH_ALGO_2 #define VDSO_TH_ALGO_X86_HVTSC VDSO_TH_ALGO_3 /* Hyper-V ref. TSC */ +#define VDSO_TH_ALGO_X86_PVCLK VDSO_TH_ALGO_4 /* KVM/XEN paravirtual clock */ #ifdef _KERNEL #ifdef COMPAT_FREEBSD32 Index: sys/x86/x86/pvclock.c =================================================================== --- sys/x86/x86/pvclock.c +++ sys/x86/x86/pvclock.c @@ -31,10 +31,18 @@ #include #include +#include +#include +#include +#include +#include +#include #include +#include + +#include +#include -#include -#include #include #include @@ -44,17 +52,33 @@ */ static volatile uint64_t pvclock_last_cycles; +static u_int pvclock_tc_get_timecount(struct timecounter *tc); +static uint32_t pvclock_tc_vdso_timehands( + struct vdso_timehands *vdso_th, struct timecounter *tc); +#ifdef COMPAT_FREEBSD32 +static uint32_t pvclock_tc_vdso_timehands32( + struct vdso_timehands32 *vdso_th, struct timecounter *tc); +#endif + +static d_open_t pvclock_cdev_open; +static d_mmap_t pvclock_cdev_mmap; + +static struct cdevsw pvclock_cdev_cdevsw = { + .d_version = D_VERSION, + .d_name = PVCLOCK_CDEVNAME, + .d_open = pvclock_cdev_open, + .d_mmap = pvclock_cdev_mmap, +}; + void pvclock_resume(void) { - atomic_store_rel_64(&pvclock_last_cycles, 0); } uint64_t pvclock_get_last_cycles(void) { - return (atomic_load_acq_64(&pvclock_last_cycles)); } @@ -64,109 +88,13 @@ uint64_t freq; freq = (1000000000ULL << 32) / ti->tsc_to_system_mul; - if (ti->tsc_shift < 0) freq <<= -ti->tsc_shift; else freq >>= ti->tsc_shift; - return (freq); } -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline uint64_t -pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift) -{ - uint64_t product; - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#if defined(__i386__) - { - uint32_t tmp1, tmp2; - - /** - * For i386, the formula looks like: - * - * lower = (mul_frac * (delta & UINT_MAX)) >> 32 - * upper = mul_frac * (delta >> 32) - * product = lower + upper - */ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)), - "2" (mul_frac) ); - } -#elif defined(__amd64__) - { - unsigned long tmp; - - __asm__ ( - "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]" - : [lo]"=a" (product), [hi]"=d" (tmp) - : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac)); - } -#else -#error "pvclock: unsupported x86 architecture?" -#endif - - return (product); -} - -static uint64_t -pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti) -{ - uint64_t delta; - - delta = rdtsc() - ti->tsc_timestamp; - - return (pvclock_scale_delta(delta, ti->tsc_to_system_mul, - ti->tsc_shift)); -} - -static void -pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, - uint64_t *cycles, uint8_t *flags) -{ - uint32_t version; - - do { - version = ti->version; - rmb(); - *cycles = ti->system_time + pvclock_get_nsec_offset(ti); - *flags = ti->flags; - rmb(); - } while ((ti->version & 1) != 0 || ti->version != version); -} - -static void -pvclock_read_wall_clock(struct pvclock_wall_clock *wc, uint32_t *sec, - uint32_t *nsec) -{ - uint32_t version; - - do { - version = wc->version; - rmb(); - *sec = wc->sec; - *nsec = wc->nsec; - rmb(); - } while ((wc->version & 1) != 0 || wc->version != version); -} - uint64_t pvclock_get_timecount(struct pvclock_vcpu_time_info *ti) { @@ -174,7 +102,6 @@ uint8_t flags; pvclock_read_time_info(ti, &now, &flags); - if (flags & PVCLOCK_FLAG_TSC_STABLE) return (now); @@ -188,16 +115,158 @@ if (last > now) return (last); } while (!atomic_cmpset_64(&pvclock_last_cycles, last, now)); - return (now); } void pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts) { - uint32_t sec, nsec; + uint32_t version; + + do { + version = atomic_load_acq_32(&wc->version); + ts->tv_sec = wc->sec; + ts->tv_nsec = wc->nsec; + atomic_thread_fence_acq(); + } while ((wc->version & 1) != 0 || wc->version != version); +} + +static int +pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + if (oflags & FWRITE) + return (EPERM); + return (0); +} + +static int +pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, + int nprot, vm_memattr_t *memattr) +{ + if (offset != 0) + return (EINVAL); + if (PROT_EXTRACT(nprot) != PROT_READ) + return (EACCES); + *paddr = vtophys(dev->si_drv1); + *memattr = VM_MEMATTR_DEFAULT; + return (0); +} + +static u_int +pvclock_tc_get_timecount(struct timecounter *tc) +{ + struct pvclock *pvc = tc->tc_priv; + uint64_t ns; + + critical_enter(); + ns = pvclock_get_timecount(pvc->get_curcpu_ti(pvc->get_curcpu_ti_arg)); + critical_exit(); + return (ns & UINT_MAX); +} + +static uint32_t +pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th, + struct timecounter *tc) +{ + struct pvclock *pvc = tc->tc_priv; + + vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; + vdso_th->th_x86_shift = 0; + vdso_th->th_x86_hpet_idx = 0; + bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); + return (pvc->cdev != NULL && pvc->stable_flag_supported && + (pvc->ti_vcpu0_page->flags & PVCLOCK_FLAG_TSC_STABLE) != 0); +} + +#ifdef COMPAT_FREEBSD32 +static uint32_t +pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th, + struct timecounter *tc) +{ + struct pvclock *pvc = tc->tc_priv; + + vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; + vdso_th->th_x86_shift = 0; + vdso_th->th_x86_hpet_idx = 0; + bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); + return (pvc->cdev != NULL && pvc->stable_flag_supported && + (pvc->ti_vcpu0_page->flags & PVCLOCK_FLAG_TSC_STABLE) != 0); +} +#endif + +void +pvclock_gettime(struct pvclock *pvc, struct timespec *ts) +{ + struct timespec system_ts; + uint64_t system_ns; + + pvclock_get_wallclock(pvc->get_wallclock(pvc->get_wallclock_arg), ts); + critical_enter(); + system_ns = + pvclock_get_timecount(pvc->get_curcpu_ti(pvc->get_curcpu_ti_arg)); + critical_exit(); + system_ts.tv_sec = system_ns / 1000000000ULL; + system_ts.tv_nsec = system_ns % 1000000000ULL; + timespecadd(ts, &system_ts, ts); +} + +void +pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name, + int tc_quality, u_int tc_flags) +{ + struct make_dev_args mda; + int err; + + KASSERT(((uintptr_t)pvc->ti_vcpu0_page & PAGE_MASK) == 0, + ("Specified vCPU 0 time info page address not page-aligned.")); + + /* Set up timecounter and timecounter-supporting members: */ + pvc->tc.tc_get_timecount = pvclock_tc_get_timecount; + pvc->tc.tc_poll_pps = NULL; + pvc->tc.tc_counter_mask = ~0U; + pvc->tc.tc_frequency = 1000000000ULL; + pvc->tc.tc_name = tc_name; + pvc->tc.tc_quality = tc_quality; + pvc->tc.tc_flags = tc_flags; + pvc->tc.tc_priv = pvc; + pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands; +#ifdef COMPAT_FREEBSD32 + pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32; +#endif - pvclock_read_wall_clock(wc, &sec, &nsec); - ts->tv_sec = sec; - ts->tv_nsec = nsec; + /* Set up cdev for userspace mmapping of vCPU 0 time info page: */ + make_dev_args_init(&mda); + mda.mda_devsw = &pvclock_cdev_cdevsw; + mda.mda_uid = UID_ROOT; + mda.mda_gid = GID_WHEEL; + mda.mda_mode = 0444; + mda.mda_si_drv1 = pvc->ti_vcpu0_page; + err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME); + if (err != 0) { + device_printf(dev, "Could not create /dev/%s, error %d. Fast " + "time of day will be unavailable for this timecounter.\n", + PVCLOCK_CDEVNAME, err); + KASSERT(pvc->cdev == NULL, + ("Failed make_dev_s() unexpectedly inited cdev.")); + } + + /* Register timecounter: */ + tc_init(&pvc->tc); + + /* + * Register wallclock: + * The RTC registration API expects a resolution in microseconds; + * pvclock's 1ns resolution is rounded up to 1us. + */ + clock_register(dev, 1); +} + +int +pvclock_destroy(struct pvclock *pvc) +{ + /* + * Not currently possible since there is no teardown counterpart of + * 'tc_init()'. + */ + return (EBUSY); } Index: sys/x86/x86/rdtsc_ordered.c =================================================================== --- sys/x86/x86/rdtsc_ordered.c +++ sys/x86/x86/rdtsc_ordered.c @@ -1,6 +1,8 @@ /*- - * Copyright (c) 2014, Bryan Venteicher - * All rights reserved. + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Juniper Networks, Inc. + * Copyright (c) 2021 Klara, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -22,38 +24,11 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ -#ifndef X86_PVCLOCK -#define X86_PVCLOCK - -struct pvclock_vcpu_time_info { - uint32_t version; - uint32_t pad0; - uint64_t tsc_timestamp; - uint64_t system_time; - uint32_t tsc_to_system_mul; - int8_t tsc_shift; - uint8_t flags; - uint8_t pad[2]; -}; - -#define PVCLOCK_FLAG_TSC_STABLE 0x01 -#define PVCLOCK_FLAG_GUEST_PASUED 0x02 - -struct pvclock_wall_clock { - uint32_t version; - uint32_t sec; - uint32_t nsec; -}; +#include +__FBSDID("$FreeBSD$"); -void pvclock_resume(void); -uint64_t pvclock_get_last_cycles(void); -uint64_t pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti); -uint64_t pvclock_get_timecount(struct pvclock_vcpu_time_info *ti); -void pvclock_get_wallclock(struct pvclock_wall_clock *wc, - struct timespec *ts); +#include -#endif +DEFINE_RDTSC_ORDERED();