diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -376,6 +376,9 @@ device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # HyperV drivers and enhancement support device hyperv # HyperV drivers diff --git a/sys/amd64/conf/MINIMAL b/sys/amd64/conf/MINIMAL --- a/sys/amd64/conf/MINIMAL +++ b/sys/amd64/conf/MINIMAL @@ -131,6 +131,9 @@ # Note that 'bpf' is required for DHCP. device bpf # Berkeley packet filter +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # Xen HVM Guest Optimizations # NOTE: XENHVM depends on xenpci and xentimer. # They must be added or removed together. diff --git a/sys/amd64/conf/NOTES b/sys/amd64/conf/NOTES --- a/sys/amd64/conf/NOTES +++ b/sys/amd64/conf/NOTES @@ -498,6 +498,9 @@ device virtio_random # VirtIO Entropy device device virtio_console # VirtIO Console device +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # Microsoft Hyper-V enhancement support device hyperv # HyperV drivers diff --git a/sys/conf/files.x86 b/sys/conf/files.x86 --- a/sys/conf/files.x86 +++ b/sys/conf/files.x86 @@ -263,6 +263,7 @@ dev/isci/scil/scif_sas_task_request_states.c optional isci dev/isci/scil/scif_sas_timer.c optional isci dev/itwd/itwd.c optional itwd +dev/kvm_clock/kvm_clock.c optional kvm_clock dev/qat/qat.c optional qat dev/qat/qat_ae.c optional qat dev/qat/qat_c2xxx.c optional qat @@ -318,7 +319,7 @@ x86/x86/mp_x86.c optional smp x86/x86/mp_watchdog.c optional mp_watchdog smp x86/x86/nexus.c standard -x86/x86/pvclock.c optional xenhvm +x86/x86/pvclock.c optional kvm_clock | xenhvm x86/x86/stack_machdep.c optional ddb | stack x86/x86/tsc.c standard x86/x86/ucode.c standard diff --git a/sys/dev/kvm_clock/kvm_clock.c b/sys/dev/kvm_clock/kvm_clock.c new file mode 100644 --- /dev/null +++ b/sys/dev/kvm_clock/kvm_clock.c @@ -0,0 +1,240 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Bryan Venteicher + * Copyright (c) 2021 Mathieu Chouquet-Stringer + * Copyright (c) 2021 Juniper Networks, Inc. + * Copyright (c) 2021 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Linux KVM paravirtual clock support + * + * References: + * - [1] https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html + * - [2] https://www.kernel.org/doc/html/latest/virt/kvm/msr.html + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include "clock_if.h" + +#define KVM_CLOCK_DEVNAME "kvmclock" +/* + * Note: Chosen to be (1) above HPET's value (always 950), (2) above the TSC's + * default value of 800, and (3) below the TSC's value when it supports the + * "Invariant TSC" feature and is believed to be synchronized across all CPUs. + */ +#define KVM_CLOCK_TC_QUALITY 975 + +struct kvm_clock_softc { + struct pvclock pvc; + struct pvclock_wall_clock wc; + struct pvclock_vcpu_time_info *timeinfos; + u_int msr_tc; + u_int msr_wc; +}; + +static devclass_t kvm_clock_devclass; + +static struct pvclock_wall_clock *kvm_clock_get_wallclock(void *arg); +static void kvm_clock_system_time_enable(struct kvm_clock_softc *sc); +static void kvm_clock_system_time_enable_pcpu(void *arg); + +static struct pvclock_wall_clock * +kvm_clock_get_wallclock(void *arg) +{ + struct kvm_clock_softc *sc = arg; + + wrmsr(sc->msr_wc, vtophys(&sc->wc)); + return (&sc->wc); +} + +static void +kvm_clock_system_time_enable(struct kvm_clock_softc *sc) +{ + smp_rendezvous(NULL, kvm_clock_system_time_enable_pcpu, NULL, sc); +} + +static void +kvm_clock_system_time_enable_pcpu(void *arg) +{ + struct kvm_clock_softc *sc = arg; + + /* + * See [2]; the lsb of this MSR is the system time enable bit. + */ + wrmsr(sc->msr_tc, vtophys(&(sc->timeinfos)[curcpu]) | 1); +} + +static void +kvm_clock_identify(driver_t *driver, device_t parent) +{ + u_int regs[4]; + + kvm_cpuid_get_features(regs); + if ((regs[0] & + (KVM_FEATURE_CLOCKSOURCE2 | KVM_FEATURE_CLOCKSOURCE)) == 0) + return; + if (device_find_child(parent, KVM_CLOCK_DEVNAME, -1)) + return; + BUS_ADD_CHILD(parent, 0, KVM_CLOCK_DEVNAME, 0); +} + +static int +kvm_clock_probe(device_t dev) +{ + device_set_desc(dev, "KVM paravirtual clock"); + return (BUS_PROBE_DEFAULT); +} + +static int +kvm_clock_attach(device_t dev) +{ + u_int regs[4]; + struct kvm_clock_softc *sc = device_get_softc(dev); + bool stable_flag_supported; + + /* Process KVM "features" CPUID leaf content: */ + kvm_cpuid_get_features(regs); + if ((regs[0] & KVM_FEATURE_CLOCKSOURCE2) != 0) { + sc->msr_tc = KVM_MSR_SYSTEM_TIME_NEW; + sc->msr_wc = KVM_MSR_WALL_CLOCK_NEW; + } else { + KASSERT((regs[0] & KVM_FEATURE_CLOCKSOURCE) != 0, + ("Clocksource feature flags disappeared since " + "kvm_clock_identify: regs[0] %#0x.", regs[0])); + sc->msr_tc = KVM_MSR_SYSTEM_TIME; + sc->msr_wc = KVM_MSR_WALL_CLOCK; + } + stable_flag_supported = + (regs[0] & KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) != 0; + + /* Set up 'struct pvclock_vcpu_time_info' page(s): */ + sc->timeinfos = (struct pvclock_vcpu_time_info *)kmem_malloc(mp_ncpus * + sizeof(struct pvclock_vcpu_time_info), M_WAITOK | M_ZERO); + kvm_clock_system_time_enable(sc); + + /* + * Init pvclock; register KVM clock wall clock, register KVM clock + * timecounter, and set up the requisite infrastructure for vDSO access + * to this timecounter. + * Regarding 'tc_flags': Since the KVM MSR documentation does not + * specifically discuss suspend/resume scenarios, conservatively + * leave 'TC_FLAGS_SUSPEND_SAFE' cleared and assume that the system + * time must be re-inited in such cases. + */ + sc->pvc.get_wallclock = kvm_clock_get_wallclock; + sc->pvc.get_wallclock_arg = sc; + sc->pvc.timeinfos = sc->timeinfos; + sc->pvc.stable_flag_supported = stable_flag_supported; + pvclock_init(&sc->pvc, dev, KVM_CLOCK_DEVNAME, KVM_CLOCK_TC_QUALITY, 0); + return (0); +} + +static int +kvm_clock_detach(device_t dev) +{ + struct kvm_clock_softc *sc = device_get_softc(dev); + + return (pvclock_destroy(&sc->pvc)); +} + +static int +kvm_clock_suspend(device_t dev) +{ + return (0); +} + +static int +kvm_clock_resume(device_t dev) +{ + /* + * See note in 'kvm_clock_attach()' regarding 'TC_FLAGS_SUSPEND_SAFE'; + * conservatively assume that the system time must be re-inited in + * suspend/resume scenarios. + */ + kvm_clock_system_time_enable(device_get_softc(dev)); + pvclock_resume(); + inittodr(time_second); + return (0); +} + +static int +kvm_clock_gettime(device_t dev, struct timespec *ts) +{ + struct kvm_clock_softc *sc = device_get_softc(dev); + + pvclock_gettime(&sc->pvc, ts); + return (0); +} + +static int +kvm_clock_settime(device_t dev, struct timespec *ts) +{ + /* + * Even though it is not possible to set the KVM clock's wall clock, to + * avoid the possibility of periodic benign error messages from + * 'settime_task_func()', report success rather than, e.g., 'ENODEV'. + */ + return (0); +} + +static device_method_t kvm_clock_methods[] = { + DEVMETHOD(device_identify, kvm_clock_identify), + DEVMETHOD(device_probe, kvm_clock_probe), + DEVMETHOD(device_attach, kvm_clock_attach), + DEVMETHOD(device_detach, kvm_clock_detach), + DEVMETHOD(device_suspend, kvm_clock_suspend), + DEVMETHOD(device_resume, kvm_clock_resume), + /* clock interface */ + DEVMETHOD(clock_gettime, kvm_clock_gettime), + DEVMETHOD(clock_settime, kvm_clock_settime), + + DEVMETHOD_END +}; + +static driver_t kvm_clock_driver = { + KVM_CLOCK_DEVNAME, + kvm_clock_methods, + sizeof(struct kvm_clock_softc), +}; + +DRIVER_MODULE(kvm_clock, nexus, kvm_clock_driver, kvm_clock_devclass, 0, 0); diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC --- a/sys/i386/conf/GENERIC +++ b/sys/i386/conf/GENERIC @@ -337,6 +337,9 @@ device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # HyperV drivers and enhancement support # NOTE: HYPERV depends on hyperv. They must be added or removed together. options HYPERV # Kernel support for HyperV drivers diff --git a/sys/i386/conf/MINIMAL b/sys/i386/conf/MINIMAL --- a/sys/i386/conf/MINIMAL +++ b/sys/i386/conf/MINIMAL @@ -145,6 +145,9 @@ # Note that 'bpf' is required for DHCP. device bpf # Berkeley packet filter +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + # Xen HVM Guest Optimizations # NOTE: XENHVM depends on xenpci. They must be added or removed together. options XENHVM # Xen HVM kernel infrastructure diff --git a/sys/i386/conf/NOTES b/sys/i386/conf/NOTES --- a/sys/i386/conf/NOTES +++ b/sys/i386/conf/NOTES @@ -719,6 +719,9 @@ device virtio_random # VirtIO Entropy device device virtio_console # VirtIO Console device +# Linux KVM paravirtualization support +device kvm_clock # KVM paravirtual clock driver + options HYPERV device hyperv # HyperV drivers diff --git a/sys/x86/include/kvm.h b/sys/x86/include/kvm.h new file mode 100644 --- /dev/null +++ b/sys/x86/include/kvm.h @@ -0,0 +1,80 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Bryan Venteicher + * Copyright (c) 2021 Mathieu Chouquet-Stringer + * Copyright (c) 2021 Juniper Networks, Inc. + * Copyright (c) 2021 Klara, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Linux KVM paravirtualization: common definitions + * + * References: + * - [1] https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html + * - [2] https://www.kernel.org/doc/html/latest/virt/kvm/msr.html + */ + +#ifndef _X86_KVM_H_ +#define _X86_KVM_H_ + +#include +#include + +#include + +#define KVM_CPUID_SIGNATURE 0x40000000 +#define KVM_CPUID_FEATURES_LEAF 0x40000001 + +#define KVM_FEATURE_CLOCKSOURCE 0x00000001 +#define KVM_FEATURE_CLOCKSOURCE2 0x00000008 +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 0x01000000 + +/* Deprecated: for the CLOCKSOURCE feature. */ +#define KVM_MSR_WALL_CLOCK 0x11 +#define KVM_MSR_SYSTEM_TIME 0x12 + +#define KVM_MSR_WALL_CLOCK_NEW 0x4b564d00 +#define KVM_MSR_SYSTEM_TIME_NEW 0x4b564d01 + +static inline bool +kvm_cpuid_features_leaf_supported(void) +{ + return (vm_guest == VM_GUEST_KVM && + KVM_CPUID_FEATURES_LEAF > hv_base && + KVM_CPUID_FEATURES_LEAF <= hv_high); +} + +static inline void +kvm_cpuid_get_features(u_int *regs) +{ + if (!kvm_cpuid_features_leaf_supported()) + regs[0] = regs[1] = regs[2] = regs[3] = 0; + else + do_cpuid(KVM_CPUID_FEATURES_LEAF, regs); +} + +#endif /* !_X86_KVM_H_ */