Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F103210643
D29733.id91478.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
29 KB
Referenced Files
None
Subscribers
None
D29733.id91478.diff
View Options
Index: lib/libc/x86/sys/__vdso_gettc.c
===================================================================
--- lib/libc/x86/sys/__vdso_gettc.c
+++ lib/libc/x86/sys/__vdso_gettc.c
@@ -45,6 +45,7 @@
#include "un-namespace.h"
#include <machine/atomic.h>
#include <machine/cpufunc.h>
+#include <machine/pvclock.h>
#include <machine/specialreg.h>
#include <dev/acpica/acpi_hpet.h>
#ifdef WANT_HYPERV
@@ -93,6 +94,20 @@
return (rdtsc_low(th));
}
+static uint64_t
+rdtsc_mb_lfence(void)
+{
+ lfence();
+ return (rdtsc());
+}
+
+static uint64_t
+rdtsc_mb_mfence(void)
+{
+ mfence();
+ return (rdtsc());
+}
+
static u_int
rdtsc32_mb_lfence(void)
{
@@ -120,24 +135,29 @@
}
struct tsc_selector_tag {
+ uint64_t (*ts_rdtsc)(void);
u_int (*ts_rdtsc32)(void);
u_int (*ts_rdtsc_low)(const struct vdso_timehands *);
};
static const struct tsc_selector_tag tsc_selector[] = {
[0] = { /* Intel, LFENCE */
+ .ts_rdtsc = rdtsc_mb_lfence,
.ts_rdtsc32 = rdtsc32_mb_lfence,
.ts_rdtsc_low = rdtsc_low_mb_lfence,
},
[1] = { /* AMD, MFENCE */
+ .ts_rdtsc = rdtsc_mb_mfence,
.ts_rdtsc32 = rdtsc32_mb_mfence,
.ts_rdtsc_low = rdtsc_low_mb_mfence,
},
[2] = { /* No SSE2 */
+ .ts_rdtsc = rdtsc,
.ts_rdtsc32 = rdtsc32_mb_none,
.ts_rdtsc_low = rdtsc_low_mb_none,
},
[3] = { /* RDTSCP */
+ .ts_rdtsc = rdtscp,
.ts_rdtsc32 = rdtscp32_,
.ts_rdtsc_low = rdtscp_low,
},
@@ -184,6 +204,11 @@
return (amd_cpu ? 1 : 0);
}
+DEFINE_UIFUNC(static, uint64_t, __vdso_gettc_rdtsc, (void))
+{
+ return (tsc_selector[tsc_selector_idx(cpu_feature)].ts_rdtsc);
+}
+
DEFINE_UIFUNC(static, u_int, __vdso_gettc_rdtsc_low,
(const struct vdso_timehands *th))
{
@@ -312,6 +337,45 @@
#endif /* WANT_HYPERV */
+static struct pvclock_vcpu_time_info *pvclock_vcpu0_info;
+
+static int
+__vdso_pvclock_tsc(struct pvclock_vcpu_time_info *ti, u_int *tc)
+{
+ uint64_t ns;
+ uint8_t flags;
+
+ pvclock_read_time_info(ti, &ns, &flags);
+
+ if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0)
+ return (ENOSYS);
+
+ *tc = ns;
+ return (0);
+}
+
+static void
+__vdso_init_pvclock_tsc(void)
+{
+ int fd;
+ unsigned int mode;
+
+ if (atomic_cmpset_ptr((volatile uintptr_t *)&pvclock_vcpu0_info,
+ (uintptr_t)NULL, (uintptr_t)MAP_FAILED) != 0) {
+ if (cap_getmode(&mode) == 0 && mode != 0)
+ return;
+
+ fd = _open("/dev/" PVCLOCK_CDEVNAME, O_RDONLY);
+ if (fd < 0)
+ return;
+
+ pvclock_vcpu0_info = mmap(NULL, sizeof(*pvclock_vcpu0_info),
+ PROT_READ, MAP_SHARED, fd, 0);
+
+ _close(fd);
+ }
+}
+
#pragma weak __vdso_gettc
int
__vdso_gettc(const struct vdso_timehands *th, u_int *tc)
@@ -347,6 +411,12 @@
return (ENOSYS);
return (__vdso_hyperv_tsc(hyperv_ref_tsc, tc));
#endif
+ case VDSO_TH_ALGO_X86_PVCLK:
+ if (pvclock_vcpu0_info == NULL)
+ __vdso_init_pvclock_tsc();
+ if (pvclock_vcpu0_info == MAP_FAILED)
+ return (ENOSYS);
+ return (__vdso_pvclock_tsc(pvclock_vcpu0_info, tc));
default:
return (ENOSYS);
}
Index: sys/amd64/conf/GENERIC
===================================================================
--- sys/amd64/conf/GENERIC
+++ sys/amd64/conf/GENERIC
@@ -376,6 +376,9 @@
device virtio_scsi # VirtIO SCSI device
device virtio_balloon # VirtIO Memory Balloon device
+# Linux KVM paravirtualization support
+device kvm_clock # KVM paravirtual clock driver
+
# HyperV drivers and enhancement support
device hyperv # HyperV drivers
Index: sys/amd64/conf/MINIMAL
===================================================================
--- sys/amd64/conf/MINIMAL
+++ sys/amd64/conf/MINIMAL
@@ -144,6 +144,9 @@
# Note that 'bpf' is required for DHCP.
device bpf # Berkeley packet filter
+# Linux KVM paravirtualization support
+device kvm_clock # KVM paravirtual clock driver
+
# Xen HVM Guest Optimizations
# NOTE: XENHVM depends on xenpci. They must be added or removed together.
options XENHVM # Xen HVM kernel infrastructure
Index: sys/amd64/conf/NOTES
===================================================================
--- sys/amd64/conf/NOTES
+++ sys/amd64/conf/NOTES
@@ -499,6 +499,9 @@
device virtio_random # VirtIO Entropy device
device virtio_console # VirtIO Console device
+# Linux KVM paravirtualization support
+device kvm_clock # KVM paravirtual clock driver
+
# Microsoft Hyper-V enhancement support
device hyperv # HyperV drivers
Index: sys/conf/files.x86
===================================================================
--- sys/conf/files.x86
+++ sys/conf/files.x86
@@ -276,6 +276,7 @@
dev/isci/scil/scif_sas_task_request_states.c optional isci
dev/isci/scil/scif_sas_timer.c optional isci
dev/itwd/itwd.c optional itwd
+dev/kvm_clock/kvm_clock.c optional kvm_clock
dev/qat/qat.c optional qat
dev/qat/qat_ae.c optional qat
dev/qat/qat_c2xxx.c optional qat
Index: sys/dev/kvm_clock/kvm_clock.c
===================================================================
--- /dev/null
+++ sys/dev/kvm_clock/kvm_clock.c
@@ -0,0 +1,334 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Bryan Venteicher <bryanv@FreeBSD.org>
+ * Copyright (c) 2021 Mathieu Chouquet-Stringer
+ * Copyright (c) 2021 Juniper Networks, Inc.
+ * Copyright (c) 2021 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Linux KVM paravirtual clock support
+ *
+ * References:
+ * - [1] https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html
+ * - [2] https://www.kernel.org/doc/html/latest/virt/kvm/msr.html
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/bus.h>
+#include <machine/pvclock.h>
+#include <x86/kvm.h>
+
+#include "clock_if.h"
+
+#define KVM_CLOCK_DEVNAME "kvmclock"
+/*
+ * Note: Chosen to be (1) above HPET's value (always 950), (2) above the TSC's
+ * default value of 800, and (3) below the TSC's value when it supports the
+ * "Invariant TSC" feature and is believed to be synchronized across all CPUs.
+ */
+#define KVM_CLOCK_TC_QUALITY 975
+
+struct kvm_clock_softc {
+ struct pvclock pvc;
+ struct pvclock_wall_clock wc;
+ struct pvclock_vcpu_time_info *timeinfos;
+ bus_dmamap_t timeinfos_dmamap;
+ bus_dma_tag_t timeinfos_dmatag;
+ u_int msr_tc;
+ u_int msr_wc;
+};
+
+static devclass_t kvm_clock_devclass;
+
+static struct pvclock_vcpu_time_info *kvm_clock_get_curcpu_timeinfo(void *arg);
+static struct pvclock_wall_clock *kvm_clock_get_wallclock(void *arg);
+static void kvm_clock_system_time_enable(struct kvm_clock_softc *sc);
+static void kvm_clock_system_time_enable_pcpu(void *arg);
+static int kvm_clock_timeinfos_alloc(struct kvm_clock_softc *sc);
+static void kvm_clock_timeinfos_alloc_cb(void *timeinfos,
+ bus_dma_segment_t *segs, int nseg, int error);
+
+static struct pvclock_vcpu_time_info *
+kvm_clock_get_curcpu_timeinfo(void *arg)
+{
+ struct pvclock_vcpu_time_info *timeinfos = arg;
+
+ return (&timeinfos[curcpu]);
+}
+
+static struct pvclock_wall_clock *
+kvm_clock_get_wallclock(void *arg)
+{
+ struct kvm_clock_softc *sc = arg;
+
+ wrmsr(sc->msr_wc, vtophys(&sc->wc));
+
+ return (&sc->wc);
+}
+
+static void
+kvm_clock_system_time_enable(struct kvm_clock_softc *sc)
+{
+ smp_rendezvous(NULL, kvm_clock_system_time_enable_pcpu, NULL, sc);
+}
+
+static void
+kvm_clock_system_time_enable_pcpu(void *arg)
+{
+ struct kvm_clock_softc *sc = arg;
+
+ /*
+ * See [2]; the lsb of this MSR is the system time enable bit.
+ */
+ wrmsr(sc->msr_tc, vtophys(&(sc->timeinfos)[curcpu]) | 1);
+}
+
+static int
+kvm_clock_timeinfos_alloc(struct kvm_clock_softc *sc)
+{
+ bus_size_t size;
+ int err;
+
+ size = round_page(mp_ncpus * sizeof(struct pvclock_vcpu_time_info));
+
+ err = bus_dma_tag_create(NULL, PAGE_SIZE, 0, BUS_SPACE_MAXADDR,
+ BUS_SPACE_MAXADDR, NULL, NULL, size, size / PAGE_SIZE, PAGE_SIZE, 0,
+ NULL, NULL, &sc->timeinfos_dmatag);
+ if (err != 0)
+ return (err);
+
+ err = bus_dmamem_alloc(sc->timeinfos_dmatag, (void **)&sc->timeinfos,
+ BUS_DMA_WAITOK | BUS_DMA_ZERO, &sc->timeinfos_dmamap);
+ if (err != 0) {
+ (void)bus_dma_tag_destroy(sc->timeinfos_dmatag);
+ return (err);
+ }
+
+ err = bus_dmamap_load(sc->timeinfos_dmatag, sc->timeinfos_dmamap,
+ sc->timeinfos, size, kvm_clock_timeinfos_alloc_cb, sc->timeinfos,
+ BUS_DMA_NOWAIT);
+ if (err != 0) {
+ bus_dmamem_free(sc->timeinfos_dmatag, sc->timeinfos,
+ sc->timeinfos_dmamap);
+ (void)bus_dma_tag_destroy(sc->timeinfos_dmatag);
+ return (err);
+ }
+
+ return (0);
+}
+
+static void
+kvm_clock_timeinfos_alloc_cb(void *timeinfos, bus_dma_segment_t *segs,
+ int nseg, int error)
+{
+ vm_paddr_t paddr;
+ int i, npages;
+
+
+ if (error != 0)
+ return;
+
+ npages = round_page(mp_ncpus * sizeof(struct pvclock_vcpu_time_info)) /
+ PAGE_SIZE;
+
+ KASSERT(nseg == npages, ("num segs %d is not num pages %d", nseg,
+ npages));
+
+ for (i = 0; i < nseg; i++) {
+ paddr = vtophys((uintptr_t)timeinfos + i * PAGE_SIZE);
+
+ KASSERT((segs[i].ds_addr & PAGE_MASK) == 0,
+ ("Segment %d address %#jx not page-aligned.", i,
+ segs[i].ds_addr));
+ KASSERT(segs[i].ds_len == PAGE_SIZE,
+ ("seg %i has non-page-sized len %ju", i, segs[i].ds_len));
+ KASSERT(paddr == segs[i].ds_addr,
+ ("seg paddr %#jx should be vtophys(vaddr) %#jx",
+ segs[i].ds_addr, paddr));
+ }
+}
+
+static void
+kvm_clock_identify(driver_t *driver, device_t parent)
+{
+ u_int regs[4];
+
+ kvm_cpuid_get_features(regs);
+ if ((regs[0] & KVM_FEATURE_CLOCKSOURCE2) == 0 &&
+ (regs[0] & KVM_FEATURE_CLOCKSOURCE) == 0)
+ return;
+
+ if (device_find_child(parent, KVM_CLOCK_DEVNAME, -1))
+ return;
+
+ BUS_ADD_CHILD(parent, 0, KVM_CLOCK_DEVNAME, 0);
+}
+
+static int
+kvm_clock_probe(device_t dev)
+{
+ device_set_desc(dev, "KVM paravirtual clock");
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+kvm_clock_attach(device_t dev)
+{
+ u_int regs[4];
+ struct kvm_clock_softc *sc;
+ int err;
+ bool stable_flag_supported;
+
+ sc = device_get_softc(dev);
+
+ /* Process KVM "features" CPUID leaf content: */
+ kvm_cpuid_get_features(regs);
+ if ((regs[0] & KVM_FEATURE_CLOCKSOURCE2) != 0) {
+ sc->msr_tc = KVM_MSR_SYSTEM_TIME_NEW;
+ sc->msr_wc = KVM_MSR_WALL_CLOCK_NEW;
+ } else if ((regs[0] & KVM_FEATURE_CLOCKSOURCE) != 0) {
+ sc->msr_tc = KVM_MSR_SYSTEM_TIME;
+ sc->msr_wc = KVM_MSR_WALL_CLOCK;
+ } else
+ return (ENXIO);
+
+ stable_flag_supported =
+ ((regs[0] & KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) != 0);
+
+ /* Set up 'struct pvclock_vcpu_time_info' page(s): */
+ err = kvm_clock_timeinfos_alloc(sc);
+ if (err != 0)
+ return (err);
+
+ kvm_clock_system_time_enable(sc);
+
+ /*
+ * Init pvclock; register KVM clock wall clock, register KVM clock
+ * timecounter, and set up the requisite infrastructure for vDSO access
+ * to this timecounter.
+ * Regarding 'tc_flags': Since the KVM MSR documentation does not
+ * specifically discuss suspend/resume scenarios, conservatively
+ * leave 'TC_FLAGS_SUSPEND_SAFE' cleared and assume that the system
+ * time must be re-inited in such cases.
+ */
+ sc->pvc.get_curcpu_ti = kvm_clock_get_curcpu_timeinfo;
+ sc->pvc.get_curcpu_ti_arg = sc->timeinfos;
+ sc->pvc.get_wallclock = kvm_clock_get_wallclock;
+ sc->pvc.get_wallclock_arg = sc;
+ sc->pvc.ti_vcpu0_page = sc->timeinfos;
+ sc->pvc.stable_flag_supported = stable_flag_supported;
+ pvclock_init(&sc->pvc, dev, KVM_CLOCK_DEVNAME, KVM_CLOCK_TC_QUALITY, 0);
+
+ return (0);
+}
+
+static int
+kvm_clock_detach(device_t dev)
+{
+ struct kvm_clock_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ return (pvclock_destroy(&sc->pvc));
+}
+
+static int
+kvm_clock_suspend(device_t dev)
+{
+ return (0);
+}
+
+static int
+kvm_clock_resume(device_t dev)
+{
+ /*
+ * See note in 'kvm_clock_attach()' regarding 'TC_FLAGS_SUSPEND_SAFE';
+ * conservatively assume that the system time must be re-inited in
+ * suspend/resume scenarios.
+ */
+ kvm_clock_system_time_enable(device_get_softc(dev));
+ pvclock_resume();
+ inittodr(time_second);
+
+ return (0);
+}
+
+static int
+kvm_clock_gettime(device_t dev, struct timespec *ts)
+{
+ struct kvm_clock_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ pvclock_gettime(&sc->pvc, ts);
+
+ return (0);
+}
+
+static int
+kvm_clock_settime(device_t dev, struct timespec *ts)
+{
+ /*
+ * Even though it is not possible to set the KVM clock's wall clock, to
+ * avoid the possibility of periodic benign error messages from
+ * 'settime_task_func()', report success rather than, e.g., 'ENODEV'.
+ */
+ return (0);
+}
+
+static device_method_t kvm_clock_methods[] = {
+ DEVMETHOD(device_identify, kvm_clock_identify),
+ DEVMETHOD(device_probe, kvm_clock_probe),
+ DEVMETHOD(device_attach, kvm_clock_attach),
+ DEVMETHOD(device_detach, kvm_clock_detach),
+ DEVMETHOD(device_suspend, kvm_clock_suspend),
+ DEVMETHOD(device_resume, kvm_clock_resume),
+ /* clock interface */
+ DEVMETHOD(clock_gettime, kvm_clock_gettime),
+ DEVMETHOD(clock_settime, kvm_clock_settime),
+
+ DEVMETHOD_END
+};
+
+static driver_t kvm_clock_driver = {
+ KVM_CLOCK_DEVNAME,
+ kvm_clock_methods,
+ sizeof(struct kvm_clock_softc),
+};
+
+DRIVER_MODULE(kvm_clock, nexus, kvm_clock_driver, kvm_clock_devclass, 0, 0);
Index: sys/i386/conf/GENERIC
===================================================================
--- sys/i386/conf/GENERIC
+++ sys/i386/conf/GENERIC
@@ -338,6 +338,9 @@
device virtio_scsi # VirtIO SCSI device
device virtio_balloon # VirtIO Memory Balloon device
+# Linux KVM paravirtualization support
+device kvm_clock # KVM paravirtual clock driver
+
# HyperV drivers and enhancement support
device hyperv # HyperV drivers
Index: sys/i386/conf/MINIMAL
===================================================================
--- sys/i386/conf/MINIMAL
+++ sys/i386/conf/MINIMAL
@@ -145,6 +145,9 @@
# Note that 'bpf' is required for DHCP.
device bpf # Berkeley packet filter
+# Linux KVM paravirtualization support
+device kvm_clock # KVM paravirtual clock driver
+
# Xen HVM Guest Optimizations
# NOTE: XENHVM depends on xenpci. They must be added or removed together.
options XENHVM # Xen HVM kernel infrastructure
Index: sys/i386/conf/NOTES
===================================================================
--- sys/i386/conf/NOTES
+++ sys/i386/conf/NOTES
@@ -719,6 +719,9 @@
device virtio_random # VirtIO Entropy device
device virtio_console # VirtIO Console device
+# Linux KVM paravirtualization support
+device kvm_clock # KVM paravirtual clock driver
+
device hyperv # HyperV drivers
#####################################################################
Index: sys/x86/include/kvm.h
===================================================================
--- /dev/null
+++ sys/x86/include/kvm.h
@@ -0,0 +1,80 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2014 Bryan Venteicher <bryanv@FreeBSD.org>
+ * Copyright (c) 2021 Mathieu Chouquet-Stringer
+ * Copyright (c) 2021 Juniper Networks, Inc.
+ * Copyright (c) 2021 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Linux KVM paravirtualization: common definitions
+ *
+ * References:
+ * - [1] https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html
+ * - [2] https://www.kernel.org/doc/html/latest/virt/kvm/msr.html
+ */
+
+#ifndef _X86_KVM_H_
+#define _X86_KVM_H_
+
+#include <sys/types.h>
+#include <sys/systm.h>
+
+#include <machine/md_var.h>
+
+#define KVM_CPUID_SIGNATURE 0x40000000
+#define KVM_CPUID_FEATURES_LEAF 0x40000001
+
+#define KVM_FEATURE_CLOCKSOURCE 0x00000001
+#define KVM_FEATURE_CLOCKSOURCE2 0x00000008
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 0x01000000
+
+/* Deprecated: for the CLOCKSOURCE feature. */
+#define KVM_MSR_WALL_CLOCK 0x11
+#define KVM_MSR_SYSTEM_TIME 0x12
+
+#define KVM_MSR_WALL_CLOCK_NEW 0x4b564d00
+#define KVM_MSR_SYSTEM_TIME_NEW 0x4b564d01
+
+static inline bool
+kvm_cpuid_features_leaf_supported(void)
+{
+ return (vm_guest == VM_GUEST_KVM &&
+ KVM_CPUID_FEATURES_LEAF > hv_base &&
+ KVM_CPUID_FEATURES_LEAF <= hv_high);
+}
+
+static inline void
+kvm_cpuid_get_features(u_int *regs)
+{
+ if (!kvm_cpuid_features_leaf_supported())
+ regs[0] = regs[1] = regs[2] = regs[3] = 0;
+ else
+ do_cpuid(KVM_CPUID_FEATURES_LEAF, regs);
+}
+
+#endif /* !_X86_KVM_H_ */
Index: sys/x86/include/pvclock.h
===================================================================
--- sys/x86/include/pvclock.h
+++ sys/x86/include/pvclock.h
@@ -29,6 +29,12 @@
#ifndef X86_PVCLOCK
#define X86_PVCLOCK
+#ifdef _KERNEL
+#include <sys/timetc.h>
+#endif /* _KERNEL */
+
+#define PVCLOCK_CDEVNAME "pvclock"
+
struct pvclock_vcpu_time_info {
uint32_t version;
uint32_t pad0;
@@ -43,12 +49,110 @@
#define PVCLOCK_FLAG_TSC_STABLE 0x01
#define PVCLOCK_FLAG_GUEST_PASUED 0x02
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline uint64_t
+pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
+{
+ uint64_t product;
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#if defined(__i386__)
+ {
+ uint32_t tmp1, tmp2;
+
+ /**
+ * For i386, the formula looks like:
+ *
+ * lower = (mul_frac * (delta & UINT_MAX)) >> 32
+ * upper = mul_frac * (delta >> 32)
+ * product = lower + upper
+ */
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
+ "2" (mul_frac) );
+ }
+#elif defined(__amd64__)
+ {
+ unsigned long tmp;
+
+ __asm__ (
+ "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+ : [lo]"=a" (product), [hi]"=d" (tmp)
+ : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
+ }
+#else
+#error "pvclock: unsupported x86 architecture?"
+#endif
+
+ return (product);
+}
+
+static inline uint64_t
+pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti)
+{
+ uint64_t delta;
+
+ delta = rdtsc() - ti->tsc_timestamp;
+
+ return (pvclock_scale_delta(delta, ti->tsc_to_system_mul,
+ ti->tsc_shift));
+}
+
+static inline void
+pvclock_read_time_info(struct pvclock_vcpu_time_info *ti,
+ uint64_t *ns, uint8_t *flags)
+{
+ uint32_t version;
+
+ do {
+ version = ti->version;
+ rmb();
+ *ns = ti->system_time + pvclock_get_nsec_offset(ti);
+ *flags = ti->flags;
+ rmb();
+ } while ((ti->version & 1) != 0 || ti->version != version);
+}
+
+#ifdef _KERNEL
+
+typedef struct pvclock_vcpu_time_info *pvclock_get_curcpu_timeinfo_t(void *arg);
+typedef struct pvclock_wall_clock *pvclock_get_wallclock_t(void *arg);
+
struct pvclock_wall_clock {
uint32_t version;
uint32_t sec;
uint32_t nsec;
};
+struct pvclock {
+ /* Public; initialized by the caller of 'pvclock_init()': */
+ pvclock_get_curcpu_timeinfo_t *get_curcpu_ti;
+ void *get_curcpu_ti_arg;
+ pvclock_get_wallclock_t *get_wallclock;
+ void *get_wallclock_arg;
+ struct pvclock_vcpu_time_info *ti_vcpu0_page;
+ bool stable_flag_supported;
+
+ /* Private; initialized by the 'pvclock' API: */
+ struct timecounter tc;
+ struct cdev *cdev;
+};
+
void pvclock_resume(void);
uint64_t pvclock_get_last_cycles(void);
uint64_t pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti);
@@ -56,4 +160,11 @@
void pvclock_get_wallclock(struct pvclock_wall_clock *wc,
struct timespec *ts);
+void pvclock_init(struct pvclock *pvc, device_t dev,
+ const char *tc_name, int tc_quality, u_int tc_flags);
+void pvclock_gettime(struct pvclock *pvc, struct timespec *ts);
+int pvclock_destroy(struct pvclock *pvc);
+
+#endif /* _KERNEL */
+
#endif
Index: sys/x86/include/vdso.h
===================================================================
--- sys/x86/include/vdso.h
+++ sys/x86/include/vdso.h
@@ -42,6 +42,7 @@
#define VDSO_TH_ALGO_X86_TSC VDSO_TH_ALGO_1
#define VDSO_TH_ALGO_X86_HPET VDSO_TH_ALGO_2
#define VDSO_TH_ALGO_X86_HVTSC VDSO_TH_ALGO_3 /* Hyper-V ref. TSC */
+#define VDSO_TH_ALGO_X86_PVCLK VDSO_TH_ALGO_4 /* KVM/XEN paravirtual clock */
#ifdef _KERNEL
#ifdef COMPAT_FREEBSD32
Index: sys/x86/x86/pvclock.c
===================================================================
--- sys/x86/x86/pvclock.c
+++ sys/x86/x86/pvclock.c
@@ -31,7 +31,17 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/mman.h>
#include <sys/proc.h>
+#include <sys/vdso.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
#include <machine/cpufunc.h>
#include <machine/cpu.h>
@@ -44,6 +54,24 @@
*/
static volatile uint64_t pvclock_last_cycles;
+static u_int pvclock_tc_get_timecount(struct timecounter *tc);
+static uint32_t pvclock_tc_vdso_timehands(
+ struct vdso_timehands *vdso_th, struct timecounter *tc);
+#ifdef COMPAT_FREEBSD32
+static uint32_t pvclock_tc_vdso_timehands32(
+ struct vdso_timehands32 *vdso_th, struct timecounter *tc);
+#endif
+
+static d_open_t pvclock_cdev_open;
+static d_mmap_t pvclock_cdev_mmap;
+
+static struct cdevsw pvclock_cdev_cdevsw = {
+ .d_version = D_VERSION,
+ .d_name = PVCLOCK_CDEVNAME,
+ .d_open = pvclock_cdev_open,
+ .d_mmap = pvclock_cdev_mmap,
+};
+
void
pvclock_resume(void)
{
@@ -73,100 +101,6 @@
return (freq);
}
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline uint64_t
-pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
-{
- uint64_t product;
-
- if (shift < 0)
- delta >>= -shift;
- else
- delta <<= shift;
-
-#if defined(__i386__)
- {
- uint32_t tmp1, tmp2;
-
- /**
- * For i386, the formula looks like:
- *
- * lower = (mul_frac * (delta & UINT_MAX)) >> 32
- * upper = mul_frac * (delta >> 32)
- * product = lower + upper
- */
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
- "2" (mul_frac) );
- }
-#elif defined(__amd64__)
- {
- unsigned long tmp;
-
- __asm__ (
- "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
- : [lo]"=a" (product), [hi]"=d" (tmp)
- : "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
- }
-#else
-#error "pvclock: unsupported x86 architecture?"
-#endif
-
- return (product);
-}
-
-static uint64_t
-pvclock_get_nsec_offset(struct pvclock_vcpu_time_info *ti)
-{
- uint64_t delta;
-
- delta = rdtsc() - ti->tsc_timestamp;
-
- return (pvclock_scale_delta(delta, ti->tsc_to_system_mul,
- ti->tsc_shift));
-}
-
-static void
-pvclock_read_time_info(struct pvclock_vcpu_time_info *ti,
- uint64_t *cycles, uint8_t *flags)
-{
- uint32_t version;
-
- do {
- version = ti->version;
- rmb();
- *cycles = ti->system_time + pvclock_get_nsec_offset(ti);
- *flags = ti->flags;
- rmb();
- } while ((ti->version & 1) != 0 || ti->version != version);
-}
-
-static void
-pvclock_read_wall_clock(struct pvclock_wall_clock *wc, uint32_t *sec,
- uint32_t *nsec)
-{
- uint32_t version;
-
- do {
- version = wc->version;
- rmb();
- *sec = wc->sec;
- *nsec = wc->nsec;
- rmb();
- } while ((wc->version & 1) != 0 || wc->version != version);
-}
-
uint64_t
pvclock_get_timecount(struct pvclock_vcpu_time_info *ti)
{
@@ -195,9 +129,164 @@
void
pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts)
{
- uint32_t sec, nsec;
+ uint32_t version;
+
+ do {
+ version = wc->version;
+ rmb();
+ ts->tv_sec = wc->sec;
+ ts->tv_nsec = wc->nsec;
+ rmb();
+ } while ((wc->version & 1) != 0 || wc->version != version);
+}
- pvclock_read_wall_clock(wc, &sec, &nsec);
- ts->tv_sec = sec;
- ts->tv_nsec = nsec;
+static int
+pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ if (oflags & FWRITE)
+ return (EPERM);
+
+ return (0);
+}
+
+static int
+pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int nprot, vm_memattr_t *memattr)
+{
+ if (offset != 0)
+ return (EINVAL);
+
+ if (PROT_EXTRACT(nprot) != PROT_READ)
+ return (EACCES);
+
+ *paddr = vtophys(dev->si_drv1);
+ *memattr = VM_MEMATTR_DEFAULT;
+
+ return (0);
+}
+
+static u_int
+pvclock_tc_get_timecount(struct timecounter *tc)
+{
+ struct pvclock *pvc = tc->tc_priv;
+ uint64_t ns;
+
+ critical_enter();
+ ns = pvclock_get_timecount(pvc->get_curcpu_ti(pvc->get_curcpu_ti_arg));
+ critical_exit();
+
+ return (ns & UINT_MAX);
+}
+
+static uint32_t
+pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th,
+ struct timecounter *tc)
+{
+ struct pvclock *pvc = tc->tc_priv;
+
+ vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
+ vdso_th->th_x86_shift = 0;
+ vdso_th->th_x86_hpet_idx = 0;
+ bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
+
+ return (pvc->cdev != NULL && pvc->stable_flag_supported &&
+ (pvc->ti_vcpu0_page->flags & PVCLOCK_FLAG_TSC_STABLE) != 0);
+}
+
+#ifdef COMPAT_FREEBSD32
+static uint32_t
+pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th,
+ struct timecounter *tc)
+{
+ struct pvclock *pvc = tc->tc_priv;
+
+ vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
+ vdso_th->th_x86_shift = 0;
+ vdso_th->th_x86_hpet_idx = 0;
+ bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
+
+ return (pvc->cdev != NULL && pvc->stable_flag_supported &&
+ (pvc->ti_vcpu0_page->flags & PVCLOCK_FLAG_TSC_STABLE) != 0);
+}
+#endif
+
+void
+pvclock_gettime(struct pvclock *pvc, struct timespec *ts)
+{
+ struct timespec system_ts;
+ uint64_t system_ns;
+
+ pvclock_get_wallclock(pvc->get_wallclock(pvc->get_wallclock_arg),
+ ts);
+
+ critical_enter();
+ system_ns =
+ pvclock_get_timecount(pvc->get_curcpu_ti(pvc->get_curcpu_ti_arg));
+ critical_exit();
+
+ system_ts.tv_sec = system_ns / 1000000000ULL;
+ system_ts.tv_nsec = system_ns % 1000000000ULL;
+
+ timespecadd(ts, &system_ts, ts);
+}
+
+void
+pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name,
+ int tc_quality, u_int tc_flags)
+{
+ struct make_dev_args mda;
+ int err;
+
+ KASSERT(((uintptr_t)pvc->ti_vcpu0_page & PAGE_MASK) == 0,
+ ("Specified vCPU 0 time info page address not page-aligned."));
+
+ /* Set up timecounter and timecounter-supporting members: */
+ pvc->tc.tc_get_timecount = pvclock_tc_get_timecount;
+ pvc->tc.tc_poll_pps = NULL;
+ pvc->tc.tc_counter_mask = ~0U;
+ pvc->tc.tc_frequency = 1000000000ULL;
+ pvc->tc.tc_name = tc_name;
+ pvc->tc.tc_quality = tc_quality;
+ pvc->tc.tc_flags = tc_flags;
+ pvc->tc.tc_priv = pvc;
+ pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands;
+#ifdef COMPAT_FREEBSD32
+ pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32;
+#endif
+
+ /* Set up cdev for userspace mmapping of vCPU 0 time info page: */
+ make_dev_args_init(&mda);
+ mda.mda_devsw = &pvclock_cdev_cdevsw;
+ mda.mda_uid = UID_ROOT;
+ mda.mda_gid = GID_WHEEL;
+ mda.mda_mode = 0444;
+ mda.mda_si_drv1 = pvc->ti_vcpu0_page;
+ err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME);
+ if (err != 0) {
+ device_printf(dev, "Could not create /dev/%s, error %d. Fast "
+ "time of day will be unavailable for this timecounter.\n",
+ PVCLOCK_CDEVNAME, err);
+ KASSERT(pvc->cdev == NULL,
+ ("Failed make_dev_s() unexpectedly inited cdev."));
+ }
+
+ /* Register timecounter: */
+ tc_init(&pvc->tc);
+
+ /*
+ * Register wallclock:
+ * The RTC registration API expects a resolution in microseconds;
+ * pvclock's 1ns resolution is rounded up to 1us.
+ */
+ clock_register(dev, 1);
+}
+
+int
+pvclock_destroy(struct pvclock *pvc)
+{
+ /*
+ * Not currently possible since there is no teardown counterpart of
+ * 'tc_init()'.
+ */
+ return (EBUSY);
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sat, Nov 23, 6:50 AM (18 h, 6 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14792734
Default Alt Text
D29733.id91478.diff (29 KB)
Attached To
Mode
D29733: kvmclock driver with vDSO support
Attached
Detach File
Event Timeline
Log In to Comment