Changeset View
Standalone View
sys/dev/kvm_clock/kvm_clock.c
- This file was added.
/*- | |||||
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD | |||||
* | |||||
* Copyright (c) 2014 Bryan Venteicher <bryanv@FreeBSD.org> | |||||
* Copyright (c) 2021 Mathieu Chouquet-Stringer | |||||
* Copyright (c) 2021 Juniper Networks, Inc. | |||||
* Copyright (c) 2021 Klara, Inc. | |||||
* | |||||
* Redistribution and use in source and binary forms, with or without | |||||
* modification, are permitted provided that the following conditions | |||||
* are met: | |||||
* 1. Redistributions of source code must retain the above copyright | |||||
* notice, this list of conditions and the following disclaimer. | |||||
* 2. Redistributions in binary form must reproduce the above copyright | |||||
* notice, this list of conditions and the following disclaimer in the | |||||
* documentation and/or other materials provided with the distribution. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |||||
* SUCH DAMAGE. | |||||
*/ | |||||
/* | |||||
* Linux KVM paravirtual clock support | |||||
* | |||||
* References: | |||||
* - [1] https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html | |||||
* - [2] https://www.kernel.org/doc/html/latest/virt/kvm/msr.html | |||||
*/ | |||||
#include <sys/cdefs.h> | |||||
__FBSDID("$FreeBSD$"); | |||||
#include <sys/param.h> | |||||
#include <sys/bus.h> | |||||
#include <sys/domainset.h> | |||||
#include <sys/kernel.h> | |||||
#include <sys/malloc.h> | |||||
#include <sys/module.h> | |||||
#include <sys/smp.h> | |||||
#include <vm/vm.h> | |||||
#include <vm/pmap.h> | |||||
#include <machine/pvclock.h> | |||||
#include <x86/kvm.h> | |||||
#include "clock_if.h" | |||||
#define KVM_CLOCK_DEVNAME "kvmclock" | |||||
/* | |||||
* Note: Chosen to be (1) above HPET's value (always 950), (2) above the TSC's | |||||
* default value of 800, and (3) below the TSC's value when it supports the | |||||
* "Invariant TSC" feature and is believed to be synchronized across all CPUs. | |||||
*/ | |||||
#define KVM_CLOCK_TC_QUALITY 975 | |||||
struct kvm_clock_softc { | |||||
struct pvclock pvc; | |||||
struct pvclock_wall_clock wc; | |||||
struct pvclock_vcpu_time_info *timeinfos; | |||||
u_int msr_tc; | |||||
u_int msr_wc; | |||||
}; | |||||
static devclass_t kvm_clock_devclass; | |||||
static struct pvclock_wall_clock *kvm_clock_get_wallclock(void *arg); | |||||
static void kvm_clock_system_time_enable(struct kvm_clock_softc *sc); | |||||
static void kvm_clock_system_time_enable_pcpu(void *arg); | |||||
static struct pvclock_wall_clock * | |||||
kvm_clock_get_wallclock(void *arg) | |||||
{ | |||||
struct kvm_clock_softc *sc = arg; | |||||
wrmsr(sc->msr_wc, vtophys(&sc->wc)); | |||||
return (&sc->wc); | |||||
} | |||||
static void | |||||
kvm_clock_system_time_enable(struct kvm_clock_softc *sc) | |||||
{ | |||||
smp_rendezvous(NULL, kvm_clock_system_time_enable_pcpu, NULL, sc); | |||||
} | |||||
static void | |||||
kvm_clock_system_time_enable_pcpu(void *arg) | |||||
kib: This blank line is excessive. | |||||
{ | |||||
struct kvm_clock_softc *sc = arg; | |||||
/* | |||||
* See [2]; the lsb of this MSR is the system time enable bit. | |||||
*/ | |||||
wrmsr(sc->msr_tc, vtophys(&(sc->timeinfos)[curcpu]) | 1); | |||||
} | |||||
static void | |||||
kvm_clock_identify(driver_t *driver, device_t parent) | |||||
{ | |||||
u_int regs[4]; | |||||
kvm_cpuid_get_features(regs); | |||||
if ((regs[0] & KVM_FEATURE_CLOCKSOURCE2) == 0 && | |||||
kibUnsubmitted Done Inline Actionsif ((regs[0] & (KVM_FEATURE_CLOCKSOURCE | KVM_FEATURE_CLOCKSOURCE2)) == 0) kib: ```
if ((regs[0] & (KVM_FEATURE_CLOCKSOURCE | KVM_FEATURE_CLOCKSOURCE2)) == 0)
``` | |||||
(regs[0] & KVM_FEATURE_CLOCKSOURCE) == 0) | |||||
return; | |||||
if (device_find_child(parent, KVM_CLOCK_DEVNAME, -1)) | |||||
return; | |||||
BUS_ADD_CHILD(parent, 0, KVM_CLOCK_DEVNAME, 0); | |||||
} | |||||
static int | |||||
kvm_clock_probe(device_t dev) | |||||
{ | |||||
device_set_desc(dev, "KVM paravirtual clock"); | |||||
return (BUS_PROBE_DEFAULT); | |||||
} | |||||
static int | |||||
kvm_clock_attach(device_t dev) | |||||
{ | |||||
u_int regs[4]; | |||||
struct kvm_clock_softc *sc = device_get_softc(dev); | |||||
bool stable_flag_supported; | |||||
/* Process KVM "features" CPUID leaf content: */ | |||||
kvm_cpuid_get_features(regs); | |||||
if ((regs[0] & KVM_FEATURE_CLOCKSOURCE2) != 0) { | |||||
sc->msr_tc = KVM_MSR_SYSTEM_TIME_NEW; | |||||
sc->msr_wc = KVM_MSR_WALL_CLOCK_NEW; | |||||
} else if ((regs[0] & KVM_FEATURE_CLOCKSOURCE) != 0) { | |||||
sc->msr_tc = KVM_MSR_SYSTEM_TIME; | |||||
sc->msr_wc = KVM_MSR_WALL_CLOCK; | |||||
} else | |||||
return (ENXIO); | |||||
kibUnsubmitted Done Inline ActionsShould this case be assert? Is it possible that _CLOCKSOURCE feature disappears? kib: Should this case be assert? Is it possible that _CLOCKSOURCE feature disappears? | |||||
stable_flag_supported = | |||||
((regs[0] & KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) != 0); | |||||
/* Set up 'struct pvclock_vcpu_time_info' page(s): */ | |||||
sc->timeinfos = malloc_aligned(round_page(mp_ncpus * | |||||
kibUnsubmitted Done Inline Actionsmalloc() for PAGE_SIZE and larger is a strange choice. Why not use kmem_malloc()? For instance, ldt allocation on x86 uses kmem_malloc(), and it is quite similar to kvm clock case. kib: malloc() for PAGE_SIZE and larger is a strange choice. Why not use kmem_malloc()?
For… | |||||
adam_fenn.ioAuthorUnsubmitted Done Inline ActionsI see that kmem_malloc(), internally, always returns page-aligned allocations, but does it formally guarantee this condition at the API level like malloc_{,domainset_}aligned(9) do? There also seem to be so few uses of it relative to malloc(9), especially among drivers---should we maintain that precedent here? adam_fenn.io: I see that `kmem_malloc()`, internally, always returns page-aligned allocations, but does it… | |||||
kibUnsubmitted Done Inline ActionsYes, the kmem_malloc() KPI fundamental property is that it operates on the page granularity. As I said, the intent there (kvmclock) matches the KPI purpose perfectly. It is one-off allocation from the platform-specific driver, which is tightly integrated with the rest of the system. kib: Yes, the kmem_malloc() KPI fundamental property is that it operates on the page granularity. | |||||
adam_fenn.ioAuthorUnsubmitted Done Inline ActionsOk, sounds good, then. Thanks for the clarification! adam_fenn.io: Ok, sounds good, then. Thanks for the clarification! | |||||
sizeof(struct pvclock_vcpu_time_info)), PAGE_SIZE, M_DEVBUF, | |||||
Done Inline ActionsOuter () are not needed. kib: Outer () are not needed. | |||||
M_WAITOK | M_ZERO); | |||||
kvm_clock_system_time_enable(sc); | |||||
Done Inline ActionsDo you really need to use the dma subsystem for this? Isn't it enough to just malloc a region of memory and use it? You could then also use M_WAITOK maybe? royger: Do you really need to use the dma subsystem for this? Isn't it enough to just malloc a region… | |||||
Done Inline Actions
I'd decided against malloc() since it doesn't provide API-level alignment guarantees; while, implementation-wise, it does look like >= PAGE_SIZE-ed malloc(9) allocations will be page-aligned, my read of malloc(9) (particularly section IMPLEMENTATION NOTES) is that this should not be assumed by callers. (FWIW, I had also noted (1) the precedent of hyperv's use of BUS_DMA(9) for its corresponding PV clock in-memory data and (2) internally, this draft's use of BUS_DMA(9) will ultimately do the same thing that the first draft was doing with its direct use of the physical memory pager). Just recapping, the intention is that timeinfos be readily mmap-able---the allocation containing timeinfos should start and end on a page boundary (with any unused portion of the last page zeroed). Does this choice make sense or am I missing something?
This choice was considering use cases where the driver is loaded during boot (whether compiled in or via loader.conf(5)). My thinking was that a failure to satisfy a relatively small allocation like this during boot would be an indication of far greater problems and, as such, it wouldn't be worth adding any blocking+reclaims+retries to the boot process, especially since the system can still function without kvmclock. But I suppose it's also ok to allow the reclaims+retries, so, I've switched bus_dmamem_alloc() from BUS_DMA_NOWAIT to BUS_DMA_WAITOK in this version as per this suggestion. adam_fenn.io: > Do you really need to use the dma subsystem for this? Isn't it enough to just malloc a region… | |||||
Done Inline Actions
Right, TBH I think malloc returning and address not aligned to a page boundary when allocating a region >= PAGE_SIZE would likely be a bug in the implementation.
Well, I think it's overly complicated (and cumbersome due to all the initialization involved) to use bus_dma just to allocate a memory region that's page aligned. I'm sure there are already instances in-kernel that assume that malloc(PAGE_SIZE) returns a page. I wouldn't be opposed to using: sc->timeinfos = malloc(size, ...); if ((sc->timeinfos & PAGE_MASK) != 0) { device_printf(..., "failed to allocate aligned memory region"); KASSERT(0, ("Failed to allocate page aligned region")); return (ENOMEM); } But I understand others might have different opinions. Maybe @kib could provide some insight on whether there's another way to allocate aligned memory regions without having to resort to bus_dma, I don't have that much insight on the VM subsystem.
Likely, if we fail to satisfy such small allocation it's likely the kernel won't be able to finish booting anyway. Adding a comment as to why BUS_DMA_NOWAIT is used (ie: not because the context prevents using BUS_DMA_WAITOK) would be OK also.
royger: > > Do you really need to use the dma subsystem for this? Isn't it enough to just malloc a… | |||||
Done Inline ActionsWhile malloc(9) can't give these guarantees, uma(8) can. Creating the tag, allocating and loading it are basically creating a uma zone behind the scenes and using them directly would be better than using busdma just to get alignment. imp: While malloc(9) can't give these guarantees, uma(8) can. Creating the tag, allocating and… | |||||
Done Inline ActionsWe have malloc_domainset_aligned(), added exactly to export the alignment guarantees from the malloc subsystem. kib: We have malloc_domainset_aligned(), added exactly to export the alignment guarantees from the… | |||||
Done Inline ActionsThanks! That's exactly what's needed here. I think this function has not been added to the malloc(9) man page? Would it make sense to also add some syntactic sugar and introduce a: #define malloc_aligned(s, a, t, f) malloc_domainset_aligned(s, a, t, DOMAINSET_RR(), f) Or similar. royger: Thanks! That's exactly what's needed here. I think this function has not been added to the… | |||||
Done Inline ActionsOf course, feel free to add. There were no users for it until now. kib: Of course, feel free to add. There were no users for it until now. | |||||
/* | |||||
Done Inline ActionsI don't think you need to strictly use contigmalloc here, as each pCPU will use vtophys to get the physical address of it's vcpu_time_info area? As the size of a vcpu_time_info is 32bytes, so it should never cross a page boundary provided the first element is properly aligned. royger: I don't think you need to strictly use contigmalloc here, as each pCPU will use vtophys to get… | |||||
Done Inline Actionsround_page() is not needed, kmem_malloc() handles roundup internally kib: round_page() is not needed, kmem_malloc() handles roundup internally | |||||
* Init pvclock; register KVM clock wall clock, register KVM clock | |||||
* timecounter, and set up the requisite infrastructure for vDSO access | |||||
* to this timecounter. | |||||
* Regarding 'tc_flags': Since the KVM MSR documentation does not | |||||
* specifically discuss suspend/resume scenarios, conservatively | |||||
* leave 'TC_FLAGS_SUSPEND_SAFE' cleared and assume that the system | |||||
* time must be re-inited in such cases. | |||||
*/ | |||||
sc->pvc.get_wallclock = kvm_clock_get_wallclock; | |||||
sc->pvc.get_wallclock_arg = sc; | |||||
sc->pvc.timeinfos = sc->timeinfos; | |||||
sc->pvc.stable_flag_supported = stable_flag_supported; | |||||
pvclock_init(&sc->pvc, dev, KVM_CLOCK_DEVNAME, KVM_CLOCK_TC_QUALITY, 0); | |||||
return (0); | |||||
} | |||||
static int | |||||
kvm_clock_detach(device_t dev) | |||||
{ | |||||
struct kvm_clock_softc *sc = device_get_softc(dev); | |||||
return (pvclock_destroy(&sc->pvc)); | |||||
} | |||||
static int | |||||
kvm_clock_suspend(device_t dev) | |||||
{ | |||||
return (0); | |||||
} | |||||
static int | |||||
kvm_clock_resume(device_t dev) | |||||
{ | |||||
/* | |||||
* See note in 'kvm_clock_attach()' regarding 'TC_FLAGS_SUSPEND_SAFE'; | |||||
* conservatively assume that the system time must be re-inited in | |||||
* suspend/resume scenarios. | |||||
*/ | |||||
kvm_clock_system_time_enable(device_get_softc(dev)); | |||||
pvclock_resume(); | |||||
inittodr(time_second); | |||||
return (0); | |||||
} | |||||
static int | |||||
kvm_clock_gettime(device_t dev, struct timespec *ts) | |||||
{ | |||||
struct kvm_clock_softc *sc = device_get_softc(dev); | |||||
pvclock_gettime(&sc->pvc, ts); | |||||
return (0); | |||||
} | |||||
static int | |||||
kvm_clock_settime(device_t dev, struct timespec *ts) | |||||
{ | |||||
/* | |||||
* Even though it is not possible to set the KVM clock's wall clock, to | |||||
* avoid the possibility of periodic benign error messages from | |||||
* 'settime_task_func()', report success rather than, e.g., 'ENODEV'. | |||||
*/ | |||||
return (0); | |||||
} | |||||
static device_method_t kvm_clock_methods[] = { | |||||
DEVMETHOD(device_identify, kvm_clock_identify), | |||||
DEVMETHOD(device_probe, kvm_clock_probe), | |||||
DEVMETHOD(device_attach, kvm_clock_attach), | |||||
DEVMETHOD(device_detach, kvm_clock_detach), | |||||
DEVMETHOD(device_suspend, kvm_clock_suspend), | |||||
DEVMETHOD(device_resume, kvm_clock_resume), | |||||
/* clock interface */ | |||||
DEVMETHOD(clock_gettime, kvm_clock_gettime), | |||||
DEVMETHOD(clock_settime, kvm_clock_settime), | |||||
Done Inline ActionsExcessive blank line. kib: Excessive blank line. | |||||
DEVMETHOD_END | |||||
Done Inline ActionsAnd this one. It would be too much spam to comment about all of them. kib: And this one. It would be too much spam to comment about all of them. | |||||
}; | |||||
static driver_t kvm_clock_driver = { | |||||
KVM_CLOCK_DEVNAME, | |||||
kvm_clock_methods, | |||||
sizeof(struct kvm_clock_softc), | |||||
}; | |||||
DRIVER_MODULE(kvm_clock, nexus, kvm_clock_driver, kvm_clock_devclass, 0, 0); | |||||
Done Inline ActionsCould the cdev creation be abstracted away, so that you only need to provide it a pointer to a struct pvclock_vcpu_time_info to use? There's no KVM specific logic in there, as it's only purpose is to map the hypervisor agnostic structure into user-space. royger: Could the cdev creation be abstracted away, so that you only need to provide it a pointer to a… | |||||
Done Inline ActionsSorry, forgot to comment: why do you need to map all the vCPU time infos in the pager? user-space should only use one of those (ie: from vCPU#0?) which seems to be already what the vDSO implementation does, but the pvclock device would expose all time infos which seems unnecessary. Might be easier to just use d_mmap here, and only allow mapping the first page (if there are multiple ones) that contains the pvclock mapped? See HyperV hyperv_tsc_mma which I think it's slightly easier as it doesn't use a pager. royger: Sorry, forgot to comment: why do you need to map all the vCPU time infos in the pager? user… | |||||
Done Inline ActionsWhy do you need newbus attachment for this thing? Due to the clock kobj interface? But is it required? kib: Why do you need newbus attachment for this thing? Due to the clock kobj interface? But is it… | |||||
Done Inline ActionsRight, all of this is as per the clock/RTC subsystem (sys/clock.h, kern/subr_rtc.c), which does use the clock kobj interface's entry points. adam_fenn.io: Right, all of this is as per the clock/RTC subsystem (`sys/clock.h`, `kern/subr_rtc.c`), which… |
This blank line is excessive.