diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c --- a/sys/x86/x86/local_apic.c +++ b/sys/x86/x86/local_apic.c @@ -1013,7 +1013,12 @@ enum lat_timer_mode newmode) { - if (la->la_timer_mode == newmode) + /* + * The TSC frequency may change during late calibration against other + * timecounters (HPET or ACPI PMTimer). + */ + if (la->la_timer_mode == newmode && + (newmode != LAT_MODE_DEADLINE || et->et_frequency == tsc_freq)) return; switch (newmode) { case LAT_MODE_PERIODIC: diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c --- a/sys/x86/x86/tsc.c +++ b/sys/x86/x86/tsc.c @@ -32,12 +32,14 @@ #include "opt_clock.h" #include +#include #include #include #include #include #include -#include +#include +#include #include #include #include @@ -84,7 +86,7 @@ static int tsc_skip_calibration; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN, &tsc_skip_calibration, 0, - "Disable TSC frequency calibration"); + "Disable early TSC frequency calibration"); static void tsc_freq_changed(void *arg, const struct cf_level *level, int status); @@ -134,14 +136,11 @@ } /* - * Calculate TSC frequency using information from the CPUID leaf 0x15 - * 'Time Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 - * is not functional, as it is on Skylake/Kabylake, try 0x16 'Processor - * Frequency Information'. Leaf 0x16 is described in the SDM as - * informational only, but if 0x15 did not work, and TSC calibration - * is disabled, it is the best we can get at all. It should still be - * an improvement over the parsing of the CPU model name in - * tsc_freq_intel(), when available. + * Calculate TSC frequency using information from the CPUID leaf 0x15 'Time + * Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 is not + * functional, as it is on Skylake/Kabylake, try 0x16 'Processor Frequency + * Information'. Leaf 0x16 is described in the SDM as informational only, but + * we can use this value until late calibration is complete. */ static bool tsc_freq_cpuid(uint64_t *res) @@ -167,8 +166,8 @@ return (false); } -static void -tsc_freq_intel(void) +static bool +tsc_freq_intel_brand(uint64_t *res) { char brand[48]; u_int regs[4]; @@ -205,7 +204,7 @@ i = 1000000; break; default: - return; + return (false); } #define C2D(c) ((c) - '0') if (p[1] == '.') { @@ -221,17 +220,39 @@ freq *= i * 1000000; } #undef C2D - tsc_freq = freq; + *res = freq; + return (true); } } + return (false); } static void -probe_tsc_freq(void) +tsc_freq_8254(uint64_t *res) { - uint64_t tmp_freq, tsc1, tsc2; - int no_cpuid_override; + uint64_t tsc1, tsc2; + int64_t overhead; + int count, i; + + overhead = 0; + for (i = 0, count = 8; i < count; i++) { + tsc1 = rdtsc_ordered(); + DELAY(0); + tsc2 = rdtsc_ordered(); + if (i > 0) + overhead += tsc2 - tsc1; + } + overhead /= count; + + tsc1 = rdtsc_ordered(); + DELAY(100000); + tsc2 = rdtsc_ordered(); + tsc_freq = (tsc2 - tsc1 - overhead) * 10; +} +static void +probe_tsc_freq(void) +{ if (cpu_power_ecx & CPUID_PERF_STAT) { /* * XXX Some emulators expose host CPUID without actual support @@ -287,50 +308,44 @@ break; } - if (tsc_skip_calibration) { - if (tsc_freq_cpuid(&tmp_freq)) - tsc_freq = tmp_freq; - else if (cpu_vendor_id == CPU_VENDOR_INTEL) - tsc_freq_intel(); - if (tsc_freq == 0) - tsc_disabled = 1; - } else { + if (tsc_freq_cpuid(&tsc_freq)) { + /* + * If possible, use the value obtained from CPUID as the initial + * frequency. This will be refined later during boot but is + * good enough for now. The 8254 PIT is not functional on some + * newer platforms anyway, so don't delay our boot for what + * might be a garbage result. Late calibration is required if + * the initial frequency was obtained from CPUID.16H, as the + * derived value may be off by as much as 1%. + */ if (bootverbose) - printf("Calibrating TSC clock ... "); - tsc1 = rdtsc(); - DELAY(1000000); - tsc2 = rdtsc(); - tsc_freq = tsc2 - tsc1; - + printf("Early TSC frequency %juHz derived from CPUID\n", + (uintmax_t)tsc_freq); + } else if (tsc_skip_calibration) { /* - * If the difference between calibrated frequency and - * the frequency reported by CPUID 0x15/0x16 leafs - * differ significantly, this probably means that - * calibration is bogus. It happens on machines - * without 8254 timer. The BIOS rarely properly - * reports it in FADT boot flags, so just compare the - * frequencies directly. + * Try to parse the brand string to obtain the nominal TSC + * frequency. */ - if (tsc_freq_cpuid(&tmp_freq) && qabs(tsc_freq - tmp_freq) > - uqmin(tsc_freq, tmp_freq)) { - no_cpuid_override = 0; - TUNABLE_INT_FETCH("machdep.disable_tsc_cpuid_override", - &no_cpuid_override); - if (!no_cpuid_override) { - if (bootverbose) { - printf( - "TSC clock: calibration freq %ju Hz, CPUID freq %ju Hz%s\n", - (uintmax_t)tsc_freq, - (uintmax_t)tmp_freq, - no_cpuid_override ? "" : - ", doing CPUID override"); - } - tsc_freq = tmp_freq; - } + if (cpu_vendor_id == CPU_VENDOR_INTEL && + tsc_freq_intel_brand(&tsc_freq)) { + if (bootverbose) + printf( + "Early TSC frequency %juHz derived from brand string\n", + (uintmax_t)tsc_freq); + } else { + tsc_disabled = 1; } + } else { + /* + * Calibrate against the 8254 PIT. This estimate will be + * refined later in tsc_calib(). + */ + tsc_freq_8254(&tsc_freq); + if (bootverbose) + printf( + "Early TSC frequency %juHz calibrated from 8254 PIT\n", + (uintmax_t)tsc_freq); } - if (bootverbose) - printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq); } void @@ -372,13 +387,18 @@ break; } #endif - + probe_tsc_freq(); /* * Inform CPU accounting about our boot-time clock rate. This will * be updated if someone loads a cpufreq driver after boot that * discovers a new max frequency. + * + * The frequency may also be updated after late calibration is complete; + * however, we register the TSC as the ticker now to avoid switching + * counters after much of the kernel has already booted and potentially + * sampled the CPU clock. */ if (tsc_freq != 0) set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); @@ -656,11 +676,65 @@ if (tsc_freq != 0) { tsc_timecounter.tc_frequency = tsc_freq >> shift; tsc_timecounter.tc_priv = (void *)(intptr_t)shift; - tc_init(&tsc_timecounter); + + /* + * Timecounter registration is deferred until after late + * calibration is finished. + */ } } SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL); +static void +tsc_update_freq(uint64_t new_freq) +{ + atomic_store_rel_64(&tsc_freq, new_freq); + atomic_store_rel_64(&tsc_timecounter.tc_frequency, + new_freq >> (int)(intptr_t)tsc_timecounter.tc_priv); +} + +/* + * Perform late calibration of the TSC frequency once ACPI-based timecounters + * are available. + */ +static void +tsc_calib(void *arg __unused) +{ + sbintime_t t_start, t_end; + uint64_t freq_khz, tsc_start, tsc_end; + register_t flags; + int cpu; + + if (tsc_disabled) + return; + + flags = intr_disable(); + cpu = curcpu; + tsc_start = rdtsc_ordered(); + t_start = sbinuptime(); + intr_restore(flags); + + DELAY(1000000); + + thread_lock(curthread); + sched_bind(curthread, cpu); + + flags = intr_disable(); + tsc_end = rdtsc_ordered(); + t_end = sbinuptime(); + intr_restore(flags); + + sched_unbind(curthread); + thread_unlock(curthread); + + freq_khz = (SBT_1S / 1024) * (tsc_end - tsc_start) / (t_end - t_start); + + tsc_update_freq(freq_khz * 1024); + tc_init(&tsc_timecounter); + set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); +} +SYSINIT(tsc_calib, SI_SUB_CLOCKS + 1, SI_ORDER_ANY, tsc_calib, NULL); + void resume_TSC(void) { @@ -752,9 +826,7 @@ /* Total setting for this level gives the new frequency in MHz. */ freq = (uint64_t)level->total_set.freq * 1000000; - atomic_store_rel_64(&tsc_freq, freq); - tsc_timecounter.tc_frequency = - freq >> (int)(intptr_t)tsc_timecounter.tc_priv; + tsc_update_freq(freq); } static int @@ -767,14 +839,10 @@ if (freq == 0) return (EOPNOTSUPP); error = sysctl_handle_64(oidp, &freq, 0, req); - if (error == 0 && req->newptr != NULL) { - atomic_store_rel_64(&tsc_freq, freq); - atomic_store_rel_64(&tsc_timecounter.tc_frequency, - freq >> (int)(intptr_t)tsc_timecounter.tc_priv); - } + if (error == 0 && req->newptr != NULL) + tsc_update_freq(freq); return (error); } - SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_machdep_tsc_freq, "QU",