diff --git a/sys/i386/i386/geode.c b/sys/i386/i386/geode.c index bef94a90629e..ee34c48157b0 100644 --- a/sys/i386/i386/geode.c +++ b/sys/i386/i386/geode.c @@ -1,383 +1,383 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2004 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include static struct bios_oem bios_soekris = { { 0xf0000, 0xf1000 }, { { "Soekris", 0, 8 }, /* Soekris Engineering. */ { "net4", 0, 8 }, /* net45xx */ { "comBIOS", 0, 54 }, /* comBIOS ver. 1.26a 20040819 ... */ { NULL, 0, 0 }, } }; static struct bios_oem bios_soekris_55 = { { 0xf0000, 0xf1000 }, { { "Soekris", 0, 8 }, /* Soekris Engineering. */ { "net5", 0, 8 }, /* net5xxx */ { "comBIOS", 0, 54 }, /* comBIOS ver. 1.26a 20040819 ... */ { NULL, 0, 0 }, } }; static struct bios_oem bios_pcengines = { { 0xf9000, 0xfa000 }, { { "PC Engines WRAP", 0, 28 }, /* PC Engines WRAP.1C v1.03 */ { "tinyBIOS", 0, 28 }, /* tinyBIOS V1.4a (C)1997-2003 */ { NULL, 0, 0 }, } }; static struct bios_oem bios_pcengines_55 = { { 0xf9000, 0xfa000 }, { { "PC Engines ALIX", 0, 28 }, /* PC Engines ALIX */ { "tinyBIOS", 0, 28 }, /* tinyBIOS V1.4a (C)1997-2005 */ { NULL, 0, 0 }, } }; static struct bios_oem bios_advantech = { { 0xfe000, 0xff000 }, { { "**** PCM-582", 5, 33 }, /* PCM-5823 BIOS V1.12 ... */ { "GXm-Cx5530", -11, 35 }, /* 06/07/2002-GXm-Cx5530... */ { NULL, 0, 0 }, } }; static unsigned cba; static unsigned gpio; static unsigned geode_counter; static struct cdev *led1, *led2, *led3; static int led1b, led2b, led3b; static void led_func(void *ptr, int onoff) { uint32_t u; int bit; bit = *(int *)ptr; if (bit < 0) { bit = -bit; onoff = !onoff; } u = inl(gpio + 4); if (onoff) u |= 1 << bit; else u &= ~(1 << bit); outl(gpio, u); } static void cs5536_led_func(void *ptr, int onoff) { int bit; uint16_t a; bit = *(int *)ptr; if (bit < 0) { bit = -bit; onoff = !onoff; } a = rdmsr(0x5140000c); if (bit >= 16) { a += 0x80; bit -= 16; } if (onoff) outl(a, 1 << bit); else outl(a, 1 << (bit + 16)); } static unsigned geode_get_timecount(struct timecounter *tc) { return (inl(geode_counter)); } static struct timecounter geode_timecounter = { geode_get_timecount, NULL, 0xffffffff, 27000000, "Geode", 1000 }; static uint64_t geode_cputicks(void) { unsigned c; static unsigned last; static uint64_t offset; c = inl(geode_counter); if (c < last) offset += (1LL << 32); last = c; return (offset | c); } /* * The GEODE watchdog runs from a 32kHz frequency. One period of that is * 31250 nanoseconds which we round down to 2^14 nanoseconds. The watchdog * consists of a power-of-two prescaler and a 16 bit counter, so the math * is quite simple. The max timeout is 14 + 16 + 13 = 2^43 nsec ~= 2h26m. */ static void geode_watchdog(void *foo __unused, u_int cmd, int *error) { u_int u, p, r; u = cmd & WD_INTERVAL; if (u >= 14 && u <= 43) { u -= 14; if (u > 16) { p = u - 16; u -= p; } else { p = 0; } if (u == 16) u = (1 << u) - 1; else u = 1 << u; r = inw(cba + 2) & 0xff00; outw(cba + 2, p | 0xf0 | r); outw(cba, u); *error = 0; } else { outw(cba, 0); } } /* * We run MFGPT0 off the 32kHz frequency and prescale by 16384 giving a * period of half a second. * Range becomes 2^30 (= 1 sec) to 2^44 (almost 5 hours) */ static void cs5536_watchdog(void *foo __unused, u_int cmd, int *error) { u_int u, p, s; uint16_t a; uint32_t m; a = rdmsr(0x5140000d); u = cmd & WD_INTERVAL; if (u >= 30 && u <= 44) { p = 1 << (u - 29); /* Set up MFGPT0, 32khz, prescaler 16k, C2 event */ outw(a + 6, 0x030e); /* set comparator 2 */ outw(a + 2, p); /* reset counter */ outw(a + 4, 0); /* Arm reset mechanism */ m = rdmsr(0x51400029); m |= (1 << 24); wrmsr(0x51400029, m); /* Start counter */ outw(a + 6, 0x8000); *error = 0; } else { /* * MFGPT_SETUP is write-once * Check if the counter has been setup */ s = inw(a + 6); if (s & (1 << 12)) { /* Stop and reset counter */ outw(a + 6, 0); outw(a + 4, 0); } } } /* * The Advantech PCM-582x watchdog expects 0x1 at I/O port 0x0443 * every 1.6 secs +/- 30%. Writing 0x0 disables the watchdog * NB: reading the I/O port enables the timer as well */ static void advantech_watchdog(void *foo __unused, u_int cmd, int *error) { u_int u; u = cmd & WD_INTERVAL; if (u > 0 && u <= WD_TO_1SEC) { outb(0x0443, 1); *error = 0; } else { outb(0x0443, 0); } } static int geode_probe(device_t self) { #define BIOS_OEM_MAXLEN 80 static u_char bios_oem[BIOS_OEM_MAXLEN] = "\0"; switch (pci_get_devid(self)) { case 0x0515100b: if (geode_counter == 0) { /* * The address of the CBA is written to this register * by the bios, see p161 in data sheet. */ cba = pci_read_config(self, 0x64, 4); if (bootverbose) printf("Geode CBA@ 0x%x\n", cba); geode_counter = cba + 0x08; outl(cba + 0x0d, 2); if (bootverbose) printf("Geode rev: %02x %02x\n", inb(cba + 0x3c), inb(cba + 0x3d)); tc_init(&geode_timecounter); EVENTHANDLER_REGISTER(watchdog_list, geode_watchdog, NULL, 0); - set_cputicker(geode_cputicks, 27000000, 0); + set_cputicker(geode_cputicks, 27000000, false); } break; case 0x0510100b: gpio = pci_read_config(self, PCIR_BAR(0), 4); gpio &= ~0x1f; if (bootverbose) printf("Geode GPIO@ = %x\n", gpio); if (bios_oem_strings(&bios_soekris, bios_oem, sizeof bios_oem) > 0 ) { led1b = 20; led1 = led_create(led_func, &led1b, "error"); } else if (bios_oem_strings(&bios_pcengines, bios_oem, sizeof bios_oem) > 0 ) { led1b = -2; led2b = -3; led3b = -18; led1 = led_create(led_func, &led1b, "led1"); led2 = led_create(led_func, &led2b, "led2"); led3 = led_create(led_func, &led3b, "led3"); /* * Turn on first LED so we don't make * people think their box just died. */ led_func(&led1b, 1); } if (*bios_oem) printf("Geode %s\n", bios_oem); break; case 0x01011078: if (bios_oem_strings(&bios_advantech, bios_oem, sizeof bios_oem) > 0 ) { printf("Geode %s\n", bios_oem); EVENTHANDLER_REGISTER(watchdog_list, advantech_watchdog, NULL, 0); } break; case 0x20801022: if (bios_oem_strings(&bios_soekris_55, bios_oem, sizeof bios_oem) > 0 ) { led1b = 6; led1 = led_create(cs5536_led_func, &led1b, "error"); } else if (bios_oem_strings(&bios_pcengines_55, bios_oem, sizeof bios_oem) > 0 ) { led1b = -6; led2b = -25; led3b = -27; led1 = led_create(cs5536_led_func, &led1b, "led1"); led2 = led_create(cs5536_led_func, &led2b, "led2"); led3 = led_create(cs5536_led_func, &led3b, "led3"); /* * Turn on first LED so we don't make * people think their box just died. */ cs5536_led_func(&led1b, 1); } if (*bios_oem) printf("Geode LX: %s\n", bios_oem); if (bootverbose) printf("MFGPT bar: %jx\n", rdmsr(0x5140000d)); EVENTHANDLER_REGISTER(watchdog_list, cs5536_watchdog, NULL, 0); break; } return (ENXIO); } static int geode_attach(device_t self) { return(ENODEV); } static device_method_t geode_methods[] = { /* Device interface */ DEVMETHOD(device_probe, geode_probe), DEVMETHOD(device_attach, geode_attach), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), DEVMETHOD(device_shutdown, bus_generic_shutdown), {0, 0} }; static driver_t geode_driver = { "geode", geode_methods, 0, }; DRIVER_MODULE(geode, pci, geode_driver, 0, 0); diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c index fcdec7a58200..37287df4d1fd 100644 --- a/sys/kern/kern_tc.c +++ b/sys/kern/kern_tc.c @@ -1,2263 +1,2263 @@ /*- * SPDX-License-Identifier: Beerware * * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * Copyright (c) 2011, 2015, 2016 The FreeBSD Foundation * * Portions of this software were developed by Julien Ridoux at the University * of Melbourne under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. */ #include __FBSDID("$FreeBSD$"); #include "opt_ntp.h" #include "opt_ffclock.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * A large step happens on boot. This constant detects such steps. * It is relatively small so that ntp_update_second gets called enough * in the typical 'missed a couple of seconds' case, but doesn't loop * forever when the time step is large. */ #define LARGE_STEP 200 /* * Implement a dummy timecounter which we can use until we get a real one * in the air. This allows the console and other early stuff to use * time services. */ static u_int dummy_get_timecount(struct timecounter *tc) { static u_int now; return (++now); } static struct timecounter dummy_timecounter = { dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000 }; struct timehands { /* These fields must be initialized by the driver. */ struct timecounter *th_counter; int64_t th_adjustment; uint64_t th_scale; u_int th_large_delta; u_int th_offset_count; struct bintime th_offset; struct bintime th_bintime; struct timeval th_microtime; struct timespec th_nanotime; struct bintime th_boottime; /* Fields not to be copied in tc_windup start with th_generation. */ u_int th_generation; struct timehands *th_next; }; static struct timehands ths[16] = { [0] = { .th_counter = &dummy_timecounter, .th_scale = (uint64_t)-1 / 1000000, .th_large_delta = 1000000, .th_offset = { .sec = 1 }, .th_generation = 1, }, }; static struct timehands *volatile timehands = &ths[0]; struct timecounter *timecounter = &dummy_timecounter; static struct timecounter *timecounters = &dummy_timecounter; /* Mutex to protect the timecounter list. */ static struct mtx tc_lock; int tc_min_ticktock_freq = 1; volatile time_t time_second = 1; volatile time_t time_uptime = 1; /* * The system time is always computed by summing the estimated boot time and the * system uptime. The timehands track boot time, but it changes when the system * time is set by the user, stepped by ntpd or adjusted when resuming. It * is set to new_time - uptime. */ static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_boottime, "S,timeval", "Estimated system boottime"); SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); static int timestepwarnings; SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RWTUN, ×tepwarnings, 0, "Log time steps"); static int timehands_count = 2; SYSCTL_INT(_kern_timecounter, OID_AUTO, timehands_count, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &timehands_count, 0, "Count of timehands in rotation"); struct bintime bt_timethreshold; struct bintime bt_tickthreshold; sbintime_t sbt_timethreshold; sbintime_t sbt_tickthreshold; struct bintime tc_tick_bt; sbintime_t tc_tick_sbt; int tc_precexp; int tc_timepercentage = TC_DEFAULTPERC; static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_timecounter_adjprecision, "I", "Allowed time interval deviation in percents"); volatile int rtc_generation = 1; static int tc_chosen; /* Non-zero if a specific tc was chosen via sysctl. */ static char tc_from_tunable[16]; static void tc_windup(struct bintime *new_boottimebin); static void cpu_tick_calibrate(int); void dtrace_getnanotime(struct timespec *tsp); void dtrace_getnanouptime(struct timespec *tsp); static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS) { struct timeval boottime; getboottime(&boottime); /* i386 is the only arch which uses a 32bits time_t */ #ifdef __amd64__ #ifdef SCTL_MASK32 int tv[2]; if (req->flags & SCTL_MASK32) { tv[0] = boottime.tv_sec; tv[1] = boottime.tv_usec; return (SYSCTL_OUT(req, tv, sizeof(tv))); } #endif #endif return (SYSCTL_OUT(req, &boottime, sizeof(boottime))); } static int sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS) { u_int ncount; struct timecounter *tc = arg1; ncount = tc->tc_get_timecount(tc); return (sysctl_handle_int(oidp, &ncount, 0, req)); } static int sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS) { uint64_t freq; struct timecounter *tc = arg1; freq = tc->tc_frequency; return (sysctl_handle_64(oidp, &freq, 0, req)); } /* * Return the difference between the timehands' counter value now and what * was when we copied it to the timehands' offset_count. */ static __inline u_int tc_delta(struct timehands *th) { struct timecounter *tc; tc = th->th_counter; return ((tc->tc_get_timecount(tc) - th->th_offset_count) & tc->tc_counter_mask); } static __inline void bintime_add_tc_delta(struct bintime *bt, uint64_t scale, uint64_t large_delta, uint64_t delta) { uint64_t x; if (__predict_false(delta >= large_delta)) { /* Avoid overflow for scale * delta. */ x = (scale >> 32) * delta; bt->sec += x >> 32; bintime_addx(bt, x << 32); bintime_addx(bt, (scale & 0xffffffff) * delta); } else { bintime_addx(bt, scale * delta); } } /* * Functions for reading the time. We have to loop until we are sure that * the timehands that we operated on was not updated under our feet. See * the comment in for a description of these 12 functions. */ static __inline void bintime_off(struct bintime *bt, u_int off) { struct timehands *th; struct bintime *btp; uint64_t scale; u_int delta, gen, large_delta; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); btp = (struct bintime *)((vm_offset_t)th + off); *bt = *btp; scale = th->th_scale; delta = tc_delta(th); large_delta = th->th_large_delta; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); bintime_add_tc_delta(bt, scale, large_delta, delta); } #define GETTHBINTIME(dst, member) \ do { \ _Static_assert(_Generic(((struct timehands *)NULL)->member, \ struct bintime: 1, default: 0) == 1, \ "struct timehands member is not of struct bintime type"); \ bintime_off(dst, __offsetof(struct timehands, member)); \ } while (0) static __inline void getthmember(void *out, size_t out_size, u_int off) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); memcpy(out, (char *)th + off, out_size); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } #define GETTHMEMBER(dst, member) \ do { \ _Static_assert(_Generic(*dst, \ __typeof(((struct timehands *)NULL)->member): 1, \ default: 0) == 1, \ "*dst and struct timehands member have different types"); \ getthmember(dst, sizeof(*dst), __offsetof(struct timehands, \ member)); \ } while (0) #ifdef FFCLOCK void fbclock_binuptime(struct bintime *bt) { GETTHBINTIME(bt, th_offset); } void fbclock_nanouptime(struct timespec *tsp) { struct bintime bt; fbclock_binuptime(&bt); bintime2timespec(&bt, tsp); } void fbclock_microuptime(struct timeval *tvp) { struct bintime bt; fbclock_binuptime(&bt); bintime2timeval(&bt, tvp); } void fbclock_bintime(struct bintime *bt) { GETTHBINTIME(bt, th_bintime); } void fbclock_nanotime(struct timespec *tsp) { struct bintime bt; fbclock_bintime(&bt); bintime2timespec(&bt, tsp); } void fbclock_microtime(struct timeval *tvp) { struct bintime bt; fbclock_bintime(&bt); bintime2timeval(&bt, tvp); } void fbclock_getbinuptime(struct bintime *bt) { GETTHMEMBER(bt, th_offset); } void fbclock_getnanouptime(struct timespec *tsp) { struct bintime bt; GETTHMEMBER(&bt, th_offset); bintime2timespec(&bt, tsp); } void fbclock_getmicrouptime(struct timeval *tvp) { struct bintime bt; GETTHMEMBER(&bt, th_offset); bintime2timeval(&bt, tvp); } void fbclock_getbintime(struct bintime *bt) { GETTHMEMBER(bt, th_bintime); } void fbclock_getnanotime(struct timespec *tsp) { GETTHMEMBER(tsp, th_nanotime); } void fbclock_getmicrotime(struct timeval *tvp) { GETTHMEMBER(tvp, th_microtime); } #else /* !FFCLOCK */ void binuptime(struct bintime *bt) { GETTHBINTIME(bt, th_offset); } void nanouptime(struct timespec *tsp) { struct bintime bt; binuptime(&bt); bintime2timespec(&bt, tsp); } void microuptime(struct timeval *tvp) { struct bintime bt; binuptime(&bt); bintime2timeval(&bt, tvp); } void bintime(struct bintime *bt) { GETTHBINTIME(bt, th_bintime); } void nanotime(struct timespec *tsp) { struct bintime bt; bintime(&bt); bintime2timespec(&bt, tsp); } void microtime(struct timeval *tvp) { struct bintime bt; bintime(&bt); bintime2timeval(&bt, tvp); } void getbinuptime(struct bintime *bt) { GETTHMEMBER(bt, th_offset); } void getnanouptime(struct timespec *tsp) { struct bintime bt; GETTHMEMBER(&bt, th_offset); bintime2timespec(&bt, tsp); } void getmicrouptime(struct timeval *tvp) { struct bintime bt; GETTHMEMBER(&bt, th_offset); bintime2timeval(&bt, tvp); } void getbintime(struct bintime *bt) { GETTHMEMBER(bt, th_bintime); } void getnanotime(struct timespec *tsp) { GETTHMEMBER(tsp, th_nanotime); } void getmicrotime(struct timeval *tvp) { GETTHMEMBER(tvp, th_microtime); } #endif /* FFCLOCK */ void getboottime(struct timeval *boottime) { struct bintime boottimebin; getboottimebin(&boottimebin); bintime2timeval(&boottimebin, boottime); } void getboottimebin(struct bintime *boottimebin) { GETTHMEMBER(boottimebin, th_boottime); } #ifdef FFCLOCK /* * Support for feed-forward synchronization algorithms. This is heavily inspired * by the timehands mechanism but kept independent from it. *_windup() functions * have some connection to avoid accessing the timecounter hardware more than * necessary. */ /* Feed-forward clock estimates kept updated by the synchronization daemon. */ struct ffclock_estimate ffclock_estimate; struct bintime ffclock_boottime; /* Feed-forward boot time estimate. */ uint32_t ffclock_status; /* Feed-forward clock status. */ int8_t ffclock_updated; /* New estimates are available. */ struct mtx ffclock_mtx; /* Mutex on ffclock_estimate. */ struct fftimehands { struct ffclock_estimate cest; struct bintime tick_time; struct bintime tick_time_lerp; ffcounter tick_ffcount; uint64_t period_lerp; volatile uint8_t gen; struct fftimehands *next; }; #define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x)) static struct fftimehands ffth[10]; static struct fftimehands *volatile fftimehands = ffth; static void ffclock_init(void) { struct fftimehands *cur; struct fftimehands *last; memset(ffth, 0, sizeof(ffth)); last = ffth + NUM_ELEMENTS(ffth) - 1; for (cur = ffth; cur < last; cur++) cur->next = cur + 1; last->next = ffth; ffclock_updated = 0; ffclock_status = FFCLOCK_STA_UNSYNC; mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF); } /* * Reset the feed-forward clock estimates. Called from inittodr() to get things * kick started and uses the timecounter nominal frequency as a first period * estimate. Note: this function may be called several time just after boot. * Note: this is the only function that sets the value of boot time for the * monotonic (i.e. uptime) version of the feed-forward clock. */ void ffclock_reset_clock(struct timespec *ts) { struct timecounter *tc; struct ffclock_estimate cest; tc = timehands->th_counter; memset(&cest, 0, sizeof(struct ffclock_estimate)); timespec2bintime(ts, &ffclock_boottime); timespec2bintime(ts, &(cest.update_time)); ffclock_read_counter(&cest.update_ffcount); cest.leapsec_next = 0; cest.period = ((1ULL << 63) / tc->tc_frequency) << 1; cest.errb_abs = 0; cest.errb_rate = 0; cest.status = FFCLOCK_STA_UNSYNC; cest.leapsec_total = 0; cest.leapsec = 0; mtx_lock(&ffclock_mtx); bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate)); ffclock_updated = INT8_MAX; mtx_unlock(&ffclock_mtx); printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name, (unsigned long long)tc->tc_frequency, (long)ts->tv_sec, (unsigned long)ts->tv_nsec); } /* * Sub-routine to convert a time interval measured in RAW counter units to time * in seconds stored in bintime format. * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be * larger than the max value of u_int (on 32 bit architecture). Loop to consume * extra cycles. */ static void ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt) { struct bintime bt2; ffcounter delta, delta_max; delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1; bintime_clear(bt); do { if (ffdelta > delta_max) delta = delta_max; else delta = ffdelta; bt2.sec = 0; bt2.frac = period; bintime_mul(&bt2, (unsigned int)delta); bintime_add(bt, &bt2); ffdelta -= delta; } while (ffdelta > 0); } /* * Update the fftimehands. * Push the tick ffcount and time(s) forward based on current clock estimate. * The conversion from ffcounter to bintime relies on the difference clock * principle, whose accuracy relies on computing small time intervals. If a new * clock estimate has been passed by the synchronisation daemon, make it * current, and compute the linear interpolation for monotonic time if needed. */ static void ffclock_windup(unsigned int delta) { struct ffclock_estimate *cest; struct fftimehands *ffth; struct bintime bt, gap_lerp; ffcounter ffdelta; uint64_t frac; unsigned int polling; uint8_t forward_jump, ogen; /* * Pick the next timehand, copy current ffclock estimates and move tick * times and counter forward. */ forward_jump = 0; ffth = fftimehands->next; ogen = ffth->gen; ffth->gen = 0; cest = &ffth->cest; bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate)); ffdelta = (ffcounter)delta; ffth->period_lerp = fftimehands->period_lerp; ffth->tick_time = fftimehands->tick_time; ffclock_convert_delta(ffdelta, cest->period, &bt); bintime_add(&ffth->tick_time, &bt); ffth->tick_time_lerp = fftimehands->tick_time_lerp; ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt); bintime_add(&ffth->tick_time_lerp, &bt); ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta; /* * Assess the status of the clock, if the last update is too old, it is * likely the synchronisation daemon is dead and the clock is free * running. */ if (ffclock_updated == 0) { ffdelta = ffth->tick_ffcount - cest->update_ffcount; ffclock_convert_delta(ffdelta, cest->period, &bt); if (bt.sec > 2 * FFCLOCK_SKM_SCALE) ffclock_status |= FFCLOCK_STA_UNSYNC; } /* * If available, grab updated clock estimates and make them current. * Recompute time at this tick using the updated estimates. The clock * estimates passed the feed-forward synchronisation daemon may result * in time conversion that is not monotonically increasing (just after * the update). time_lerp is a particular linear interpolation over the * synchronisation algo polling period that ensures monotonicity for the * clock ids requesting it. */ if (ffclock_updated > 0) { bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate)); ffdelta = ffth->tick_ffcount - cest->update_ffcount; ffth->tick_time = cest->update_time; ffclock_convert_delta(ffdelta, cest->period, &bt); bintime_add(&ffth->tick_time, &bt); /* ffclock_reset sets ffclock_updated to INT8_MAX */ if (ffclock_updated == INT8_MAX) ffth->tick_time_lerp = ffth->tick_time; if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >)) forward_jump = 1; else forward_jump = 0; bintime_clear(&gap_lerp); if (forward_jump) { gap_lerp = ffth->tick_time; bintime_sub(&gap_lerp, &ffth->tick_time_lerp); } else { gap_lerp = ffth->tick_time_lerp; bintime_sub(&gap_lerp, &ffth->tick_time); } /* * The reset from the RTC clock may be far from accurate, and * reducing the gap between real time and interpolated time * could take a very long time if the interpolated clock insists * on strict monotonicity. The clock is reset under very strict * conditions (kernel time is known to be wrong and * synchronization daemon has been restarted recently. * ffclock_boottime absorbs the jump to ensure boot time is * correct and uptime functions stay consistent. */ if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) && ((cest->status & FFCLOCK_STA_UNSYNC) == 0) && ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) { if (forward_jump) bintime_add(&ffclock_boottime, &gap_lerp); else bintime_sub(&ffclock_boottime, &gap_lerp); ffth->tick_time_lerp = ffth->tick_time; bintime_clear(&gap_lerp); } ffclock_status = cest->status; ffth->period_lerp = cest->period; /* * Compute corrected period used for the linear interpolation of * time. The rate of linear interpolation is capped to 5000PPM * (5ms/s). */ if (bintime_isset(&gap_lerp)) { ffdelta = cest->update_ffcount; ffdelta -= fftimehands->cest.update_ffcount; ffclock_convert_delta(ffdelta, cest->period, &bt); polling = bt.sec; bt.sec = 0; bt.frac = 5000000 * (uint64_t)18446744073LL; bintime_mul(&bt, polling); if (bintime_cmp(&gap_lerp, &bt, >)) gap_lerp = bt; /* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */ frac = 0; if (gap_lerp.sec > 0) { frac -= 1; frac /= ffdelta / gap_lerp.sec; } frac += gap_lerp.frac / ffdelta; if (forward_jump) ffth->period_lerp += frac; else ffth->period_lerp -= frac; } ffclock_updated = 0; } if (++ogen == 0) ogen = 1; ffth->gen = ogen; fftimehands = ffth; } /* * Adjust the fftimehands when the timecounter is changed. Stating the obvious, * the old and new hardware counter cannot be read simultaneously. tc_windup() * does read the two counters 'back to back', but a few cycles are effectively * lost, and not accumulated in tick_ffcount. This is a fairly radical * operation for a feed-forward synchronization daemon, and it is its job to not * pushing irrelevant data to the kernel. Because there is no locking here, * simply force to ignore pending or next update to give daemon a chance to * realize the counter has changed. */ static void ffclock_change_tc(struct timehands *th) { struct fftimehands *ffth; struct ffclock_estimate *cest; struct timecounter *tc; uint8_t ogen; tc = th->th_counter; ffth = fftimehands->next; ogen = ffth->gen; ffth->gen = 0; cest = &ffth->cest; bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate)); cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1; cest->errb_abs = 0; cest->errb_rate = 0; cest->status |= FFCLOCK_STA_UNSYNC; ffth->tick_ffcount = fftimehands->tick_ffcount; ffth->tick_time_lerp = fftimehands->tick_time_lerp; ffth->tick_time = fftimehands->tick_time; ffth->period_lerp = cest->period; /* Do not lock but ignore next update from synchronization daemon. */ ffclock_updated--; if (++ogen == 0) ogen = 1; ffth->gen = ogen; fftimehands = ffth; } /* * Retrieve feed-forward counter and time of last kernel tick. */ void ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags) { struct fftimehands *ffth; uint8_t gen; /* * No locking but check generation has not changed. Also need to make * sure ffdelta is positive, i.e. ffcount > tick_ffcount. */ do { ffth = fftimehands; gen = ffth->gen; if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) *bt = ffth->tick_time_lerp; else *bt = ffth->tick_time; *ffcount = ffth->tick_ffcount; } while (gen == 0 || gen != ffth->gen); } /* * Absolute clock conversion. Low level function to convert ffcounter to * bintime. The ffcounter is converted using the current ffclock period estimate * or the "interpolated period" to ensure monotonicity. * NOTE: this conversion may have been deferred, and the clock updated since the * hardware counter has been read. */ void ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags) { struct fftimehands *ffth; struct bintime bt2; ffcounter ffdelta; uint8_t gen; /* * No locking but check generation has not changed. Also need to make * sure ffdelta is positive, i.e. ffcount > tick_ffcount. */ do { ffth = fftimehands; gen = ffth->gen; if (ffcount > ffth->tick_ffcount) ffdelta = ffcount - ffth->tick_ffcount; else ffdelta = ffth->tick_ffcount - ffcount; if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) { *bt = ffth->tick_time_lerp; ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2); } else { *bt = ffth->tick_time; ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2); } if (ffcount > ffth->tick_ffcount) bintime_add(bt, &bt2); else bintime_sub(bt, &bt2); } while (gen == 0 || gen != ffth->gen); } /* * Difference clock conversion. * Low level function to Convert a time interval measured in RAW counter units * into bintime. The difference clock allows measuring small intervals much more * reliably than the absolute clock. */ void ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt) { struct fftimehands *ffth; uint8_t gen; /* No locking but check generation has not changed. */ do { ffth = fftimehands; gen = ffth->gen; ffclock_convert_delta(ffdelta, ffth->cest.period, bt); } while (gen == 0 || gen != ffth->gen); } /* * Access to current ffcounter value. */ void ffclock_read_counter(ffcounter *ffcount) { struct timehands *th; struct fftimehands *ffth; unsigned int gen, delta; /* * ffclock_windup() called from tc_windup(), safe to rely on * th->th_generation only, for correct delta and ffcounter. */ do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); ffth = fftimehands; delta = tc_delta(th); *ffcount = ffth->tick_ffcount; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); *ffcount += delta; } void binuptime(struct bintime *bt) { binuptime_fromclock(bt, sysclock_active); } void nanouptime(struct timespec *tsp) { nanouptime_fromclock(tsp, sysclock_active); } void microuptime(struct timeval *tvp) { microuptime_fromclock(tvp, sysclock_active); } void bintime(struct bintime *bt) { bintime_fromclock(bt, sysclock_active); } void nanotime(struct timespec *tsp) { nanotime_fromclock(tsp, sysclock_active); } void microtime(struct timeval *tvp) { microtime_fromclock(tvp, sysclock_active); } void getbinuptime(struct bintime *bt) { getbinuptime_fromclock(bt, sysclock_active); } void getnanouptime(struct timespec *tsp) { getnanouptime_fromclock(tsp, sysclock_active); } void getmicrouptime(struct timeval *tvp) { getmicrouptime_fromclock(tvp, sysclock_active); } void getbintime(struct bintime *bt) { getbintime_fromclock(bt, sysclock_active); } void getnanotime(struct timespec *tsp) { getnanotime_fromclock(tsp, sysclock_active); } void getmicrotime(struct timeval *tvp) { getmicrouptime_fromclock(tvp, sysclock_active); } #endif /* FFCLOCK */ /* * This is a clone of getnanotime and used for walltimestamps. * The dtrace_ prefix prevents fbt from creating probes for * it so walltimestamp can be safely used in all fbt probes. */ void dtrace_getnanotime(struct timespec *tsp) { GETTHMEMBER(tsp, th_nanotime); } /* * This is a clone of getnanouptime used for time since boot. * The dtrace_ prefix prevents fbt from creating probes for * it so an uptime that can be safely used in all fbt probes. */ void dtrace_getnanouptime(struct timespec *tsp) { struct bintime bt; GETTHMEMBER(&bt, th_offset); bintime2timespec(&bt, tsp); } /* * System clock currently providing time to the system. Modifiable via sysctl * when the FFCLOCK option is defined. */ int sysclock_active = SYSCLOCK_FBCK; /* Internal NTP status and error estimates. */ extern int time_status; extern long time_esterror; /* * Take a snapshot of sysclock data which can be used to compare system clocks * and generate timestamps after the fact. */ void sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast) { struct fbclock_info *fbi; struct timehands *th; struct bintime bt; unsigned int delta, gen; #ifdef FFCLOCK ffcounter ffcount; struct fftimehands *ffth; struct ffclock_info *ffi; struct ffclock_estimate cest; ffi = &clock_snap->ff_info; #endif fbi = &clock_snap->fb_info; delta = 0; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); fbi->th_scale = th->th_scale; fbi->tick_time = th->th_offset; #ifdef FFCLOCK ffth = fftimehands; ffi->tick_time = ffth->tick_time_lerp; ffi->tick_time_lerp = ffth->tick_time_lerp; ffi->period = ffth->cest.period; ffi->period_lerp = ffth->period_lerp; clock_snap->ffcount = ffth->tick_ffcount; cest = ffth->cest; #endif if (!fast) delta = tc_delta(th); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); clock_snap->delta = delta; clock_snap->sysclock_active = sysclock_active; /* Record feedback clock status and error. */ clock_snap->fb_info.status = time_status; /* XXX: Very crude estimate of feedback clock error. */ bt.sec = time_esterror / 1000000; bt.frac = ((time_esterror - bt.sec) * 1000000) * (uint64_t)18446744073709ULL; clock_snap->fb_info.error = bt; #ifdef FFCLOCK if (!fast) clock_snap->ffcount += delta; /* Record feed-forward clock leap second adjustment. */ ffi->leapsec_adjustment = cest.leapsec_total; if (clock_snap->ffcount > cest.leapsec_next) ffi->leapsec_adjustment -= cest.leapsec; /* Record feed-forward clock status and error. */ clock_snap->ff_info.status = cest.status; ffcount = clock_snap->ffcount - cest.update_ffcount; ffclock_convert_delta(ffcount, cest.period, &bt); /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */ bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL); /* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */ bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL); clock_snap->ff_info.error = bt; #endif } /* * Convert a sysclock snapshot into a struct bintime based on the specified * clock source and flags. */ int sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt, int whichclock, uint32_t flags) { struct bintime boottimebin; #ifdef FFCLOCK struct bintime bt2; uint64_t period; #endif switch (whichclock) { case SYSCLOCK_FBCK: *bt = cs->fb_info.tick_time; /* If snapshot was created with !fast, delta will be >0. */ if (cs->delta > 0) bintime_addx(bt, cs->fb_info.th_scale * cs->delta); if ((flags & FBCLOCK_UPTIME) == 0) { getboottimebin(&boottimebin); bintime_add(bt, &boottimebin); } break; #ifdef FFCLOCK case SYSCLOCK_FFWD: if (flags & FFCLOCK_LERP) { *bt = cs->ff_info.tick_time_lerp; period = cs->ff_info.period_lerp; } else { *bt = cs->ff_info.tick_time; period = cs->ff_info.period; } /* If snapshot was created with !fast, delta will be >0. */ if (cs->delta > 0) { ffclock_convert_delta(cs->delta, period, &bt2); bintime_add(bt, &bt2); } /* Leap second adjustment. */ if (flags & FFCLOCK_LEAPSEC) bt->sec -= cs->ff_info.leapsec_adjustment; /* Boot time adjustment, for uptime/monotonic clocks. */ if (flags & FFCLOCK_UPTIME) bintime_sub(bt, &ffclock_boottime); break; #endif default: return (EINVAL); break; } return (0); } /* * Initialize a new timecounter and possibly use it. */ void tc_init(struct timecounter *tc) { u_int u; struct sysctl_oid *tc_root; u = tc->tc_frequency / tc->tc_counter_mask; /* XXX: We need some margin here, 10% is a guess */ u *= 11; u /= 10; if (u > hz && tc->tc_quality >= 0) { tc->tc_quality = -2000; if (bootverbose) { printf("Timecounter \"%s\" frequency %ju Hz", tc->tc_name, (uintmax_t)tc->tc_frequency); printf(" -- Insufficient hz, needs at least %u\n", u); } } else if (tc->tc_quality >= 0 || bootverbose) { printf("Timecounter \"%s\" frequency %ju Hz quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, tc->tc_quality); } /* * Set up sysctl tree for this counter. */ tc_root = SYSCTL_ADD_NODE_WITH_LABEL(NULL, SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "timecounter description", "timecounter"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0, "mask for implemented bits"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "counter", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, tc, sizeof(*tc), sysctl_kern_timecounter_get, "IU", "current timecounter value"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "frequency", CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, tc, sizeof(*tc), sysctl_kern_timecounter_freq, "QU", "timecounter frequency"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "quality", CTLFLAG_RD, &(tc->tc_quality), 0, "goodness of time counter"); mtx_lock(&tc_lock); tc->tc_next = timecounters; timecounters = tc; /* * Do not automatically switch if the current tc was specifically * chosen. Never automatically use a timecounter with negative quality. * Even though we run on the dummy counter, switching here may be * worse since this timecounter may not be monotonic. */ if (tc_chosen) goto unlock; if (tc->tc_quality < 0) goto unlock; if (tc_from_tunable[0] != '\0' && strcmp(tc->tc_name, tc_from_tunable) == 0) { tc_chosen = 1; tc_from_tunable[0] = '\0'; } else { if (tc->tc_quality < timecounter->tc_quality) goto unlock; if (tc->tc_quality == timecounter->tc_quality && tc->tc_frequency < timecounter->tc_frequency) goto unlock; } (void)tc->tc_get_timecount(tc); timecounter = tc; unlock: mtx_unlock(&tc_lock); } /* Report the frequency of the current timecounter. */ uint64_t tc_getfrequency(void) { return (timehands->th_counter->tc_frequency); } static bool sleeping_on_old_rtc(struct thread *td) { /* * td_rtcgen is modified by curthread when it is running, * and by other threads in this function. By finding the thread * on a sleepqueue and holding the lock on the sleepqueue * chain, we guarantee that the thread is not running and that * modifying td_rtcgen is safe. Setting td_rtcgen to zero informs * the thread that it was woken due to a real-time clock adjustment. * (The declaration of td_rtcgen refers to this comment.) */ if (td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation) { td->td_rtcgen = 0; return (true); } return (false); } static struct mtx tc_setclock_mtx; MTX_SYSINIT(tc_setclock_init, &tc_setclock_mtx, "tcsetc", MTX_SPIN); /* * Step our concept of UTC. This is done by modifying our estimate of * when we booted. */ void tc_setclock(struct timespec *ts) { struct timespec tbef, taft; struct bintime bt, bt2; timespec2bintime(ts, &bt); nanotime(&tbef); mtx_lock_spin(&tc_setclock_mtx); cpu_tick_calibrate(1); binuptime(&bt2); bintime_sub(&bt, &bt2); /* XXX fiddle all the little crinkly bits around the fiords... */ tc_windup(&bt); mtx_unlock_spin(&tc_setclock_mtx); /* Avoid rtc_generation == 0, since td_rtcgen == 0 is special. */ atomic_add_rel_int(&rtc_generation, 2); sleepq_chains_remove_matching(sleeping_on_old_rtc); if (timestepwarnings) { nanotime(&taft); log(LOG_INFO, "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n", (intmax_t)tbef.tv_sec, tbef.tv_nsec, (intmax_t)taft.tv_sec, taft.tv_nsec, (intmax_t)ts->tv_sec, ts->tv_nsec); } } /* * Recalculate the scaling factor. We want the number of 1/2^64 * fractions of a second per period of the hardware counter, taking * into account the th_adjustment factor which the NTP PLL/adjtime(2) * processing provides us with. * * The th_adjustment is nanoseconds per second with 32 bit binary * fraction and we want 64 bit binary fraction of second: * * x = a * 2^32 / 10^9 = a * 4.294967296 * * The range of th_adjustment is +/- 5000PPM so inside a 64bit int * we can only multiply by about 850 without overflowing, that * leaves no suitably precise fractions for multiply before divide. * * Divide before multiply with a fraction of 2199/512 results in a * systematic undercompensation of 10PPM of th_adjustment. On a * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. * * We happily sacrifice the lowest of the 64 bits of our result * to the goddess of code clarity. */ static void recalculate_scaling_factor_and_large_delta(struct timehands *th) { uint64_t scale; scale = (uint64_t)1 << 63; scale += (th->th_adjustment / 1024) * 2199; scale /= th->th_counter->tc_frequency; th->th_scale = scale * 2; th->th_large_delta = MIN(((uint64_t)1 << 63) / scale, UINT_MAX); } /* * Initialize the next struct timehands in the ring and make * it the active timehands. Along the way we might switch to a different * timecounter and/or do seconds processing in NTP. Slightly magic. */ static void tc_windup(struct bintime *new_boottimebin) { struct bintime bt; struct timecounter *tc; struct timehands *th, *tho; u_int delta, ncount, ogen; int i; time_t t; /* * Make the next timehands a copy of the current one, but do * not overwrite the generation or next pointer. While we * update the contents, the generation must be zero. We need * to ensure that the zero generation is visible before the * data updates become visible, which requires release fence. * For similar reasons, re-reading of the generation after the * data is read should use acquire fence. */ tho = timehands; th = tho->th_next; ogen = th->th_generation; th->th_generation = 0; atomic_thread_fence_rel(); memcpy(th, tho, offsetof(struct timehands, th_generation)); if (new_boottimebin != NULL) th->th_boottime = *new_boottimebin; /* * Capture a timecounter delta on the current timecounter and if * changing timecounters, a counter value from the new timecounter. * Update the offset fields accordingly. */ tc = atomic_load_ptr(&timecounter); delta = tc_delta(th); if (th->th_counter != tc) ncount = tc->tc_get_timecount(tc); else ncount = 0; #ifdef FFCLOCK ffclock_windup(delta); #endif th->th_offset_count += delta; th->th_offset_count &= th->th_counter->tc_counter_mask; bintime_add_tc_delta(&th->th_offset, th->th_scale, th->th_large_delta, delta); /* * Hardware latching timecounters may not generate interrupts on * PPS events, so instead we poll them. There is a finite risk that * the hardware might capture a count which is later than the one we * got above, and therefore possibly in the next NTP second which might * have a different rate than the current NTP second. It doesn't * matter in practice. */ if (tho->th_counter->tc_poll_pps) tho->th_counter->tc_poll_pps(tho->th_counter); /* * Deal with NTP second processing. The loop normally * iterates at most once, but in extreme situations it might * keep NTP sane if timeouts are not run for several seconds. * At boot, the time step can be large when the TOD hardware * has been read, so on really large steps, we call * ntp_update_second only twice. We need to call it twice in * case we missed a leap second. */ bt = th->th_offset; bintime_add(&bt, &th->th_boottime); i = bt.sec - tho->th_microtime.tv_sec; if (i > 0) { if (i > LARGE_STEP) i = 2; do { t = bt.sec; ntp_update_second(&th->th_adjustment, &bt.sec); if (bt.sec != t) th->th_boottime.sec += bt.sec - t; --i; } while (i > 0); recalculate_scaling_factor_and_large_delta(th); } /* Update the UTC timestamps used by the get*() functions. */ th->th_bintime = bt; bintime2timeval(&bt, &th->th_microtime); bintime2timespec(&bt, &th->th_nanotime); /* Now is a good time to change timecounters. */ if (th->th_counter != tc) { #ifndef __arm__ if ((tc->tc_flags & TC_FLAGS_C2STOP) != 0) cpu_disable_c2_sleep++; if ((th->th_counter->tc_flags & TC_FLAGS_C2STOP) != 0) cpu_disable_c2_sleep--; #endif th->th_counter = tc; th->th_offset_count = ncount; tc_min_ticktock_freq = max(1, tc->tc_frequency / (((uint64_t)tc->tc_counter_mask + 1) / 3)); recalculate_scaling_factor_and_large_delta(th); #ifdef FFCLOCK ffclock_change_tc(th); #endif } /* * Now that the struct timehands is again consistent, set the new * generation number, making sure to not make it zero. */ if (++ogen == 0) ogen = 1; atomic_store_rel_int(&th->th_generation, ogen); /* Go live with the new struct timehands. */ #ifdef FFCLOCK switch (sysclock_active) { case SYSCLOCK_FBCK: #endif time_second = th->th_microtime.tv_sec; time_uptime = th->th_offset.sec; #ifdef FFCLOCK break; case SYSCLOCK_FFWD: time_second = fftimehands->tick_time_lerp.sec; time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec; break; } #endif timehands = th; timekeep_push_vdso(); } /* Report or change the active timecounter hardware. */ static int sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS) { char newname[32]; struct timecounter *newtc, *tc; int error; mtx_lock(&tc_lock); tc = timecounter; strlcpy(newname, tc->tc_name, sizeof(newname)); mtx_unlock(&tc_lock); error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req); if (error != 0 || req->newptr == NULL) return (error); mtx_lock(&tc_lock); /* Record that the tc in use now was specifically chosen. */ tc_chosen = 1; if (strcmp(newname, tc->tc_name) == 0) { mtx_unlock(&tc_lock); return (0); } for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { if (strcmp(newname, newtc->tc_name) != 0) continue; /* Warm up new timecounter. */ (void)newtc->tc_get_timecount(newtc); timecounter = newtc; /* * The vdso timehands update is deferred until the next * 'tc_windup()'. * * This is prudent given that 'timekeep_push_vdso()' does not * use any locking and that it can be called in hard interrupt * context via 'tc_windup()'. */ break; } mtx_unlock(&tc_lock); return (newtc != NULL ? 0 : EINVAL); } SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_timecounter_hardware, "A", "Timecounter hardware selected"); /* Report the available timecounter hardware. */ static int sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct timecounter *tc; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sb, NULL, 0, req); mtx_lock(&tc_lock); for (tc = timecounters; tc != NULL; tc = tc->tc_next) { if (tc != timecounters) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s(%d)", tc->tc_name, tc->tc_quality); } mtx_unlock(&tc_lock); error = sbuf_finish(&sb); sbuf_delete(&sb); return (error); } SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_timecounter_choice, "A", "Timecounter hardware detected"); /* * RFC 2783 PPS-API implementation. */ /* * Return true if the driver is aware of the abi version extensions in the * pps_state structure, and it supports at least the given abi version number. */ static inline int abi_aware(struct pps_state *pps, int vers) { return ((pps->kcmode & KCMODE_ABIFLAG) && pps->driver_abi >= vers); } static int pps_fetch(struct pps_fetch_args *fapi, struct pps_state *pps) { int err, timo; pps_seq_t aseq, cseq; struct timeval tv; if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC) return (EINVAL); /* * If no timeout is requested, immediately return whatever values were * most recently captured. If timeout seconds is -1, that's a request * to block without a timeout. WITNESS won't let us sleep forever * without a lock (we really don't need a lock), so just repeatedly * sleep a long time. */ if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) { if (fapi->timeout.tv_sec == -1) timo = 0x7fffffff; else { tv.tv_sec = fapi->timeout.tv_sec; tv.tv_usec = fapi->timeout.tv_nsec / 1000; timo = tvtohz(&tv); } aseq = atomic_load_int(&pps->ppsinfo.assert_sequence); cseq = atomic_load_int(&pps->ppsinfo.clear_sequence); while (aseq == atomic_load_int(&pps->ppsinfo.assert_sequence) && cseq == atomic_load_int(&pps->ppsinfo.clear_sequence)) { if (abi_aware(pps, 1) && pps->driver_mtx != NULL) { if (pps->flags & PPSFLAG_MTX_SPIN) { err = msleep_spin(pps, pps->driver_mtx, "ppsfch", timo); } else { err = msleep(pps, pps->driver_mtx, PCATCH, "ppsfch", timo); } } else { err = tsleep(pps, PCATCH, "ppsfch", timo); } if (err == EWOULDBLOCK) { if (fapi->timeout.tv_sec == -1) { continue; } else { return (ETIMEDOUT); } } else if (err != 0) { return (err); } } } pps->ppsinfo.current_mode = pps->ppsparam.mode; fapi->pps_info_buf = pps->ppsinfo; return (0); } int pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps) { pps_params_t *app; struct pps_fetch_args *fapi; #ifdef FFCLOCK struct pps_fetch_ffc_args *fapi_ffc; #endif #ifdef PPS_SYNC struct pps_kcbind_args *kapi; #endif KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl")); switch (cmd) { case PPS_IOC_CREATE: return (0); case PPS_IOC_DESTROY: return (0); case PPS_IOC_SETPARAMS: app = (pps_params_t *)data; if (app->mode & ~pps->ppscap) return (EINVAL); #ifdef FFCLOCK /* Ensure only a single clock is selected for ffc timestamp. */ if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK) return (EINVAL); #endif pps->ppsparam = *app; return (0); case PPS_IOC_GETPARAMS: app = (pps_params_t *)data; *app = pps->ppsparam; app->api_version = PPS_API_VERS_1; return (0); case PPS_IOC_GETCAP: *(int*)data = pps->ppscap; return (0); case PPS_IOC_FETCH: fapi = (struct pps_fetch_args *)data; return (pps_fetch(fapi, pps)); #ifdef FFCLOCK case PPS_IOC_FETCH_FFCOUNTER: fapi_ffc = (struct pps_fetch_ffc_args *)data; if (fapi_ffc->tsformat && fapi_ffc->tsformat != PPS_TSFMT_TSPEC) return (EINVAL); if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec) return (EOPNOTSUPP); pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode; fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc; /* Overwrite timestamps if feedback clock selected. */ switch (pps->ppsparam.mode & PPS_TSCLK_MASK) { case PPS_TSCLK_FBCK: fapi_ffc->pps_info_buf_ffc.assert_timestamp = pps->ppsinfo.assert_timestamp; fapi_ffc->pps_info_buf_ffc.clear_timestamp = pps->ppsinfo.clear_timestamp; break; case PPS_TSCLK_FFWD: break; default: break; } return (0); #endif /* FFCLOCK */ case PPS_IOC_KCBIND: #ifdef PPS_SYNC kapi = (struct pps_kcbind_args *)data; /* XXX Only root should be able to do this */ if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC) return (EINVAL); if (kapi->kernel_consumer != PPS_KC_HARDPPS) return (EINVAL); if (kapi->edge & ~pps->ppscap) return (EINVAL); pps->kcmode = (kapi->edge & KCMODE_EDGEMASK) | (pps->kcmode & KCMODE_ABIFLAG); return (0); #else return (EOPNOTSUPP); #endif default: return (ENOIOCTL); } } void pps_init(struct pps_state *pps) { pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT; if (pps->ppscap & PPS_CAPTUREASSERT) pps->ppscap |= PPS_OFFSETASSERT; if (pps->ppscap & PPS_CAPTURECLEAR) pps->ppscap |= PPS_OFFSETCLEAR; #ifdef FFCLOCK pps->ppscap |= PPS_TSCLK_MASK; #endif pps->kcmode &= ~KCMODE_ABIFLAG; } void pps_init_abi(struct pps_state *pps) { pps_init(pps); if (pps->driver_abi > 0) { pps->kcmode |= KCMODE_ABIFLAG; pps->kernel_abi = PPS_ABI_VERSION; } } void pps_capture(struct pps_state *pps) { struct timehands *th; KASSERT(pps != NULL, ("NULL pps pointer in pps_capture")); th = timehands; pps->capgen = atomic_load_acq_int(&th->th_generation); pps->capth = th; #ifdef FFCLOCK pps->capffth = fftimehands; #endif pps->capcount = th->th_counter->tc_get_timecount(th->th_counter); atomic_thread_fence_acq(); if (pps->capgen != th->th_generation) pps->capgen = 0; } void pps_event(struct pps_state *pps, int event) { struct bintime bt; struct timespec ts, *tsp, *osp; u_int tcount, *pcount; int foff; pps_seq_t *pseq; #ifdef FFCLOCK struct timespec *tsp_ffc; pps_seq_t *pseq_ffc; ffcounter *ffcount; #endif #ifdef PPS_SYNC int fhard; #endif KASSERT(pps != NULL, ("NULL pps pointer in pps_event")); /* Nothing to do if not currently set to capture this event type. */ if ((event & pps->ppsparam.mode) == 0) return; /* If the timecounter was wound up underneath us, bail out. */ if (pps->capgen == 0 || pps->capgen != atomic_load_acq_int(&pps->capth->th_generation)) return; /* Things would be easier with arrays. */ if (event == PPS_CAPTUREASSERT) { tsp = &pps->ppsinfo.assert_timestamp; osp = &pps->ppsparam.assert_offset; foff = pps->ppsparam.mode & PPS_OFFSETASSERT; #ifdef PPS_SYNC fhard = pps->kcmode & PPS_CAPTUREASSERT; #endif pcount = &pps->ppscount[0]; pseq = &pps->ppsinfo.assert_sequence; #ifdef FFCLOCK ffcount = &pps->ppsinfo_ffc.assert_ffcount; tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp; pseq_ffc = &pps->ppsinfo_ffc.assert_sequence; #endif } else { tsp = &pps->ppsinfo.clear_timestamp; osp = &pps->ppsparam.clear_offset; foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; #ifdef PPS_SYNC fhard = pps->kcmode & PPS_CAPTURECLEAR; #endif pcount = &pps->ppscount[1]; pseq = &pps->ppsinfo.clear_sequence; #ifdef FFCLOCK ffcount = &pps->ppsinfo_ffc.clear_ffcount; tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp; pseq_ffc = &pps->ppsinfo_ffc.clear_sequence; #endif } /* * If the timecounter changed, we cannot compare the count values, so * we have to drop the rest of the PPS-stuff until the next event. */ if (pps->ppstc != pps->capth->th_counter) { pps->ppstc = pps->capth->th_counter; *pcount = pps->capcount; pps->ppscount[2] = pps->capcount; return; } /* Convert the count to a timespec. */ tcount = pps->capcount - pps->capth->th_offset_count; tcount &= pps->capth->th_counter->tc_counter_mask; bt = pps->capth->th_bintime; bintime_addx(&bt, pps->capth->th_scale * tcount); bintime2timespec(&bt, &ts); /* If the timecounter was wound up underneath us, bail out. */ atomic_thread_fence_acq(); if (pps->capgen != pps->capth->th_generation) return; *pcount = pps->capcount; (*pseq)++; *tsp = ts; if (foff) { timespecadd(tsp, osp, tsp); if (tsp->tv_nsec < 0) { tsp->tv_nsec += 1000000000; tsp->tv_sec -= 1; } } #ifdef FFCLOCK *ffcount = pps->capffth->tick_ffcount + tcount; bt = pps->capffth->tick_time; ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt); bintime_add(&bt, &pps->capffth->tick_time); bintime2timespec(&bt, &ts); (*pseq_ffc)++; *tsp_ffc = ts; #endif #ifdef PPS_SYNC if (fhard) { uint64_t scale; /* * Feed the NTP PLL/FLL. * The FLL wants to know how many (hardware) nanoseconds * elapsed since the previous event. */ tcount = pps->capcount - pps->ppscount[2]; pps->ppscount[2] = pps->capcount; tcount &= pps->capth->th_counter->tc_counter_mask; scale = (uint64_t)1 << 63; scale /= pps->capth->th_counter->tc_frequency; scale *= 2; bt.sec = 0; bt.frac = 0; bintime_addx(&bt, scale * tcount); bintime2timespec(&bt, &ts); hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec); } #endif /* Wakeup anyone sleeping in pps_fetch(). */ wakeup(pps); } /* * Timecounters need to be updated every so often to prevent the hardware * counter from overflowing. Updating also recalculates the cached values * used by the get*() family of functions, so their precision depends on * the update frequency. */ static int tc_tick; SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0, "Approximate number of hardclock ticks in a millisecond"); void tc_ticktock(int cnt) { static int count; if (mtx_trylock_spin(&tc_setclock_mtx)) { count += cnt; if (count >= tc_tick) { count = 0; tc_windup(NULL); } mtx_unlock_spin(&tc_setclock_mtx); } } static void __inline tc_adjprecision(void) { int t; if (tc_timepercentage > 0) { t = (99 + tc_timepercentage) / tc_timepercentage; tc_precexp = fls(t + (t >> 1)) - 1; FREQ2BT(hz / tc_tick, &bt_timethreshold); FREQ2BT(hz, &bt_tickthreshold); bintime_shift(&bt_timethreshold, tc_precexp); bintime_shift(&bt_tickthreshold, tc_precexp); } else { tc_precexp = 31; bt_timethreshold.sec = INT_MAX; bt_timethreshold.frac = ~(uint64_t)0; bt_tickthreshold = bt_timethreshold; } sbt_timethreshold = bttosbt(bt_timethreshold); sbt_tickthreshold = bttosbt(bt_tickthreshold); } static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS) { int error, val; val = tc_timepercentage; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); tc_timepercentage = val; if (cold) goto done; tc_adjprecision(); done: return (0); } /* Set up the requested number of timehands. */ static void inittimehands(void *dummy) { struct timehands *thp; int i; TUNABLE_INT_FETCH("kern.timecounter.timehands_count", &timehands_count); if (timehands_count < 1) timehands_count = 1; if (timehands_count > nitems(ths)) timehands_count = nitems(ths); for (i = 1, thp = &ths[0]; i < timehands_count; thp = &ths[i++]) thp->th_next = &ths[i]; thp->th_next = &ths[0]; TUNABLE_STR_FETCH("kern.timecounter.hardware", tc_from_tunable, sizeof(tc_from_tunable)); mtx_init(&tc_lock, "tc", NULL, MTX_DEF); } SYSINIT(timehands, SI_SUB_TUNABLES, SI_ORDER_ANY, inittimehands, NULL); static void inittimecounter(void *dummy) { u_int p; int tick_rate; /* * Set the initial timeout to * max(1, ). * People should probably not use the sysctl to set the timeout * to smaller than its initial value, since that value is the * smallest reasonable one. If they want better timestamps they * should use the non-"get"* functions. */ if (hz > 1000) tc_tick = (hz + 500) / 1000; else tc_tick = 1; tc_adjprecision(); FREQ2BT(hz, &tick_bt); tick_sbt = bttosbt(tick_bt); tick_rate = hz / tc_tick; FREQ2BT(tick_rate, &tc_tick_bt); tc_tick_sbt = bttosbt(tc_tick_bt); p = (tc_tick * 1000000) / hz; printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); #ifdef FFCLOCK ffclock_init(); #endif /* warm up new timecounter (again) and get rolling. */ (void)timecounter->tc_get_timecount(timecounter); mtx_lock_spin(&tc_setclock_mtx); tc_windup(NULL); mtx_unlock_spin(&tc_setclock_mtx); } SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL); /* Cpu tick handling -------------------------------------------------*/ -static int cpu_tick_variable; +static bool cpu_tick_variable; static uint64_t cpu_tick_frequency; DPCPU_DEFINE_STATIC(uint64_t, tc_cpu_ticks_base); DPCPU_DEFINE_STATIC(unsigned, tc_cpu_ticks_last); static uint64_t tc_cpu_ticks(void) { struct timecounter *tc; uint64_t res, *base; unsigned u, *last; critical_enter(); base = DPCPU_PTR(tc_cpu_ticks_base); last = DPCPU_PTR(tc_cpu_ticks_last); tc = timehands->th_counter; u = tc->tc_get_timecount(tc) & tc->tc_counter_mask; if (u < *last) *base += (uint64_t)tc->tc_counter_mask + 1; *last = u; res = u + *base; critical_exit(); return (res); } void cpu_tick_calibration(void) { static time_t last_calib; if (time_uptime != last_calib && !(time_uptime & 0xf)) { cpu_tick_calibrate(0); last_calib = time_uptime; } } /* * This function gets called every 16 seconds on only one designated * CPU in the system from hardclock() via cpu_tick_calibration()(). * * Whenever the real time clock is stepped we get called with reset=1 * to make sure we handle suspend/resume and similar events correctly. */ static void cpu_tick_calibrate(int reset) { static uint64_t c_last; uint64_t c_this, c_delta; static struct bintime t_last; struct bintime t_this, t_delta; uint32_t divi; if (reset) { /* The clock was stepped, abort & reset */ t_last.sec = 0; return; } /* we don't calibrate fixed rate cputicks */ if (!cpu_tick_variable) return; getbinuptime(&t_this); c_this = cpu_ticks(); if (t_last.sec != 0) { c_delta = c_this - c_last; t_delta = t_this; bintime_sub(&t_delta, &t_last); /* * Headroom: * 2^(64-20) / 16[s] = * 2^(44) / 16[s] = * 17.592.186.044.416 / 16 = * 1.099.511.627.776 [Hz] */ divi = t_delta.sec << 20; divi |= t_delta.frac >> (64 - 20); c_delta <<= 20; c_delta /= divi; if (c_delta > cpu_tick_frequency) { if (0 && bootverbose) printf("cpu_tick increased to %ju Hz\n", c_delta); cpu_tick_frequency = c_delta; } } c_last = c_this; t_last = t_this; } void -set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var) +set_cputicker(cpu_tick_f *func, uint64_t freq, bool isvariable) { if (func == NULL) { cpu_ticks = tc_cpu_ticks; } else { cpu_tick_frequency = freq; - cpu_tick_variable = var; + cpu_tick_variable = isvariable; cpu_ticks = func; } } uint64_t cpu_tickrate(void) { if (cpu_ticks == tc_cpu_ticks) return (tc_getfrequency()); return (cpu_tick_frequency); } /* * We need to be slightly careful converting cputicks to microseconds. * There is plenty of margin in 64 bits of microseconds (half a million * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply * before divide conversion (to retain precision) we find that the * margin shrinks to 1.5 hours (one millionth of 146y). */ uint64_t cputick2usec(uint64_t tick) { uint64_t tr; tr = cpu_tickrate(); return ((tick / tr) * 1000000ULL) + ((tick % tr) * 1000000ULL) / tr; } cpu_tick_f *cpu_ticks = tc_cpu_ticks; static int vdso_th_enable = 1; static int sysctl_fast_gettime(SYSCTL_HANDLER_ARGS) { int old_vdso_th_enable, error; old_vdso_th_enable = vdso_th_enable; error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req); if (error != 0) return (error); vdso_th_enable = old_vdso_th_enable; return (0); } SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day"); uint32_t tc_fill_vdso_timehands(struct vdso_timehands *vdso_th) { struct timehands *th; uint32_t enabled; th = timehands; vdso_th->th_scale = th->th_scale; vdso_th->th_offset_count = th->th_offset_count; vdso_th->th_counter_mask = th->th_counter->tc_counter_mask; vdso_th->th_offset = th->th_offset; vdso_th->th_boottime = th->th_boottime; if (th->th_counter->tc_fill_vdso_timehands != NULL) { enabled = th->th_counter->tc_fill_vdso_timehands(vdso_th, th->th_counter); } else enabled = 0; if (!vdso_th_enable) enabled = 0; return (enabled); } #ifdef COMPAT_FREEBSD32 uint32_t tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32) { struct timehands *th; uint32_t enabled; th = timehands; *(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale; vdso_th32->th_offset_count = th->th_offset_count; vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask; vdso_th32->th_offset.sec = th->th_offset.sec; *(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac; vdso_th32->th_boottime.sec = th->th_boottime.sec; *(uint64_t *)&vdso_th32->th_boottime.frac[0] = th->th_boottime.frac; if (th->th_counter->tc_fill_vdso_timehands32 != NULL) { enabled = th->th_counter->tc_fill_vdso_timehands32(vdso_th32, th->th_counter); } else enabled = 0; if (!vdso_th_enable) enabled = 0; return (enabled); } #endif #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(timecounter, db_show_timecounter) { struct timehands *th; struct timecounter *tc; u_int val1, val2; th = timehands; tc = th->th_counter; val1 = tc->tc_get_timecount(tc); __compiler_membar(); val2 = tc->tc_get_timecount(tc); db_printf("timecounter %p %s\n", tc, tc->tc_name); db_printf(" mask %#x freq %ju qual %d flags %#x priv %p\n", tc->tc_counter_mask, (uintmax_t)tc->tc_frequency, tc->tc_quality, tc->tc_flags, tc->tc_priv); db_printf(" val %#x %#x\n", val1, val2); db_printf("timehands adj %#jx scale %#jx ldelta %d off_cnt %d gen %d\n", (uintmax_t)th->th_adjustment, (uintmax_t)th->th_scale, th->th_large_delta, th->th_offset_count, th->th_generation); db_printf(" offset %jd %jd boottime %jd %jd\n", (intmax_t)th->th_offset.sec, (uintmax_t)th->th_offset.frac, (intmax_t)th->th_boottime.sec, (uintmax_t)th->th_boottime.frac); } #endif diff --git a/sys/powerpc/powerpc/clock.c b/sys/powerpc/powerpc/clock.c index a530d6c71a6b..46495108e5f1 100644 --- a/sys/powerpc/powerpc/clock.c +++ b/sys/powerpc/powerpc/clock.c @@ -1,346 +1,346 @@ /*- * SPDX-License-Identifier: BSD-4-Clause AND BSD-2-Clause-FreeBSD * * Copyright (C) 1995, 1996 Wolfgang Solfrank. * Copyright (C) 1995, 1996 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $NetBSD: clock.c,v 1.9 2000/01/19 02:52:19 msaitoh Exp $ */ /* * Copyright (C) 2001 Benno Rice. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Initially we assume a processor with a bus frequency of 12.5 MHz. */ static int initialized = 0; static uint64_t ps_per_tick = 80000; static u_long ticks_per_sec = 12500000; static u_long *decr_counts[MAXCPU]; static int decr_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period); static int decr_et_stop(struct eventtimer *et); static timecounter_get_t decr_get_timecount; static uint32_t decr_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc); #ifdef COMPAT_FREEBSD32 static uint32_t decr_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc); #endif struct decr_state { int mode; /* 0 - off, 1 - periodic, 2 - one-shot. */ int32_t div; /* Periodic divisor. */ }; DPCPU_DEFINE_STATIC(struct decr_state, decr_state); static struct eventtimer decr_et; static struct timecounter decr_tc = { .tc_get_timecount = decr_get_timecount, .tc_counter_mask = ~0u, .tc_name = "timebase", .tc_quality = 1000, .tc_fill_vdso_timehands = decr_vdso_timehands, #ifdef COMPAT_FREEBSD32 .tc_fill_vdso_timehands32 = decr_vdso_timehands32, #endif }; /* * Decrementer interrupt handler. */ void decr_intr(struct trapframe *frame) { struct decr_state *s = DPCPU_PTR(decr_state); int nticks = 0; int32_t val; if (!initialized) return; (*decr_counts[curcpu])++; #ifdef BOOKE /* * Interrupt handler must reset DIS to avoid getting another * interrupt once EE is enabled. */ mtspr(SPR_TSR, TSR_DIS); #endif if (s->mode == 1) { /* * Based on the actual time delay since the last decrementer * reload, we arrange for earlier interrupt next time. */ __asm ("mfdec %0" : "=r"(val)); while (val < 0) { val += s->div; nticks++; } mtdec(val); } else if (s->mode == 2) { nticks = 1; decr_et_stop(NULL); } else if (s->mode == 0) { /* Potemkin timer ran out without an event. Just reset it. */ decr_et_stop(NULL); } while (nticks-- > 0) { if (decr_et.et_active) decr_et.et_event_cb(&decr_et, decr_et.et_arg); } } void cpu_initclocks(void) { decr_tc_init(); cpu_initclocks_bsp(); } /* * BSP early initialization. */ void decr_init(void) { struct cpuref cpu; char buf[32]; /* * Check the BSP's timebase frequency. Sometimes we can't find the BSP, * so fall back to the first CPU in this case. */ if (platform_smp_get_bsp(&cpu) != 0) platform_smp_first_cpu(&cpu); ticks_per_sec = platform_timebase_freq(&cpu); ps_per_tick = 1000000000000 / ticks_per_sec; - set_cputicker(mftb, ticks_per_sec, 0); + set_cputicker(mftb, ticks_per_sec, false); snprintf(buf, sizeof(buf), "cpu%d:decrementer", curcpu); intrcnt_add(buf, &decr_counts[curcpu]); decr_et_stop(NULL); initialized = 1; } #ifdef SMP /* * AP early initialization. */ void decr_ap_init(void) { char buf[32]; snprintf(buf, sizeof(buf), "cpu%d:decrementer", curcpu); intrcnt_add(buf, &decr_counts[curcpu]); decr_et_stop(NULL); } #endif /* * Final initialization. */ void decr_tc_init(void) { decr_tc.tc_frequency = ticks_per_sec; tc_init(&decr_tc); decr_et.et_name = "decrementer"; decr_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; decr_et.et_quality = 1000; decr_et.et_frequency = ticks_per_sec; decr_et.et_min_period = (0x00000002LLU << 32) / ticks_per_sec; decr_et.et_max_period = (0x7fffffffLLU << 32) / ticks_per_sec; decr_et.et_start = decr_et_start; decr_et.et_stop = decr_et_stop; decr_et.et_priv = NULL; et_register(&decr_et); } uint32_t decr_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { vdso_th->th_algo = VDSO_TH_ALGO_PPC_TB; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (initialized == 1); } #ifdef COMPAT_FREEBSD32 uint32_t decr_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc) { vdso_th32->th_algo = VDSO_TH_ALGO_PPC_TB; bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); return (initialized == 1); } #endif /* * Event timer start method. */ static int decr_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period) { struct decr_state *s = DPCPU_PTR(decr_state); uint32_t fdiv; #ifdef BOOKE uint32_t tcr; #endif if (period != 0) { s->mode = 1; s->div = (decr_et.et_frequency * period) >> 32; } else { s->mode = 2; s->div = 0; } if (first != 0) fdiv = (decr_et.et_frequency * first) >> 32; else fdiv = s->div; #ifdef BOOKE tcr = mfspr(SPR_TCR); tcr |= TCR_DIE; if (s->mode == 1) { mtspr(SPR_DECAR, s->div); tcr |= TCR_ARE; } else tcr &= ~TCR_ARE; mtdec(fdiv); mtspr(SPR_TCR, tcr); #else mtdec(fdiv); #endif return (0); } /* * Event timer stop method. */ static int decr_et_stop(struct eventtimer *et) { struct decr_state *s = DPCPU_PTR(decr_state); #ifdef BOOKE uint32_t tcr; #endif s->mode = 0; s->div = 0x7fffffff; #ifdef BOOKE tcr = mfspr(SPR_TCR); tcr &= ~(TCR_DIE | TCR_ARE); mtspr(SPR_TCR, tcr); #else mtdec(s->div); #endif return (0); } /* * Timecounter get method. */ static unsigned decr_get_timecount(struct timecounter *tc) { return (mftb()); } /* * Wait for about n microseconds (at least!). */ void DELAY(int n) { volatile u_quad_t tb; u_quad_t ttb; TSENTER(); tb = mftb(); ttb = tb + howmany((uint64_t)n * 1000000, ps_per_tick); nop_prio_vlow(); while (tb < ttb) tb = mftb(); nop_prio_medium(); TSEXIT(); } diff --git a/sys/sys/systm.h b/sys/sys/systm.h index 98637c4f4838..f4b3421b76ba 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -1,573 +1,573 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)systm.h 8.7 (Berkeley) 3/29/95 * $FreeBSD$ */ #ifndef _SYS_SYSTM_H_ #define _SYS_SYSTM_H_ #include #include #include #include #include #include #include /* for people using printf mainly */ __NULLABILITY_PRAGMA_PUSH #ifdef _KERNEL extern int cold; /* nonzero if we are doing a cold boot */ extern int suspend_blocked; /* block suspend due to pending shutdown */ extern int rebooting; /* kern_reboot() has been called. */ extern char version[]; /* system version */ extern char compiler_version[]; /* compiler version */ extern char copyright[]; /* system copyright */ extern int kstack_pages; /* number of kernel stack pages */ extern u_long pagesizes[]; /* supported page sizes */ extern long physmem; /* physical memory */ extern long realmem; /* 'real' memory */ extern char *rootdevnames[2]; /* names of possible root devices */ extern int boothowto; /* reboot flags, from console subsystem */ extern int bootverbose; /* nonzero to print verbose messages */ extern int maxusers; /* system tune hint */ extern int ngroups_max; /* max # of supplemental groups */ extern int vm_guest; /* Running as virtual machine guest? */ extern u_long maxphys; /* max raw I/O transfer size */ /* * Detected virtual machine guest types. The intention is to expand * and/or add to the VM_GUEST_VM type if specific VM functionality is * ever implemented (e.g. vendor-specific paravirtualization features). * Keep in sync with vm_guest_sysctl_names[]. */ enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV, VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_GUEST_VBOX, VM_GUEST_PARALLELS, VM_LAST }; #endif /* KERNEL */ /* * Align variables. */ #define __read_mostly __section(".data.read_mostly") #define __read_frequently __section(".data.read_frequently") #define __exclusive_cache_line __aligned(CACHE_LINE_SIZE) \ __section(".data.exclusive_cache_line") #if defined(_STANDALONE) struct ucred; #endif #ifdef _KERNEL #include /* MAXCPU */ #include /* curthread */ #include extern int osreldate; extern const void *zero_region; /* address space maps to a zeroed page */ extern int unmapped_buf_allowed; #ifdef __LP64__ #define IOSIZE_MAX iosize_max() #define DEVFS_IOSIZE_MAX devfs_iosize_max() #else #define IOSIZE_MAX SSIZE_MAX #define DEVFS_IOSIZE_MAX SSIZE_MAX #endif /* * General function declarations. */ struct inpcb; struct lock_object; struct malloc_type; struct mtx; struct proc; struct socket; struct thread; struct tty; struct ucred; struct uio; struct _jmp_buf; struct trapframe; struct eventtimer; int setjmp(struct _jmp_buf *) __returns_twice; void longjmp(struct _jmp_buf *, int) __dead2; int dumpstatus(vm_offset_t addr, off_t count); int nullop(void); int eopnotsupp(void); int ureadc(int, struct uio *); void hashdestroy(void *, struct malloc_type *, u_long); void *hashinit(int count, struct malloc_type *type, u_long *hashmask); void *hashinit_flags(int count, struct malloc_type *type, u_long *hashmask, int flags); #define HASH_NOWAIT 0x00000001 #define HASH_WAITOK 0x00000002 void *phashinit(int count, struct malloc_type *type, u_long *nentries); void *phashinit_flags(int count, struct malloc_type *type, u_long *nentries, int flags); void g_waitidle(void); void cpu_flush_dcache(void *, size_t); void cpu_rootconf(void); void critical_enter_KBI(void); void critical_exit_KBI(void); void critical_exit_preempt(void); void init_param1(void); void init_param2(long physpages); void init_static_kenv(char *, size_t); void tablefull(const char *); /* * Allocate per-thread "current" state in the linuxkpi */ extern int (*lkpi_alloc_current)(struct thread *, int); int linux_alloc_current_noop(struct thread *, int); #if (defined(KLD_MODULE) && !defined(KLD_TIED)) || defined(KTR_CRITICAL) || !defined(_KERNEL) || defined(GENOFFSET) #define critical_enter() critical_enter_KBI() #define critical_exit() critical_exit_KBI() #else static __inline void critical_enter(void) { struct thread_lite *td; td = (struct thread_lite *)curthread; td->td_critnest++; atomic_interrupt_fence(); } static __inline void critical_exit(void) { struct thread_lite *td; td = (struct thread_lite *)curthread; KASSERT(td->td_critnest != 0, ("critical_exit: td_critnest == 0")); atomic_interrupt_fence(); td->td_critnest--; atomic_interrupt_fence(); if (__predict_false(td->td_owepreempt)) critical_exit_preempt(); } #endif #ifdef EARLY_PRINTF typedef void early_putc_t(int ch); extern early_putc_t *early_putc; #endif int kvprintf(char const *, void (*)(int, void*), void *, int, __va_list) __printflike(1, 0); void log(int, const char *, ...) __printflike(2, 3); void log_console(struct uio *); void vlog(int, const char *, __va_list) __printflike(2, 0); int asprintf(char **ret, struct malloc_type *mtp, const char *format, ...) __printflike(3, 4); int printf(const char *, ...) __printflike(1, 2); int snprintf(char *, size_t, const char *, ...) __printflike(3, 4); int sprintf(char *buf, const char *, ...) __printflike(2, 3); int uprintf(const char *, ...) __printflike(1, 2); int vprintf(const char *, __va_list) __printflike(1, 0); int vasprintf(char **ret, struct malloc_type *mtp, const char *format, __va_list ap) __printflike(3, 0); int vsnprintf(char *, size_t, const char *, __va_list) __printflike(3, 0); int vsnrprintf(char *, size_t, int, const char *, __va_list) __printflike(4, 0); int vsprintf(char *buf, const char *, __va_list) __printflike(2, 0); int sscanf(const char *, char const * _Nonnull, ...) __scanflike(2, 3); int vsscanf(const char * _Nonnull, char const * _Nonnull, __va_list) __scanflike(2, 0); long strtol(const char *, char **, int); u_long strtoul(const char *, char **, int); quad_t strtoq(const char *, char **, int); u_quad_t strtouq(const char *, char **, int); void tprintf(struct proc *p, int pri, const char *, ...) __printflike(3, 4); void vtprintf(struct proc *, int, const char *, __va_list) __printflike(3, 0); void hexdump(const void *ptr, int length, const char *hdr, int flags); #define HD_COLUMN_MASK 0xff #define HD_DELIM_MASK 0xff00 #define HD_OMIT_COUNT (1 << 16) #define HD_OMIT_HEX (1 << 17) #define HD_OMIT_CHARS (1 << 18) #define ovbcopy(f, t, l) bcopy((f), (t), (l)) void explicit_bzero(void * _Nonnull, size_t); void *memset(void * _Nonnull buf, int c, size_t len); void *memcpy(void * _Nonnull to, const void * _Nonnull from, size_t len); void *memmove(void * _Nonnull dest, const void * _Nonnull src, size_t n); int memcmp(const void *b1, const void *b2, size_t len); #ifdef SAN_NEEDS_INTERCEPTORS #define SAN_INTERCEPTOR(func) \ __CONCAT(SAN_INTERCEPTOR_PREFIX, __CONCAT(_, func)) void *SAN_INTERCEPTOR(memset)(void *, int, size_t); void *SAN_INTERCEPTOR(memcpy)(void *, const void *, size_t); void *SAN_INTERCEPTOR(memmove)(void *, const void *, size_t); int SAN_INTERCEPTOR(memcmp)(const void *, const void *, size_t); #ifndef SAN_RUNTIME #define bcopy(from, to, len) SAN_INTERCEPTOR(memmove)((to), (from), (len)) #define bzero(buf, len) SAN_INTERCEPTOR(memset)((buf), 0, (len)) #define bcmp(b1, b2, len) SAN_INTERCEPTOR(memcmp)((b1), (b2), (len)) #define memset(buf, c, len) SAN_INTERCEPTOR(memset)((buf), (c), (len)) #define memcpy(to, from, len) SAN_INTERCEPTOR(memcpy)((to), (from), (len)) #define memmove(dest, src, n) SAN_INTERCEPTOR(memmove)((dest), (src), (n)) #define memcmp(b1, b2, len) SAN_INTERCEPTOR(memcmp)((b1), (b2), (len)) #endif /* !SAN_RUNTIME */ #else /* !SAN_NEEDS_INTERCEPTORS */ #define bcopy(from, to, len) __builtin_memmove((to), (from), (len)) #define bzero(buf, len) __builtin_memset((buf), 0, (len)) #define bcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len)) #define memset(buf, c, len) __builtin_memset((buf), (c), (len)) #define memcpy(to, from, len) __builtin_memcpy((to), (from), (len)) #define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) #define memcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len)) #endif /* SAN_NEEDS_INTERCEPTORS */ void *memset_early(void * _Nonnull buf, int c, size_t len); #define bzero_early(buf, len) memset_early((buf), 0, (len)) void *memcpy_early(void * _Nonnull to, const void * _Nonnull from, size_t len); void *memmove_early(void * _Nonnull dest, const void * _Nonnull src, size_t n); #define bcopy_early(from, to, len) memmove_early((to), (from), (len)) #define copystr(src, dst, len, outlen) ({ \ size_t __r, __len, *__outlen; \ \ __len = (len); \ __outlen = (outlen); \ __r = strlcpy((dst), (src), __len); \ if (__outlen != NULL) \ *__outlen = ((__r >= __len) ? __len : __r + 1); \ ((__r >= __len) ? ENAMETOOLONG : 0); \ }) int copyinstr(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len, size_t * __restrict lencopied); int copyin(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int copyin_nofault(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int copyout(const void * _Nonnull __restrict kaddr, void * __restrict udaddr, size_t len); int copyout_nofault(const void * _Nonnull __restrict kaddr, void * __restrict udaddr, size_t len); #ifdef SAN_NEEDS_INTERCEPTORS int SAN_INTERCEPTOR(copyin)(const void *, void *, size_t); int SAN_INTERCEPTOR(copyinstr)(const void *, void *, size_t, size_t *); int SAN_INTERCEPTOR(copyout)(const void *, void *, size_t); #ifndef SAN_RUNTIME #define copyin(u, k, l) SAN_INTERCEPTOR(copyin)((u), (k), (l)) #define copyinstr(u, k, l, lc) SAN_INTERCEPTOR(copyinstr)((u), (k), (l), (lc)) #define copyout(k, u, l) SAN_INTERCEPTOR(copyout)((k), (u), (l)) #endif /* !SAN_RUNTIME */ #endif /* SAN_NEEDS_INTERCEPTORS */ int fubyte(volatile const void *base); long fuword(volatile const void *base); int fuword16(volatile const void *base); int32_t fuword32(volatile const void *base); int64_t fuword64(volatile const void *base); int fueword(volatile const void *base, long *val); int fueword32(volatile const void *base, int32_t *val); int fueword64(volatile const void *base, int64_t *val); int subyte(volatile void *base, int byte); int suword(volatile void *base, long word); int suword16(volatile void *base, int word); int suword32(volatile void *base, int32_t word); int suword64(volatile void *base, int64_t word); uint32_t casuword32(volatile uint32_t *base, uint32_t oldval, uint32_t newval); u_long casuword(volatile u_long *p, u_long oldval, u_long newval); int casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval); int casueword(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval); #if defined(SAN_NEEDS_INTERCEPTORS) && !defined(KCSAN) int SAN_INTERCEPTOR(fubyte)(volatile const void *base); int SAN_INTERCEPTOR(fuword16)(volatile const void *base); int SAN_INTERCEPTOR(fueword)(volatile const void *base, long *val); int SAN_INTERCEPTOR(fueword32)(volatile const void *base, int32_t *val); int SAN_INTERCEPTOR(fueword64)(volatile const void *base, int64_t *val); int SAN_INTERCEPTOR(subyte)(volatile void *base, int byte); int SAN_INTERCEPTOR(suword)(volatile void *base, long word); int SAN_INTERCEPTOR(suword16)(volatile void *base, int word); int SAN_INTERCEPTOR(suword32)(volatile void *base, int32_t word); int SAN_INTERCEPTOR(suword64)(volatile void *base, int64_t word); int SAN_INTERCEPTOR(casueword32)(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval); int SAN_INTERCEPTOR(casueword)(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval); #ifndef SAN_RUNTIME #define fubyte(b) SAN_INTERCEPTOR(fubyte)((b)) #define fuword16(b) SAN_INTERCEPTOR(fuword16)((b)) #define fueword(b, v) SAN_INTERCEPTOR(fueword)((b), (v)) #define fueword32(b, v) SAN_INTERCEPTOR(fueword32)((b), (v)) #define fueword64(b, v) SAN_INTERCEPTOR(fueword64)((b), (v)) #define subyte(b, w) SAN_INTERCEPTOR(subyte)((b), (w)) #define suword(b, w) SAN_INTERCEPTOR(suword)((b), (w)) #define suword16(b, w) SAN_INTERCEPTOR(suword16)((b), (w)) #define suword32(b, w) SAN_INTERCEPTOR(suword32)((b), (w)) #define suword64(b, w) SAN_INTERCEPTOR(suword64)((b), (w)) #define casueword32(b, o, p, n) SAN_INTERCEPTOR(casueword32)((b), (o), (p), (n)) #define casueword(b, o, p, n) SAN_INTERCEPTOR(casueword)((b), (o), (p), (n)) #endif /* !SAN_RUNTIME */ #endif /* SAN_NEEDS_INTERCEPTORS && !KCSAN */ void realitexpire(void *); int sysbeep(int hertz, sbintime_t duration); void hardclock(int cnt, int usermode); void hardclock_sync(int cpu); void statclock(int cnt, int usermode); void profclock(int cnt, int usermode, uintfptr_t pc); int hardclockintr(void); void startprofclock(struct proc *); void stopprofclock(struct proc *); void cpu_startprofclock(void); void cpu_stopprofclock(void); void suspendclock(void); void resumeclock(void); sbintime_t cpu_idleclock(void); void cpu_activeclock(void); void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt); void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq); extern int cpu_disable_c2_sleep; extern int cpu_disable_c3_sleep; char *kern_getenv(const char *name); void freeenv(char *env); int getenv_int(const char *name, int *data); int getenv_uint(const char *name, unsigned int *data); int getenv_long(const char *name, long *data); int getenv_ulong(const char *name, unsigned long *data); int getenv_string(const char *name, char *data, int size); int getenv_int64(const char *name, int64_t *data); int getenv_uint64(const char *name, uint64_t *data); int getenv_quad(const char *name, quad_t *data); int getenv_bool(const char *name, bool *data); bool getenv_is_true(const char *name); bool getenv_is_false(const char *name); int kern_setenv(const char *name, const char *value); int kern_unsetenv(const char *name); int testenv(const char *name); int getenv_array(const char *name, void *data, int size, int *psize, int type_size, bool allow_signed); #define GETENV_UNSIGNED false /* negative numbers not allowed */ #define GETENV_SIGNED true /* negative numbers allowed */ typedef uint64_t (cpu_tick_f)(void); -void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var); +void set_cputicker(cpu_tick_f *func, uint64_t freq, bool isvariable); extern cpu_tick_f *cpu_ticks; uint64_t cpu_tickrate(void); uint64_t cputick2usec(uint64_t tick); #include /* Initialize the world */ void consinit(void); void cpu_initclocks(void); void cpu_initclocks_bsp(void); void cpu_initclocks_ap(void); void usrinfoinit(void); /* Finalize the world */ void kern_reboot(int) __dead2; void shutdown_nice(int); /* Stubs for obsolete functions that used to be for interrupt management */ static __inline intrmask_t splhigh(void) { return 0; } static __inline intrmask_t splimp(void) { return 0; } static __inline intrmask_t splnet(void) { return 0; } static __inline intrmask_t spltty(void) { return 0; } static __inline void splx(intrmask_t ipl __unused) { return; } /* * Common `proc' functions are declared here so that proc.h can be included * less often. */ int _sleep(const void * _Nonnull chan, struct lock_object *lock, int pri, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep(chan, mtx, pri, wmesg, timo) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), \ tick_sbt * (timo), 0, C_HARDCLOCK) #define msleep_sbt(chan, mtx, pri, wmesg, bt, pr, flags) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (bt), (pr), \ (flags)) int msleep_spin_sbt(const void * _Nonnull chan, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep_spin(chan, mtx, wmesg, timo) \ msleep_spin_sbt((chan), (mtx), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define pause(wmesg, timo) \ pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK) #define pause_sig(wmesg, timo) \ pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK | C_CATCH) #define tsleep(chan, pri, wmesg, timo) \ _sleep((chan), NULL, (pri), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) #define tsleep_sbt(chan, pri, wmesg, bt, pr, flags) \ _sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags)) void wakeup(const void *chan); void wakeup_one(const void *chan); void wakeup_any(const void *chan); /* * Common `struct cdev *' stuff are declared here to avoid #include poisoning */ struct cdev; dev_t dev2udev(struct cdev *x); const char *devtoname(struct cdev *cdev); #ifdef __LP64__ size_t devfs_iosize_max(void); size_t iosize_max(void); #endif int poll_no_poll(int events); /* XXX: Should be void nanodelay(u_int nsec); */ void DELAY(int usec); /* Root mount holdback API */ struct root_hold_token { int flags; const char *who; TAILQ_ENTRY(root_hold_token) list; }; struct root_hold_token *root_mount_hold(const char *identifier); void root_mount_hold_token(const char *identifier, struct root_hold_token *h); void root_mount_rel(struct root_hold_token *h); int root_mounted(void); /* * Unit number allocation API. (kern/subr_unit.c) */ struct unrhdr; #define UNR_NO_MTX ((void *)(uintptr_t)-1) struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); void init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex); void delete_unrhdr(struct unrhdr *uh); void clear_unrhdr(struct unrhdr *uh); void clean_unrhdr(struct unrhdr *uh); void clean_unrhdrl(struct unrhdr *uh); int alloc_unr(struct unrhdr *uh); int alloc_unr_specific(struct unrhdr *uh, u_int item); int alloc_unrl(struct unrhdr *uh); void free_unr(struct unrhdr *uh, u_int item); #ifndef __LP64__ #define UNR64_LOCKED #endif struct unrhdr64 { uint64_t counter; }; static __inline void new_unrhdr64(struct unrhdr64 *unr64, uint64_t low) { unr64->counter = low; } #ifdef UNR64_LOCKED uint64_t alloc_unr64(struct unrhdr64 *); #else static __inline uint64_t alloc_unr64(struct unrhdr64 *unr64) { return (atomic_fetchadd_64(&unr64->counter, 1)); } #endif void intr_prof_stack_use(struct thread *td, struct trapframe *frame); void counted_warning(unsigned *counter, const char *msg); /* * APIs to manage deprecation and obsolescence. */ void _gone_in(int major, const char *msg); void _gone_in_dev(device_t dev, int major, const char *msg); #ifdef NO_OBSOLETE_CODE #define __gone_ok(m, msg) \ _Static_assert(m < P_OSREL_MAJOR(__FreeBSD_version)), \ "Obsolete code: " msg); #else #define __gone_ok(m, msg) #endif #define gone_in(major, msg) __gone_ok(major, msg) _gone_in(major, msg) #define gone_in_dev(dev, major, msg) __gone_ok(major, msg) _gone_in_dev(dev, major, msg) #ifdef INVARIANTS #define __diagused #else #define __diagused __unused #endif #ifdef WITNESS #define __witness_used #else #define __witness_used __unused #endif #endif /* _KERNEL */ __NULLABILITY_PRAGMA_POP #endif /* !_SYS_SYSTM_H_ */ diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c index 96f209e4c08d..040e7ffcd3b8 100644 --- a/sys/x86/x86/tsc.c +++ b/sys/x86/x86/tsc.c @@ -1,998 +1,998 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1998-2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_clock.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cpufreq_if.h" uint64_t tsc_freq; int tsc_is_invariant; int tsc_perf_stat; static int tsc_early_calib_exact; static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag; SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN, &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant"); #ifdef SMP int smp_tsc; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0, "Indicates whether the TSC is safe to use in SMP mode"); int smp_tsc_adjust = 0; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN, &smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP"); #endif static int tsc_shift = 1; SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN, &tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency"); static int tsc_disabled; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0, "Disable x86 Time Stamp Counter"); static int tsc_skip_calibration; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN, &tsc_skip_calibration, 0, "Disable early TSC frequency calibration"); static void tsc_freq_changed(void *arg, const struct cf_level *level, int status); static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status); static u_int tsc_get_timecount(struct timecounter *tc); static inline u_int tsc_get_timecount_low(struct timecounter *tc); static u_int tsc_get_timecount_lfence(struct timecounter *tc); static u_int tsc_get_timecount_low_lfence(struct timecounter *tc); static u_int tsc_get_timecount_mfence(struct timecounter *tc); static u_int tsc_get_timecount_low_mfence(struct timecounter *tc); static u_int tscp_get_timecount(struct timecounter *tc); static u_int tscp_get_timecount_low(struct timecounter *tc); static void tsc_levels_changed(void *arg, int unit); static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc); #ifdef COMPAT_FREEBSD32 static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc); #endif static struct timecounter tsc_timecounter = { .tc_get_timecount = tsc_get_timecount, .tc_counter_mask = ~0u, .tc_name = "TSC", .tc_quality = 800, /* adjusted in code */ .tc_fill_vdso_timehands = x86_tsc_vdso_timehands, #ifdef COMPAT_FREEBSD32 .tc_fill_vdso_timehands32 = x86_tsc_vdso_timehands32, #endif }; static int tsc_freq_cpuid_vm(void) { u_int regs[4]; if (vm_guest == VM_GUEST_NO) return (false); if (hv_high < 0x40000010) return (false); do_cpuid(0x40000010, regs); tsc_freq = (uint64_t)(regs[0]) * 1000; tsc_early_calib_exact = 1; return (true); } static void tsc_freq_vmware(void) { u_int regs[4]; vmware_hvcall(VMW_HVCMD_GETHZ, regs); if (regs[1] != UINT_MAX) tsc_freq = regs[0] | ((uint64_t)regs[1] << 32); tsc_early_calib_exact = 1; } static void tsc_freq_xen(void) { u_int regs[4]; /* * Must run *after* generic tsc_freq_cpuid_vm, so that when Xen is * emulating Viridian support the Viridian leaf is used instead. */ KASSERT(hv_high >= 0x40000003, ("Invalid max hypervisor leaf on Xen")); cpuid_count(0x40000003, 0, regs); tsc_freq = (uint64_t)(regs[2]) * 1000; tsc_early_calib_exact = 1; } /* * Calculate TSC frequency using information from the CPUID leaf 0x15 'Time * Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 is not * functional, as it is on Skylake/Kabylake, try 0x16 'Processor Frequency * Information'. Leaf 0x16 is described in the SDM as informational only, but * we can use this value until late calibration is complete. */ static bool tsc_freq_cpuid(uint64_t *res) { u_int regs[4]; if (cpu_high < 0x15) return (false); do_cpuid(0x15, regs); if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) { *res = (uint64_t)regs[2] * regs[1] / regs[0]; return (true); } if (cpu_high < 0x16) return (false); do_cpuid(0x16, regs); if (regs[0] != 0) { *res = (uint64_t)regs[0] * 1000000; return (true); } return (false); } static bool tsc_freq_intel_brand(uint64_t *res) { char brand[48]; u_int regs[4]; uint64_t freq; char *p; u_int i; /* * Intel Processor Identification and the CPUID Instruction * Application Note 485. * http://www.intel.com/assets/pdf/appnote/241618.pdf */ if (cpu_exthigh >= 0x80000004) { p = brand; for (i = 0x80000002; i < 0x80000005; i++) { do_cpuid(i, regs); memcpy(p, regs, sizeof(regs)); p += sizeof(regs); } p = NULL; for (i = 0; i < sizeof(brand) - 1; i++) if (brand[i] == 'H' && brand[i + 1] == 'z') p = brand + i; if (p != NULL) { p -= 5; switch (p[4]) { case 'M': i = 1; break; case 'G': i = 1000; break; case 'T': i = 1000000; break; default: return (false); } #define C2D(c) ((c) - '0') if (p[1] == '.') { freq = C2D(p[0]) * 1000; freq += C2D(p[2]) * 100; freq += C2D(p[3]) * 10; freq *= i * 1000; } else { freq = C2D(p[0]) * 1000; freq += C2D(p[1]) * 100; freq += C2D(p[2]) * 10; freq += C2D(p[3]); freq *= i * 1000000; } #undef C2D *res = freq; return (true); } } return (false); } static void tsc_freq_tc(uint64_t *res) { uint64_t tsc1, tsc2; int64_t overhead; int count, i; overhead = 0; for (i = 0, count = 8; i < count; i++) { tsc1 = rdtsc_ordered(); DELAY(0); tsc2 = rdtsc_ordered(); if (i > 0) overhead += tsc2 - tsc1; } overhead /= count; tsc1 = rdtsc_ordered(); DELAY(100000); tsc2 = rdtsc_ordered(); tsc_freq = (tsc2 - tsc1 - overhead) * 10; } /* * Try to determine the TSC frequency using CPUID or hypercalls. If successful, * this lets use the TSC for early DELAY() calls instead of the 8254 timer, * which may be unreliable or entirely absent on contemporary systems. However, * avoid calibrating using the 8254 here so as to give hypervisors a chance to * register a timecounter that can be used instead. */ static void probe_tsc_freq_early(void) { #ifdef __i386__ /* The TSC is known to be broken on certain CPUs. */ switch (cpu_vendor_id) { case CPU_VENDOR_AMD: switch (cpu_id & 0xFF0) { case 0x500: /* K5 Model 0 */ tsc_disabled = 1; return; } break; case CPU_VENDOR_CENTAUR: switch (cpu_id & 0xff0) { case 0x540: /* * http://www.centtech.com/c6_data_sheet.pdf * * I-12 RDTSC may return incoherent values in EDX:EAX * I-13 RDTSC hangs when certain event counters are used */ tsc_disabled = 1; return; } break; case CPU_VENDOR_NSC: switch (cpu_id & 0xff0) { case 0x540: if ((cpu_id & CPUID_STEPPING) == 0) { tsc_disabled = 1; return; } break; } break; } #endif switch (cpu_vendor_id) { case CPU_VENDOR_AMD: case CPU_VENDOR_HYGON: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) >= 0x10)) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_mfence; } break; case CPU_VENDOR_INTEL: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && ((CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xe) || (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x3)))) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_lfence; } break; case CPU_VENDOR_CENTAUR: if (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf && (rdmsr(0x1203) & 0x100000000ULL) == 0) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_lfence; } break; } if (tsc_freq_cpuid_vm()) { if (bootverbose) printf( "Early TSC frequency %juHz derived from hypervisor CPUID\n", (uintmax_t)tsc_freq); } else if (vm_guest == VM_GUEST_VMWARE) { tsc_freq_vmware(); if (bootverbose) printf( "Early TSC frequency %juHz derived from VMWare hypercall\n", (uintmax_t)tsc_freq); } else if (vm_guest == VM_GUEST_XEN) { tsc_freq_xen(); if (bootverbose) printf( "Early TSC frequency %juHz derived from Xen CPUID\n", (uintmax_t)tsc_freq); } else if (tsc_freq_cpuid(&tsc_freq)) { /* * If possible, use the value obtained from CPUID as the initial * frequency. This will be refined later during boot but is * good enough for now. The 8254 PIT is not functional on some * newer platforms anyway, so don't delay our boot for what * might be a garbage result. Late calibration is required if * the initial frequency was obtained from CPUID.16H, as the * derived value may be off by as much as 1%. */ if (bootverbose) printf("Early TSC frequency %juHz derived from CPUID\n", (uintmax_t)tsc_freq); } } /* * If we were unable to determine the TSC frequency via CPU registers, try * to calibrate against a known clock. */ static void probe_tsc_freq_late(void) { if (tsc_freq != 0) return; if (tsc_skip_calibration) { /* * Try to parse the brand string to obtain the nominal TSC * frequency. */ if (cpu_vendor_id == CPU_VENDOR_INTEL && tsc_freq_intel_brand(&tsc_freq)) { if (bootverbose) printf( "Early TSC frequency %juHz derived from brand string\n", (uintmax_t)tsc_freq); } else { tsc_disabled = 1; } } else { /* * Calibrate against a timecounter or the 8254 PIT. This * estimate will be refined later in tsc_calib(). */ tsc_freq_tc(&tsc_freq); if (bootverbose) printf( "Early TSC frequency %juHz calibrated from 8254 PIT\n", (uintmax_t)tsc_freq); } } void start_TSC(void) { if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; probe_tsc_freq_late(); if (cpu_power_ecx & CPUID_PERF_STAT) { /* * XXX Some emulators expose host CPUID without actual support * for these MSRs. We must test whether they really work. */ wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); DELAY(10); if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0) tsc_perf_stat = 1; } /* * Inform CPU accounting about our boot-time clock rate. This will * be updated if someone loads a cpufreq driver after boot that * discovers a new max frequency. * * The frequency may also be updated after late calibration is complete; * however, we register the TSC as the ticker now to avoid switching * counters after much of the kernel has already booted and potentially * sampled the CPU clock. */ if (tsc_freq != 0) set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); if (tsc_is_invariant) return; /* Register to find out about changes in CPU frequency. */ tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change, tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST); tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change, tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST); tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed, tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY); } #ifdef SMP /* * RDTSC is not a serializing instruction, and does not drain * instruction stream, so we need to drain the stream before executing * it. It could be fixed by use of RDTSCP, except the instruction is * not available everywhere. * * Use CPUID for draining in the boot-time SMP constistency test. The * timecounters use MFENCE for AMD CPUs, and LFENCE for others (Intel * and VIA) when SSE2 is present, and nothing on older machines which * also do not issue RDTSC prematurely. There, testing for SSE2 and * vendor is too cumbersome, and we learn about TSC presence from CPUID. * * Do not use do_cpuid(), since we do not need CPUID results, which * have to be written into memory with do_cpuid(). */ #define TSC_READ(x) \ static void \ tsc_read_##x(void *arg) \ { \ uint64_t *tsc = arg; \ u_int cpu = PCPU_GET(cpuid); \ \ __asm __volatile("cpuid" : : : "eax", "ebx", "ecx", "edx"); \ tsc[cpu * 3 + x] = rdtsc(); \ } TSC_READ(0) TSC_READ(1) TSC_READ(2) #undef TSC_READ #define N 1000 static void comp_smp_tsc(void *arg) { uint64_t *tsc; int64_t d1, d2; u_int cpu = PCPU_GET(cpuid); u_int i, j, size; size = (mp_maxid + 1) * 3; for (i = 0, tsc = arg; i < N; i++, tsc += size) CPU_FOREACH(j) { if (j == cpu) continue; d1 = tsc[cpu * 3 + 1] - tsc[j * 3]; d2 = tsc[cpu * 3 + 2] - tsc[j * 3 + 1]; if (d1 <= 0 || d2 <= 0) { smp_tsc = 0; return; } } } static void adj_smp_tsc(void *arg) { uint64_t *tsc; int64_t d, min, max; u_int cpu = PCPU_GET(cpuid); u_int first, i, size; first = CPU_FIRST(); if (cpu == first) return; min = INT64_MIN; max = INT64_MAX; size = (mp_maxid + 1) * 3; for (i = 0, tsc = arg; i < N; i++, tsc += size) { d = tsc[first * 3] - tsc[cpu * 3 + 1]; if (d > min) min = d; d = tsc[first * 3 + 1] - tsc[cpu * 3 + 2]; if (d > min) min = d; d = tsc[first * 3 + 1] - tsc[cpu * 3]; if (d < max) max = d; d = tsc[first * 3 + 2] - tsc[cpu * 3 + 1]; if (d < max) max = d; } if (min > max) return; d = min / 2 + max / 2; __asm __volatile ( "movl $0x10, %%ecx\n\t" "rdmsr\n\t" "addl %%edi, %%eax\n\t" "adcl %%esi, %%edx\n\t" "wrmsr\n" : /* No output */ : "D" ((uint32_t)d), "S" ((uint32_t)(d >> 32)) : "ax", "cx", "dx", "cc" ); } static int test_tsc(int adj_max_count) { uint64_t *data, *tsc; u_int i, size, adj; if ((!smp_tsc && !tsc_is_invariant)) return (-100); /* * Misbehavior of TSC under VirtualBox has been observed. In * particular, threads doing small (~1 second) sleeps may miss their * wakeup and hang around in sleep state, causing hangs on shutdown. */ if (vm_guest == VM_GUEST_VBOX) return (0); TSENTER(); size = (mp_maxid + 1) * 3; data = malloc(sizeof(*data) * size * N, M_TEMP, M_WAITOK); adj = 0; retry: for (i = 0, tsc = data; i < N; i++, tsc += size) smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc); smp_tsc = 1; /* XXX */ smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc, smp_no_rendezvous_barrier, data); if (!smp_tsc && adj < adj_max_count) { adj++; smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc, smp_no_rendezvous_barrier, data); goto retry; } free(data, M_TEMP); if (bootverbose) printf("SMP: %sed TSC synchronization test%s\n", smp_tsc ? "pass" : "fail", adj > 0 ? " after adjustment" : ""); TSEXIT(); if (smp_tsc && tsc_is_invariant) { switch (cpu_vendor_id) { case CPU_VENDOR_AMD: case CPU_VENDOR_HYGON: /* * Processor Programming Reference (PPR) for AMD * Family 17h states that the TSC uses a common * reference for all sockets, cores and threads. */ if (CPUID_TO_FAMILY(cpu_id) >= 0x17) return (1000); /* * Starting with Family 15h processors, TSC clock * source is in the north bridge. Check whether * we have a single-socket/multi-core platform. * XXX Need more work for complex cases. */ if (CPUID_TO_FAMILY(cpu_id) < 0x15 || (amd_feature2 & AMDID2_CMP) == 0 || smp_cpus > (cpu_procinfo2 & AMDID_CMP_CORES) + 1) break; return (1000); case CPU_VENDOR_INTEL: /* * XXX Assume Intel platforms have synchronized TSCs. */ return (1000); } return (800); } return (-100); } #undef N #endif /* SMP */ static void init_TSC_tc(void) { uint64_t max_freq; int shift; if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; /* * Limit timecounter frequency to fit in an int and prevent it from * overflowing too fast. */ max_freq = UINT_MAX; /* * Intel CPUs without a C-state invariant TSC can stop the TSC * in either C2 or C3. Disable use of C2 and C3 while using * the TSC as the timecounter. The timecounter can be changed * to enable C2 and C3. * * Note that the TSC is used as the cputicker for computing * thread runtime regardless of the timecounter setting, so * using an alternate timecounter and enabling C2 or C3 can * result incorrect runtimes for kernel idle threads (but not * for any non-idle threads). */ if (cpu_vendor_id == CPU_VENDOR_INTEL && (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) { tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP; if (bootverbose) printf("TSC timecounter disables C2 and C3.\n"); } /* * We can not use the TSC in SMP mode unless the TSCs on all CPUs * are synchronized. If the user is sure that the system has * synchronized TSCs, set kern.timecounter.smp_tsc tunable to a * non-zero value. The TSC seems unreliable in virtualized SMP * environments, so it is set to a negative quality in those cases. */ #ifdef SMP if (mp_ncpus > 1) tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust); else #endif /* SMP */ if (tsc_is_invariant) tsc_timecounter.tc_quality = 1000; max_freq >>= tsc_shift; for (shift = 0; shift <= 31 && (tsc_freq >> shift) > max_freq; shift++) ; /* * Timecounter implementation selection, top to bottom: * - If RDTSCP is available, use RDTSCP. * - If fence instructions are provided (SSE2), use LFENCE;RDTSC * on Intel, and MFENCE;RDTSC on AMD. * - For really old CPUs, just use RDTSC. */ if ((amd_feature & AMDID_RDTSCP) != 0) { tsc_timecounter.tc_get_timecount = shift > 0 ? tscp_get_timecount_low : tscp_get_timecount; } else if ((cpu_feature & CPUID_SSE2) != 0 && mp_ncpus > 1) { if (cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_HYGON) { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low_mfence : tsc_get_timecount_mfence; } else { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low_lfence : tsc_get_timecount_lfence; } } else { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low : tsc_get_timecount; } if (shift > 0) { tsc_timecounter.tc_name = "TSC-low"; if (bootverbose) printf("TSC timecounter discards lower %d bit(s)\n", shift); } if (tsc_freq != 0) { tsc_timecounter.tc_frequency = tsc_freq >> shift; tsc_timecounter.tc_priv = (void *)(intptr_t)shift; /* * Timecounter registration is deferred until after late * calibration is finished. */ } } SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL); static void tsc_update_freq(uint64_t new_freq) { atomic_store_rel_64(&tsc_freq, new_freq); atomic_store_rel_64(&tsc_timecounter.tc_frequency, new_freq >> (int)(intptr_t)tsc_timecounter.tc_priv); } void tsc_init(void) { if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; probe_tsc_freq_early(); } /* * Perform late calibration of the TSC frequency once ACPI-based timecounters * are available. At this point timehands are not set up, so we read the * highest-quality timecounter directly rather than using (s)binuptime(). */ void tsc_calibrate(void) { uint64_t freq; if (tsc_disabled) return; if (tsc_early_calib_exact) goto calibrated; fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX); freq = clockcalib(rdtsc_ordered, "TSC"); fpu_kern_leave(curthread, NULL); tsc_update_freq(freq); calibrated: tc_init(&tsc_timecounter); set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); } void resume_TSC(void) { #ifdef SMP int quality; /* If TSC was not good on boot, it is unlikely to become good now. */ if (tsc_timecounter.tc_quality < 0) return; /* Nothing to do with UP. */ if (mp_ncpus < 2) return; /* * If TSC was good, a single synchronization should be enough, * but honour smp_tsc_adjust if it's set. */ quality = test_tsc(MAX(smp_tsc_adjust, 1)); if (quality != tsc_timecounter.tc_quality) { printf("TSC timecounter quality changed: %d -> %d\n", tsc_timecounter.tc_quality, quality); tsc_timecounter.tc_quality = quality; } #endif /* SMP */ } /* * When cpufreq levels change, find out about the (new) max frequency. We * use this to update CPU accounting in case it got a lower estimate at boot. */ static void tsc_levels_changed(void *arg, int unit) { device_t cf_dev; struct cf_level *levels; int count, error; uint64_t max_freq; /* Only use values from the first CPU, assuming all are equal. */ if (unit != 0) return; /* Find the appropriate cpufreq device instance. */ cf_dev = devclass_get_device(devclass_find("cpufreq"), unit); if (cf_dev == NULL) { printf("tsc_levels_changed() called but no cpufreq device?\n"); return; } /* Get settings from the device and find the max frequency. */ count = 64; levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT); if (levels == NULL) return; error = CPUFREQ_LEVELS(cf_dev, levels, &count); if (error == 0 && count != 0) { max_freq = (uint64_t)levels[0].total_set.freq * 1000000; - set_cputicker(rdtsc, max_freq, 1); + set_cputicker(rdtsc, max_freq, true); } else printf("tsc_levels_changed: no max freq found\n"); free(levels, M_TEMP); } /* * If the TSC timecounter is in use, veto the pending change. It may be * possible in the future to handle a dynamically-changing timecounter rate. */ static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status) { if (*status != 0 || timecounter != &tsc_timecounter) return; printf("timecounter TSC must not be in use when " "changing frequencies; change denied\n"); *status = EBUSY; } /* Update TSC freq with the value indicated by the caller. */ static void tsc_freq_changed(void *arg, const struct cf_level *level, int status) { uint64_t freq; /* If there was an error during the transition, don't do anything. */ if (tsc_disabled || status != 0) return; /* Total setting for this level gives the new frequency in MHz. */ freq = (uint64_t)level->total_set.freq * 1000000; tsc_update_freq(freq); } static int sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS) { int error; uint64_t freq; freq = atomic_load_acq_64(&tsc_freq); if (freq == 0) return (EOPNOTSUPP); error = sysctl_handle_64(oidp, &freq, 0, req); if (error == 0 && req->newptr != NULL) tsc_update_freq(freq); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_machdep_tsc_freq, "QU", "Time Stamp Counter frequency"); static u_int tsc_get_timecount(struct timecounter *tc __unused) { return (rdtsc32()); } static u_int tscp_get_timecount(struct timecounter *tc __unused) { return (rdtscp32()); } static inline u_int tsc_get_timecount_low(struct timecounter *tc) { uint32_t rv; __asm __volatile("rdtsc; shrd %%cl, %%edx, %0" : "=a" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx"); return (rv); } static u_int tscp_get_timecount_low(struct timecounter *tc) { uint32_t rv; __asm __volatile("rdtscp; movl %1, %%ecx; shrd %%cl, %%edx, %0" : "=&a" (rv) : "m" (tc->tc_priv) : "ecx", "edx"); return (rv); } static u_int tsc_get_timecount_lfence(struct timecounter *tc __unused) { lfence(); return (rdtsc32()); } static u_int tsc_get_timecount_low_lfence(struct timecounter *tc) { lfence(); return (tsc_get_timecount_low(tc)); } static u_int tsc_get_timecount_mfence(struct timecounter *tc __unused) { mfence(); return (rdtsc32()); } static u_int tsc_get_timecount_low_mfence(struct timecounter *tc) { mfence(); return (tsc_get_timecount_low(tc)); } static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th->th_x86_hpet_idx = 0xffffffff; vdso_th->th_x86_pvc_last_systime = 0; vdso_th->th_x86_pvc_stable_mask = 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (1); } #ifdef COMPAT_FREEBSD32 static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc) { vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th32->th_x86_hpet_idx = 0xffffffff; vdso_th32->th_x86_pvc_last_systime = 0; vdso_th32->th_x86_pvc_stable_mask = 0; bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); return (1); } #endif