diff --git a/sys/dev/acpica/acpi_hpet.c b/sys/dev/acpica/acpi_hpet.c index e35e3808a980..e6f83512feed 100644 --- a/sys/dev/acpica/acpi_hpet.c +++ b/sys/dev/acpica/acpi_hpet.c @@ -1,1013 +1,1014 @@ /*- * Copyright (c) 2005 Poul-Henning Kamp * Copyright (c) 2010 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #if defined(__amd64__) #define DEV_APIC #else #include "opt_apic.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include "pcib_if.h" #endif #define HPET_VENDID_AMD 0x4353 #define HPET_VENDID_AMD2 0x1022 #define HPET_VENDID_HYGON 0x1d94 #define HPET_VENDID_INTEL 0x8086 #define HPET_VENDID_NVIDIA 0x10de #define HPET_VENDID_SW 0x1166 ACPI_SERIAL_DECL(hpet, "ACPI HPET support"); static devclass_t hpet_devclass; /* ACPI CA debugging */ #define _COMPONENT ACPI_TIMER ACPI_MODULE_NAME("HPET") struct hpet_softc { device_t dev; int mem_rid; int intr_rid; int irq; int useirq; int legacy_route; int per_cpu; uint32_t allowed_irqs; struct resource *mem_res; struct resource *intr_res; void *intr_handle; ACPI_HANDLE handle; uint32_t acpi_uid; uint64_t freq; uint32_t caps; struct timecounter tc; struct hpet_timer { struct eventtimer et; struct hpet_softc *sc; int num; int mode; #define TIMER_STOPPED 0 #define TIMER_PERIODIC 1 #define TIMER_ONESHOT 2 int intr_rid; int irq; int pcpu_cpu; int pcpu_misrouted; int pcpu_master; int pcpu_slaves[MAXCPU]; struct resource *intr_res; void *intr_handle; uint32_t caps; uint32_t vectors; uint32_t div; uint32_t next; char name[8]; } t[32]; int num_timers; struct cdev *pdev; int mmap_allow; int mmap_allow_write; }; static d_open_t hpet_open; static d_mmap_t hpet_mmap; static struct cdevsw hpet_cdevsw = { .d_version = D_VERSION, .d_name = "hpet", .d_open = hpet_open, .d_mmap = hpet_mmap, }; static u_int hpet_get_timecount(struct timecounter *tc); static void hpet_test(struct hpet_softc *sc); static char *hpet_ids[] = { "PNP0103", NULL }; /* Knob to disable acpi_hpet device */ bool acpi_hpet_disabled = false; static u_int hpet_get_timecount(struct timecounter *tc) { struct hpet_softc *sc; sc = tc->tc_priv; return (bus_read_4(sc->mem_res, HPET_MAIN_COUNTER)); } uint32_t hpet_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { struct hpet_softc *sc; sc = tc->tc_priv; vdso_th->th_algo = VDSO_TH_ALGO_X86_HPET; vdso_th->th_x86_shift = 0; vdso_th->th_x86_hpet_idx = device_get_unit(sc->dev); vdso_th->th_x86_pvc_last_systime = 0; vdso_th->th_x86_pvc_stable_mask = 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (sc->mmap_allow != 0); } #ifdef COMPAT_FREEBSD32 uint32_t hpet_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc) { struct hpet_softc *sc; sc = tc->tc_priv; vdso_th32->th_algo = VDSO_TH_ALGO_X86_HPET; vdso_th32->th_x86_shift = 0; vdso_th32->th_x86_hpet_idx = device_get_unit(sc->dev); - vdso_th32->th_x86_pvc_last_systime = 0; + vdso_th32->th_x86_pvc_last_systime[0] = 0; + vdso_th32->th_x86_pvc_last_systime[1] = 0; vdso_th32->th_x86_pvc_stable_mask = 0; bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); return (sc->mmap_allow != 0); } #endif static void hpet_enable(struct hpet_softc *sc) { uint32_t val; val = bus_read_4(sc->mem_res, HPET_CONFIG); if (sc->legacy_route) val |= HPET_CNF_LEG_RT; else val &= ~HPET_CNF_LEG_RT; val |= HPET_CNF_ENABLE; bus_write_4(sc->mem_res, HPET_CONFIG, val); } static void hpet_disable(struct hpet_softc *sc) { uint32_t val; val = bus_read_4(sc->mem_res, HPET_CONFIG); val &= ~HPET_CNF_ENABLE; bus_write_4(sc->mem_res, HPET_CONFIG, val); } static int hpet_start(struct eventtimer *et, sbintime_t first, sbintime_t period) { struct hpet_timer *mt = (struct hpet_timer *)et->et_priv; struct hpet_timer *t; struct hpet_softc *sc = mt->sc; uint32_t fdiv, now; t = (mt->pcpu_master < 0) ? mt : &sc->t[mt->pcpu_slaves[curcpu]]; if (period != 0) { t->mode = TIMER_PERIODIC; t->div = (sc->freq * period) >> 32; } else { t->mode = TIMER_ONESHOT; t->div = 0; } if (first != 0) fdiv = (sc->freq * first) >> 32; else fdiv = t->div; if (t->irq < 0) bus_write_4(sc->mem_res, HPET_ISR, 1 << t->num); t->caps |= HPET_TCNF_INT_ENB; now = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER); restart: t->next = now + fdiv; if (t->mode == TIMER_PERIODIC && (t->caps & HPET_TCAP_PER_INT)) { t->caps |= HPET_TCNF_TYPE; bus_write_4(sc->mem_res, HPET_TIMER_CAP_CNF(t->num), t->caps | HPET_TCNF_VAL_SET); bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->next); bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->div); } else { t->caps &= ~HPET_TCNF_TYPE; bus_write_4(sc->mem_res, HPET_TIMER_CAP_CNF(t->num), t->caps); bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->next); } now = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER); if ((int32_t)(now - t->next + HPET_MIN_CYCLES) >= 0) { fdiv *= 2; goto restart; } return (0); } static int hpet_stop(struct eventtimer *et) { struct hpet_timer *mt = (struct hpet_timer *)et->et_priv; struct hpet_timer *t; struct hpet_softc *sc = mt->sc; t = (mt->pcpu_master < 0) ? mt : &sc->t[mt->pcpu_slaves[curcpu]]; t->mode = TIMER_STOPPED; t->caps &= ~(HPET_TCNF_INT_ENB | HPET_TCNF_TYPE); bus_write_4(sc->mem_res, HPET_TIMER_CAP_CNF(t->num), t->caps); return (0); } static int hpet_intr_single(void *arg) { struct hpet_timer *t = (struct hpet_timer *)arg; struct hpet_timer *mt; struct hpet_softc *sc = t->sc; uint32_t now; if (t->mode == TIMER_STOPPED) return (FILTER_STRAY); /* Check that per-CPU timer interrupt reached right CPU. */ if (t->pcpu_cpu >= 0 && t->pcpu_cpu != curcpu) { if ((++t->pcpu_misrouted) % 32 == 0) { printf("HPET interrupt routed to the wrong CPU" " (timer %d CPU %d -> %d)!\n", t->num, t->pcpu_cpu, curcpu); } /* * Reload timer, hoping that next time may be more lucky * (system will manage proper interrupt binding). */ if ((t->mode == TIMER_PERIODIC && (t->caps & HPET_TCAP_PER_INT) == 0) || t->mode == TIMER_ONESHOT) { t->next = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER) + sc->freq / 8; bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->next); } return (FILTER_HANDLED); } if (t->mode == TIMER_PERIODIC && (t->caps & HPET_TCAP_PER_INT) == 0) { t->next += t->div; now = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER); if ((int32_t)((now + t->div / 2) - t->next) > 0) t->next = now + t->div / 2; bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->next); } else if (t->mode == TIMER_ONESHOT) t->mode = TIMER_STOPPED; mt = (t->pcpu_master < 0) ? t : &sc->t[t->pcpu_master]; if (mt->et.et_active) mt->et.et_event_cb(&mt->et, mt->et.et_arg); return (FILTER_HANDLED); } static int hpet_intr(void *arg) { struct hpet_softc *sc = (struct hpet_softc *)arg; int i; uint32_t val; val = bus_read_4(sc->mem_res, HPET_ISR); if (val) { bus_write_4(sc->mem_res, HPET_ISR, val); val &= sc->useirq; for (i = 0; i < sc->num_timers; i++) { if ((val & (1 << i)) == 0) continue; hpet_intr_single(&sc->t[i]); } return (FILTER_HANDLED); } return (FILTER_STRAY); } uint32_t hpet_get_uid(device_t dev) { struct hpet_softc *sc; sc = device_get_softc(dev); return (sc->acpi_uid); } static ACPI_STATUS hpet_find(ACPI_HANDLE handle, UINT32 level, void *context, void **status) { char **ids; uint32_t id = (uint32_t)(uintptr_t)context; uint32_t uid = 0; for (ids = hpet_ids; *ids != NULL; ids++) { if (acpi_MatchHid(handle, *ids)) break; } if (*ids == NULL) return (AE_OK); if (ACPI_FAILURE(acpi_GetInteger(handle, "_UID", &uid)) || id == uid) *status = acpi_get_device(handle); return (AE_OK); } /* * Find an existing IRQ resource that matches the requested IRQ range * and return its RID. If one is not found, use a new RID. */ static int hpet_find_irq_rid(device_t dev, u_long start, u_long end) { rman_res_t irq; int error, rid; for (rid = 0;; rid++) { error = bus_get_resource(dev, SYS_RES_IRQ, rid, &irq, NULL); if (error != 0 || (start <= irq && irq <= end)) return (rid); } } static int hpet_open(struct cdev *cdev, int oflags, int devtype, struct thread *td) { struct hpet_softc *sc; sc = cdev->si_drv1; if (!sc->mmap_allow) return (EPERM); else return (0); } static int hpet_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { struct hpet_softc *sc; sc = cdev->si_drv1; if (offset >= rman_get_size(sc->mem_res)) return (EINVAL); if (!sc->mmap_allow_write && (nprot & PROT_WRITE)) return (EPERM); *paddr = rman_get_start(sc->mem_res) + offset; *memattr = VM_MEMATTR_UNCACHEABLE; return (0); } /* Discover the HPET via the ACPI table of the same name. */ static void hpet_identify(driver_t *driver, device_t parent) { ACPI_TABLE_HPET *hpet; ACPI_STATUS status; device_t child; int i; /* Only one HPET device can be added. */ if (devclass_get_device(hpet_devclass, 0)) return; for (i = 1; ; i++) { /* Search for HPET table. */ status = AcpiGetTable(ACPI_SIG_HPET, i, (ACPI_TABLE_HEADER **)&hpet); if (ACPI_FAILURE(status)) return; /* Search for HPET device with same ID. */ child = NULL; AcpiWalkNamespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, 100, hpet_find, NULL, (void *)(uintptr_t)hpet->Sequence, (void *)&child); /* If found - let it be probed in normal way. */ if (child) { if (bus_get_resource(child, SYS_RES_MEMORY, 0, NULL, NULL) != 0) bus_set_resource(child, SYS_RES_MEMORY, 0, hpet->Address.Address, HPET_MEM_WIDTH); continue; } /* If not - create it from table info. */ child = BUS_ADD_CHILD(parent, 2, "hpet", 0); if (child == NULL) { printf("%s: can't add child\n", __func__); continue; } bus_set_resource(child, SYS_RES_MEMORY, 0, hpet->Address.Address, HPET_MEM_WIDTH); } } static int hpet_probe(device_t dev) { int rv; ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__); if (acpi_disabled("hpet") || acpi_hpet_disabled) return (ENXIO); if (acpi_get_handle(dev) != NULL) rv = ACPI_ID_PROBE(device_get_parent(dev), dev, hpet_ids, NULL); else rv = 0; if (rv <= 0) device_set_desc(dev, "High Precision Event Timer"); return (rv); } static int hpet_attach(device_t dev) { struct hpet_softc *sc; struct hpet_timer *t; struct make_dev_args mda; int i, j, num_msi, num_timers, num_percpu_et, num_percpu_t, cur_cpu; int pcpu_master, error; rman_res_t hpet_region_size; static int maxhpetet = 0; uint32_t val, val2, cvectors, dvectors; uint16_t vendor, rev; ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__); sc = device_get_softc(dev); sc->dev = dev; sc->handle = acpi_get_handle(dev); sc->mem_rid = 0; sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->mem_rid, RF_ACTIVE); if (sc->mem_res == NULL) return (ENOMEM); hpet_region_size = rman_get_size(sc->mem_res); /* Validate that the region is big enough for the control registers. */ if (hpet_region_size < HPET_MEM_MIN_WIDTH) { device_printf(dev, "memory region width %jd too small\n", hpet_region_size); bus_free_resource(dev, SYS_RES_MEMORY, sc->mem_res); return (ENXIO); } /* Be sure timer is enabled. */ hpet_enable(sc); /* Read basic statistics about the timer. */ val = bus_read_4(sc->mem_res, HPET_PERIOD); if (val == 0) { device_printf(dev, "invalid period\n"); hpet_disable(sc); bus_free_resource(dev, SYS_RES_MEMORY, sc->mem_res); return (ENXIO); } sc->freq = (1000000000000000LL + val / 2) / val; sc->caps = bus_read_4(sc->mem_res, HPET_CAPABILITIES); vendor = (sc->caps & HPET_CAP_VENDOR_ID) >> 16; rev = sc->caps & HPET_CAP_REV_ID; num_timers = 1 + ((sc->caps & HPET_CAP_NUM_TIM) >> 8); /* * ATI/AMD violates IA-PC HPET (High Precision Event Timers) * Specification and provides an off by one number * of timers/comparators. * Additionally, they use unregistered value in VENDOR_ID field. */ if (vendor == HPET_VENDID_AMD && rev < 0x10 && num_timers > 0) num_timers--; /* * Now validate that the region is big enough to address all counters. */ if (hpet_region_size < HPET_TIMER_CAP_CNF(num_timers)) { device_printf(dev, "memory region width %jd too small for %d timers\n", hpet_region_size, num_timers); hpet_disable(sc); bus_free_resource(dev, SYS_RES_MEMORY, sc->mem_res); return (ENXIO); } sc->num_timers = num_timers; if (bootverbose) { device_printf(dev, "vendor 0x%x, rev 0x%x, %jdHz%s, %d timers,%s\n", vendor, rev, sc->freq, (sc->caps & HPET_CAP_COUNT_SIZE) ? " 64bit" : "", num_timers, (sc->caps & HPET_CAP_LEG_RT) ? " legacy route" : ""); } for (i = 0; i < num_timers; i++) { t = &sc->t[i]; t->sc = sc; t->num = i; t->mode = TIMER_STOPPED; t->intr_rid = -1; t->irq = -1; t->pcpu_cpu = -1; t->pcpu_misrouted = 0; t->pcpu_master = -1; t->caps = bus_read_4(sc->mem_res, HPET_TIMER_CAP_CNF(i)); t->vectors = bus_read_4(sc->mem_res, HPET_TIMER_CAP_CNF(i) + 4); if (bootverbose) { device_printf(dev, " t%d: irqs 0x%08x (%d)%s%s%s\n", i, t->vectors, (t->caps & HPET_TCNF_INT_ROUTE) >> 9, (t->caps & HPET_TCAP_FSB_INT_DEL) ? ", MSI" : "", (t->caps & HPET_TCAP_SIZE) ? ", 64bit" : "", (t->caps & HPET_TCAP_PER_INT) ? ", periodic" : ""); } } if (testenv("debug.acpi.hpet_test")) hpet_test(sc); /* * Don't attach if the timer never increments. Since the spec * requires it to be at least 10 MHz, it has to change in 1 us. */ val = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER); DELAY(1); val2 = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER); if (val == val2) { device_printf(dev, "HPET never increments, disabling\n"); hpet_disable(sc); bus_free_resource(dev, SYS_RES_MEMORY, sc->mem_res); return (ENXIO); } /* Announce first HPET as timecounter. */ if (device_get_unit(dev) == 0) { sc->tc.tc_get_timecount = hpet_get_timecount, sc->tc.tc_counter_mask = ~0u, sc->tc.tc_name = "HPET", sc->tc.tc_quality = 950, sc->tc.tc_frequency = sc->freq; sc->tc.tc_priv = sc; sc->tc.tc_fill_vdso_timehands = hpet_vdso_timehands; #ifdef COMPAT_FREEBSD32 sc->tc.tc_fill_vdso_timehands32 = hpet_vdso_timehands32; #endif tc_init(&sc->tc); } /* If not disabled - setup and announce event timers. */ if (resource_int_value(device_get_name(dev), device_get_unit(dev), "clock", &i) == 0 && i == 0) return (0); /* Check whether we can and want legacy routing. */ sc->legacy_route = 0; resource_int_value(device_get_name(dev), device_get_unit(dev), "legacy_route", &sc->legacy_route); if ((sc->caps & HPET_CAP_LEG_RT) == 0) sc->legacy_route = 0; if (sc->legacy_route) { sc->t[0].vectors = 0; sc->t[1].vectors = 0; } /* Check what IRQs we want use. */ /* By default allow any PCI IRQs. */ sc->allowed_irqs = 0xffff0000; /* * HPETs in AMD chipsets before SB800 have problems with IRQs >= 16 * Lower are also not always working for different reasons. * SB800 fixed it, but seems do not implements level triggering * properly, that makes it very unreliable - it freezes after any * interrupt loss. Avoid legacy IRQs for AMD. */ if (vendor == HPET_VENDID_AMD || vendor == HPET_VENDID_AMD2 || vendor == HPET_VENDID_HYGON) sc->allowed_irqs = 0x00000000; /* * NVidia MCP5x chipsets have number of unexplained interrupt * problems. For some reason, using HPET interrupts breaks HDA sound. */ if (vendor == HPET_VENDID_NVIDIA && rev <= 0x01) sc->allowed_irqs = 0x00000000; /* * ServerWorks HT1000 reported to have problems with IRQs >= 16. * Lower IRQs are working, but allowed mask is not set correctly. * Legacy_route mode works fine. */ if (vendor == HPET_VENDID_SW && rev <= 0x01) sc->allowed_irqs = 0x00000000; /* * Neither QEMU nor VirtualBox report supported IRQs correctly. * The only way to use HPET there is to specify IRQs manually * and/or use legacy_route. Legacy_route mode works on both. */ if (vm_guest) sc->allowed_irqs = 0x00000000; /* Let user override. */ resource_int_value(device_get_name(dev), device_get_unit(dev), "allowed_irqs", &sc->allowed_irqs); /* Get how much per-CPU timers we should try to provide. */ sc->per_cpu = 1; resource_int_value(device_get_name(dev), device_get_unit(dev), "per_cpu", &sc->per_cpu); num_msi = 0; sc->useirq = 0; /* Find IRQ vectors for all timers. */ cvectors = sc->allowed_irqs & 0xffff0000; dvectors = sc->allowed_irqs & 0x0000ffff; if (sc->legacy_route) dvectors &= 0x0000fefe; for (i = 0; i < num_timers; i++) { t = &sc->t[i]; if (sc->legacy_route && i < 2) t->irq = (i == 0) ? 0 : 8; #ifdef DEV_APIC else if (t->caps & HPET_TCAP_FSB_INT_DEL) { if ((j = PCIB_ALLOC_MSIX( device_get_parent(device_get_parent(dev)), dev, &t->irq))) { device_printf(dev, "Can't allocate interrupt for t%d: %d\n", i, j); } } #endif else if (dvectors & t->vectors) { t->irq = ffs(dvectors & t->vectors) - 1; dvectors &= ~(1 << t->irq); } if (t->irq >= 0) { t->intr_rid = hpet_find_irq_rid(dev, t->irq, t->irq); t->intr_res = bus_alloc_resource(dev, SYS_RES_IRQ, &t->intr_rid, t->irq, t->irq, 1, RF_ACTIVE); if (t->intr_res == NULL) { t->irq = -1; device_printf(dev, "Can't map interrupt for t%d.\n", i); } else if (bus_setup_intr(dev, t->intr_res, INTR_TYPE_CLK, hpet_intr_single, NULL, t, &t->intr_handle) != 0) { t->irq = -1; device_printf(dev, "Can't setup interrupt for t%d.\n", i); } else { bus_describe_intr(dev, t->intr_res, t->intr_handle, "t%d", i); num_msi++; } } if (t->irq < 0 && (cvectors & t->vectors) != 0) { cvectors &= t->vectors; sc->useirq |= (1 << i); } } if (sc->legacy_route && sc->t[0].irq < 0 && sc->t[1].irq < 0) sc->legacy_route = 0; if (sc->legacy_route) hpet_enable(sc); /* Group timers for per-CPU operation. */ num_percpu_et = min(num_msi / mp_ncpus, sc->per_cpu); num_percpu_t = num_percpu_et * mp_ncpus; pcpu_master = 0; cur_cpu = CPU_FIRST(); for (i = 0; i < num_timers; i++) { t = &sc->t[i]; if (t->irq >= 0 && num_percpu_t > 0) { if (cur_cpu == CPU_FIRST()) pcpu_master = i; t->pcpu_cpu = cur_cpu; t->pcpu_master = pcpu_master; sc->t[pcpu_master]. pcpu_slaves[cur_cpu] = i; bus_bind_intr(dev, t->intr_res, cur_cpu); cur_cpu = CPU_NEXT(cur_cpu); num_percpu_t--; } else if (t->irq >= 0) bus_bind_intr(dev, t->intr_res, CPU_FIRST()); } bus_write_4(sc->mem_res, HPET_ISR, 0xffffffff); sc->irq = -1; /* If at least one timer needs legacy IRQ - set it up. */ if (sc->useirq) { j = i = fls(cvectors) - 1; while (j > 0 && (cvectors & (1 << (j - 1))) != 0) j--; sc->intr_rid = hpet_find_irq_rid(dev, j, i); sc->intr_res = bus_alloc_resource(dev, SYS_RES_IRQ, &sc->intr_rid, j, i, 1, RF_SHAREABLE | RF_ACTIVE); if (sc->intr_res == NULL) device_printf(dev, "Can't map interrupt.\n"); else if (bus_setup_intr(dev, sc->intr_res, INTR_TYPE_CLK, hpet_intr, NULL, sc, &sc->intr_handle) != 0) { device_printf(dev, "Can't setup interrupt.\n"); } else { sc->irq = rman_get_start(sc->intr_res); /* Bind IRQ to BSP to avoid live migration. */ bus_bind_intr(dev, sc->intr_res, CPU_FIRST()); } } /* Program and announce event timers. */ for (i = 0; i < num_timers; i++) { t = &sc->t[i]; t->caps &= ~(HPET_TCNF_FSB_EN | HPET_TCNF_INT_ROUTE); t->caps &= ~(HPET_TCNF_VAL_SET | HPET_TCNF_INT_ENB); t->caps &= ~(HPET_TCNF_INT_TYPE); t->caps |= HPET_TCNF_32MODE; if (t->irq >= 0 && sc->legacy_route && i < 2) { /* Legacy route doesn't need more configuration. */ } else #ifdef DEV_APIC if ((t->caps & HPET_TCAP_FSB_INT_DEL) && t->irq >= 0) { uint64_t addr; uint32_t data; if (PCIB_MAP_MSI( device_get_parent(device_get_parent(dev)), dev, t->irq, &addr, &data) == 0) { bus_write_4(sc->mem_res, HPET_TIMER_FSB_ADDR(i), addr); bus_write_4(sc->mem_res, HPET_TIMER_FSB_VAL(i), data); t->caps |= HPET_TCNF_FSB_EN; } else t->irq = -2; } else #endif if (t->irq >= 0) t->caps |= (t->irq << 9); else if (sc->irq >= 0 && (t->vectors & (1 << sc->irq))) t->caps |= (sc->irq << 9) | HPET_TCNF_INT_TYPE; bus_write_4(sc->mem_res, HPET_TIMER_CAP_CNF(i), t->caps); /* Skip event timers without set up IRQ. */ if (t->irq < 0 && (sc->irq < 0 || (t->vectors & (1 << sc->irq)) == 0)) continue; /* Announce the reset. */ if (maxhpetet == 0) t->et.et_name = "HPET"; else { sprintf(t->name, "HPET%d", maxhpetet); t->et.et_name = t->name; } t->et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT; t->et.et_quality = 450; if (t->pcpu_master >= 0) { t->et.et_flags |= ET_FLAGS_PERCPU; t->et.et_quality += 100; } else if (mp_ncpus >= 8) t->et.et_quality -= 100; if ((t->caps & HPET_TCAP_PER_INT) == 0) t->et.et_quality -= 10; t->et.et_frequency = sc->freq; t->et.et_min_period = ((uint64_t)(HPET_MIN_CYCLES * 2) << 32) / sc->freq; t->et.et_max_period = (0xfffffffeLLU << 32) / sc->freq; t->et.et_start = hpet_start; t->et.et_stop = hpet_stop; t->et.et_priv = &sc->t[i]; if (t->pcpu_master < 0 || t->pcpu_master == i) { et_register(&t->et); maxhpetet++; } } acpi_GetInteger(sc->handle, "_UID", &sc->acpi_uid); make_dev_args_init(&mda); mda.mda_devsw = &hpet_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0644; mda.mda_si_drv1 = sc; error = make_dev_s(&mda, &sc->pdev, "hpet%d", device_get_unit(dev)); if (error == 0) { sc->mmap_allow = 1; TUNABLE_INT_FETCH("hw.acpi.hpet.mmap_allow", &sc->mmap_allow); sc->mmap_allow_write = 0; TUNABLE_INT_FETCH("hw.acpi.hpet.mmap_allow_write", &sc->mmap_allow_write); SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "mmap_allow", CTLFLAG_RW, &sc->mmap_allow, 0, "Allow userland to memory map HPET"); SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "mmap_allow_write", CTLFLAG_RW, &sc->mmap_allow_write, 0, "Allow userland write to the HPET register space"); } else { device_printf(dev, "could not create /dev/hpet%d, error %d\n", device_get_unit(dev), error); } return (0); } static int hpet_detach(device_t dev) { ACPI_FUNCTION_TRACE((char *)(uintptr_t) __func__); /* XXX Without a tc_remove() function, we can't detach. */ return (EBUSY); } static int hpet_suspend(device_t dev) { // struct hpet_softc *sc; /* * Disable the timer during suspend. The timer will not lose * its state in S1 or S2, but we are required to disable * it. */ // sc = device_get_softc(dev); // hpet_disable(sc); return (0); } static int hpet_resume(device_t dev) { struct hpet_softc *sc; struct hpet_timer *t; int i; /* Re-enable the timer after a resume to keep the clock advancing. */ sc = device_get_softc(dev); hpet_enable(sc); /* Restart event timers that were running on suspend. */ for (i = 0; i < sc->num_timers; i++) { t = &sc->t[i]; #ifdef DEV_APIC if (t->irq >= 0 && (sc->legacy_route == 0 || i >= 2)) { uint64_t addr; uint32_t data; if (PCIB_MAP_MSI( device_get_parent(device_get_parent(dev)), dev, t->irq, &addr, &data) == 0) { bus_write_4(sc->mem_res, HPET_TIMER_FSB_ADDR(i), addr); bus_write_4(sc->mem_res, HPET_TIMER_FSB_VAL(i), data); } } #endif if (t->mode == TIMER_STOPPED) continue; t->next = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER); if (t->mode == TIMER_PERIODIC && (t->caps & HPET_TCAP_PER_INT) != 0) { t->caps |= HPET_TCNF_TYPE; t->next += t->div; bus_write_4(sc->mem_res, HPET_TIMER_CAP_CNF(t->num), t->caps | HPET_TCNF_VAL_SET); bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->next); bus_read_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num)); bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->div); } else { t->next += sc->freq / 1024; bus_write_4(sc->mem_res, HPET_TIMER_COMPARATOR(t->num), t->next); } bus_write_4(sc->mem_res, HPET_ISR, 1 << t->num); bus_write_4(sc->mem_res, HPET_TIMER_CAP_CNF(t->num), t->caps); } return (0); } /* Print some basic latency/rate information to assist in debugging. */ static void hpet_test(struct hpet_softc *sc) { int i; uint32_t u1, u2; struct bintime b0, b1, b2; struct timespec ts; binuptime(&b0); binuptime(&b0); binuptime(&b1); u1 = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER); for (i = 1; i < 1000; i++) u2 = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER); binuptime(&b2); u2 = bus_read_4(sc->mem_res, HPET_MAIN_COUNTER); bintime_sub(&b2, &b1); bintime_sub(&b1, &b0); bintime_sub(&b2, &b1); bintime2timespec(&b2, &ts); device_printf(sc->dev, "%ld.%09ld: %u ... %u = %u\n", (long)ts.tv_sec, ts.tv_nsec, u1, u2, u2 - u1); device_printf(sc->dev, "time per call: %ld ns\n", ts.tv_nsec / 1000); } #ifdef DEV_APIC static int hpet_remap_intr(device_t dev, device_t child, u_int irq) { struct hpet_softc *sc = device_get_softc(dev); struct hpet_timer *t; uint64_t addr; uint32_t data; int error, i; for (i = 0; i < sc->num_timers; i++) { t = &sc->t[i]; if (t->irq != irq) continue; error = PCIB_MAP_MSI( device_get_parent(device_get_parent(dev)), dev, irq, &addr, &data); if (error) return (error); hpet_disable(sc); /* Stop timer to avoid interrupt loss. */ bus_write_4(sc->mem_res, HPET_TIMER_FSB_ADDR(i), addr); bus_write_4(sc->mem_res, HPET_TIMER_FSB_VAL(i), data); hpet_enable(sc); return (0); } return (ENOENT); } #endif static device_method_t hpet_methods[] = { /* Device interface */ DEVMETHOD(device_identify, hpet_identify), DEVMETHOD(device_probe, hpet_probe), DEVMETHOD(device_attach, hpet_attach), DEVMETHOD(device_detach, hpet_detach), DEVMETHOD(device_suspend, hpet_suspend), DEVMETHOD(device_resume, hpet_resume), #ifdef DEV_APIC DEVMETHOD(bus_remap_intr, hpet_remap_intr), #endif DEVMETHOD_END }; static driver_t hpet_driver = { "hpet", hpet_methods, sizeof(struct hpet_softc), }; DRIVER_MODULE(hpet, acpi, hpet_driver, hpet_devclass, 0, 0); MODULE_DEPEND(hpet, acpi, 1, 1, 1); diff --git a/sys/x86/include/vdso.h b/sys/x86/include/vdso.h index 546a92a47301..be90e26702f7 100644 --- a/sys/x86/include/vdso.h +++ b/sys/x86/include/vdso.h @@ -1,56 +1,61 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2012 Konstantin Belousov . * Copyright 2016 The FreeBSD Foundation. * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _X86_VDSO_H #define _X86_VDSO_H #define VDSO_TIMEHANDS_MD \ uint32_t th_x86_shift; \ uint32_t th_x86_hpet_idx; \ uint64_t th_x86_pvc_last_systime;\ uint8_t th_x86_pvc_stable_mask; \ uint8_t th_res[15]; #define VDSO_TH_ALGO_X86_TSC VDSO_TH_ALGO_1 #define VDSO_TH_ALGO_X86_HPET VDSO_TH_ALGO_2 #define VDSO_TH_ALGO_X86_HVTSC VDSO_TH_ALGO_3 /* Hyper-V ref. TSC */ #define VDSO_TH_ALGO_X86_PVCLK VDSO_TH_ALGO_4 /* KVM/XEN paravirtual clock */ #ifdef _KERNEL #ifdef COMPAT_FREEBSD32 -#define VDSO_TIMEHANDS_MD32 VDSO_TIMEHANDS_MD +#define VDSO_TIMEHANDS_MD32 \ + uint32_t th_x86_shift; \ + uint32_t th_x86_hpet_idx; \ + uint32_t th_x86_pvc_last_systime[2];\ + uint8_t th_x86_pvc_stable_mask; \ + uint8_t th_res[15]; #endif #endif #endif diff --git a/sys/x86/x86/pvclock.c b/sys/x86/x86/pvclock.c index 3da3373bb2ee..9d8ac99f5a8a 100644 --- a/sys/x86/x86/pvclock.c +++ b/sys/x86/x86/pvclock.c @@ -1,358 +1,358 @@ /*- * Copyright (c) 2009 Adrian Chadd * Copyright (c) 2012 Spectra Logic Corporation * Copyright (c) 2014 Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Last system time. This is used to guarantee a monotonically non-decreasing * clock for the kernel codepath and approximate the same for the vDSO codepath. * In theory, this should be unnecessary absent hypervisor bug(s) and/or what * should be rare cases where TSC jitter may still be visible despite the * hypervisor's best efforts. */ static volatile uint64_t pvclock_last_systime; static uint64_t pvclock_getsystime(struct pvclock *pvc); static void pvclock_read_time_info( struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags); static void pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts); static u_int pvclock_tc_get_timecount(struct timecounter *tc); static uint32_t pvclock_tc_vdso_timehands( struct vdso_timehands *vdso_th, struct timecounter *tc); #ifdef COMPAT_FREEBSD32 static uint32_t pvclock_tc_vdso_timehands32( struct vdso_timehands32 *vdso_th, struct timecounter *tc); #endif static d_open_t pvclock_cdev_open; static d_mmap_t pvclock_cdev_mmap; static struct cdevsw pvclock_cdev_cdevsw = { .d_version = D_VERSION, .d_name = PVCLOCK_CDEVNAME, .d_open = pvclock_cdev_open, .d_mmap = pvclock_cdev_mmap, }; void pvclock_resume(void) { atomic_store_rel_64(&pvclock_last_systime, 0); } uint64_t pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti) { uint64_t freq; freq = (1000000000ULL << 32) / ti->tsc_to_system_mul; if (ti->tsc_shift < 0) freq <<= -ti->tsc_shift; else freq >>= ti->tsc_shift; return (freq); } static void pvclock_read_time_info(struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags) { uint64_t delta; uint32_t version; do { version = atomic_load_acq_32(&ti->version); delta = rdtsc_ordered() - ti->tsc_timestamp; *ns = ti->system_time + pvclock_scale_delta(delta, ti->tsc_to_system_mul, ti->tsc_shift); *flags = ti->flags; atomic_thread_fence_acq(); } while ((ti->version & 1) != 0 || ti->version != version); } static void pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts) { uint32_t version; do { version = atomic_load_acq_32(&wc->version); ts->tv_sec = wc->sec; ts->tv_nsec = wc->nsec; atomic_thread_fence_acq(); } while ((wc->version & 1) != 0 || wc->version != version); } static uint64_t pvclock_getsystime(struct pvclock *pvc) { uint64_t now, last, ret; uint8_t flags; critical_enter(); pvclock_read_time_info(&pvc->timeinfos[curcpu], &now, &flags); ret = now; if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) { last = atomic_load_acq_64(&pvclock_last_systime); do { if (last > now) { ret = last; break; } } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last, now)); } critical_exit(); return (ret); } /* * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' * has been migrated to the 'struct pvclock' API. */ uint64_t pvclock_get_timecount(struct pvclock_vcpu_time_info *ti) { uint64_t now, last, ret; uint8_t flags; pvclock_read_time_info(ti, &now, &flags); ret = now; if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) { last = atomic_load_acq_64(&pvclock_last_systime); do { if (last > now) { ret = last; break; } } while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last, now)); } return (ret); } /* * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c' * has been migrated to the 'struct pvclock' API. */ void pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts) { pvclock_read_wall_clock(wc, ts); } static int pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { if (oflags & FWRITE) return (EPERM); return (0); } static int pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info)) return (EINVAL); if (PROT_EXTRACT(nprot) != PROT_READ) return (EACCES); *paddr = vtophys((uintptr_t)dev->si_drv1 + offset); *memattr = VM_MEMATTR_DEFAULT; return (0); } static u_int pvclock_tc_get_timecount(struct timecounter *tc) { struct pvclock *pvc = tc->tc_priv; return (pvclock_getsystime(pvc) & UINT_MAX); } static uint32_t pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { struct pvclock *pvc = tc->tc_priv; if (pvc->cdev == NULL) return (0); vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; vdso_th->th_x86_shift = 0; vdso_th->th_x86_hpet_idx = 0; vdso_th->th_x86_pvc_last_systime = atomic_load_acq_64(&pvclock_last_systime); vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return ((amd_feature & AMDID_RDTSCP) != 0 || ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 && pvc->vdso_enable_without_rdtscp)); } #ifdef COMPAT_FREEBSD32 static uint32_t pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th, struct timecounter *tc) { struct pvclock *pvc = tc->tc_priv; if (pvc->cdev == NULL) return (0); vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK; vdso_th->th_x86_shift = 0; vdso_th->th_x86_hpet_idx = 0; - vdso_th->th_x86_pvc_last_systime = + *(uint64_t *)&vdso_th->th_x86_pvc_last_systime[0] = atomic_load_acq_64(&pvclock_last_systime); vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable && pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return ((amd_feature & AMDID_RDTSCP) != 0 || ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 && pvc->vdso_enable_without_rdtscp)); } #endif void pvclock_gettime(struct pvclock *pvc, struct timespec *ts) { struct timespec system_ts; uint64_t system_ns; pvclock_read_wall_clock(pvc->get_wallclock(pvc->get_wallclock_arg), ts); system_ns = pvclock_getsystime(pvc); system_ts.tv_sec = system_ns / 1000000000ULL; system_ts.tv_nsec = system_ns % 1000000000ULL; timespecadd(ts, &system_ts, ts); } void pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name, int tc_quality, u_int tc_flags) { struct make_dev_args mda; int err; KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0, ("Specified time info page(s) address is not page-aligned.")); /* Set up vDSO stable-flag suppression test facility: */ pvc->vdso_force_unstable = false; SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0, "Forcibly deassert stable flag in vDSO codepath"); /* * Make it possible to use the vDSO page even when the hypervisor does * not support the rdtscp instruction. This is disabled by default for * compatibility with old libc. */ pvc->vdso_enable_without_rdtscp = false; SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "vdso_enable_without_rdtscp", CTLFLAG_RWTUN, &pvc->vdso_enable_without_rdtscp, 0, "Allow the use of a vDSO when rdtscp is not available"); /* Set up timecounter and timecounter-supporting members: */ pvc->tc.tc_get_timecount = pvclock_tc_get_timecount; pvc->tc.tc_poll_pps = NULL; pvc->tc.tc_counter_mask = ~0U; pvc->tc.tc_frequency = 1000000000ULL; pvc->tc.tc_name = tc_name; pvc->tc.tc_quality = tc_quality; pvc->tc.tc_flags = tc_flags; pvc->tc.tc_priv = pvc; pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands; #ifdef COMPAT_FREEBSD32 pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32; #endif /* Set up cdev for userspace mmapping of vCPU 0 time info page: */ make_dev_args_init(&mda); mda.mda_devsw = &pvclock_cdev_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0444; mda.mda_si_drv1 = pvc->timeinfos; err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME); if (err != 0) { device_printf(dev, "Could not create /dev/%s, error %d. Fast " "time of day will be unavailable for this timecounter.\n", PVCLOCK_CDEVNAME, err); KASSERT(pvc->cdev == NULL, ("Failed make_dev_s() unexpectedly inited cdev.")); } /* Register timecounter: */ tc_init(&pvc->tc); /* * Register wallclock: * The RTC registration API expects a resolution in microseconds; * pvclock's 1ns resolution is rounded up to 1us. */ clock_register(dev, 1); } int pvclock_destroy(struct pvclock *pvc) { /* * Not currently possible since there is no teardown counterpart of * 'tc_init()'. */ return (EBUSY); } diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c index 94305dfdc82f..4eab4dc1692b 100644 --- a/sys/x86/x86/tsc.c +++ b/sys/x86/x86/tsc.c @@ -1,975 +1,976 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1998-2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_clock.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cpufreq_if.h" uint64_t tsc_freq; int tsc_is_invariant; int tsc_perf_stat; static int tsc_early_calib_exact; static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag; SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN, &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant"); #ifdef SMP int smp_tsc; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0, "Indicates whether the TSC is safe to use in SMP mode"); int smp_tsc_adjust = 0; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN, &smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP"); #endif static int tsc_shift = 1; SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN, &tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency"); static int tsc_disabled; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0, "Disable x86 Time Stamp Counter"); static int tsc_skip_calibration; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN, &tsc_skip_calibration, 0, "Disable early TSC frequency calibration"); static void tsc_freq_changed(void *arg, const struct cf_level *level, int status); static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status); static u_int tsc_get_timecount(struct timecounter *tc); static inline u_int tsc_get_timecount_low(struct timecounter *tc); static u_int tsc_get_timecount_lfence(struct timecounter *tc); static u_int tsc_get_timecount_low_lfence(struct timecounter *tc); static u_int tsc_get_timecount_mfence(struct timecounter *tc); static u_int tsc_get_timecount_low_mfence(struct timecounter *tc); static u_int tscp_get_timecount(struct timecounter *tc); static u_int tscp_get_timecount_low(struct timecounter *tc); static void tsc_levels_changed(void *arg, int unit); static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc); #ifdef COMPAT_FREEBSD32 static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc); #endif static struct timecounter tsc_timecounter = { .tc_get_timecount = tsc_get_timecount, .tc_counter_mask = ~0u, .tc_name = "TSC", .tc_quality = 800, /* adjusted in code */ .tc_fill_vdso_timehands = x86_tsc_vdso_timehands, #ifdef COMPAT_FREEBSD32 .tc_fill_vdso_timehands32 = x86_tsc_vdso_timehands32, #endif }; static int tsc_freq_cpuid_vm(void) { u_int regs[4]; if (vm_guest == VM_GUEST_NO) return (false); if (hv_high < 0x40000010) return (false); do_cpuid(0x40000010, regs); tsc_freq = (uint64_t)(regs[0]) * 1000; tsc_early_calib_exact = 1; return (true); } static void tsc_freq_vmware(void) { u_int regs[4]; vmware_hvcall(VMW_HVCMD_GETHZ, regs); if (regs[1] != UINT_MAX) tsc_freq = regs[0] | ((uint64_t)regs[1] << 32); tsc_early_calib_exact = 1; } /* * Calculate TSC frequency using information from the CPUID leaf 0x15 'Time * Stamp Counter and Nominal Core Crystal Clock'. If leaf 0x15 is not * functional, as it is on Skylake/Kabylake, try 0x16 'Processor Frequency * Information'. Leaf 0x16 is described in the SDM as informational only, but * we can use this value until late calibration is complete. */ static bool tsc_freq_cpuid(uint64_t *res) { u_int regs[4]; if (cpu_high < 0x15) return (false); do_cpuid(0x15, regs); if (regs[0] != 0 && regs[1] != 0 && regs[2] != 0) { *res = (uint64_t)regs[2] * regs[1] / regs[0]; return (true); } if (cpu_high < 0x16) return (false); do_cpuid(0x16, regs); if (regs[0] != 0) { *res = (uint64_t)regs[0] * 1000000; return (true); } return (false); } static bool tsc_freq_intel_brand(uint64_t *res) { char brand[48]; u_int regs[4]; uint64_t freq; char *p; u_int i; /* * Intel Processor Identification and the CPUID Instruction * Application Note 485. * http://www.intel.com/assets/pdf/appnote/241618.pdf */ if (cpu_exthigh >= 0x80000004) { p = brand; for (i = 0x80000002; i < 0x80000005; i++) { do_cpuid(i, regs); memcpy(p, regs, sizeof(regs)); p += sizeof(regs); } p = NULL; for (i = 0; i < sizeof(brand) - 1; i++) if (brand[i] == 'H' && brand[i + 1] == 'z') p = brand + i; if (p != NULL) { p -= 5; switch (p[4]) { case 'M': i = 1; break; case 'G': i = 1000; break; case 'T': i = 1000000; break; default: return (false); } #define C2D(c) ((c) - '0') if (p[1] == '.') { freq = C2D(p[0]) * 1000; freq += C2D(p[2]) * 100; freq += C2D(p[3]) * 10; freq *= i * 1000; } else { freq = C2D(p[0]) * 1000; freq += C2D(p[1]) * 100; freq += C2D(p[2]) * 10; freq += C2D(p[3]); freq *= i * 1000000; } #undef C2D *res = freq; return (true); } } return (false); } static void tsc_freq_tc(uint64_t *res) { uint64_t tsc1, tsc2; int64_t overhead; int count, i; overhead = 0; for (i = 0, count = 8; i < count; i++) { tsc1 = rdtsc_ordered(); DELAY(0); tsc2 = rdtsc_ordered(); if (i > 0) overhead += tsc2 - tsc1; } overhead /= count; tsc1 = rdtsc_ordered(); DELAY(100000); tsc2 = rdtsc_ordered(); tsc_freq = (tsc2 - tsc1 - overhead) * 10; } /* * Try to determine the TSC frequency using CPUID or hypercalls. If successful, * this lets use the TSC for early DELAY() calls instead of the 8254 timer, * which may be unreliable or entirely absent on contemporary systems. However, * avoid calibrating using the 8254 here so as to give hypervisors a chance to * register a timecounter that can be used instead. */ static void probe_tsc_freq_early(void) { #ifdef __i386__ /* The TSC is known to be broken on certain CPUs. */ switch (cpu_vendor_id) { case CPU_VENDOR_AMD: switch (cpu_id & 0xFF0) { case 0x500: /* K5 Model 0 */ tsc_disabled = 1; return; } break; case CPU_VENDOR_CENTAUR: switch (cpu_id & 0xff0) { case 0x540: /* * http://www.centtech.com/c6_data_sheet.pdf * * I-12 RDTSC may return incoherent values in EDX:EAX * I-13 RDTSC hangs when certain event counters are used */ tsc_disabled = 1; return; } break; case CPU_VENDOR_NSC: switch (cpu_id & 0xff0) { case 0x540: if ((cpu_id & CPUID_STEPPING) == 0) { tsc_disabled = 1; return; } break; } break; } #endif switch (cpu_vendor_id) { case CPU_VENDOR_AMD: case CPU_VENDOR_HYGON: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) >= 0x10)) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_mfence; } break; case CPU_VENDOR_INTEL: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && ((CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xe) || (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x3)))) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_lfence; } break; case CPU_VENDOR_CENTAUR: if (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf && (rdmsr(0x1203) & 0x100000000ULL) == 0) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_lfence; } break; } if (tsc_freq_cpuid_vm()) { if (bootverbose) printf( "Early TSC frequency %juHz derived from hypervisor CPUID\n", (uintmax_t)tsc_freq); } else if (vm_guest == VM_GUEST_VMWARE) { tsc_freq_vmware(); if (bootverbose) printf( "Early TSC frequency %juHz derived from VMWare hypercall\n", (uintmax_t)tsc_freq); } else if (tsc_freq_cpuid(&tsc_freq)) { /* * If possible, use the value obtained from CPUID as the initial * frequency. This will be refined later during boot but is * good enough for now. The 8254 PIT is not functional on some * newer platforms anyway, so don't delay our boot for what * might be a garbage result. Late calibration is required if * the initial frequency was obtained from CPUID.16H, as the * derived value may be off by as much as 1%. */ if (bootverbose) printf("Early TSC frequency %juHz derived from CPUID\n", (uintmax_t)tsc_freq); } } /* * If we were unable to determine the TSC frequency via CPU registers, try * to calibrate against a known clock. */ static void probe_tsc_freq_late(void) { if (tsc_freq != 0) return; if (tsc_skip_calibration) { /* * Try to parse the brand string to obtain the nominal TSC * frequency. */ if (cpu_vendor_id == CPU_VENDOR_INTEL && tsc_freq_intel_brand(&tsc_freq)) { if (bootverbose) printf( "Early TSC frequency %juHz derived from brand string\n", (uintmax_t)tsc_freq); } else { tsc_disabled = 1; } } else { /* * Calibrate against a timecounter or the 8254 PIT. This * estimate will be refined later in tsc_calib(). */ tsc_freq_tc(&tsc_freq); if (bootverbose) printf( "Early TSC frequency %juHz calibrated from 8254 PIT\n", (uintmax_t)tsc_freq); } } void start_TSC(void) { if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; probe_tsc_freq_late(); if (cpu_power_ecx & CPUID_PERF_STAT) { /* * XXX Some emulators expose host CPUID without actual support * for these MSRs. We must test whether they really work. */ wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); DELAY(10); if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0) tsc_perf_stat = 1; } /* * Inform CPU accounting about our boot-time clock rate. This will * be updated if someone loads a cpufreq driver after boot that * discovers a new max frequency. * * The frequency may also be updated after late calibration is complete; * however, we register the TSC as the ticker now to avoid switching * counters after much of the kernel has already booted and potentially * sampled the CPU clock. */ if (tsc_freq != 0) set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); if (tsc_is_invariant) return; /* Register to find out about changes in CPU frequency. */ tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change, tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST); tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change, tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST); tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed, tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY); } #ifdef SMP /* * RDTSC is not a serializing instruction, and does not drain * instruction stream, so we need to drain the stream before executing * it. It could be fixed by use of RDTSCP, except the instruction is * not available everywhere. * * Use CPUID for draining in the boot-time SMP constistency test. The * timecounters use MFENCE for AMD CPUs, and LFENCE for others (Intel * and VIA) when SSE2 is present, and nothing on older machines which * also do not issue RDTSC prematurely. There, testing for SSE2 and * vendor is too cumbersome, and we learn about TSC presence from CPUID. * * Do not use do_cpuid(), since we do not need CPUID results, which * have to be written into memory with do_cpuid(). */ #define TSC_READ(x) \ static void \ tsc_read_##x(void *arg) \ { \ uint64_t *tsc = arg; \ u_int cpu = PCPU_GET(cpuid); \ \ __asm __volatile("cpuid" : : : "eax", "ebx", "ecx", "edx"); \ tsc[cpu * 3 + x] = rdtsc(); \ } TSC_READ(0) TSC_READ(1) TSC_READ(2) #undef TSC_READ #define N 1000 static void comp_smp_tsc(void *arg) { uint64_t *tsc; int64_t d1, d2; u_int cpu = PCPU_GET(cpuid); u_int i, j, size; size = (mp_maxid + 1) * 3; for (i = 0, tsc = arg; i < N; i++, tsc += size) CPU_FOREACH(j) { if (j == cpu) continue; d1 = tsc[cpu * 3 + 1] - tsc[j * 3]; d2 = tsc[cpu * 3 + 2] - tsc[j * 3 + 1]; if (d1 <= 0 || d2 <= 0) { smp_tsc = 0; return; } } } static void adj_smp_tsc(void *arg) { uint64_t *tsc; int64_t d, min, max; u_int cpu = PCPU_GET(cpuid); u_int first, i, size; first = CPU_FIRST(); if (cpu == first) return; min = INT64_MIN; max = INT64_MAX; size = (mp_maxid + 1) * 3; for (i = 0, tsc = arg; i < N; i++, tsc += size) { d = tsc[first * 3] - tsc[cpu * 3 + 1]; if (d > min) min = d; d = tsc[first * 3 + 1] - tsc[cpu * 3 + 2]; if (d > min) min = d; d = tsc[first * 3 + 1] - tsc[cpu * 3]; if (d < max) max = d; d = tsc[first * 3 + 2] - tsc[cpu * 3 + 1]; if (d < max) max = d; } if (min > max) return; d = min / 2 + max / 2; __asm __volatile ( "movl $0x10, %%ecx\n\t" "rdmsr\n\t" "addl %%edi, %%eax\n\t" "adcl %%esi, %%edx\n\t" "wrmsr\n" : /* No output */ : "D" ((uint32_t)d), "S" ((uint32_t)(d >> 32)) : "ax", "cx", "dx", "cc" ); } static int test_tsc(int adj_max_count) { uint64_t *data, *tsc; u_int i, size, adj; if ((!smp_tsc && !tsc_is_invariant)) return (-100); /* * Misbehavior of TSC under VirtualBox has been observed. In * particular, threads doing small (~1 second) sleeps may miss their * wakeup and hang around in sleep state, causing hangs on shutdown. */ if (vm_guest == VM_GUEST_VBOX) return (0); size = (mp_maxid + 1) * 3; data = malloc(sizeof(*data) * size * N, M_TEMP, M_WAITOK); adj = 0; retry: for (i = 0, tsc = data; i < N; i++, tsc += size) smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc); smp_tsc = 1; /* XXX */ smp_rendezvous(smp_no_rendezvous_barrier, comp_smp_tsc, smp_no_rendezvous_barrier, data); if (!smp_tsc && adj < adj_max_count) { adj++; smp_rendezvous(smp_no_rendezvous_barrier, adj_smp_tsc, smp_no_rendezvous_barrier, data); goto retry; } free(data, M_TEMP); if (bootverbose) printf("SMP: %sed TSC synchronization test%s\n", smp_tsc ? "pass" : "fail", adj > 0 ? " after adjustment" : ""); if (smp_tsc && tsc_is_invariant) { switch (cpu_vendor_id) { case CPU_VENDOR_AMD: case CPU_VENDOR_HYGON: /* * Processor Programming Reference (PPR) for AMD * Family 17h states that the TSC uses a common * reference for all sockets, cores and threads. */ if (CPUID_TO_FAMILY(cpu_id) >= 0x17) return (1000); /* * Starting with Family 15h processors, TSC clock * source is in the north bridge. Check whether * we have a single-socket/multi-core platform. * XXX Need more work for complex cases. */ if (CPUID_TO_FAMILY(cpu_id) < 0x15 || (amd_feature2 & AMDID2_CMP) == 0 || smp_cpus > (cpu_procinfo2 & AMDID_CMP_CORES) + 1) break; return (1000); case CPU_VENDOR_INTEL: /* * XXX Assume Intel platforms have synchronized TSCs. */ return (1000); } return (800); } return (-100); } #undef N #endif /* SMP */ static void init_TSC_tc(void) { uint64_t max_freq; int shift; if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; /* * Limit timecounter frequency to fit in an int and prevent it from * overflowing too fast. */ max_freq = UINT_MAX; /* * Intel CPUs without a C-state invariant TSC can stop the TSC * in either C2 or C3. Disable use of C2 and C3 while using * the TSC as the timecounter. The timecounter can be changed * to enable C2 and C3. * * Note that the TSC is used as the cputicker for computing * thread runtime regardless of the timecounter setting, so * using an alternate timecounter and enabling C2 or C3 can * result incorrect runtimes for kernel idle threads (but not * for any non-idle threads). */ if (cpu_vendor_id == CPU_VENDOR_INTEL && (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) { tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP; if (bootverbose) printf("TSC timecounter disables C2 and C3.\n"); } /* * We can not use the TSC in SMP mode unless the TSCs on all CPUs * are synchronized. If the user is sure that the system has * synchronized TSCs, set kern.timecounter.smp_tsc tunable to a * non-zero value. The TSC seems unreliable in virtualized SMP * environments, so it is set to a negative quality in those cases. */ #ifdef SMP if (mp_ncpus > 1) tsc_timecounter.tc_quality = test_tsc(smp_tsc_adjust); else #endif /* SMP */ if (tsc_is_invariant) tsc_timecounter.tc_quality = 1000; max_freq >>= tsc_shift; for (shift = 0; shift <= 31 && (tsc_freq >> shift) > max_freq; shift++) ; /* * Timecounter implementation selection, top to bottom: * - If RDTSCP is available, use RDTSCP. * - If fence instructions are provided (SSE2), use LFENCE;RDTSC * on Intel, and MFENCE;RDTSC on AMD. * - For really old CPUs, just use RDTSC. */ if ((amd_feature & AMDID_RDTSCP) != 0) { tsc_timecounter.tc_get_timecount = shift > 0 ? tscp_get_timecount_low : tscp_get_timecount; } else if ((cpu_feature & CPUID_SSE2) != 0 && mp_ncpus > 1) { if (cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_HYGON) { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low_mfence : tsc_get_timecount_mfence; } else { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low_lfence : tsc_get_timecount_lfence; } } else { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low : tsc_get_timecount; } if (shift > 0) { tsc_timecounter.tc_name = "TSC-low"; if (bootverbose) printf("TSC timecounter discards lower %d bit(s)\n", shift); } if (tsc_freq != 0) { tsc_timecounter.tc_frequency = tsc_freq >> shift; tsc_timecounter.tc_priv = (void *)(intptr_t)shift; /* * Timecounter registration is deferred until after late * calibration is finished. */ } } SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL); static void tsc_update_freq(uint64_t new_freq) { atomic_store_rel_64(&tsc_freq, new_freq); atomic_store_rel_64(&tsc_timecounter.tc_frequency, new_freq >> (int)(intptr_t)tsc_timecounter.tc_priv); } void tsc_init(void) { if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; probe_tsc_freq_early(); } /* * Perform late calibration of the TSC frequency once ACPI-based timecounters * are available. At this point timehands are not set up, so we read the * highest-quality timecounter directly rather than using (s)binuptime(). */ void tsc_calibrate(void) { uint64_t freq; if (tsc_disabled) return; if (tsc_early_calib_exact) goto calibrated; fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX); freq = clockcalib(rdtsc_ordered, "TSC"); fpu_kern_leave(curthread, NULL); tsc_update_freq(freq); calibrated: tc_init(&tsc_timecounter); set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); } void resume_TSC(void) { #ifdef SMP int quality; /* If TSC was not good on boot, it is unlikely to become good now. */ if (tsc_timecounter.tc_quality < 0) return; /* Nothing to do with UP. */ if (mp_ncpus < 2) return; /* * If TSC was good, a single synchronization should be enough, * but honour smp_tsc_adjust if it's set. */ quality = test_tsc(MAX(smp_tsc_adjust, 1)); if (quality != tsc_timecounter.tc_quality) { printf("TSC timecounter quality changed: %d -> %d\n", tsc_timecounter.tc_quality, quality); tsc_timecounter.tc_quality = quality; } #endif /* SMP */ } /* * When cpufreq levels change, find out about the (new) max frequency. We * use this to update CPU accounting in case it got a lower estimate at boot. */ static void tsc_levels_changed(void *arg, int unit) { device_t cf_dev; struct cf_level *levels; int count, error; uint64_t max_freq; /* Only use values from the first CPU, assuming all are equal. */ if (unit != 0) return; /* Find the appropriate cpufreq device instance. */ cf_dev = devclass_get_device(devclass_find("cpufreq"), unit); if (cf_dev == NULL) { printf("tsc_levels_changed() called but no cpufreq device?\n"); return; } /* Get settings from the device and find the max frequency. */ count = 64; levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT); if (levels == NULL) return; error = CPUFREQ_LEVELS(cf_dev, levels, &count); if (error == 0 && count != 0) { max_freq = (uint64_t)levels[0].total_set.freq * 1000000; set_cputicker(rdtsc, max_freq, true); } else printf("tsc_levels_changed: no max freq found\n"); free(levels, M_TEMP); } /* * If the TSC timecounter is in use, veto the pending change. It may be * possible in the future to handle a dynamically-changing timecounter rate. */ static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status) { if (*status != 0 || timecounter != &tsc_timecounter) return; printf("timecounter TSC must not be in use when " "changing frequencies; change denied\n"); *status = EBUSY; } /* Update TSC freq with the value indicated by the caller. */ static void tsc_freq_changed(void *arg, const struct cf_level *level, int status) { uint64_t freq; /* If there was an error during the transition, don't do anything. */ if (tsc_disabled || status != 0) return; /* Total setting for this level gives the new frequency in MHz. */ freq = (uint64_t)level->total_set.freq * 1000000; tsc_update_freq(freq); } static int sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS) { int error; uint64_t freq; freq = atomic_load_acq_64(&tsc_freq); if (freq == 0) return (EOPNOTSUPP); error = sysctl_handle_64(oidp, &freq, 0, req); if (error == 0 && req->newptr != NULL) tsc_update_freq(freq); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_machdep_tsc_freq, "QU", "Time Stamp Counter frequency"); static u_int tsc_get_timecount(struct timecounter *tc __unused) { return (rdtsc32()); } static u_int tscp_get_timecount(struct timecounter *tc __unused) { return (rdtscp32()); } static inline u_int tsc_get_timecount_low(struct timecounter *tc) { uint32_t rv; __asm __volatile("rdtsc; shrd %%cl, %%edx, %0" : "=a" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx"); return (rv); } static u_int tscp_get_timecount_low(struct timecounter *tc) { uint32_t rv; __asm __volatile("rdtscp; movl %1, %%ecx; shrd %%cl, %%edx, %0" : "=&a" (rv) : "m" (tc->tc_priv) : "ecx", "edx"); return (rv); } static u_int tsc_get_timecount_lfence(struct timecounter *tc __unused) { lfence(); return (rdtsc32()); } static u_int tsc_get_timecount_low_lfence(struct timecounter *tc) { lfence(); return (tsc_get_timecount_low(tc)); } static u_int tsc_get_timecount_mfence(struct timecounter *tc __unused) { mfence(); return (rdtsc32()); } static u_int tsc_get_timecount_low_mfence(struct timecounter *tc) { mfence(); return (tsc_get_timecount_low(tc)); } static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th->th_x86_hpet_idx = 0xffffffff; vdso_th->th_x86_pvc_last_systime = 0; vdso_th->th_x86_pvc_stable_mask = 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (1); } #ifdef COMPAT_FREEBSD32 static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc) { vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th32->th_x86_hpet_idx = 0xffffffff; - vdso_th32->th_x86_pvc_last_systime = 0; + vdso_th32->th_x86_pvc_last_systime[0] = 0; + vdso_th32->th_x86_pvc_last_systime[1] = 0; vdso_th32->th_x86_pvc_stable_mask = 0; bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); return (1); } #endif