Index: head/sys/dev/acpica/acpi_cpu.c =================================================================== --- head/sys/dev/acpica/acpi_cpu.c (revision 314210) +++ head/sys/dev/acpica/acpi_cpu.c (revision 314211) @@ -1,1536 +1,1529 @@ /*- * Copyright (c) 2003-2005 Nate Lawson (SDG) * Copyright (c) 2001 Michael Smith * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__amd64__) || defined(__i386__) #include #include #include #endif #include #include #include #include /* * Support for ACPI Processor devices, including C[1-3] sleep states. */ /* Hooks for the ACPI CA debugging infrastructure */ #define _COMPONENT ACPI_PROCESSOR ACPI_MODULE_NAME("PROCESSOR") struct acpi_cx { struct resource *p_lvlx; /* Register to read to enter state. */ uint32_t type; /* C1-3 (C4 and up treated as C3). */ uint32_t trans_lat; /* Transition latency (usec). */ uint32_t power; /* Power consumed (mW). */ int res_type; /* Resource type for p_lvlx. */ int res_rid; /* Resource ID for p_lvlx. */ bool do_mwait; uint32_t mwait_hint; bool mwait_hw_coord; bool mwait_bm_avoidance; }; #define MAX_CX_STATES 8 struct acpi_cpu_softc { device_t cpu_dev; ACPI_HANDLE cpu_handle; struct pcpu *cpu_pcpu; uint32_t cpu_acpi_id; /* ACPI processor id */ uint32_t cpu_p_blk; /* ACPI P_BLK location */ uint32_t cpu_p_blk_len; /* P_BLK length (must be 6). */ struct acpi_cx cpu_cx_states[MAX_CX_STATES]; int cpu_cx_count; /* Number of valid Cx states. */ int cpu_prev_sleep;/* Last idle sleep duration. */ int cpu_features; /* Child driver supported features. */ /* Runtime state. */ int cpu_non_c2; /* Index of lowest non-C2 state. */ int cpu_non_c3; /* Index of lowest non-C3 state. */ u_int cpu_cx_stats[MAX_CX_STATES];/* Cx usage history. */ /* Values for sysctl. */ struct sysctl_ctx_list cpu_sysctl_ctx; struct sysctl_oid *cpu_sysctl_tree; int cpu_cx_lowest; int cpu_cx_lowest_lim; int cpu_disable_idle; /* Disable entry to idle function */ char cpu_cx_supported[64]; }; struct acpi_cpu_device { struct resource_list ad_rl; }; #define CPU_GET_REG(reg, width) \ (bus_space_read_ ## width(rman_get_bustag((reg)), \ rman_get_bushandle((reg)), 0)) #define CPU_SET_REG(reg, width, val) \ (bus_space_write_ ## width(rman_get_bustag((reg)), \ rman_get_bushandle((reg)), 0, (val))) #define PM_USEC(x) ((x) >> 2) /* ~4 clocks per usec (3.57955 Mhz) */ #define ACPI_NOTIFY_CX_STATES 0x81 /* _CST changed. */ #define CPU_QUIRK_NO_C3 (1<<0) /* C3-type states are not usable. */ #define CPU_QUIRK_NO_BM_CTRL (1<<2) /* No bus mastering control. */ #define PCI_VENDOR_INTEL 0x8086 #define PCI_DEVICE_82371AB_3 0x7113 /* PIIX4 chipset for quirks. */ #define PCI_REVISION_A_STEP 0 #define PCI_REVISION_B_STEP 1 #define PCI_REVISION_4E 2 #define PCI_REVISION_4M 3 #define PIIX4_DEVACTB_REG 0x58 #define PIIX4_BRLD_EN_IRQ0 (1<<0) #define PIIX4_BRLD_EN_IRQ (1<<1) #define PIIX4_BRLD_EN_IRQ8 (1<<5) #define PIIX4_STOP_BREAK_MASK (PIIX4_BRLD_EN_IRQ0 | PIIX4_BRLD_EN_IRQ | PIIX4_BRLD_EN_IRQ8) #define PIIX4_PCNTRL_BST_EN (1<<10) #define CST_FFH_VENDOR_INTEL 1 #define CST_FFH_INTEL_CL_C1IO 1 #define CST_FFH_INTEL_CL_MWAIT 2 #define CST_FFH_MWAIT_HW_COORD 0x0001 #define CST_FFH_MWAIT_BM_AVOID 0x0002 /* Allow users to ignore processor orders in MADT. */ static int cpu_unordered; SYSCTL_INT(_debug_acpi, OID_AUTO, cpu_unordered, CTLFLAG_RDTUN, &cpu_unordered, 0, "Do not use the MADT to match ACPI Processor objects to CPUs."); /* Knob to disable acpi_cpu devices */ bool acpi_cpu_disabled = false; /* Platform hardware resource information. */ static uint32_t cpu_smi_cmd; /* Value to write to SMI_CMD. */ static uint8_t cpu_cst_cnt; /* Indicate we are _CST aware. */ static int cpu_quirks; /* Indicate any hardware bugs. */ /* Values for sysctl. */ static struct sysctl_ctx_list cpu_sysctl_ctx; static struct sysctl_oid *cpu_sysctl_tree; static int cpu_cx_generic; static int cpu_cx_lowest_lim; static device_t *cpu_devices; static int cpu_ndevices; static struct acpi_cpu_softc **cpu_softc; ACPI_SERIAL_DECL(cpu, "ACPI CPU"); static int acpi_cpu_probe(device_t dev); static int acpi_cpu_attach(device_t dev); static int acpi_cpu_suspend(device_t dev); static int acpi_cpu_resume(device_t dev); static int acpi_pcpu_get_id(device_t dev, uint32_t *acpi_id, uint32_t *cpu_id); static struct resource_list *acpi_cpu_get_rlist(device_t dev, device_t child); static device_t acpi_cpu_add_child(device_t dev, u_int order, const char *name, int unit); static int acpi_cpu_read_ivar(device_t dev, device_t child, int index, uintptr_t *result); static int acpi_cpu_shutdown(device_t dev); static void acpi_cpu_cx_probe(struct acpi_cpu_softc *sc); static void acpi_cpu_generic_cx_probe(struct acpi_cpu_softc *sc); static int acpi_cpu_cx_cst(struct acpi_cpu_softc *sc); static void acpi_cpu_startup(void *arg); static void acpi_cpu_startup_cx(struct acpi_cpu_softc *sc); static void acpi_cpu_cx_list(struct acpi_cpu_softc *sc); #if defined(__i386__) || defined(__amd64__) static void acpi_cpu_idle(sbintime_t sbt); #endif static void acpi_cpu_notify(ACPI_HANDLE h, UINT32 notify, void *context); static void acpi_cpu_quirks(void); static void acpi_cpu_quirks_piix4(void); static int acpi_cpu_usage_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_cpu_usage_counters_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_cpu_set_cx_lowest(struct acpi_cpu_softc *sc); static int acpi_cpu_cx_lowest_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_cpu_global_cx_lowest_sysctl(SYSCTL_HANDLER_ARGS); #if defined(__i386__) || defined(__amd64__) static int acpi_cpu_method_sysctl(SYSCTL_HANDLER_ARGS); #endif static device_method_t acpi_cpu_methods[] = { /* Device interface */ DEVMETHOD(device_probe, acpi_cpu_probe), DEVMETHOD(device_attach, acpi_cpu_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, acpi_cpu_shutdown), DEVMETHOD(device_suspend, acpi_cpu_suspend), DEVMETHOD(device_resume, acpi_cpu_resume), /* Bus interface */ DEVMETHOD(bus_add_child, acpi_cpu_add_child), DEVMETHOD(bus_read_ivar, acpi_cpu_read_ivar), DEVMETHOD(bus_get_resource_list, acpi_cpu_get_rlist), DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource), DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource), DEVMETHOD(bus_alloc_resource, bus_generic_rl_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_rl_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), DEVMETHOD_END }; static driver_t acpi_cpu_driver = { "cpu", acpi_cpu_methods, sizeof(struct acpi_cpu_softc), }; static devclass_t acpi_cpu_devclass; DRIVER_MODULE(cpu, acpi, acpi_cpu_driver, acpi_cpu_devclass, 0, 0); MODULE_DEPEND(cpu, acpi, 1, 1, 1); static int acpi_cpu_probe(device_t dev) { int acpi_id, cpu_id; ACPI_BUFFER buf; ACPI_HANDLE handle; ACPI_OBJECT *obj; ACPI_STATUS status; if (acpi_disabled("cpu") || acpi_get_type(dev) != ACPI_TYPE_PROCESSOR || acpi_cpu_disabled) return (ENXIO); handle = acpi_get_handle(dev); if (cpu_softc == NULL) cpu_softc = malloc(sizeof(struct acpi_cpu_softc *) * (mp_maxid + 1), M_TEMP /* XXX */, M_WAITOK | M_ZERO); /* Get our Processor object. */ buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(handle, NULL, NULL, &buf); if (ACPI_FAILURE(status)) { device_printf(dev, "probe failed to get Processor obj - %s\n", AcpiFormatException(status)); return (ENXIO); } obj = (ACPI_OBJECT *)buf.Pointer; if (obj->Type != ACPI_TYPE_PROCESSOR) { device_printf(dev, "Processor object has bad type %d\n", obj->Type); AcpiOsFree(obj); return (ENXIO); } /* * Find the processor associated with our unit. We could use the * ProcId as a key, however, some boxes do not have the same values * in their Processor object as the ProcId values in the MADT. */ acpi_id = obj->Processor.ProcId; AcpiOsFree(obj); if (acpi_pcpu_get_id(dev, &acpi_id, &cpu_id) != 0) return (ENXIO); /* * Check if we already probed this processor. We scan the bus twice * so it's possible we've already seen this one. */ if (cpu_softc[cpu_id] != NULL) return (ENXIO); /* Mark this processor as in-use and save our derived id for attach. */ cpu_softc[cpu_id] = (void *)1; acpi_set_private(dev, (void*)(intptr_t)cpu_id); device_set_desc(dev, "ACPI CPU"); return (0); } static int acpi_cpu_attach(device_t dev) { ACPI_BUFFER buf; ACPI_OBJECT arg, *obj; ACPI_OBJECT_LIST arglist; struct pcpu *pcpu_data; struct acpi_cpu_softc *sc; struct acpi_softc *acpi_sc; ACPI_STATUS status; u_int features; int cpu_id, drv_count, i; driver_t **drivers; uint32_t cap_set[3]; /* UUID needed by _OSC evaluation */ static uint8_t cpu_oscuuid[16] = { 0x16, 0xA6, 0x77, 0x40, 0x0C, 0x29, 0xBE, 0x47, 0x9E, 0xBD, 0xD8, 0x70, 0x58, 0x71, 0x39, 0x53 }; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); sc = device_get_softc(dev); sc->cpu_dev = dev; sc->cpu_handle = acpi_get_handle(dev); cpu_id = (int)(intptr_t)acpi_get_private(dev); cpu_softc[cpu_id] = sc; pcpu_data = pcpu_find(cpu_id); pcpu_data->pc_device = dev; sc->cpu_pcpu = pcpu_data; cpu_smi_cmd = AcpiGbl_FADT.SmiCommand; cpu_cst_cnt = AcpiGbl_FADT.CstControl; buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(sc->cpu_handle, NULL, NULL, &buf); if (ACPI_FAILURE(status)) { device_printf(dev, "attach failed to get Processor obj - %s\n", AcpiFormatException(status)); return (ENXIO); } obj = (ACPI_OBJECT *)buf.Pointer; sc->cpu_p_blk = obj->Processor.PblkAddress; sc->cpu_p_blk_len = obj->Processor.PblkLength; sc->cpu_acpi_id = obj->Processor.ProcId; AcpiOsFree(obj); ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu%d: P_BLK at %#x/%d\n", device_get_unit(dev), sc->cpu_p_blk, sc->cpu_p_blk_len)); /* * If this is the first cpu we attach, create and initialize the generic * resources that will be used by all acpi cpu devices. */ if (device_get_unit(dev) == 0) { /* Assume we won't be using generic Cx mode by default */ cpu_cx_generic = FALSE; /* Install hw.acpi.cpu sysctl tree */ acpi_sc = acpi_device_get_parent_softc(dev); sysctl_ctx_init(&cpu_sysctl_ctx); cpu_sysctl_tree = SYSCTL_ADD_NODE(&cpu_sysctl_ctx, SYSCTL_CHILDREN(acpi_sc->acpi_sysctl_tree), OID_AUTO, "cpu", CTLFLAG_RD, 0, "node for CPU children"); } /* * Before calling any CPU methods, collect child driver feature hints * and notify ACPI of them. We support unified SMP power control * so advertise this ourselves. Note this is not the same as independent * SMP control where each CPU can have different settings. */ sc->cpu_features = ACPI_CAP_SMP_SAME | ACPI_CAP_SMP_SAME_C3 | ACPI_CAP_C1_IO_HALT; #if defined(__i386__) || defined(__amd64__) /* * Ask for MWAIT modes if not disabled and interrupts work * reasonable with MWAIT. */ if (!acpi_disabled("mwait") && cpu_mwait_usable()) sc->cpu_features |= ACPI_CAP_SMP_C1_NATIVE | ACPI_CAP_SMP_C3_NATIVE; #endif if (devclass_get_drivers(acpi_cpu_devclass, &drivers, &drv_count) == 0) { for (i = 0; i < drv_count; i++) { if (ACPI_GET_FEATURES(drivers[i], &features) == 0) sc->cpu_features |= features; } free(drivers, M_TEMP); } /* * CPU capabilities are specified in * Intel Processor Vendor-Specific ACPI Interface Specification. */ if (sc->cpu_features) { cap_set[1] = sc->cpu_features; status = acpi_EvaluateOSC(sc->cpu_handle, cpu_oscuuid, 1, 2, cap_set, cap_set, false); if (ACPI_SUCCESS(status)) { if (cap_set[0] != 0) device_printf(dev, "_OSC returned status %#x\n", cap_set[0]); } else { arglist.Pointer = &arg; arglist.Count = 1; arg.Type = ACPI_TYPE_BUFFER; arg.Buffer.Length = sizeof(cap_set); arg.Buffer.Pointer = (uint8_t *)cap_set; cap_set[0] = 1; /* revision */ cap_set[1] = 1; /* number of capabilities integers */ cap_set[2] = sc->cpu_features; AcpiEvaluateObject(sc->cpu_handle, "_PDC", &arglist, NULL); } } /* Probe for Cx state support. */ acpi_cpu_cx_probe(sc); return (0); } static void acpi_cpu_postattach(void *unused __unused) { device_t *devices; int err; int i, n; int attached; err = devclass_get_devices(acpi_cpu_devclass, &devices, &n); if (err != 0) { printf("devclass_get_devices(acpi_cpu_devclass) failed\n"); return; } attached = 0; for (i = 0; i < n; i++) if (device_is_attached(devices[i]) && device_get_driver(devices[i]) == &acpi_cpu_driver) attached = 1; for (i = 0; i < n; i++) bus_generic_probe(devices[i]); for (i = 0; i < n; i++) bus_generic_attach(devices[i]); free(devices, M_TEMP); if (attached) { #ifdef EARLY_AP_STARTUP acpi_cpu_startup(NULL); #else /* Queue post cpu-probing task handler */ AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_cpu_startup, NULL); #endif } } SYSINIT(acpi_cpu, SI_SUB_CONFIGURE, SI_ORDER_MIDDLE, acpi_cpu_postattach, NULL); static void disable_idle(struct acpi_cpu_softc *sc) { cpuset_t cpuset; CPU_SETOF(sc->cpu_pcpu->pc_cpuid, &cpuset); sc->cpu_disable_idle = TRUE; /* * Ensure that the CPU is not in idle state or in acpi_cpu_idle(). * Note that this code depends on the fact that the rendezvous IPI * can not penetrate context where interrupts are disabled and acpi_cpu_idle * is called and executed in such a context with interrupts being re-enabled * right before return. */ smp_rendezvous_cpus(cpuset, smp_no_rendevous_barrier, NULL, smp_no_rendevous_barrier, NULL); } static void enable_idle(struct acpi_cpu_softc *sc) { sc->cpu_disable_idle = FALSE; } #if defined(__i386__) || defined(__amd64__) static int is_idle_disabled(struct acpi_cpu_softc *sc) { return (sc->cpu_disable_idle); } #endif /* * Disable any entry to the idle function during suspend and re-enable it * during resume. */ static int acpi_cpu_suspend(device_t dev) { int error; error = bus_generic_suspend(dev); if (error) return (error); disable_idle(device_get_softc(dev)); return (0); } static int acpi_cpu_resume(device_t dev) { enable_idle(device_get_softc(dev)); return (bus_generic_resume(dev)); } /* * Find the processor associated with a given ACPI ID. By default, * use the MADT to map ACPI IDs to APIC IDs and use that to locate a * processor. Some systems have inconsistent ASL and MADT however. * For these systems the cpu_unordered tunable can be set in which * case we assume that Processor objects are listed in the same order * in both the MADT and ASL. */ static int acpi_pcpu_get_id(device_t dev, uint32_t *acpi_id, uint32_t *cpu_id) { struct pcpu *pc; uint32_t i, idx; KASSERT(acpi_id != NULL, ("Null acpi_id")); KASSERT(cpu_id != NULL, ("Null cpu_id")); idx = device_get_unit(dev); /* * If pc_acpi_id for CPU 0 is not initialized (e.g. a non-APIC * UP box) use the ACPI ID from the first processor we find. */ if (idx == 0 && mp_ncpus == 1) { pc = pcpu_find(0); if (pc->pc_acpi_id == 0xffffffff) pc->pc_acpi_id = *acpi_id; *cpu_id = 0; return (0); } CPU_FOREACH(i) { pc = pcpu_find(i); KASSERT(pc != NULL, ("no pcpu data for %d", i)); if (cpu_unordered) { if (idx-- == 0) { /* * If pc_acpi_id doesn't match the ACPI ID from the * ASL, prefer the MADT-derived value. */ if (pc->pc_acpi_id != *acpi_id) *acpi_id = pc->pc_acpi_id; *cpu_id = pc->pc_cpuid; return (0); } } else { if (pc->pc_acpi_id == *acpi_id) { if (bootverbose) device_printf(dev, "Processor %s (ACPI ID %u) -> APIC ID %d\n", acpi_name(acpi_get_handle(dev)), *acpi_id, pc->pc_cpuid); *cpu_id = pc->pc_cpuid; return (0); } } } if (bootverbose) printf("ACPI: Processor %s (ACPI ID %u) ignored\n", acpi_name(acpi_get_handle(dev)), *acpi_id); return (ESRCH); } static struct resource_list * acpi_cpu_get_rlist(device_t dev, device_t child) { struct acpi_cpu_device *ad; ad = device_get_ivars(child); if (ad == NULL) return (NULL); return (&ad->ad_rl); } static device_t acpi_cpu_add_child(device_t dev, u_int order, const char *name, int unit) { struct acpi_cpu_device *ad; device_t child; if ((ad = malloc(sizeof(*ad), M_TEMP, M_NOWAIT | M_ZERO)) == NULL) return (NULL); resource_list_init(&ad->ad_rl); child = device_add_child_ordered(dev, order, name, unit); if (child != NULL) device_set_ivars(child, ad); else free(ad, M_TEMP); return (child); } static int acpi_cpu_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { struct acpi_cpu_softc *sc; sc = device_get_softc(dev); switch (index) { case ACPI_IVAR_HANDLE: *result = (uintptr_t)sc->cpu_handle; break; case CPU_IVAR_PCPU: *result = (uintptr_t)sc->cpu_pcpu; break; #if defined(__amd64__) || defined(__i386__) case CPU_IVAR_NOMINAL_MHZ: if (tsc_is_invariant) { *result = (uintptr_t)(atomic_load_acq_64(&tsc_freq) / 1000000); break; } /* FALLTHROUGH */ #endif default: return (ENOENT); } return (0); } static int acpi_cpu_shutdown(device_t dev) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* Allow children to shutdown first. */ bus_generic_shutdown(dev); /* * Disable any entry to the idle function. */ disable_idle(device_get_softc(dev)); /* * CPU devices are not truly detached and remain referenced, * so their resources are not freed. */ return_VALUE (0); } static void acpi_cpu_cx_probe(struct acpi_cpu_softc *sc) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* Use initial sleep value of 1 sec. to start with lowest idle state. */ sc->cpu_prev_sleep = 1000000; sc->cpu_cx_lowest = 0; sc->cpu_cx_lowest_lim = 0; /* * Check for the ACPI 2.0 _CST sleep states object. If we can't find * any, we'll revert to generic FADT/P_BLK Cx control method which will * be handled by acpi_cpu_startup. We need to defer to after having * probed all the cpus in the system before probing for generic Cx * states as we may already have found cpus with valid _CST packages */ if (!cpu_cx_generic && acpi_cpu_cx_cst(sc) != 0) { /* * We were unable to find a _CST package for this cpu or there * was an error parsing it. Switch back to generic mode. */ cpu_cx_generic = TRUE; if (bootverbose) device_printf(sc->cpu_dev, "switching to generic Cx mode\n"); } /* * TODO: _CSD Package should be checked here. */ } static void acpi_cpu_generic_cx_probe(struct acpi_cpu_softc *sc) { ACPI_GENERIC_ADDRESS gas; struct acpi_cx *cx_ptr; sc->cpu_cx_count = 0; cx_ptr = sc->cpu_cx_states; /* Use initial sleep value of 1 sec. to start with lowest idle state. */ sc->cpu_prev_sleep = 1000000; /* C1 has been required since just after ACPI 1.0 */ cx_ptr->type = ACPI_STATE_C1; cx_ptr->trans_lat = 0; cx_ptr++; sc->cpu_non_c2 = sc->cpu_cx_count; sc->cpu_non_c3 = sc->cpu_cx_count; sc->cpu_cx_count++; - cpu_deepest_sleep = 1; /* * The spec says P_BLK must be 6 bytes long. However, some systems * use it to indicate a fractional set of features present so we * take 5 as C2. Some may also have a value of 7 to indicate * another C3 but most use _CST for this (as required) and having * "only" C1-C3 is not a hardship. */ if (sc->cpu_p_blk_len < 5) return; /* Validate and allocate resources for C2 (P_LVL2). */ gas.SpaceId = ACPI_ADR_SPACE_SYSTEM_IO; gas.BitWidth = 8; if (AcpiGbl_FADT.C2Latency <= 100) { gas.Address = sc->cpu_p_blk + 4; cx_ptr->res_rid = 0; acpi_bus_alloc_gas(sc->cpu_dev, &cx_ptr->res_type, &cx_ptr->res_rid, &gas, &cx_ptr->p_lvlx, RF_SHAREABLE); if (cx_ptr->p_lvlx != NULL) { cx_ptr->type = ACPI_STATE_C2; cx_ptr->trans_lat = AcpiGbl_FADT.C2Latency; cx_ptr++; sc->cpu_non_c3 = sc->cpu_cx_count; sc->cpu_cx_count++; - cpu_deepest_sleep = 2; } } if (sc->cpu_p_blk_len < 6) return; /* Validate and allocate resources for C3 (P_LVL3). */ if (AcpiGbl_FADT.C3Latency <= 1000 && !(cpu_quirks & CPU_QUIRK_NO_C3)) { gas.Address = sc->cpu_p_blk + 5; cx_ptr->res_rid = 1; acpi_bus_alloc_gas(sc->cpu_dev, &cx_ptr->res_type, &cx_ptr->res_rid, &gas, &cx_ptr->p_lvlx, RF_SHAREABLE); if (cx_ptr->p_lvlx != NULL) { cx_ptr->type = ACPI_STATE_C3; cx_ptr->trans_lat = AcpiGbl_FADT.C3Latency; cx_ptr++; sc->cpu_cx_count++; - cpu_deepest_sleep = 3; } } } #if defined(__i386__) || defined(__amd64__) static void acpi_cpu_cx_cst_mwait(struct acpi_cx *cx_ptr, uint64_t address, int accsize) { cx_ptr->do_mwait = true; cx_ptr->mwait_hint = address & 0xffffffff; cx_ptr->mwait_hw_coord = (accsize & CST_FFH_MWAIT_HW_COORD) != 0; cx_ptr->mwait_bm_avoidance = (accsize & CST_FFH_MWAIT_BM_AVOID) != 0; } #endif static void acpi_cpu_cx_cst_free_plvlx(device_t cpu_dev, struct acpi_cx *cx_ptr) { if (cx_ptr->p_lvlx == NULL) return; bus_release_resource(cpu_dev, cx_ptr->res_type, cx_ptr->res_rid, cx_ptr->p_lvlx); cx_ptr->p_lvlx = NULL; } /* * Parse a _CST package and set up its Cx states. Since the _CST object * can change dynamically, our notify handler may call this function * to clean up and probe the new _CST package. */ static int acpi_cpu_cx_cst(struct acpi_cpu_softc *sc) { struct acpi_cx *cx_ptr; ACPI_STATUS status; ACPI_BUFFER buf; ACPI_OBJECT *top; ACPI_OBJECT *pkg; uint32_t count; int i; #if defined(__i386__) || defined(__amd64__) uint64_t address; int vendor, class, accsize; #endif ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(sc->cpu_handle, "_CST", NULL, &buf); if (ACPI_FAILURE(status)) return (ENXIO); /* _CST is a package with a count and at least one Cx package. */ top = (ACPI_OBJECT *)buf.Pointer; if (!ACPI_PKG_VALID(top, 2) || acpi_PkgInt32(top, 0, &count) != 0) { device_printf(sc->cpu_dev, "invalid _CST package\n"); AcpiOsFree(buf.Pointer); return (ENXIO); } if (count != top->Package.Count - 1) { device_printf(sc->cpu_dev, "invalid _CST state count (%d != %d)\n", count, top->Package.Count - 1); count = top->Package.Count - 1; } if (count > MAX_CX_STATES) { device_printf(sc->cpu_dev, "_CST has too many states (%d)\n", count); count = MAX_CX_STATES; } sc->cpu_non_c2 = 0; sc->cpu_non_c3 = 0; sc->cpu_cx_count = 0; cx_ptr = sc->cpu_cx_states; /* * C1 has been required since just after ACPI 1.0. * Reserve the first slot for it. */ cx_ptr->type = ACPI_STATE_C0; cx_ptr++; sc->cpu_cx_count++; - cpu_deepest_sleep = 1; /* Set up all valid states. */ for (i = 0; i < count; i++) { pkg = &top->Package.Elements[i + 1]; if (!ACPI_PKG_VALID(pkg, 4) || acpi_PkgInt32(pkg, 1, &cx_ptr->type) != 0 || acpi_PkgInt32(pkg, 2, &cx_ptr->trans_lat) != 0 || acpi_PkgInt32(pkg, 3, &cx_ptr->power) != 0) { device_printf(sc->cpu_dev, "skipping invalid Cx state package\n"); continue; } /* Validate the state to see if we should use it. */ switch (cx_ptr->type) { case ACPI_STATE_C1: acpi_cpu_cx_cst_free_plvlx(sc->cpu_dev, cx_ptr); #if defined(__i386__) || defined(__amd64__) if (acpi_PkgFFH_IntelCpu(pkg, 0, &vendor, &class, &address, &accsize) == 0 && vendor == CST_FFH_VENDOR_INTEL) { if (class == CST_FFH_INTEL_CL_C1IO) { /* C1 I/O then Halt */ cx_ptr->res_rid = sc->cpu_cx_count; bus_set_resource(sc->cpu_dev, SYS_RES_IOPORT, cx_ptr->res_rid, address, 1); cx_ptr->p_lvlx = bus_alloc_resource_any(sc->cpu_dev, SYS_RES_IOPORT, &cx_ptr->res_rid, RF_ACTIVE | RF_SHAREABLE); if (cx_ptr->p_lvlx == NULL) { bus_delete_resource(sc->cpu_dev, SYS_RES_IOPORT, cx_ptr->res_rid); device_printf(sc->cpu_dev, "C1 I/O failed to allocate port %d, " "degrading to C1 Halt", (int)address); } } else if (class == CST_FFH_INTEL_CL_MWAIT) { acpi_cpu_cx_cst_mwait(cx_ptr, address, accsize); } } #endif if (sc->cpu_cx_states[0].type == ACPI_STATE_C0) { /* This is the first C1 state. Use the reserved slot. */ sc->cpu_cx_states[0] = *cx_ptr; } else { sc->cpu_non_c2 = sc->cpu_cx_count; sc->cpu_non_c3 = sc->cpu_cx_count; cx_ptr++; sc->cpu_cx_count++; } continue; case ACPI_STATE_C2: sc->cpu_non_c3 = sc->cpu_cx_count; - if (cpu_deepest_sleep < 2) - cpu_deepest_sleep = 2; break; case ACPI_STATE_C3: default: if ((cpu_quirks & CPU_QUIRK_NO_C3) != 0) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu%d: C3[%d] not available.\n", device_get_unit(sc->cpu_dev), i)); continue; - } else - cpu_deepest_sleep = 3; + } break; } /* Free up any previous register. */ acpi_cpu_cx_cst_free_plvlx(sc->cpu_dev, cx_ptr); /* Allocate the control register for C2 or C3. */ #if defined(__i386__) || defined(__amd64__) if (acpi_PkgFFH_IntelCpu(pkg, 0, &vendor, &class, &address, &accsize) == 0 && vendor == CST_FFH_VENDOR_INTEL && class == CST_FFH_INTEL_CL_MWAIT) { /* Native C State Instruction use (mwait) */ acpi_cpu_cx_cst_mwait(cx_ptr, address, accsize); ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu%d: Got C%d/mwait - %d latency\n", device_get_unit(sc->cpu_dev), cx_ptr->type, cx_ptr->trans_lat)); cx_ptr++; sc->cpu_cx_count++; } else #endif { cx_ptr->res_rid = sc->cpu_cx_count; acpi_PkgGas(sc->cpu_dev, pkg, 0, &cx_ptr->res_type, &cx_ptr->res_rid, &cx_ptr->p_lvlx, RF_SHAREABLE); if (cx_ptr->p_lvlx) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu%d: Got C%d - %d latency\n", device_get_unit(sc->cpu_dev), cx_ptr->type, cx_ptr->trans_lat)); cx_ptr++; sc->cpu_cx_count++; } } } AcpiOsFree(buf.Pointer); /* If C1 state was not found, we need one now. */ cx_ptr = sc->cpu_cx_states; if (cx_ptr->type == ACPI_STATE_C0) { cx_ptr->type = ACPI_STATE_C1; cx_ptr->trans_lat = 0; } return (0); } /* * Call this *after* all CPUs have been attached. */ static void acpi_cpu_startup(void *arg) { struct acpi_cpu_softc *sc; int i; /* Get set of CPU devices */ devclass_get_devices(acpi_cpu_devclass, &cpu_devices, &cpu_ndevices); /* * Setup any quirks that might necessary now that we have probed * all the CPUs */ acpi_cpu_quirks(); if (cpu_cx_generic) { /* * We are using generic Cx mode, probe for available Cx states * for all processors. */ for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); acpi_cpu_generic_cx_probe(sc); } } else { /* * We are using _CST mode, remove C3 state if necessary. * As we now know for sure that we will be using _CST mode * install our notify handler. */ for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); if (cpu_quirks & CPU_QUIRK_NO_C3) { sc->cpu_cx_count = min(sc->cpu_cx_count, sc->cpu_non_c3 + 1); } AcpiInstallNotifyHandler(sc->cpu_handle, ACPI_DEVICE_NOTIFY, acpi_cpu_notify, sc); } } /* Perform Cx final initialization. */ for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); acpi_cpu_startup_cx(sc); } /* Add a sysctl handler to handle global Cx lowest setting */ SYSCTL_ADD_PROC(&cpu_sysctl_ctx, SYSCTL_CHILDREN(cpu_sysctl_tree), OID_AUTO, "cx_lowest", CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, acpi_cpu_global_cx_lowest_sysctl, "A", "Global lowest Cx sleep state to use"); /* Take over idling from cpu_idle_default(). */ cpu_cx_lowest_lim = 0; for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); enable_idle(sc); } #if defined(__i386__) || defined(__amd64__) cpu_idle_hook = acpi_cpu_idle; #endif } static void acpi_cpu_cx_list(struct acpi_cpu_softc *sc) { struct sbuf sb; int i; /* * Set up the list of Cx states */ sbuf_new(&sb, sc->cpu_cx_supported, sizeof(sc->cpu_cx_supported), SBUF_FIXEDLEN); for (i = 0; i < sc->cpu_cx_count; i++) sbuf_printf(&sb, "C%d/%d/%d ", i + 1, sc->cpu_cx_states[i].type, sc->cpu_cx_states[i].trans_lat); sbuf_trim(&sb); sbuf_finish(&sb); } static void acpi_cpu_startup_cx(struct acpi_cpu_softc *sc) { acpi_cpu_cx_list(sc); SYSCTL_ADD_STRING(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_supported", CTLFLAG_RD, sc->cpu_cx_supported, 0, "Cx/microsecond values for supported Cx states"); SYSCTL_ADD_PROC(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_lowest", CTLTYPE_STRING | CTLFLAG_RW, (void *)sc, 0, acpi_cpu_cx_lowest_sysctl, "A", "lowest Cx sleep state to use"); SYSCTL_ADD_PROC(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_usage", CTLTYPE_STRING | CTLFLAG_RD, (void *)sc, 0, acpi_cpu_usage_sysctl, "A", "percent usage for each Cx state"); SYSCTL_ADD_PROC(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_usage_counters", CTLTYPE_STRING | CTLFLAG_RD, (void *)sc, 0, acpi_cpu_usage_counters_sysctl, "A", "Cx sleep state counters"); #if defined(__i386__) || defined(__amd64__) SYSCTL_ADD_PROC(&sc->cpu_sysctl_ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->cpu_dev)), OID_AUTO, "cx_method", CTLTYPE_STRING | CTLFLAG_RD, (void *)sc, 0, acpi_cpu_method_sysctl, "A", "Cx entrance methods"); #endif /* Signal platform that we can handle _CST notification. */ if (!cpu_cx_generic && cpu_cst_cnt != 0) { ACPI_LOCK(acpi); AcpiOsWritePort(cpu_smi_cmd, cpu_cst_cnt, 8); ACPI_UNLOCK(acpi); } } #if defined(__i386__) || defined(__amd64__) /* * Idle the CPU in the lowest state possible. This function is called with * interrupts disabled. Note that once it re-enables interrupts, a task * switch can occur so do not access shared data (i.e. the softc) after * interrupts are re-enabled. */ static void acpi_cpu_idle(sbintime_t sbt) { struct acpi_cpu_softc *sc; struct acpi_cx *cx_next; uint64_t cputicks; uint32_t start_time, end_time; ACPI_STATUS status; int bm_active, cx_next_idx, i, us; /* * Look up our CPU id to get our softc. If it's NULL, we'll use C1 * since there is no ACPI processor object for this CPU. This occurs * for logical CPUs in the HTT case. */ sc = cpu_softc[PCPU_GET(cpuid)]; if (sc == NULL) { acpi_cpu_c1(); return; } /* If disabled, take the safe path. */ if (is_idle_disabled(sc)) { acpi_cpu_c1(); return; } /* Find the lowest state that has small enough latency. */ us = sc->cpu_prev_sleep; if (sbt >= 0 && us > (sbt >> 12)) us = (sbt >> 12); cx_next_idx = 0; if (cpu_disable_c2_sleep) i = min(sc->cpu_cx_lowest, sc->cpu_non_c2); else if (cpu_disable_c3_sleep) i = min(sc->cpu_cx_lowest, sc->cpu_non_c3); else i = sc->cpu_cx_lowest; for (; i >= 0; i--) { if (sc->cpu_cx_states[i].trans_lat * 3 <= us) { cx_next_idx = i; break; } } /* * Check for bus master activity. If there was activity, clear * the bit and use the lowest non-C3 state. Note that the USB * driver polling for new devices keeps this bit set all the * time if USB is loaded. */ if ((cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0 && cx_next_idx > sc->cpu_non_c3) { status = AcpiReadBitRegister(ACPI_BITREG_BUS_MASTER_STATUS, &bm_active); if (ACPI_SUCCESS(status) && bm_active != 0) { AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_STATUS, 1); cx_next_idx = sc->cpu_non_c3; } } /* Select the next state and update statistics. */ cx_next = &sc->cpu_cx_states[cx_next_idx]; sc->cpu_cx_stats[cx_next_idx]++; KASSERT(cx_next->type != ACPI_STATE_C0, ("acpi_cpu_idle: C0 sleep")); /* * Execute HLT (or equivalent) and wait for an interrupt. We can't * precisely calculate the time spent in C1 since the place we wake up * is an ISR. Assume we slept no more then half of quantum, unless * we are called inside critical section, delaying context switch. */ if (cx_next->type == ACPI_STATE_C1) { cputicks = cpu_ticks(); if (cx_next->p_lvlx != NULL) { /* C1 I/O then Halt */ CPU_GET_REG(cx_next->p_lvlx, 1); } if (cx_next->do_mwait) acpi_cpu_idle_mwait(cx_next->mwait_hint); else acpi_cpu_c1(); end_time = ((cpu_ticks() - cputicks) << 20) / cpu_tickrate(); if (curthread->td_critnest == 0) end_time = min(end_time, 500000 / hz); /* acpi_cpu_c1() returns with interrupts enabled. */ if (cx_next->do_mwait) ACPI_ENABLE_IRQS(); sc->cpu_prev_sleep = (sc->cpu_prev_sleep * 3 + end_time) / 4; return; } /* * For C3, disable bus master arbitration and enable bus master wake * if BM control is available, otherwise flush the CPU cache. */ if (cx_next->type == ACPI_STATE_C3 || cx_next->mwait_bm_avoidance) { if ((cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) { AcpiWriteBitRegister(ACPI_BITREG_ARB_DISABLE, 1); AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 1); } else ACPI_FLUSH_CPU_CACHE(); } /* * Read from P_LVLx to enter C2(+), checking time spent asleep. * Use the ACPI timer for measuring sleep time. Since we need to * get the time very close to the CPU start/stop clock logic, this * is the only reliable time source. */ if (cx_next->type == ACPI_STATE_C3) { AcpiHwRead(&start_time, &AcpiGbl_FADT.XPmTimerBlock); cputicks = 0; } else { start_time = 0; cputicks = cpu_ticks(); } if (cx_next->do_mwait) acpi_cpu_idle_mwait(cx_next->mwait_hint); else CPU_GET_REG(cx_next->p_lvlx, 1); /* * Read the end time twice. Since it may take an arbitrary time * to enter the idle state, the first read may be executed before * the processor has stopped. Doing it again provides enough * margin that we are certain to have a correct value. */ AcpiHwRead(&end_time, &AcpiGbl_FADT.XPmTimerBlock); if (cx_next->type == ACPI_STATE_C3) { AcpiHwRead(&end_time, &AcpiGbl_FADT.XPmTimerBlock); end_time = acpi_TimerDelta(end_time, start_time); } else end_time = ((cpu_ticks() - cputicks) << 20) / cpu_tickrate(); /* Enable bus master arbitration and disable bus master wakeup. */ if ((cx_next->type == ACPI_STATE_C3 || cx_next->mwait_bm_avoidance) && (cpu_quirks & CPU_QUIRK_NO_BM_CTRL) == 0) { AcpiWriteBitRegister(ACPI_BITREG_ARB_DISABLE, 0); AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 0); } ACPI_ENABLE_IRQS(); sc->cpu_prev_sleep = (sc->cpu_prev_sleep * 3 + PM_USEC(end_time)) / 4; } #endif /* * Re-evaluate the _CST object when we are notified that it changed. */ static void acpi_cpu_notify(ACPI_HANDLE h, UINT32 notify, void *context) { struct acpi_cpu_softc *sc = (struct acpi_cpu_softc *)context; if (notify != ACPI_NOTIFY_CX_STATES) return; /* * C-state data for target CPU is going to be in flux while we execute * acpi_cpu_cx_cst, so disable entering acpi_cpu_idle. * Also, it may happen that multiple ACPI taskqueues may concurrently * execute notifications for the same CPU. ACPI_SERIAL is used to * protect against that. */ ACPI_SERIAL_BEGIN(cpu); disable_idle(sc); /* Update the list of Cx states. */ acpi_cpu_cx_cst(sc); acpi_cpu_cx_list(sc); acpi_cpu_set_cx_lowest(sc); enable_idle(sc); ACPI_SERIAL_END(cpu); acpi_UserNotify("PROCESSOR", sc->cpu_handle, notify); } static void acpi_cpu_quirks(void) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* * Bus mastering arbitration control is needed to keep caches coherent * while sleeping in C3. If it's not present but a working flush cache * instruction is present, flush the caches before entering C3 instead. * Otherwise, just disable C3 completely. */ if (AcpiGbl_FADT.Pm2ControlBlock == 0 || AcpiGbl_FADT.Pm2ControlLength == 0) { if ((AcpiGbl_FADT.Flags & ACPI_FADT_WBINVD) && (AcpiGbl_FADT.Flags & ACPI_FADT_WBINVD_FLUSH) == 0) { cpu_quirks |= CPU_QUIRK_NO_BM_CTRL; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: no BM control, using flush cache method\n")); } else { cpu_quirks |= CPU_QUIRK_NO_C3; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: no BM control, C3 not available\n")); } } /* * If we are using generic Cx mode, C3 on multiple CPUs requires using * the expensive flush cache instruction. */ if (cpu_cx_generic && mp_ncpus > 1) { cpu_quirks |= CPU_QUIRK_NO_BM_CTRL; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: SMP, using flush cache mode for C3\n")); } /* Look for various quirks of the PIIX4 part. */ acpi_cpu_quirks_piix4(); } static void acpi_cpu_quirks_piix4(void) { #ifdef __i386__ device_t acpi_dev; uint32_t val; ACPI_STATUS status; acpi_dev = pci_find_device(PCI_VENDOR_INTEL, PCI_DEVICE_82371AB_3); if (acpi_dev != NULL) { switch (pci_get_revid(acpi_dev)) { /* * Disable C3 support for all PIIX4 chipsets. Some of these parts * do not report the BMIDE status to the BM status register and * others have a livelock bug if Type-F DMA is enabled. Linux * works around the BMIDE bug by reading the BM status directly * but we take the simpler approach of disabling C3 for these * parts. * * See erratum #18 ("C3 Power State/BMIDE and Type-F DMA * Livelock") from the January 2002 PIIX4 specification update. * Applies to all PIIX4 models. * * Also, make sure that all interrupts cause a "Stop Break" * event to exit from C2 state. * Also, BRLD_EN_BM (ACPI_BITREG_BUS_MASTER_RLD in ACPI-speak) * should be set to zero, otherwise it causes C2 to short-sleep. * PIIX4 doesn't properly support C3 and bus master activity * need not break out of C2. */ case PCI_REVISION_A_STEP: case PCI_REVISION_B_STEP: case PCI_REVISION_4E: case PCI_REVISION_4M: cpu_quirks |= CPU_QUIRK_NO_C3; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: working around PIIX4 bug, disabling C3\n")); val = pci_read_config(acpi_dev, PIIX4_DEVACTB_REG, 4); if ((val & PIIX4_STOP_BREAK_MASK) != PIIX4_STOP_BREAK_MASK) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: PIIX4: enabling IRQs to generate Stop Break\n")); val |= PIIX4_STOP_BREAK_MASK; pci_write_config(acpi_dev, PIIX4_DEVACTB_REG, val, 4); } status = AcpiReadBitRegister(ACPI_BITREG_BUS_MASTER_RLD, &val); if (ACPI_SUCCESS(status) && val != 0) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "acpi_cpu: PIIX4: reset BRLD_EN_BM\n")); AcpiWriteBitRegister(ACPI_BITREG_BUS_MASTER_RLD, 0); } break; default: break; } } #endif } static int acpi_cpu_usage_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; struct sbuf sb; char buf[128]; int i; uintmax_t fract, sum, whole; sc = (struct acpi_cpu_softc *) arg1; sum = 0; for (i = 0; i < sc->cpu_cx_count; i++) sum += sc->cpu_cx_stats[i]; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < sc->cpu_cx_count; i++) { if (sum > 0) { whole = (uintmax_t)sc->cpu_cx_stats[i] * 100; fract = (whole % sum) * 100; sbuf_printf(&sb, "%u.%02u%% ", (u_int)(whole / sum), (u_int)(fract / sum)); } else sbuf_printf(&sb, "0.00%% "); } sbuf_printf(&sb, "last %dus", sc->cpu_prev_sleep); sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } /* * XXX TODO: actually add support to count each entry/exit * from the Cx states. */ static int acpi_cpu_usage_counters_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; struct sbuf sb; char buf[128]; int i; sc = (struct acpi_cpu_softc *) arg1; /* Print out the raw counters */ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < sc->cpu_cx_count; i++) { sbuf_printf(&sb, "%u ", sc->cpu_cx_stats[i]); } sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } #if defined(__i386__) || defined(__amd64__) static int acpi_cpu_method_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; struct acpi_cx *cx; struct sbuf sb; char buf[128]; int i; sc = (struct acpi_cpu_softc *)arg1; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < sc->cpu_cx_count; i++) { cx = &sc->cpu_cx_states[i]; sbuf_printf(&sb, "C%d/", i + 1); if (cx->do_mwait) { sbuf_cat(&sb, "mwait"); if (cx->mwait_hw_coord) sbuf_cat(&sb, "/hwc"); if (cx->mwait_bm_avoidance) sbuf_cat(&sb, "/bma"); } else if (cx->type == ACPI_STATE_C1) { sbuf_cat(&sb, "hlt"); } else { sbuf_cat(&sb, "io"); } if (cx->type == ACPI_STATE_C1 && cx->p_lvlx != NULL) sbuf_cat(&sb, "/iohlt"); sbuf_putc(&sb, ' '); } sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } #endif static int acpi_cpu_set_cx_lowest(struct acpi_cpu_softc *sc) { int i; ACPI_SERIAL_ASSERT(cpu); sc->cpu_cx_lowest = min(sc->cpu_cx_lowest_lim, sc->cpu_cx_count - 1); /* If not disabling, cache the new lowest non-C3 state. */ sc->cpu_non_c3 = 0; for (i = sc->cpu_cx_lowest; i >= 0; i--) { if (sc->cpu_cx_states[i].type < ACPI_STATE_C3) { sc->cpu_non_c3 = i; break; } } /* Reset the statistics counters. */ bzero(sc->cpu_cx_stats, sizeof(sc->cpu_cx_stats)); return (0); } static int acpi_cpu_cx_lowest_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; char state[8]; int val, error; sc = (struct acpi_cpu_softc *) arg1; snprintf(state, sizeof(state), "C%d", sc->cpu_cx_lowest_lim + 1); error = sysctl_handle_string(oidp, state, sizeof(state), req); if (error != 0 || req->newptr == NULL) return (error); if (strlen(state) < 2 || toupper(state[0]) != 'C') return (EINVAL); if (strcasecmp(state, "Cmax") == 0) val = MAX_CX_STATES; else { val = (int) strtol(state + 1, NULL, 10); if (val < 1 || val > MAX_CX_STATES) return (EINVAL); } ACPI_SERIAL_BEGIN(cpu); sc->cpu_cx_lowest_lim = val - 1; acpi_cpu_set_cx_lowest(sc); ACPI_SERIAL_END(cpu); return (0); } static int acpi_cpu_global_cx_lowest_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_cpu_softc *sc; char state[8]; int val, error, i; snprintf(state, sizeof(state), "C%d", cpu_cx_lowest_lim + 1); error = sysctl_handle_string(oidp, state, sizeof(state), req); if (error != 0 || req->newptr == NULL) return (error); if (strlen(state) < 2 || toupper(state[0]) != 'C') return (EINVAL); if (strcasecmp(state, "Cmax") == 0) val = MAX_CX_STATES; else { val = (int) strtol(state + 1, NULL, 10); if (val < 1 || val > MAX_CX_STATES) return (EINVAL); } /* Update the new lowest useable Cx state for all CPUs. */ ACPI_SERIAL_BEGIN(cpu); cpu_cx_lowest_lim = val - 1; for (i = 0; i < cpu_ndevices; i++) { sc = device_get_softc(cpu_devices[i]); sc->cpu_cx_lowest_lim = cpu_cx_lowest_lim; acpi_cpu_set_cx_lowest(sc); } ACPI_SERIAL_END(cpu); return (0); } Index: head/sys/kern/kern_clocksource.c =================================================================== --- head/sys/kern/kern_clocksource.c (revision 314210) +++ head/sys/kern/kern_clocksource.c (revision 314211) @@ -1,966 +1,965 @@ /*- * Copyright (c) 2010-2013 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Common routines to manage event timers hardware. */ #include "opt_device_polling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -int cpu_deepest_sleep = 0; /* Deepest Cx state available. */ int cpu_disable_c2_sleep = 0; /* Timer dies in C2. */ int cpu_disable_c3_sleep = 0; /* Timer dies in C3. */ static void setuptimer(void); static void loadtimer(sbintime_t now, int first); static int doconfigtimer(void); static void configtimer(int start); static int round_freq(struct eventtimer *et, int freq); static sbintime_t getnextcpuevent(int idle); static sbintime_t getnextevent(void); static int handleevents(sbintime_t now, int fake); static struct mtx et_hw_mtx; #define ET_HW_LOCK(state) \ { \ if (timer->et_flags & ET_FLAGS_PERCPU) \ mtx_lock_spin(&(state)->et_hw_mtx); \ else \ mtx_lock_spin(&et_hw_mtx); \ } #define ET_HW_UNLOCK(state) \ { \ if (timer->et_flags & ET_FLAGS_PERCPU) \ mtx_unlock_spin(&(state)->et_hw_mtx); \ else \ mtx_unlock_spin(&et_hw_mtx); \ } static struct eventtimer *timer = NULL; static sbintime_t timerperiod; /* Timer period for periodic mode. */ static sbintime_t statperiod; /* statclock() events period. */ static sbintime_t profperiod; /* profclock() events period. */ static sbintime_t nexttick; /* Next global timer tick time. */ static u_int busy = 1; /* Reconfiguration is in progress. */ static int profiling; /* Profiling events enabled. */ static char timername[32]; /* Wanted timer. */ TUNABLE_STR("kern.eventtimer.timer", timername, sizeof(timername)); static int singlemul; /* Multiplier for periodic mode. */ SYSCTL_INT(_kern_eventtimer, OID_AUTO, singlemul, CTLFLAG_RWTUN, &singlemul, 0, "Multiplier for periodic mode"); static u_int idletick; /* Run periodic events when idle. */ SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RWTUN, &idletick, 0, "Run periodic events when idle"); static int periodic; /* Periodic or one-shot mode. */ static int want_periodic; /* What mode to prefer. */ TUNABLE_INT("kern.eventtimer.periodic", &want_periodic); struct pcpu_state { struct mtx et_hw_mtx; /* Per-CPU timer mutex. */ u_int action; /* Reconfiguration requests. */ u_int handle; /* Immediate handle resuests. */ sbintime_t now; /* Last tick time. */ sbintime_t nextevent; /* Next scheduled event on this CPU. */ sbintime_t nexttick; /* Next timer tick time. */ sbintime_t nexthard; /* Next hardclock() event. */ sbintime_t nextstat; /* Next statclock() event. */ sbintime_t nextprof; /* Next profclock() event. */ sbintime_t nextcall; /* Next callout event. */ sbintime_t nextcallopt; /* Next optional callout event. */ int ipi; /* This CPU needs IPI. */ int idle; /* This CPU is in idle mode. */ }; static DPCPU_DEFINE(struct pcpu_state, timerstate); DPCPU_DEFINE(sbintime_t, hardclocktime); /* * Timer broadcast IPI handler. */ int hardclockintr(void) { sbintime_t now; struct pcpu_state *state; int done; if (doconfigtimer() || busy) return (FILTER_HANDLED); state = DPCPU_PTR(timerstate); now = state->now; CTR3(KTR_SPARE2, "ipi at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); done = handleevents(now, 0); return (done ? FILTER_HANDLED : FILTER_STRAY); } /* * Handle all events for specified time on this CPU */ static int handleevents(sbintime_t now, int fake) { sbintime_t t, *hct; struct trapframe *frame; struct pcpu_state *state; int usermode; int done, runs; CTR3(KTR_SPARE2, "handle at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); done = 0; if (fake) { frame = NULL; usermode = 0; } else { frame = curthread->td_intr_frame; usermode = TRAPF_USERMODE(frame); } state = DPCPU_PTR(timerstate); runs = 0; while (now >= state->nexthard) { state->nexthard += tick_sbt; runs++; } if (runs) { hct = DPCPU_PTR(hardclocktime); *hct = state->nexthard - tick_sbt; if (fake < 2) { hardclock_cnt(runs, usermode); done = 1; } } runs = 0; while (now >= state->nextstat) { state->nextstat += statperiod; runs++; } if (runs && fake < 2) { statclock_cnt(runs, usermode); done = 1; } if (profiling) { runs = 0; while (now >= state->nextprof) { state->nextprof += profperiod; runs++; } if (runs && !fake) { profclock_cnt(runs, usermode, TRAPF_PC(frame)); done = 1; } } else state->nextprof = state->nextstat; if (now >= state->nextcallopt || now >= state->nextcall) { state->nextcall = state->nextcallopt = SBT_MAX; callout_process(now); } t = getnextcpuevent(0); ET_HW_LOCK(state); if (!busy) { state->idle = 0; state->nextevent = t; loadtimer(now, (fake == 2) && (timer->et_flags & ET_FLAGS_PERCPU)); } ET_HW_UNLOCK(state); return (done); } /* * Schedule binuptime of the next event on current CPU. */ static sbintime_t getnextcpuevent(int idle) { sbintime_t event; struct pcpu_state *state; u_int hardfreq; state = DPCPU_PTR(timerstate); /* Handle hardclock() events, skipping some if CPU is idle. */ event = state->nexthard; if (idle) { hardfreq = (u_int)hz / 2; if (tc_min_ticktock_freq > 2 #ifdef SMP && curcpu == CPU_FIRST() #endif ) hardfreq = hz / tc_min_ticktock_freq; if (hardfreq > 1) event += tick_sbt * (hardfreq - 1); } /* Handle callout events. */ if (event > state->nextcall) event = state->nextcall; if (!idle) { /* If CPU is active - handle other types of events. */ if (event > state->nextstat) event = state->nextstat; if (profiling && event > state->nextprof) event = state->nextprof; } return (event); } /* * Schedule binuptime of the next event on all CPUs. */ static sbintime_t getnextevent(void) { struct pcpu_state *state; sbintime_t event; #ifdef SMP int cpu; #endif int c; state = DPCPU_PTR(timerstate); event = state->nextevent; c = -1; #ifdef SMP if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); if (event > state->nextevent) { event = state->nextevent; c = cpu; } } } #endif CTR4(KTR_SPARE2, "next at %d: next %d.%08x by %d", curcpu, (int)(event >> 32), (u_int)(event & 0xffffffff), c); return (event); } /* Hardware timer callback function. */ static void timercb(struct eventtimer *et, void *arg) { sbintime_t now; sbintime_t *next; struct pcpu_state *state; #ifdef SMP int cpu, bcast; #endif /* Do not touch anything if somebody reconfiguring timers. */ if (busy) return; /* Update present and next tick times. */ state = DPCPU_PTR(timerstate); if (et->et_flags & ET_FLAGS_PERCPU) { next = &state->nexttick; } else next = &nexttick; now = sbinuptime(); if (periodic) *next = now + timerperiod; else *next = -1; /* Next tick is not scheduled yet. */ state->now = now; CTR3(KTR_SPARE2, "intr at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); #ifdef SMP #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); #endif /* Prepare broadcasting to other CPUs for non-per-CPU timers. */ bcast = 0; #ifdef EARLY_AP_STARTUP if ((et->et_flags & ET_FLAGS_PERCPU) == 0) { #else if ((et->et_flags & ET_FLAGS_PERCPU) == 0 && smp_started) { #endif CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); state->now = now; if (now >= state->nextevent) { state->nextevent += SBT_1S; if (curcpu != cpu) { state->ipi = 1; bcast = 1; } } ET_HW_UNLOCK(state); } } #endif /* Handle events for this time on this CPU. */ handleevents(now, 0); #ifdef SMP /* Broadcast interrupt to other CPUs for non-per-CPU timers. */ if (bcast) { CPU_FOREACH(cpu) { if (curcpu == cpu) continue; state = DPCPU_ID_PTR(cpu, timerstate); if (state->ipi) { state->ipi = 0; ipi_cpu(cpu, IPI_HARDCLOCK); } } } #endif } /* * Load new value into hardware timer. */ static void loadtimer(sbintime_t now, int start) { struct pcpu_state *state; sbintime_t new; sbintime_t *next; uint64_t tmp; int eq; if (timer->et_flags & ET_FLAGS_PERCPU) { state = DPCPU_PTR(timerstate); next = &state->nexttick; } else next = &nexttick; if (periodic) { if (start) { /* * Try to start all periodic timers aligned * to period to make events synchronous. */ tmp = now % timerperiod; new = timerperiod - tmp; if (new < tmp) /* Left less then passed. */ new += timerperiod; CTR5(KTR_SPARE2, "load p at %d: now %d.%08x first in %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff), (int)(new >> 32), (u_int)(new & 0xffffffff)); *next = new + now; et_start(timer, new, timerperiod); } } else { new = getnextevent(); eq = (new == *next); CTR4(KTR_SPARE2, "load at %d: next %d.%08x eq %d", curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq); if (!eq) { *next = new; et_start(timer, new - now, 0); } } } /* * Prepare event timer parameters after configuration changes. */ static void setuptimer(void) { int freq; if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0) periodic = 0; else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0) periodic = 1; singlemul = MIN(MAX(singlemul, 1), 20); freq = hz * singlemul; while (freq < (profiling ? profhz : stathz)) freq += hz; freq = round_freq(timer, freq); timerperiod = SBT_1S / freq; } /* * Reconfigure specified per-CPU timer on other CPU. Called from IPI handler. */ static int doconfigtimer(void) { sbintime_t now; struct pcpu_state *state; state = DPCPU_PTR(timerstate); switch (atomic_load_acq_int(&state->action)) { case 1: now = sbinuptime(); ET_HW_LOCK(state); loadtimer(now, 1); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); return (1); case 2: ET_HW_LOCK(state); et_stop(timer); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); return (1); } if (atomic_readandclear_int(&state->handle) && !busy) { now = sbinuptime(); handleevents(now, 0); return (1); } return (0); } /* * Reconfigure specified timer. * For per-CPU timers use IPI to make other CPUs to reconfigure. */ static void configtimer(int start) { sbintime_t now, next; struct pcpu_state *state; int cpu; if (start) { setuptimer(); now = sbinuptime(); } else now = 0; critical_enter(); ET_HW_LOCK(DPCPU_PTR(timerstate)); if (start) { /* Initialize time machine parameters. */ next = now + timerperiod; if (periodic) nexttick = next; else nexttick = -1; #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); #endif CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); state->now = now; #ifndef EARLY_AP_STARTUP if (!smp_started && cpu != CPU_FIRST()) state->nextevent = SBT_MAX; else #endif state->nextevent = next; if (periodic) state->nexttick = next; else state->nexttick = -1; state->nexthard = next; state->nextstat = next; state->nextprof = next; state->nextcall = next; state->nextcallopt = next; hardclock_sync(cpu); } busy = 0; /* Start global timer or per-CPU timer of this CPU. */ loadtimer(now, 1); } else { busy = 1; /* Stop global timer or per-CPU timer of this CPU. */ et_stop(timer); } ET_HW_UNLOCK(DPCPU_PTR(timerstate)); #ifdef SMP #ifdef EARLY_AP_STARTUP /* If timer is global we are done. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { #else /* If timer is global or there is no other CPUs yet - we are done. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || !smp_started) { #endif critical_exit(); return; } /* Set reconfigure flags for other CPUs. */ CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); atomic_store_rel_int(&state->action, (cpu == curcpu) ? 0 : ( start ? 1 : 2)); } /* Broadcast reconfigure IPI. */ ipi_all_but_self(IPI_HARDCLOCK); /* Wait for reconfiguration completed. */ restart: cpu_spinwait(); CPU_FOREACH(cpu) { if (cpu == curcpu) continue; state = DPCPU_ID_PTR(cpu, timerstate); if (atomic_load_acq_int(&state->action)) goto restart; } #endif critical_exit(); } /* * Calculate nearest frequency supported by hardware timer. */ static int round_freq(struct eventtimer *et, int freq) { uint64_t div; if (et->et_frequency != 0) { div = lmax((et->et_frequency + freq / 2) / freq, 1); if (et->et_flags & ET_FLAGS_POW2DIV) div = 1 << (flsl(div + div / 2) - 1); freq = (et->et_frequency + div / 2) / div; } if (et->et_min_period > SBT_1S) panic("Event timer \"%s\" doesn't support sub-second periods!", et->et_name); else if (et->et_min_period != 0) freq = min(freq, SBT2FREQ(et->et_min_period)); if (et->et_max_period < SBT_1S && et->et_max_period != 0) freq = max(freq, SBT2FREQ(et->et_max_period)); return (freq); } /* * Configure and start event timers (BSP part). */ void cpu_initclocks_bsp(void) { struct pcpu_state *state; int base, div, cpu; mtx_init(&et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); state->nextcall = SBT_MAX; state->nextcallopt = SBT_MAX; } periodic = want_periodic; /* Grab requested timer or the best of present. */ if (timername[0]) timer = et_find(timername, 0, 0); if (timer == NULL && periodic) { timer = et_find(NULL, ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC); } if (timer == NULL) { timer = et_find(NULL, ET_FLAGS_ONESHOT, ET_FLAGS_ONESHOT); } if (timer == NULL && !periodic) { timer = et_find(NULL, ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC); } if (timer == NULL) panic("No usable event timer found!"); et_init(timer, timercb, NULL, NULL); /* Adapt to timer capabilities. */ if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0) periodic = 0; else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0) periodic = 1; if (timer->et_flags & ET_FLAGS_C3STOP) cpu_disable_c3_sleep++; /* * We honor the requested 'hz' value. * We want to run stathz in the neighborhood of 128hz. * We would like profhz to run as often as possible. */ if (singlemul <= 0 || singlemul > 20) { if (hz >= 1500 || (hz % 128) == 0) singlemul = 1; else if (hz >= 750) singlemul = 2; else singlemul = 4; } if (periodic) { base = round_freq(timer, hz * singlemul); singlemul = max((base + hz / 2) / hz, 1); hz = (base + singlemul / 2) / singlemul; if (base <= 128) stathz = base; else { div = base / 128; if (div >= singlemul && (div % singlemul) == 0) div++; stathz = base / div; } profhz = stathz; while ((profhz + stathz) <= 128 * 64) profhz += stathz; profhz = round_freq(timer, profhz); } else { hz = round_freq(timer, hz); stathz = round_freq(timer, 127); profhz = round_freq(timer, stathz * 64); } tick = 1000000 / hz; tick_sbt = SBT_1S / hz; tick_bt = sbttobt(tick_sbt); statperiod = SBT_1S / stathz; profperiod = SBT_1S / profhz; ET_LOCK(); configtimer(1); ET_UNLOCK(); } /* * Start per-CPU event timers on APs. */ void cpu_initclocks_ap(void) { sbintime_t now; struct pcpu_state *state; struct thread *td; state = DPCPU_PTR(timerstate); now = sbinuptime(); ET_HW_LOCK(state); state->now = now; hardclock_sync(curcpu); spinlock_enter(); ET_HW_UNLOCK(state); td = curthread; td->td_intr_nesting_level++; handleevents(state->now, 2); td->td_intr_nesting_level--; spinlock_exit(); } /* * Switch to profiling clock rates. */ void cpu_startprofclock(void) { ET_LOCK(); if (profiling == 0) { if (periodic) { configtimer(0); profiling = 1; configtimer(1); } else profiling = 1; } else profiling++; ET_UNLOCK(); } /* * Switch to regular clock rates. */ void cpu_stopprofclock(void) { ET_LOCK(); if (profiling == 1) { if (periodic) { configtimer(0); profiling = 0; configtimer(1); } else profiling = 0; } else profiling--; ET_UNLOCK(); } /* * Switch to idle mode (all ticks handled). */ sbintime_t cpu_idleclock(void) { sbintime_t now, t; struct pcpu_state *state; if (idletick || busy || (periodic && (timer->et_flags & ET_FLAGS_PERCPU)) #ifdef DEVICE_POLLING || curcpu == CPU_FIRST() #endif ) return (-1); state = DPCPU_PTR(timerstate); if (periodic) now = state->now; else now = sbinuptime(); CTR3(KTR_SPARE2, "idle at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); t = getnextcpuevent(1); ET_HW_LOCK(state); state->idle = 1; state->nextevent = t; if (!periodic) loadtimer(now, 0); ET_HW_UNLOCK(state); return (MAX(t - now, 0)); } /* * Switch to active mode (skip empty ticks). */ void cpu_activeclock(void) { sbintime_t now; struct pcpu_state *state; struct thread *td; state = DPCPU_PTR(timerstate); if (state->idle == 0 || busy) return; if (periodic) now = state->now; else now = sbinuptime(); CTR3(KTR_SPARE2, "active at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); spinlock_enter(); td = curthread; td->td_intr_nesting_level++; handleevents(now, 1); td->td_intr_nesting_level--; spinlock_exit(); } /* * Change the frequency of the given timer. This changes et->et_frequency and * if et is the active timer it reconfigures the timer on all CPUs. This is * intended to be a private interface for the use of et_change_frequency() only. */ void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq) { ET_LOCK(); if (et == timer) { configtimer(0); et->et_frequency = newfreq; configtimer(1); } else et->et_frequency = newfreq; ET_UNLOCK(); } void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt) { struct pcpu_state *state; /* Do not touch anything if somebody reconfiguring timers. */ if (busy) return; CTR6(KTR_SPARE2, "new co at %d: on %d at %d.%08x - %d.%08x", curcpu, cpu, (int)(bt_opt >> 32), (u_int)(bt_opt & 0xffffffff), (int)(bt >> 32), (u_int)(bt & 0xffffffff)); state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); /* * If there is callout time already set earlier -- do nothing. * This check may appear redundant because we check already in * callout_process() but this double check guarantees we're safe * with respect to race conditions between interrupts execution * and scheduling. */ state->nextcallopt = bt_opt; if (bt >= state->nextcall) goto done; state->nextcall = bt; /* If there is some other event set earlier -- do nothing. */ if (bt >= state->nextevent) goto done; state->nextevent = bt; /* If timer is periodic -- there is nothing to reprogram. */ if (periodic) goto done; /* If timer is global or of the current CPU -- reprogram it. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) { loadtimer(sbinuptime(), 0); done: ET_HW_UNLOCK(state); return; } /* Otherwise make other CPU to reprogram it. */ state->handle = 1; ET_HW_UNLOCK(state); #ifdef SMP ipi_cpu(cpu, IPI_HARDCLOCK); #endif } /* * Report or change the active event timers hardware. */ static int sysctl_kern_eventtimer_timer(SYSCTL_HANDLER_ARGS) { char buf[32]; struct eventtimer *et; int error; ET_LOCK(); et = timer; snprintf(buf, sizeof(buf), "%s", et->et_name); ET_UNLOCK(); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); ET_LOCK(); et = timer; if (error != 0 || req->newptr == NULL || strcasecmp(buf, et->et_name) == 0) { ET_UNLOCK(); return (error); } et = et_find(buf, 0, 0); if (et == NULL) { ET_UNLOCK(); return (ENOENT); } configtimer(0); et_free(timer); if (et->et_flags & ET_FLAGS_C3STOP) cpu_disable_c3_sleep++; if (timer->et_flags & ET_FLAGS_C3STOP) cpu_disable_c3_sleep--; periodic = want_periodic; timer = et; et_init(timer, timercb, NULL, NULL); configtimer(1); ET_UNLOCK(); return (error); } SYSCTL_PROC(_kern_eventtimer, OID_AUTO, timer, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_eventtimer_timer, "A", "Chosen event timer"); /* * Report or change the active event timer periodicity. */ static int sysctl_kern_eventtimer_periodic(SYSCTL_HANDLER_ARGS) { int error, val; val = periodic; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); ET_LOCK(); configtimer(0); periodic = want_periodic = val; configtimer(1); ET_UNLOCK(); return (error); } SYSCTL_PROC(_kern_eventtimer, OID_AUTO, periodic, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_eventtimer_periodic, "I", "Enable event timer periodic mode"); #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(clocksource, db_show_clocksource) { struct pcpu_state *st; int c; CPU_FOREACH(c) { st = DPCPU_ID_PTR(c, timerstate); db_printf( "CPU %2d: action %d handle %d ipi %d idle %d\n" " now %#jx nevent %#jx (%jd)\n" " ntick %#jx (%jd) nhard %#jx (%jd)\n" " nstat %#jx (%jd) nprof %#jx (%jd)\n" " ncall %#jx (%jd) ncallopt %#jx (%jd)\n", c, st->action, st->handle, st->ipi, st->idle, (uintmax_t)st->now, (uintmax_t)st->nextevent, (uintmax_t)(st->nextevent - st->now) / tick_sbt, (uintmax_t)st->nexttick, (uintmax_t)(st->nexttick - st->now) / tick_sbt, (uintmax_t)st->nexthard, (uintmax_t)(st->nexthard - st->now) / tick_sbt, (uintmax_t)st->nextstat, (uintmax_t)(st->nextstat - st->now) / tick_sbt, (uintmax_t)st->nextprof, (uintmax_t)(st->nextprof - st->now) / tick_sbt, (uintmax_t)st->nextcall, (uintmax_t)(st->nextcall - st->now) / tick_sbt, (uintmax_t)st->nextcallopt, (uintmax_t)(st->nextcallopt - st->now) / tick_sbt); } } #endif Index: head/sys/sys/systm.h =================================================================== --- head/sys/sys/systm.h (revision 314210) +++ head/sys/sys/systm.h (revision 314211) @@ -1,462 +1,461 @@ /*- * Copyright (c) 1982, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)systm.h 8.7 (Berkeley) 3/29/95 * $FreeBSD$ */ #ifndef _SYS_SYSTM_H_ #define _SYS_SYSTM_H_ #include #include #include #include #include #include /* for people using printf mainly */ __NULLABILITY_PRAGMA_PUSH extern int cold; /* nonzero if we are doing a cold boot */ extern int suspend_blocked; /* block suspend due to pending shutdown */ extern int rebooting; /* kern_reboot() has been called. */ extern const char *panicstr; /* panic message */ extern char version[]; /* system version */ extern char compiler_version[]; /* compiler version */ extern char copyright[]; /* system copyright */ extern int kstack_pages; /* number of kernel stack pages */ extern u_long pagesizes[]; /* supported page sizes */ extern long physmem; /* physical memory */ extern long realmem; /* 'real' memory */ extern char *rootdevnames[2]; /* names of possible root devices */ extern int boothowto; /* reboot flags, from console subsystem */ extern int bootverbose; /* nonzero to print verbose messages */ extern int maxusers; /* system tune hint */ extern int ngroups_max; /* max # of supplemental groups */ extern int vm_guest; /* Running as virtual machine guest? */ /* * Detected virtual machine guest types. The intention is to expand * and/or add to the VM_GUEST_VM type if specific VM functionality is * ever implemented (e.g. vendor-specific paravirtualization features). * Keep in sync with vm_guest_sysctl_names[]. */ enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV, VM_GUEST_VMWARE, VM_GUEST_KVM, VM_LAST }; #if defined(WITNESS) || defined(INVARIANT_SUPPORT) void kassert_panic(const char *fmt, ...) __printflike(1, 2); #endif #ifdef INVARIANTS /* The option is always available */ #define KASSERT(exp,msg) do { \ if (__predict_false(!(exp))) \ kassert_panic msg; \ } while (0) #define VNASSERT(exp, vp, msg) do { \ if (__predict_false(!(exp))) { \ vn_printf(vp, "VNASSERT failed\n"); \ kassert_panic msg; \ } \ } while (0) #else #define KASSERT(exp,msg) do { \ } while (0) #define VNASSERT(exp, vp, msg) do { \ } while (0) #endif #ifndef CTASSERT /* Allow lint to override */ #define CTASSERT(x) _Static_assert(x, "compile-time assertion failed") #endif /* * Assert that a pointer can be loaded from memory atomically. * * This assertion enforces stronger alignment than necessary. For example, * on some architectures, atomicity for unaligned loads will depend on * whether or not the load spans multiple cache lines. */ #define ASSERT_ATOMIC_LOAD_PTR(var, msg) \ KASSERT(sizeof(var) == sizeof(void *) && \ ((uintptr_t)&(var) & (sizeof(void *) - 1)) == 0, msg) /* * Assert that a thread is in critical(9) section. */ #define CRITICAL_ASSERT(td) \ KASSERT((td)->td_critnest >= 1, ("Not in critical section")); /* * If we have already panic'd and this is the thread that called * panic(), then don't block on any mutexes but silently succeed. * Otherwise, the kernel will deadlock since the scheduler isn't * going to run the thread that holds any lock we need. */ #define SCHEDULER_STOPPED_TD(td) ({ \ MPASS((td) == curthread); \ __predict_false((td)->td_stopsched); \ }) #define SCHEDULER_STOPPED() SCHEDULER_STOPPED_TD(curthread) /* * Align variables. */ #define __read_mostly __section(".data.read_mostly") #define __exclusive_cache_line __aligned(CACHE_LINE_SIZE) \ __section(".data.exclusive_cache_line") /* * XXX the hints declarations are even more misplaced than most declarations * in this file, since they are needed in one file (per arch) and only used * in two files. * XXX most of these variables should be const. */ extern int osreldate; extern int envmode; extern int hintmode; /* 0 = off. 1 = config, 2 = fallback */ extern int dynamic_kenv; extern struct mtx kenv_lock; extern char *kern_envp; extern char static_env[]; extern char static_hints[]; /* by config for now */ extern char **kenvp; extern const void *zero_region; /* address space maps to a zeroed page */ extern int unmapped_buf_allowed; #ifdef __LP64__ #define IOSIZE_MAX iosize_max() #define DEVFS_IOSIZE_MAX devfs_iosize_max() #else #define IOSIZE_MAX SSIZE_MAX #define DEVFS_IOSIZE_MAX SSIZE_MAX #endif /* * General function declarations. */ struct inpcb; struct lock_object; struct malloc_type; struct mtx; struct proc; struct socket; struct thread; struct tty; struct ucred; struct uio; struct _jmp_buf; struct trapframe; struct eventtimer; int setjmp(struct _jmp_buf *) __returns_twice; void longjmp(struct _jmp_buf *, int) __dead2; int dumpstatus(vm_offset_t addr, off_t count); int nullop(void); int eopnotsupp(void); int ureadc(int, struct uio *); void hashdestroy(void *, struct malloc_type *, u_long); void *hashinit(int count, struct malloc_type *type, u_long *hashmask); void *hashinit_flags(int count, struct malloc_type *type, u_long *hashmask, int flags); #define HASH_NOWAIT 0x00000001 #define HASH_WAITOK 0x00000002 void *phashinit(int count, struct malloc_type *type, u_long *nentries); void *phashinit_flags(int count, struct malloc_type *type, u_long *nentries, int flags); void g_waitidle(void); void panic(const char *, ...) __dead2 __printflike(1, 2); void vpanic(const char *, __va_list) __dead2 __printflike(1, 0); void cpu_boot(int); void cpu_flush_dcache(void *, size_t); void cpu_rootconf(void); void critical_enter(void); void critical_exit(void); void init_param1(void); void init_param2(long physpages); void init_static_kenv(char *, size_t); void tablefull(const char *); #ifdef EARLY_PRINTF typedef void early_putc_t(int ch); extern early_putc_t *early_putc; #endif int kvprintf(char const *, void (*)(int, void*), void *, int, __va_list) __printflike(1, 0); void log(int, const char *, ...) __printflike(2, 3); void log_console(struct uio *); void vlog(int, const char *, __va_list) __printflike(2, 0); int asprintf(char **ret, struct malloc_type *mtp, const char *format, ...) __printflike(3, 4); int printf(const char *, ...) __printflike(1, 2); int snprintf(char *, size_t, const char *, ...) __printflike(3, 4); int sprintf(char *buf, const char *, ...) __printflike(2, 3); int uprintf(const char *, ...) __printflike(1, 2); int vprintf(const char *, __va_list) __printflike(1, 0); int vasprintf(char **ret, struct malloc_type *mtp, const char *format, __va_list ap) __printflike(3, 0); int vsnprintf(char *, size_t, const char *, __va_list) __printflike(3, 0); int vsnrprintf(char *, size_t, int, const char *, __va_list) __printflike(4, 0); int vsprintf(char *buf, const char *, __va_list) __printflike(2, 0); int ttyprintf(struct tty *, const char *, ...) __printflike(2, 3); int sscanf(const char *, char const * _Nonnull, ...) __scanflike(2, 3); int vsscanf(const char * _Nonnull, char const * _Nonnull, __va_list) __scanflike(2, 0); long strtol(const char *, char **, int); u_long strtoul(const char *, char **, int); quad_t strtoq(const char *, char **, int); u_quad_t strtouq(const char *, char **, int); void tprintf(struct proc *p, int pri, const char *, ...) __printflike(3, 4); void vtprintf(struct proc *, int, const char *, __va_list) __printflike(3, 0); void hexdump(const void *ptr, int length, const char *hdr, int flags); #define HD_COLUMN_MASK 0xff #define HD_DELIM_MASK 0xff00 #define HD_OMIT_COUNT (1 << 16) #define HD_OMIT_HEX (1 << 17) #define HD_OMIT_CHARS (1 << 18) #define ovbcopy(f, t, l) bcopy((f), (t), (l)) void bcopy(const void * _Nonnull from, void * _Nonnull to, size_t len); void bzero(void * _Nonnull buf, size_t len); void explicit_bzero(void * _Nonnull, size_t); void *memcpy(void * _Nonnull to, const void * _Nonnull from, size_t len); void *memmove(void * _Nonnull dest, const void * _Nonnull src, size_t n); int copystr(const void * _Nonnull __restrict kfaddr, void * _Nonnull __restrict kdaddr, size_t len, size_t * __restrict lencopied); int copyinstr(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len, size_t * __restrict lencopied); int copyin(const void * _Nonnull __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int copyin_nofault(const void * _Nonnull __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int copyout(const void * _Nonnull __restrict kaddr, void * _Nonnull __restrict udaddr, size_t len); int copyout_nofault(const void * _Nonnull __restrict kaddr, void * _Nonnull __restrict udaddr, size_t len); int fubyte(volatile const void *base); long fuword(volatile const void *base); int fuword16(volatile const void *base); int32_t fuword32(volatile const void *base); int64_t fuword64(volatile const void *base); int fueword(volatile const void *base, long *val); int fueword32(volatile const void *base, int32_t *val); int fueword64(volatile const void *base, int64_t *val); int subyte(volatile void *base, int byte); int suword(volatile void *base, long word); int suword16(volatile void *base, int word); int suword32(volatile void *base, int32_t word); int suword64(volatile void *base, int64_t word); uint32_t casuword32(volatile uint32_t *base, uint32_t oldval, uint32_t newval); u_long casuword(volatile u_long *p, u_long oldval, u_long newval); int casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval); int casueword(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval); void realitexpire(void *); int sysbeep(int hertz, int period); void hardclock(int usermode, uintfptr_t pc); void hardclock_cnt(int cnt, int usermode); void hardclock_cpu(int usermode); void hardclock_sync(int cpu); void softclock(void *); void statclock(int usermode); void statclock_cnt(int cnt, int usermode); void profclock(int usermode, uintfptr_t pc); void profclock_cnt(int cnt, int usermode, uintfptr_t pc); int hardclockintr(void); void startprofclock(struct proc *); void stopprofclock(struct proc *); void cpu_startprofclock(void); void cpu_stopprofclock(void); sbintime_t cpu_idleclock(void); void cpu_activeclock(void); void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt); void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq); -extern int cpu_deepest_sleep; extern int cpu_disable_c2_sleep; extern int cpu_disable_c3_sleep; char *kern_getenv(const char *name); void freeenv(char *env); int getenv_int(const char *name, int *data); int getenv_uint(const char *name, unsigned int *data); int getenv_long(const char *name, long *data); int getenv_ulong(const char *name, unsigned long *data); int getenv_string(const char *name, char *data, int size); int getenv_int64(const char *name, int64_t *data); int getenv_uint64(const char *name, uint64_t *data); int getenv_quad(const char *name, quad_t *data); int kern_setenv(const char *name, const char *value); int kern_unsetenv(const char *name); int testenv(const char *name); typedef uint64_t (cpu_tick_f)(void); void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var); extern cpu_tick_f *cpu_ticks; uint64_t cpu_tickrate(void); uint64_t cputick2usec(uint64_t tick); #ifdef APM_FIXUP_CALLTODO struct timeval; void adjust_timeout_calltodo(struct timeval *time_change); #endif /* APM_FIXUP_CALLTODO */ #include /* Initialize the world */ void consinit(void); void cpu_initclocks(void); void cpu_initclocks_bsp(void); void cpu_initclocks_ap(void); void usrinfoinit(void); /* Finalize the world */ void kern_reboot(int) __dead2; void shutdown_nice(int); /* Timeouts */ typedef void timeout_t(void *); /* timeout function type */ #define CALLOUT_HANDLE_INITIALIZER(handle) \ { NULL } void callout_handle_init(struct callout_handle *); struct callout_handle timeout(timeout_t *, void *, int); void untimeout(timeout_t *, void *, struct callout_handle); /* Stubs for obsolete functions that used to be for interrupt management */ static __inline intrmask_t splbio(void) { return 0; } static __inline intrmask_t splcam(void) { return 0; } static __inline intrmask_t splclock(void) { return 0; } static __inline intrmask_t splhigh(void) { return 0; } static __inline intrmask_t splimp(void) { return 0; } static __inline intrmask_t splnet(void) { return 0; } static __inline intrmask_t spltty(void) { return 0; } static __inline void splx(intrmask_t ipl __unused) { return; } /* * Common `proc' functions are declared here so that proc.h can be included * less often. */ int _sleep(void * _Nonnull chan, struct lock_object *lock, int pri, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep(chan, mtx, pri, wmesg, timo) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), \ tick_sbt * (timo), 0, C_HARDCLOCK) #define msleep_sbt(chan, mtx, pri, wmesg, bt, pr, flags) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (bt), (pr), \ (flags)) int msleep_spin_sbt(void * _Nonnull chan, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep_spin(chan, mtx, wmesg, timo) \ msleep_spin_sbt((chan), (mtx), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define pause(wmesg, timo) \ pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK) #define tsleep(chan, pri, wmesg, timo) \ _sleep((chan), NULL, (pri), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) #define tsleep_sbt(chan, pri, wmesg, bt, pr, flags) \ _sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags)) void wakeup(void * chan); void wakeup_one(void * chan); /* * Common `struct cdev *' stuff are declared here to avoid #include poisoning */ struct cdev; dev_t dev2udev(struct cdev *x); const char *devtoname(struct cdev *cdev); #ifdef __LP64__ size_t devfs_iosize_max(void); size_t iosize_max(void); #endif int poll_no_poll(int events); /* XXX: Should be void nanodelay(u_int nsec); */ void DELAY(int usec); /* Root mount holdback API */ struct root_hold_token; struct root_hold_token *root_mount_hold(const char *identifier); void root_mount_rel(struct root_hold_token *h); int root_mounted(void); /* * Unit number allocation API. (kern/subr_unit.c) */ struct unrhdr; struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); void init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex); void delete_unrhdr(struct unrhdr *uh); void clean_unrhdr(struct unrhdr *uh); void clean_unrhdrl(struct unrhdr *uh); int alloc_unr(struct unrhdr *uh); int alloc_unr_specific(struct unrhdr *uh, u_int item); int alloc_unrl(struct unrhdr *uh); void free_unr(struct unrhdr *uh, u_int item); void intr_prof_stack_use(struct thread *td, struct trapframe *frame); extern void (*softdep_ast_cleanup)(void); void counted_warning(unsigned *counter, const char *msg); __NULLABILITY_PRAGMA_POP #endif /* !_SYS_SYSTM_H_ */ Index: head/sys/x86/x86/tsc.c =================================================================== --- head/sys/x86/x86/tsc.c (revision 314210) +++ head/sys/x86/x86/tsc.c (revision 314211) @@ -1,759 +1,759 @@ /*- * Copyright (c) 1998-2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_clock.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cpufreq_if.h" uint64_t tsc_freq; int tsc_is_invariant; int tsc_perf_stat; static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag; SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN, &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant"); #ifdef SMP int smp_tsc; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0, "Indicates whether the TSC is safe to use in SMP mode"); int smp_tsc_adjust = 0; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc_adjust, CTLFLAG_RDTUN, &smp_tsc_adjust, 0, "Try to adjust TSC on APs to match BSP"); #endif static int tsc_shift = 1; SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_shift, CTLFLAG_RDTUN, &tsc_shift, 0, "Shift to pre-apply for the maximum TSC frequency"); static int tsc_disabled; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0, "Disable x86 Time Stamp Counter"); static int tsc_skip_calibration; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN, &tsc_skip_calibration, 0, "Disable TSC frequency calibration"); static void tsc_freq_changed(void *arg, const struct cf_level *level, int status); static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status); static unsigned tsc_get_timecount(struct timecounter *tc); static inline unsigned tsc_get_timecount_low(struct timecounter *tc); static unsigned tsc_get_timecount_lfence(struct timecounter *tc); static unsigned tsc_get_timecount_low_lfence(struct timecounter *tc); static unsigned tsc_get_timecount_mfence(struct timecounter *tc); static unsigned tsc_get_timecount_low_mfence(struct timecounter *tc); static void tsc_levels_changed(void *arg, int unit); static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc); #ifdef COMPAT_FREEBSD32 static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc); #endif static struct timecounter tsc_timecounter = { .tc_get_timecount = tsc_get_timecount, .tc_counter_mask = ~0u, .tc_name = "TSC", .tc_quality = 800, /* adjusted in code */ .tc_fill_vdso_timehands = x86_tsc_vdso_timehands, #ifdef COMPAT_FREEBSD32 .tc_fill_vdso_timehands32 = x86_tsc_vdso_timehands32, #endif }; static void tsc_freq_vmware(void) { u_int regs[4]; if (hv_high >= 0x40000010) { do_cpuid(0x40000010, regs); tsc_freq = regs[0] * 1000; } else { vmware_hvcall(VMW_HVCMD_GETHZ, regs); if (regs[1] != UINT_MAX) tsc_freq = regs[0] | ((uint64_t)regs[1] << 32); } tsc_is_invariant = 1; } static void tsc_freq_intel(void) { char brand[48]; u_int regs[4]; uint64_t freq; char *p; u_int i; /* * Intel Processor Identification and the CPUID Instruction * Application Note 485. * http://www.intel.com/assets/pdf/appnote/241618.pdf */ if (cpu_exthigh >= 0x80000004) { p = brand; for (i = 0x80000002; i < 0x80000005; i++) { do_cpuid(i, regs); memcpy(p, regs, sizeof(regs)); p += sizeof(regs); } p = NULL; for (i = 0; i < sizeof(brand) - 1; i++) if (brand[i] == 'H' && brand[i + 1] == 'z') p = brand + i; if (p != NULL) { p -= 5; switch (p[4]) { case 'M': i = 1; break; case 'G': i = 1000; break; case 'T': i = 1000000; break; default: return; } #define C2D(c) ((c) - '0') if (p[1] == '.') { freq = C2D(p[0]) * 1000; freq += C2D(p[2]) * 100; freq += C2D(p[3]) * 10; freq *= i * 1000; } else { freq = C2D(p[0]) * 1000; freq += C2D(p[1]) * 100; freq += C2D(p[2]) * 10; freq += C2D(p[3]); freq *= i * 1000000; } #undef C2D tsc_freq = freq; } } } static void probe_tsc_freq(void) { u_int regs[4]; uint64_t tsc1, tsc2; if (cpu_high >= 6) { do_cpuid(6, regs); if ((regs[2] & CPUID_PERF_STAT) != 0) { /* * XXX Some emulators expose host CPUID without actual * support for these MSRs. We must test whether they * really work. */ wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); DELAY(10); if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0) tsc_perf_stat = 1; } } if (vm_guest == VM_GUEST_VMWARE) { tsc_freq_vmware(); return; } switch (cpu_vendor_id) { case CPU_VENDOR_AMD: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) >= 0x10)) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_mfence; } break; case CPU_VENDOR_INTEL: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && ((CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xe) || (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x3)))) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_lfence; } break; case CPU_VENDOR_CENTAUR: if (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf && (rdmsr(0x1203) & 0x100000000ULL) == 0) tsc_is_invariant = 1; if (cpu_feature & CPUID_SSE2) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_lfence; } break; } if (tsc_skip_calibration) { if (cpu_vendor_id == CPU_VENDOR_INTEL) tsc_freq_intel(); return; } if (bootverbose) printf("Calibrating TSC clock ... "); tsc1 = rdtsc(); DELAY(1000000); tsc2 = rdtsc(); tsc_freq = tsc2 - tsc1; if (bootverbose) printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq); } void init_TSC(void) { if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; #ifdef __i386__ /* The TSC is known to be broken on certain CPUs. */ switch (cpu_vendor_id) { case CPU_VENDOR_AMD: switch (cpu_id & 0xFF0) { case 0x500: /* K5 Model 0 */ return; } break; case CPU_VENDOR_CENTAUR: switch (cpu_id & 0xff0) { case 0x540: /* * http://www.centtech.com/c6_data_sheet.pdf * * I-12 RDTSC may return incoherent values in EDX:EAX * I-13 RDTSC hangs when certain event counters are used */ return; } break; case CPU_VENDOR_NSC: switch (cpu_id & 0xff0) { case 0x540: if ((cpu_id & CPUID_STEPPING) == 0) return; break; } break; } #endif probe_tsc_freq(); /* * Inform CPU accounting about our boot-time clock rate. This will * be updated if someone loads a cpufreq driver after boot that * discovers a new max frequency. */ if (tsc_freq != 0) set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); if (tsc_is_invariant) return; /* Register to find out about changes in CPU frequency. */ tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change, tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST); tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change, tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST); tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed, tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY); } #ifdef SMP /* * RDTSC is not a serializing instruction, and does not drain * instruction stream, so we need to drain the stream before executing * it. It could be fixed by use of RDTSCP, except the instruction is * not available everywhere. * * Use CPUID for draining in the boot-time SMP constistency test. The * timecounters use MFENCE for AMD CPUs, and LFENCE for others (Intel * and VIA) when SSE2 is present, and nothing on older machines which * also do not issue RDTSC prematurely. There, testing for SSE2 and * vendor is too cumbersome, and we learn about TSC presence from CPUID. * * Do not use do_cpuid(), since we do not need CPUID results, which * have to be written into memory with do_cpuid(). */ #define TSC_READ(x) \ static void \ tsc_read_##x(void *arg) \ { \ uint64_t *tsc = arg; \ u_int cpu = PCPU_GET(cpuid); \ \ __asm __volatile("cpuid" : : : "eax", "ebx", "ecx", "edx"); \ tsc[cpu * 3 + x] = rdtsc(); \ } TSC_READ(0) TSC_READ(1) TSC_READ(2) #undef TSC_READ #define N 1000 static void comp_smp_tsc(void *arg) { uint64_t *tsc; int64_t d1, d2; u_int cpu = PCPU_GET(cpuid); u_int i, j, size; size = (mp_maxid + 1) * 3; for (i = 0, tsc = arg; i < N; i++, tsc += size) CPU_FOREACH(j) { if (j == cpu) continue; d1 = tsc[cpu * 3 + 1] - tsc[j * 3]; d2 = tsc[cpu * 3 + 2] - tsc[j * 3 + 1]; if (d1 <= 0 || d2 <= 0) { smp_tsc = 0; return; } } } static void adj_smp_tsc(void *arg) { uint64_t *tsc; int64_t d, min, max; u_int cpu = PCPU_GET(cpuid); u_int first, i, size; first = CPU_FIRST(); if (cpu == first) return; min = INT64_MIN; max = INT64_MAX; size = (mp_maxid + 1) * 3; for (i = 0, tsc = arg; i < N; i++, tsc += size) { d = tsc[first * 3] - tsc[cpu * 3 + 1]; if (d > min) min = d; d = tsc[first * 3 + 1] - tsc[cpu * 3 + 2]; if (d > min) min = d; d = tsc[first * 3 + 1] - tsc[cpu * 3]; if (d < max) max = d; d = tsc[first * 3 + 2] - tsc[cpu * 3 + 1]; if (d < max) max = d; } if (min > max) return; d = min / 2 + max / 2; __asm __volatile ( "movl $0x10, %%ecx\n\t" "rdmsr\n\t" "addl %%edi, %%eax\n\t" "adcl %%esi, %%edx\n\t" "wrmsr\n" : /* No output */ : "D" ((uint32_t)d), "S" ((uint32_t)(d >> 32)) : "ax", "cx", "dx", "cc" ); } static int test_tsc(void) { uint64_t *data, *tsc; u_int i, size, adj; if ((!smp_tsc && !tsc_is_invariant) || vm_guest) return (-100); size = (mp_maxid + 1) * 3; data = malloc(sizeof(*data) * size * N, M_TEMP, M_WAITOK); adj = 0; retry: for (i = 0, tsc = data; i < N; i++, tsc += size) smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc); smp_tsc = 1; /* XXX */ smp_rendezvous(smp_no_rendevous_barrier, comp_smp_tsc, smp_no_rendevous_barrier, data); if (!smp_tsc && adj < smp_tsc_adjust) { adj++; smp_rendezvous(smp_no_rendevous_barrier, adj_smp_tsc, smp_no_rendevous_barrier, data); goto retry; } free(data, M_TEMP); if (bootverbose) printf("SMP: %sed TSC synchronization test%s\n", smp_tsc ? "pass" : "fail", adj > 0 ? " after adjustment" : ""); if (smp_tsc && tsc_is_invariant) { switch (cpu_vendor_id) { case CPU_VENDOR_AMD: /* * Starting with Family 15h processors, TSC clock * source is in the north bridge. Check whether * we have a single-socket/multi-core platform. * XXX Need more work for complex cases. */ if (CPUID_TO_FAMILY(cpu_id) < 0x15 || (amd_feature2 & AMDID2_CMP) == 0 || smp_cpus > (cpu_procinfo2 & AMDID_CMP_CORES) + 1) break; return (1000); case CPU_VENDOR_INTEL: /* * XXX Assume Intel platforms have synchronized TSCs. */ return (1000); } return (800); } return (-100); } #undef N #else /* * The function is not called, it is provided to avoid linking failure * on uniprocessor kernel. */ static int test_tsc(void) { return (0); } #endif /* SMP */ static void init_TSC_tc(void) { uint64_t max_freq; int shift; if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; /* * Limit timecounter frequency to fit in an int and prevent it from * overflowing too fast. */ max_freq = UINT_MAX; /* * We can not use the TSC if we support APM. Precise timekeeping * on an APM'ed machine is at best a fools pursuit, since * any and all of the time spent in various SMM code can't * be reliably accounted for. Reading the RTC is your only * source of reliable time info. The i8254 loses too, of course, * but we need to have some kind of time... * We don't know at this point whether APM is going to be used * or not, nor when it might be activated. Play it safe. */ if (power_pm_get_type() == POWER_PM_TYPE_APM) { tsc_timecounter.tc_quality = -1000; if (bootverbose) printf("TSC timecounter disabled: APM enabled.\n"); goto init; } /* * Intel CPUs without a C-state invariant TSC can stop the TSC * in either C2 or C3. Disable use of C2 and C3 while using * the TSC as the timecounter. The timecounter can be changed * to enable C2 and C3. * * Note that the TSC is used as the cputicker for computing * thread runtime regardless of the timecounter setting, so * using an alternate timecounter and enabling C2 or C3 can * result incorrect runtimes for kernel idle threads (but not * for any non-idle threads). */ - if (cpu_deepest_sleep >= 2 && cpu_vendor_id == CPU_VENDOR_INTEL && + if (cpu_vendor_id == CPU_VENDOR_INTEL && (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) { tsc_timecounter.tc_flags |= TC_FLAGS_C2STOP; if (bootverbose) printf("TSC timecounter disables C2 and C3.\n"); } /* * We can not use the TSC in SMP mode unless the TSCs on all CPUs * are synchronized. If the user is sure that the system has * synchronized TSCs, set kern.timecounter.smp_tsc tunable to a * non-zero value. The TSC seems unreliable in virtualized SMP * environments, so it is set to a negative quality in those cases. */ if (mp_ncpus > 1) tsc_timecounter.tc_quality = test_tsc(); else if (tsc_is_invariant) tsc_timecounter.tc_quality = 1000; max_freq >>= tsc_shift; init: for (shift = 0; shift <= 31 && (tsc_freq >> shift) > max_freq; shift++) ; if ((cpu_feature & CPUID_SSE2) != 0 && mp_ncpus > 1) { if (cpu_vendor_id == CPU_VENDOR_AMD) { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low_mfence : tsc_get_timecount_mfence; } else { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low_lfence : tsc_get_timecount_lfence; } } else { tsc_timecounter.tc_get_timecount = shift > 0 ? tsc_get_timecount_low : tsc_get_timecount; } if (shift > 0) { tsc_timecounter.tc_name = "TSC-low"; if (bootverbose) printf("TSC timecounter discards lower %d bit(s)\n", shift); } if (tsc_freq != 0) { tsc_timecounter.tc_frequency = tsc_freq >> shift; tsc_timecounter.tc_priv = (void *)(intptr_t)shift; tc_init(&tsc_timecounter); } } SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL); /* * When cpufreq levels change, find out about the (new) max frequency. We * use this to update CPU accounting in case it got a lower estimate at boot. */ static void tsc_levels_changed(void *arg, int unit) { device_t cf_dev; struct cf_level *levels; int count, error; uint64_t max_freq; /* Only use values from the first CPU, assuming all are equal. */ if (unit != 0) return; /* Find the appropriate cpufreq device instance. */ cf_dev = devclass_get_device(devclass_find("cpufreq"), unit); if (cf_dev == NULL) { printf("tsc_levels_changed() called but no cpufreq device?\n"); return; } /* Get settings from the device and find the max frequency. */ count = 64; levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT); if (levels == NULL) return; error = CPUFREQ_LEVELS(cf_dev, levels, &count); if (error == 0 && count != 0) { max_freq = (uint64_t)levels[0].total_set.freq * 1000000; set_cputicker(rdtsc, max_freq, 1); } else printf("tsc_levels_changed: no max freq found\n"); free(levels, M_TEMP); } /* * If the TSC timecounter is in use, veto the pending change. It may be * possible in the future to handle a dynamically-changing timecounter rate. */ static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status) { if (*status != 0 || timecounter != &tsc_timecounter) return; printf("timecounter TSC must not be in use when " "changing frequencies; change denied\n"); *status = EBUSY; } /* Update TSC freq with the value indicated by the caller. */ static void tsc_freq_changed(void *arg, const struct cf_level *level, int status) { uint64_t freq; /* If there was an error during the transition, don't do anything. */ if (tsc_disabled || status != 0) return; /* Total setting for this level gives the new frequency in MHz. */ freq = (uint64_t)level->total_set.freq * 1000000; atomic_store_rel_64(&tsc_freq, freq); tsc_timecounter.tc_frequency = freq >> (int)(intptr_t)tsc_timecounter.tc_priv; } static int sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS) { int error; uint64_t freq; freq = atomic_load_acq_64(&tsc_freq); if (freq == 0) return (EOPNOTSUPP); error = sysctl_handle_64(oidp, &freq, 0, req); if (error == 0 && req->newptr != NULL) { atomic_store_rel_64(&tsc_freq, freq); atomic_store_rel_64(&tsc_timecounter.tc_frequency, freq >> (int)(intptr_t)tsc_timecounter.tc_priv); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_U64 | CTLFLAG_RW, 0, 0, sysctl_machdep_tsc_freq, "QU", "Time Stamp Counter frequency"); static u_int tsc_get_timecount(struct timecounter *tc __unused) { return (rdtsc32()); } static inline u_int tsc_get_timecount_low(struct timecounter *tc) { uint32_t rv; __asm __volatile("rdtsc; shrd %%cl, %%edx, %0" : "=a" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx"); return (rv); } static u_int tsc_get_timecount_lfence(struct timecounter *tc __unused) { lfence(); return (rdtsc32()); } static u_int tsc_get_timecount_low_lfence(struct timecounter *tc) { lfence(); return (tsc_get_timecount_low(tc)); } static u_int tsc_get_timecount_mfence(struct timecounter *tc __unused) { mfence(); return (rdtsc32()); } static u_int tsc_get_timecount_low_mfence(struct timecounter *tc) { mfence(); return (tsc_get_timecount_low(tc)); } static uint32_t x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc) { vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th->th_x86_hpet_idx = 0xffffffff; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (1); } #ifdef COMPAT_FREEBSD32 static uint32_t x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32, struct timecounter *tc) { vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC; vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv; vdso_th32->th_x86_hpet_idx = 0xffffffff; bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); return (1); } #endif