diff --git a/design/cpufreq.adoc b/design/cpufreq.adoc new file mode 100644 --- /dev/null +++ b/design/cpufreq.adoc @@ -0,0 +1,270 @@ += cpufreq Device Driver Design + +This is a quick note on changes to cpufreq drivers. + +== Motivation + +Currently, all cpu frequency control drivers are added using identify routines. +These routines check different characteristics of the CPU directly or using the device tree, or the ACPI properties of the CPU. +When they can arrange for exactly one node to be added, things work well. +x86, however, has several different cpu frequency controls, and it can be a challenge to ensure only one. +esp and p4tcc can 'race' to see which one attaches based on a detail of how the loader sorts data sets. +A number of hacks are in place with the different drivers to 'arbitrate' which driver to attach, many of which can be fragile due to ordering issues. +It can also be hard to add new drivers, especially ones that may conflict with others. + +== High Level Design + +Turn the current cpufreq device which is added by other drivers into being the base-class for those drivers. +With this, then the cpu drivers in the tree can add a "cpufreq" device, and then probe and attach that. +The identify routines would be eliminated. +When the "cpufreq" device is probed, all the drivers will "bid" on the device. +When multiple drivers could handle the CPU, then one that returns the highest priority will get the device. +Drivers would need to be modified to be a subclass, and the registration routines would need to change a bit. +Add explicit disabling checks. + +== Notes + +We have to explicitly check in the probe routine if the device is disabled. +Normally, we probe the device, allow that to succed, then disable the device just before we attach it. +This works well because usually there's only one driver that will attach to a device. +However, for devices that are subclasses, multiple drivers are bidding for that one device. +To get disable to mean 'don't use it' as it normally would, the probe routines would need to do it. +Perhaps we need to fix the probe and attach stuff to not 'accept' a bid from a disablfed device that's disabled, but maybe we need to accept it if it's the only choice. + +It's unclear if we need both `acpi_throttle` and `esp`, but it's quite likely that we don't. + +We have to have a design that allows other kinds of devices attached to cpu. +We attach hw performance, temperature monitoring, etc nodes to the CPU, so we either have to work around it, or have a design that creates another psuedo bus for these devices. +Subclassing seems simpler than an extra layer. + +Currently, `est_probe` does this crazy thing: +[source] +---- + /* + * If the ACPI perf driver has attached and is not just offering + * info, let it manage things. + */ + perf_dev = device_find_child(device_get_parent(dev), "acpi_perf", -1); + if (perf_dev && device_is_attached(perf_dev)) { + error = CPUFREQ_DRV_TYPE(perf_dev, &type); + if (error == 0 && (type & CPUFREQ_FLAG_INFO_ONLY) == 0) + return (ENXIO); + } +---- +So we need to move the info only stuff up into `acpi_cpu` and have `acpi_perf` +only attach if it isn't info only. + +Also, `est_probe` tries to enable SpeedStep and fails to probe if it can't. +It's unclear if this belongs in the probe, or if we should defer that to attach. +In the `identify` world we check to see if `acpi_perf` is attached before we try. +In the newbus priority world, though, I think we should defer this to attach and then we'll only do it if we're selected. +This is safer, and I think should just work. + +`hwpstate_amd` has a comment `Only hwpstate0. It goes well with acpi_throttle.` and then enforces only one instance. what does this mean? +Also, hwpstate_amd has interlock with acpi_perf in strange ways. +This needs to be disentabgled. +[source] +---- + /* + * Check if acpi_perf has INFO only flag. + */ + perf_dev = device_find_child(device_get_parent(dev), "acpi_perf", -1); + error = TRUE; + if (perf_dev && device_is_attached(perf_dev)) { + error = CPUFREQ_DRV_TYPE(perf_dev, &type); + if (error == 0) { + if ((type & CPUFREQ_FLAG_INFO_ONLY) == 0) { + /* + * If acpi_perf doesn't have INFO_ONLY flag, + * it will take care of pstate transitions. + */ + HWPSTATE_DEBUG(dev, "acpi_perf will take care of pstate transitions.\n"); + return (ENXIO); + } else { + /* + * If acpi_perf has INFO_ONLY flag, (_PCT has FFixedHW) + * we can get _PSS info from acpi_perf + * without going into ACPI. + */ + HWPSTATE_DEBUG(dev, "going to fetch info from acpi_perf\n"); + error = hwpstate_get_info_from_acpi_perf(dev, perf_dev); + } + } + } + + if (error == 0) { + /* + * Now we get _PSS info from acpi_perf without error. + * Let's check it. + */ + msr = rdmsr(MSR_AMD_10H_11H_LIMIT); + if (sc->cfnum != 1 + AMD_10H_11H_GET_PSTATE_MAX_VAL(msr)) { + HWPSTATE_DEBUG(dev, "MSR (%jd) and ACPI _PSS (%d)" + " count mismatch\n", (intmax_t)msr, sc->cfnum); + error = TRUE; + } + } +---- + +smist has familiar coupling with `acpi_perf`, but also `ichss` +[source] +---- + /* + * If the ACPI perf or ICH SpeedStep drivers have attached and not + * just offering info, let them manage things. + */ + perf_dev = device_find_child(device_get_parent(dev), "acpi_perf", -1); + if (perf_dev && device_is_attached(perf_dev)) { + rv = CPUFREQ_DRV_TYPE(perf_dev, &type); + if (rv == 0 && (type & CPUFREQ_FLAG_INFO_ONLY) == 0) + return (ENXIO); + } + ichss_dev = device_find_child(device_get_parent(dev), "ichss", -1); + if (ichss_dev && device_is_attached(ichss_dev)) + return (ENXIO); +---- + +`ichss` does a lot of filtering for very old devices, released 2002. +Maybe we need to thin this one. + +`acpi_perf` probes and attaches itself in `identify`. +Thankfully, this is disappearing, as it rightly should. + +So we have the following partial ordering in the code: +* `est` > `ichss` +* `acpi_perf` > `ichss` +* `ichss` > `smist` +* `acpi_perf` > `smist` +* `acpi_perf` > `hwpstate_amd` +* `acpi_perf` > `est` +* `hwpstate_intel` > `est` +* `est` > `p4tcc` +* `p4tcc` > `acpi_throttle` + +.Partial Ordering Table +|================================ +|hwpstate_intel | > est | > p4tcc | > acpi_throttle | +|acpi_perf | > est | > ichss | > smist | +|acpi_perf | | > ichss | > smist | +|acpi_perf | > hwpstate_amd | | +|================================ + +.Priorities +|================================ +|smist | BUS_PROBE_DEFAULT - 15 | +|acpi_throttle | BUS_PROBE_DEFAULT - 15 | +|ichss | BUS_PROBE_DEFAULT - 10 | +|p4tcc | BUS_PROBE_DEFAULT - 10 | +|est | BUS_PROBE_DEFAULT - 5 | +|hwpstate_amd | BUS_PROBE_DEFAULT - 5 | +|acpi_perf | BUS_PROBE_DEFAULT | +|hwpstate_intel | BUS_PROBE_DEFAULT | +|powernow | BUS_PROBE_DEFAULT | +|================================ + +Should acpi_perf be higher or lower than hwpstate_intel? +Or the same? +Normally, when we have acpi and non-acpi we have a tie-breaker. + +Note: +Changed hwpstate_intel from BUS_PROBE_NOWILDCARD, which has magic, to BUS_PROBE_DEFAULT. +Changed smist from -1500 to BUS_PROBE_DEFAULT - 15. +Changed ichss from -1000 to BUS_PROBE_DEFAULT - 10. + +cpufreq_dt returns BUS_PROBE_GENERIC. +Should it do something else like BUS_PROBE_DEFAULT? +Right now, with the identify will ensure it always attaches, so it's hard to know what it should defer to. + +acpi_perf claims a device when there's a _PCT method, but PkgGas fails... +Should we fail the probe of the device instead? + +== Tasks +Many of the checklist items are nops, but we need to audit each driver for each of these things. +There are 18 drivers in the tree that register as cpufreq drivers. + +* [x] Declare cpufreq to be a subclass +* [x] Add explicit checks to all current cpu frequency control devices for being disable +* [*] Make all the `probe` routines accept only the conditions that `identify` accepts. +** [*] mpc85xx_jog +** [*] dfs +** [*] pcr +** [*] pmufreq +** [*] pmcr +** [*] tegra210_cpufreq +** [*] acpi_throttle +** [*] acpi_perf +** [*] cpufreq_dt +** [*] ichss +** [*] smist +** [*] hwpstate_intel +** [*] hwpstate_amd +** [*] powernow +** [*] p4tcc +** [*] est +** [*] bcm2835_cpufreq +** [*] tegra124_cpufreq +* [*] Make all the `probe` routines return PROBE_BUS_DEFAULT +/- bias. +** [*] mpc85xx_jog +** [*] dfs +** [*] pcr +** [*] pmufreq +** [*] pmcr +** [*] tegra210_cpufreq +** [*] acpi_throttle +** [*] acpi_perf +** [*] cpufreq_dt +** [*] ichss +** [*] smist +** [*] hwpstate_intel +** [*] hwpstate_amd +** [*] powernow +** [*] p4tcc +** [*] est +** [*] bcm2835_cpufreq +** [*] tegra124_cpufreq +* [ ] Create `cpufreq_ctor` to replace `cpufreq_register` +* [ ] Create `cpufreq_dtor` to replace `cpufreq_unregister` +* [ ] Migrate to subclass with `cpufreq_ctor` and `cpufreq_dtor` +** [ ] mpc85xx_jog +** [ ] dfs +** [ ] pcr +** [ ] pmufreq +** [ ] pmcr +** [ ] tegra210_cpufreq +** [ ] acpi_throttle +** [ ] acpi_perf +** [ ] cpufreq_dt +** [ ] ichss +** [ ] smist +** [ ] hwpstate_intel +** [ ] hwpstate_amd +** [ ] powernow +** [ ] p4tcc +** [ ] est +** [ ] bcm2835_cpufreq +** [ ] tegra124_cpufreq +* [ ] Split out the CPUFREQ_FLAG_INFO_ONLY stuff from acpi_perf and move it into acpi_cpu. +* [ ] Remove now-useless `identify` routine +** [ ] mpc85xx_jog +** [ ] dfs +** [ ] pcr +** [ ] pmufreq +** [ ] pmcr +** [ ] tegra210_cpufreq +** [ ] acpi_throttle +** [ ] acpi_perf +** [ ] cpufreq_dt +** [ ] ichss +** [ ] smist +** [ ] hwpstate_intel +** [ ] hwpstate_amd +** [ ] powernow +** [ ] p4tcc +** [ ] est +** [ ] bcm2835_cpufreq +** [ ] tegra124_cpufreq +* [ ] Update cpu drivers to explicitly add "cpufreq" child +** [ ] ofw_cpu +** [ ] legacy +** [ ] acpi +* [ ] Remove `cpufreq_register` and `cpufreq_unregister`