Index: head/sys/dev/acpica/acpi_perf.c =================================================================== --- head/sys/dev/acpica/acpi_perf.c (revision 299352) +++ head/sys/dev/acpica/acpi_perf.c (revision 299353) @@ -1,597 +1,595 @@ /*- * Copyright (c) 2003-2005 Nate Lawson (SDG) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cpufreq_if.h" /* * Support for ACPI processor performance states (Px) according to * section 8.3.3 of the ACPI 2.0c specification. */ struct acpi_px { uint32_t core_freq; uint32_t power; uint32_t trans_lat; uint32_t bm_lat; uint32_t ctrl_val; uint32_t sts_val; }; /* Offsets in struct cf_setting array for storing driver-specific values. */ #define PX_SPEC_CONTROL 0 #define PX_SPEC_STATUS 1 #define MAX_PX_STATES 16 struct acpi_perf_softc { device_t dev; ACPI_HANDLE handle; struct resource *perf_ctrl; /* Set new performance state. */ int perf_ctrl_type; /* Resource type for perf_ctrl. */ struct resource *perf_status; /* Check that transition succeeded. */ int perf_sts_type; /* Resource type for perf_status. */ struct acpi_px *px_states; /* ACPI perf states. */ uint32_t px_count; /* Total number of perf states. */ uint32_t px_max_avail; /* Lowest index state available. */ int px_curr_state; /* Active state index. */ int px_rid; int info_only; /* Can we set new states? */ }; #define PX_GET_REG(reg) \ (bus_space_read_4(rman_get_bustag((reg)), \ rman_get_bushandle((reg)), 0)) #define PX_SET_REG(reg, val) \ (bus_space_write_4(rman_get_bustag((reg)), \ rman_get_bushandle((reg)), 0, (val))) #define ACPI_NOTIFY_PERF_STATES 0x80 /* _PSS changed. */ static void acpi_perf_identify(driver_t *driver, device_t parent); static int acpi_perf_probe(device_t dev); static int acpi_perf_attach(device_t dev); static int acpi_perf_detach(device_t dev); static int acpi_perf_evaluate(device_t dev); static int acpi_px_to_set(device_t dev, struct acpi_px *px, struct cf_setting *set); static void acpi_px_available(struct acpi_perf_softc *sc); static void acpi_px_startup(void *arg); static void acpi_px_notify(ACPI_HANDLE h, UINT32 notify, void *context); static int acpi_px_settings(device_t dev, struct cf_setting *sets, int *count); static int acpi_px_set(device_t dev, const struct cf_setting *set); static int acpi_px_get(device_t dev, struct cf_setting *set); static int acpi_px_type(device_t dev, int *type); static device_method_t acpi_perf_methods[] = { /* Device interface */ DEVMETHOD(device_identify, acpi_perf_identify), DEVMETHOD(device_probe, acpi_perf_probe), DEVMETHOD(device_attach, acpi_perf_attach), DEVMETHOD(device_detach, acpi_perf_detach), /* cpufreq interface */ DEVMETHOD(cpufreq_drv_set, acpi_px_set), DEVMETHOD(cpufreq_drv_get, acpi_px_get), DEVMETHOD(cpufreq_drv_type, acpi_px_type), DEVMETHOD(cpufreq_drv_settings, acpi_px_settings), DEVMETHOD_END }; static driver_t acpi_perf_driver = { "acpi_perf", acpi_perf_methods, sizeof(struct acpi_perf_softc), }; static devclass_t acpi_perf_devclass; DRIVER_MODULE(acpi_perf, cpu, acpi_perf_driver, acpi_perf_devclass, 0, 0); MODULE_DEPEND(acpi_perf, acpi, 1, 1, 1); static MALLOC_DEFINE(M_ACPIPERF, "acpi_perf", "ACPI Performance states"); static void acpi_perf_identify(driver_t *driver, device_t parent) { ACPI_HANDLE handle; device_t dev; /* Make sure we're not being doubly invoked. */ if (device_find_child(parent, "acpi_perf", -1) != NULL) return; /* Get the handle for the Processor object and check for perf states. */ handle = acpi_get_handle(parent); if (handle == NULL) return; if (ACPI_FAILURE(AcpiEvaluateObject(handle, "_PSS", NULL, NULL))) return; /* * Add a child to every CPU that has the right methods. In future * versions of the ACPI spec, CPUs can have different settings. * We probe this child now so that other devices that depend * on it (i.e., for info about supported states) will see it. */ if ((dev = BUS_ADD_CHILD(parent, 0, "acpi_perf", -1)) != NULL) device_probe_and_attach(dev); else device_printf(parent, "add acpi_perf child failed\n"); } static int acpi_perf_probe(device_t dev) { ACPI_HANDLE handle; ACPI_OBJECT *pkg; struct resource *res; ACPI_BUFFER buf; int error, rid, type; if (resource_disabled("acpi_perf", 0)) return (ENXIO); /* * Check the performance state registers. If they are of type * "functional fixed hardware", we attach quietly since we will * only be providing information on settings to other drivers. */ error = ENXIO; handle = acpi_get_handle(dev); buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; if (ACPI_FAILURE(AcpiEvaluateObject(handle, "_PCT", NULL, &buf))) return (error); pkg = (ACPI_OBJECT *)buf.Pointer; if (ACPI_PKG_VALID(pkg, 2)) { rid = 0; error = acpi_PkgGas(dev, pkg, 0, &type, &rid, &res, 0); switch (error) { case 0: bus_release_resource(dev, type, rid, res); bus_delete_resource(dev, type, rid); device_set_desc(dev, "ACPI CPU Frequency Control"); break; case EOPNOTSUPP: device_quiet(dev); error = 0; break; } } AcpiOsFree(buf.Pointer); return (error); } static int acpi_perf_attach(device_t dev) { struct acpi_perf_softc *sc; sc = device_get_softc(dev); sc->dev = dev; sc->handle = acpi_get_handle(dev); sc->px_max_avail = 0; sc->px_curr_state = CPUFREQ_VAL_UNKNOWN; if (acpi_perf_evaluate(dev) != 0) return (ENXIO); AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_px_startup, NULL); if (!sc->info_only) cpufreq_register(dev); return (0); } static int acpi_perf_detach(device_t dev) { /* TODO: teardown registers, remove notify handler. */ return (ENXIO); } /* Probe and setup any valid performance states (Px). */ static int acpi_perf_evaluate(device_t dev) { struct acpi_perf_softc *sc; ACPI_BUFFER buf; ACPI_OBJECT *pkg, *res; ACPI_STATUS status; int count, error, i, j; static int once = 1; uint32_t *p; /* Get the control values and parameters for each state. */ error = ENXIO; sc = device_get_softc(dev); buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(sc->handle, "_PSS", NULL, &buf); if (ACPI_FAILURE(status)) return (ENXIO); pkg = (ACPI_OBJECT *)buf.Pointer; if (!ACPI_PKG_VALID(pkg, 1)) { device_printf(dev, "invalid top level _PSS package\n"); goto out; } sc->px_count = pkg->Package.Count; sc->px_states = malloc(sc->px_count * sizeof(struct acpi_px), M_ACPIPERF, M_WAITOK | M_ZERO); - if (sc->px_states == NULL) - goto out; /* * Each state is a package of {CoreFreq, Power, TransitionLatency, * BusMasterLatency, ControlVal, StatusVal}, sorted from highest * performance to lowest. */ count = 0; for (i = 0; i < sc->px_count; i++) { res = &pkg->Package.Elements[i]; if (!ACPI_PKG_VALID(res, 6)) { if (once) { once = 0; device_printf(dev, "invalid _PSS package\n"); } continue; } /* Parse the rest of the package into the struct. */ p = &sc->px_states[count].core_freq; for (j = 0; j < 6; j++, p++) acpi_PkgInt32(res, j, p); /* * Check for some impossible frequencies that some systems * use to indicate they don't actually support this Px state. */ if (sc->px_states[count].core_freq == 0 || sc->px_states[count].core_freq == 9999 || sc->px_states[count].core_freq == 0x9999 || sc->px_states[count].core_freq >= 0xffff) continue; /* Check for duplicate entries */ if (count > 0 && sc->px_states[count - 1].core_freq == sc->px_states[count].core_freq) continue; count++; } sc->px_count = count; /* No valid Px state found so give up. */ if (count == 0) goto out; AcpiOsFree(buf.Pointer); /* Get the control and status registers (one of each). */ buf.Pointer = NULL; buf.Length = ACPI_ALLOCATE_BUFFER; status = AcpiEvaluateObject(sc->handle, "_PCT", NULL, &buf); if (ACPI_FAILURE(status)) goto out; /* Check the package of two registers, each a Buffer in GAS format. */ pkg = (ACPI_OBJECT *)buf.Pointer; if (!ACPI_PKG_VALID(pkg, 2)) { device_printf(dev, "invalid perf register package\n"); goto out; } error = acpi_PkgGas(sc->dev, pkg, 0, &sc->perf_ctrl_type, &sc->px_rid, &sc->perf_ctrl, 0); if (error) { /* * If the register is of type FFixedHW, we can only return * info, we can't get or set new settings. */ if (error == EOPNOTSUPP) { sc->info_only = TRUE; error = 0; } else device_printf(dev, "failed in PERF_CTL attach\n"); goto out; } sc->px_rid++; error = acpi_PkgGas(sc->dev, pkg, 1, &sc->perf_sts_type, &sc->px_rid, &sc->perf_status, 0); if (error) { if (error == EOPNOTSUPP) { sc->info_only = TRUE; error = 0; } else device_printf(dev, "failed in PERF_STATUS attach\n"); goto out; } sc->px_rid++; /* Get our current limit and register for notifies. */ acpi_px_available(sc); AcpiInstallNotifyHandler(sc->handle, ACPI_DEVICE_NOTIFY, acpi_px_notify, sc); error = 0; out: if (error) { if (sc->px_states) { free(sc->px_states, M_ACPIPERF); sc->px_states = NULL; } if (sc->perf_ctrl) { bus_release_resource(sc->dev, sc->perf_ctrl_type, 0, sc->perf_ctrl); bus_delete_resource(sc->dev, sc->perf_ctrl_type, 0); sc->perf_ctrl = NULL; } if (sc->perf_status) { bus_release_resource(sc->dev, sc->perf_sts_type, 1, sc->perf_status); bus_delete_resource(sc->dev, sc->perf_sts_type, 1); sc->perf_status = NULL; } sc->px_rid = 0; sc->px_count = 0; } if (buf.Pointer) AcpiOsFree(buf.Pointer); return (error); } static void acpi_px_startup(void *arg) { /* Signal to the platform that we are taking over CPU control. */ if (AcpiGbl_FADT.PstateControl == 0) return; ACPI_LOCK(acpi); AcpiOsWritePort(AcpiGbl_FADT.SmiCommand, AcpiGbl_FADT.PstateControl, 8); ACPI_UNLOCK(acpi); } static void acpi_px_notify(ACPI_HANDLE h, UINT32 notify, void *context) { struct acpi_perf_softc *sc; sc = context; if (notify != ACPI_NOTIFY_PERF_STATES) return; acpi_px_available(sc); /* TODO: Implement notification when frequency changes. */ } /* * Find the highest currently-supported performance state. * This can be called at runtime (e.g., due to a docking event) at * the request of a Notify on the processor object. */ static void acpi_px_available(struct acpi_perf_softc *sc) { ACPI_STATUS status; struct cf_setting set; status = acpi_GetInteger(sc->handle, "_PPC", &sc->px_max_avail); /* If the old state is too high, set current state to the new max. */ if (ACPI_SUCCESS(status)) { if (sc->px_curr_state != CPUFREQ_VAL_UNKNOWN && sc->px_curr_state > sc->px_max_avail) { acpi_px_to_set(sc->dev, &sc->px_states[sc->px_max_avail], &set); acpi_px_set(sc->dev, &set); } } else sc->px_max_avail = 0; } static int acpi_px_to_set(device_t dev, struct acpi_px *px, struct cf_setting *set) { if (px == NULL || set == NULL) return (EINVAL); set->freq = px->core_freq; set->power = px->power; /* XXX Include BM latency too? */ set->lat = px->trans_lat; set->volts = CPUFREQ_VAL_UNKNOWN; set->dev = dev; set->spec[PX_SPEC_CONTROL] = px->ctrl_val; set->spec[PX_SPEC_STATUS] = px->sts_val; return (0); } static int acpi_px_settings(device_t dev, struct cf_setting *sets, int *count) { struct acpi_perf_softc *sc; int x, y; sc = device_get_softc(dev); if (sets == NULL || count == NULL) return (EINVAL); if (*count < sc->px_count - sc->px_max_avail) return (E2BIG); /* Return a list of settings that are currently valid. */ y = 0; for (x = sc->px_max_avail; x < sc->px_count; x++, y++) acpi_px_to_set(dev, &sc->px_states[x], &sets[y]); *count = sc->px_count - sc->px_max_avail; return (0); } static int acpi_px_set(device_t dev, const struct cf_setting *set) { struct acpi_perf_softc *sc; int i, status, sts_val, tries; if (set == NULL) return (EINVAL); sc = device_get_softc(dev); /* If we can't set new states, return immediately. */ if (sc->info_only) return (ENXIO); /* Look up appropriate state, based on frequency. */ for (i = sc->px_max_avail; i < sc->px_count; i++) { if (CPUFREQ_CMP(set->freq, sc->px_states[i].core_freq)) break; } if (i == sc->px_count) return (EINVAL); /* Write the appropriate value to the register. */ PX_SET_REG(sc->perf_ctrl, sc->px_states[i].ctrl_val); /* * Try for up to 10 ms to verify the desired state was selected. * This is longer than the standard says (1 ms) but in some modes, * systems may take longer to respond. */ sts_val = sc->px_states[i].sts_val; for (tries = 0; tries < 1000; tries++) { status = PX_GET_REG(sc->perf_status); /* * If we match the status or the desired status is 8 bits * and matches the relevant bits, assume we succeeded. It * appears some systems (IBM R32) expect byte-wide access * even though the standard says the register is 32-bit. */ if (status == sts_val || ((sts_val & ~0xff) == 0 && (status & 0xff) == sts_val)) break; DELAY(10); } if (tries == 1000) { device_printf(dev, "Px transition to %d failed\n", sc->px_states[i].core_freq); return (ENXIO); } sc->px_curr_state = i; return (0); } static int acpi_px_get(device_t dev, struct cf_setting *set) { struct acpi_perf_softc *sc; uint64_t rate; int i; struct pcpu *pc; if (set == NULL) return (EINVAL); sc = device_get_softc(dev); /* If we can't get new states, return immediately. */ if (sc->info_only) return (ENXIO); /* If we've set the rate before, use the cached value. */ if (sc->px_curr_state != CPUFREQ_VAL_UNKNOWN) { acpi_px_to_set(dev, &sc->px_states[sc->px_curr_state], set); return (0); } /* Otherwise, estimate and try to match against our settings. */ pc = cpu_get_pcpu(dev); if (pc == NULL) return (ENXIO); cpu_est_clockrate(pc->pc_cpuid, &rate); rate /= 1000000; for (i = 0; i < sc->px_count; i++) { if (CPUFREQ_CMP(sc->px_states[i].core_freq, rate)) { sc->px_curr_state = i; acpi_px_to_set(dev, &sc->px_states[i], set); break; } } /* No match, give up. */ if (i == sc->px_count) { sc->px_curr_state = CPUFREQ_VAL_UNKNOWN; set->freq = CPUFREQ_VAL_UNKNOWN; } return (0); } static int acpi_px_type(device_t dev, int *type) { struct acpi_perf_softc *sc; if (type == NULL) return (EINVAL); sc = device_get_softc(dev); *type = CPUFREQ_TYPE_ABSOLUTE; if (sc->info_only) *type |= CPUFREQ_FLAG_INFO_ONLY; return (0); } Index: head/sys/dev/hwpmc/hwpmc_piv.c =================================================================== --- head/sys/dev/hwpmc/hwpmc_piv.c (revision 299352) +++ head/sys/dev/hwpmc/hwpmc_piv.c (revision 299353) @@ -1,1701 +1,1698 @@ /*- * Copyright (c) 2003-2007 Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #if (__FreeBSD_version >= 1100000) #include #else #include #endif #include #include #include #include #include /* * PENTIUM 4 SUPPORT * * The P4 has 18 PMCs, divided into 4 groups with 4,4,4 and 6 PMCs * respectively. Each PMC comprises of two model specific registers: * a counter configuration control register (CCCR) and a counter * register that holds the actual event counts. * * Configuring an event requires the use of one of 45 event selection * control registers (ESCR). Events are associated with specific * ESCRs. Each PMC group has a set of ESCRs it can use. * * - The BPU counter group (4 PMCs) can use the 16 ESCRs: * BPU_ESCR{0,1}, IS_ESCR{0,1}, MOB_ESCR{0,1}, ITLB_ESCR{0,1}, * PMH_ESCR{0,1}, IX_ESCR{0,1}, FSB_ESCR{0,}, BSU_ESCR{0,1}. * * - The MS counter group (4 PMCs) can use the 6 ESCRs: MS_ESCR{0,1}, * TC_ESCR{0,1}, TBPU_ESCR{0,1}. * * - The FLAME counter group (4 PMCs) can use the 10 ESCRs: * FLAME_ESCR{0,1}, FIRM_ESCR{0,1}, SAAT_ESCR{0,1}, U2L_ESCR{0,1}, * DAC_ESCR{0,1}. * * - The IQ counter group (6 PMCs) can use the 13 ESCRs: IQ_ESCR{0,1}, * ALF_ESCR{0,1}, RAT_ESCR{0,1}, SSU_ESCR0, CRU_ESCR{0,1,2,3,4,5}. * * Even-numbered ESCRs can be used with counters 0, 1 and 4 (if * present) of a counter group. Odd-numbers ESCRs can be used with * counters 2, 3 and 5 (if present) of a counter group. The * 'p4_escrs[]' table describes these restrictions in a form that * function 'p4_allocate()' uses for making allocation decisions. * * SYSTEM-MODE AND THREAD-MODE ALLOCATION * * In addition to remembering the state of PMC rows * ('FREE','STANDALONE', or 'THREAD'), we similar need to track the * state of ESCR rows. If an ESCR is allocated to a system-mode PMC * on a CPU we cannot allocate this to a thread-mode PMC. On a * multi-cpu (multiple physical CPUs) system, ESCR allocation on each * CPU is tracked by the pc_escrs[] array. * * Each system-mode PMC that is using an ESCR records its row-index in * the appropriate entry and system-mode allocation attempts check * that an ESCR is available using this array. Process-mode PMCs do * not use the pc_escrs[] array, since ESCR row itself would have been * marked as in 'THREAD' mode. * * HYPERTHREADING SUPPORT * * When HTT is enabled, the FreeBSD kernel treats the two 'logical' * cpus as independent CPUs and can schedule kernel threads on them * independently. However, the two logical CPUs share the same set of * PMC resources. We need to ensure that: * - PMCs that use the PMC_F_DESCENDANTS semantics are handled correctly, * and, * - Threads of multi-threaded processes that get scheduled on the same * physical CPU are handled correctly. * * HTT Detection * * Not all HTT capable systems will have HTT enabled. We detect the * presence of HTT by detecting if 'p4_init()' was called for a secondary * CPU in a HTT pair. * * Note that hwpmc(4) cannot currently deal with a change in HTT status once * loaded. * * Handling HTT READ / WRITE / START / STOP * * PMC resources are shared across the CPUs in an HTT pair. We * designate the lower numbered CPU in a HTT pair as the 'primary' * CPU. In each primary CPU's state we keep track of a 'runcount' * which reflects the number of PMC-using processes that have been * scheduled on its secondary CPU. Process-mode PMC operations will * actually 'start' or 'stop' hardware only if these are the first or * last processes respectively to use the hardware. PMC values * written by a 'write' operation are saved and are transferred to * hardware at PMC 'start' time if the runcount is 0. If the runcount * is greater than 0 at the time of a 'start' operation, we keep track * of the actual hardware value at the time of the 'start' operation * and use this to adjust the final readings at PMC 'stop' or 'read' * time. * * Execution sequences: * * Case 1: CPUx +...- (no overlap) * CPUy +...- * RC 0 1 0 1 0 * * Case 2: CPUx +........- (partial overlap) * CPUy +........- * RC 0 1 2 1 0 * * Case 3: CPUx +..............- (fully overlapped) * CPUy +.....- * RC 0 1 2 1 0 * * Key: * 'CPU[xy]' : one of the two logical processors on a HTT CPU. * 'RC' : run count (#threads per physical core). * '+' : point in time when a thread is put on a CPU. * '-' : point in time where a thread is taken off a CPU. * * Handling HTT CONFIG * * Different processes attached to the same PMC may get scheduled on * the two logical processors in the package. We keep track of config * and de-config operations using the CFGFLAGS fields of the per-physical * cpu state. */ #define P4_PMCS() \ P4_PMC(BPU_COUNTER0) \ P4_PMC(BPU_COUNTER1) \ P4_PMC(BPU_COUNTER2) \ P4_PMC(BPU_COUNTER3) \ P4_PMC(MS_COUNTER0) \ P4_PMC(MS_COUNTER1) \ P4_PMC(MS_COUNTER2) \ P4_PMC(MS_COUNTER3) \ P4_PMC(FLAME_COUNTER0) \ P4_PMC(FLAME_COUNTER1) \ P4_PMC(FLAME_COUNTER2) \ P4_PMC(FLAME_COUNTER3) \ P4_PMC(IQ_COUNTER0) \ P4_PMC(IQ_COUNTER1) \ P4_PMC(IQ_COUNTER2) \ P4_PMC(IQ_COUNTER3) \ P4_PMC(IQ_COUNTER4) \ P4_PMC(IQ_COUNTER5) \ P4_PMC(NONE) enum pmc_p4pmc { #undef P4_PMC #define P4_PMC(N) P4_PMC_##N , P4_PMCS() }; /* * P4 ESCR descriptors */ #define P4_ESCRS() \ P4_ESCR(BSU_ESCR0, 0x3A0, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(BSU_ESCR1, 0x3A1, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(FSB_ESCR0, 0x3A2, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(FSB_ESCR1, 0x3A3, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(FIRM_ESCR0, 0x3A4, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(FIRM_ESCR1, 0x3A5, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(FLAME_ESCR0, 0x3A6, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(FLAME_ESCR1, 0x3A7, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(DAC_ESCR0, 0x3A8, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(DAC_ESCR1, 0x3A9, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(MOB_ESCR0, 0x3AA, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(MOB_ESCR1, 0x3AB, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(PMH_ESCR0, 0x3AC, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(PMH_ESCR1, 0x3AD, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(SAAT_ESCR0, 0x3AE, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(SAAT_ESCR1, 0x3AF, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(U2L_ESCR0, 0x3B0, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(U2L_ESCR1, 0x3B1, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(BPU_ESCR0, 0x3B2, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(BPU_ESCR1, 0x3B3, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(IS_ESCR0, 0x3B4, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(IS_ESCR1, 0x3B5, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(ITLB_ESCR0, 0x3B6, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(ITLB_ESCR1, 0x3B7, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(CRU_ESCR0, 0x3B8, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(CRU_ESCR1, 0x3B9, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(IQ_ESCR0, 0x3BA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(IQ_ESCR1, 0x3BB, IQ_COUNTER1, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(RAT_ESCR0, 0x3BC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(RAT_ESCR1, 0x3BD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(SSU_ESCR0, 0x3BE, IQ_COUNTER0, IQ_COUNTER2, IQ_COUNTER4) \ P4_ESCR(MS_ESCR0, 0x3C0, MS_COUNTER0, MS_COUNTER1, NONE) \ P4_ESCR(MS_ESCR1, 0x3C1, MS_COUNTER2, MS_COUNTER3, NONE) \ P4_ESCR(TBPU_ESCR0, 0x3C2, MS_COUNTER0, MS_COUNTER1, NONE) \ P4_ESCR(TBPU_ESCR1, 0x3C3, MS_COUNTER2, MS_COUNTER3, NONE) \ P4_ESCR(TC_ESCR0, 0x3C4, MS_COUNTER0, MS_COUNTER1, NONE) \ P4_ESCR(TC_ESCR1, 0x3C5, MS_COUNTER2, MS_COUNTER3, NONE) \ P4_ESCR(IX_ESCR0, 0x3C8, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(IX_ESCR1, 0x3C9, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(ALF_ESCR0, 0x3CA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(ALF_ESCR1, 0x3CB, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(CRU_ESCR2, 0x3CC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(CRU_ESCR3, 0x3CD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(CRU_ESCR4, 0x3E0, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(CRU_ESCR5, 0x3E1, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(NONE, ~0, NONE, NONE, NONE) enum pmc_p4escr { #define P4_ESCR(N, MSR, P1, P2, P3) P4_ESCR_##N , P4_ESCRS() #undef P4_ESCR }; struct pmc_p4escr_descr { const char pm_escrname[PMC_NAME_MAX]; u_short pm_escr_msr; const enum pmc_p4pmc pm_pmcs[P4_MAX_PMC_PER_ESCR]; }; static struct pmc_p4escr_descr p4_escrs[] = { #define P4_ESCR(N, MSR, P1, P2, P3) \ { \ .pm_escrname = #N, \ .pm_escr_msr = (MSR), \ .pm_pmcs = \ { \ P4_PMC_##P1, \ P4_PMC_##P2, \ P4_PMC_##P3 \ } \ } , P4_ESCRS() #undef P4_ESCR }; /* * P4 Event descriptor */ struct p4_event_descr { const enum pmc_event pm_event; const uint32_t pm_escr_eventselect; const uint32_t pm_cccr_select; const char pm_is_ti_event; enum pmc_p4escr pm_escrs[P4_MAX_ESCR_PER_EVENT]; }; static struct p4_event_descr p4_events[] = { #define P4_EVDESCR(NAME, ESCREVENTSEL, CCCRSEL, TI_EVENT, ESCR0, ESCR1) \ { \ .pm_event = PMC_EV_P4_##NAME, \ .pm_escr_eventselect = (ESCREVENTSEL), \ .pm_cccr_select = (CCCRSEL), \ .pm_is_ti_event = (TI_EVENT), \ .pm_escrs = \ { \ P4_ESCR_##ESCR0, \ P4_ESCR_##ESCR1 \ } \ } P4_EVDESCR(TC_DELIVER_MODE, 0x01, 0x01, TRUE, TC_ESCR0, TC_ESCR1), P4_EVDESCR(BPU_FETCH_REQUEST, 0x03, 0x00, FALSE, BPU_ESCR0, BPU_ESCR1), P4_EVDESCR(ITLB_REFERENCE, 0x18, 0x03, FALSE, ITLB_ESCR0, ITLB_ESCR1), P4_EVDESCR(MEMORY_CANCEL, 0x02, 0x05, FALSE, DAC_ESCR0, DAC_ESCR1), P4_EVDESCR(MEMORY_COMPLETE, 0x08, 0x02, FALSE, SAAT_ESCR0, SAAT_ESCR1), P4_EVDESCR(LOAD_PORT_REPLAY, 0x04, 0x02, FALSE, SAAT_ESCR0, SAAT_ESCR1), P4_EVDESCR(STORE_PORT_REPLAY, 0x05, 0x02, FALSE, SAAT_ESCR0, SAAT_ESCR1), P4_EVDESCR(MOB_LOAD_REPLAY, 0x03, 0x02, FALSE, MOB_ESCR0, MOB_ESCR1), P4_EVDESCR(PAGE_WALK_TYPE, 0x01, 0x04, TRUE, PMH_ESCR0, PMH_ESCR1), P4_EVDESCR(BSQ_CACHE_REFERENCE, 0x0C, 0x07, FALSE, BSU_ESCR0, BSU_ESCR1), P4_EVDESCR(IOQ_ALLOCATION, 0x03, 0x06, FALSE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(IOQ_ACTIVE_ENTRIES, 0x1A, 0x06, FALSE, FSB_ESCR1, NONE), P4_EVDESCR(FSB_DATA_ACTIVITY, 0x17, 0x06, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(BSQ_ALLOCATION, 0x05, 0x07, FALSE, BSU_ESCR0, NONE), P4_EVDESCR(BSQ_ACTIVE_ENTRIES, 0x06, 0x07, FALSE, BSU_ESCR1, NONE), /* BSQ_ACTIVE_ENTRIES inherits CPU specificity from BSQ_ALLOCATION */ P4_EVDESCR(SSE_INPUT_ASSIST, 0x34, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(PACKED_SP_UOP, 0x08, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(PACKED_DP_UOP, 0x0C, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(SCALAR_SP_UOP, 0x0A, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(SCALAR_DP_UOP, 0x0E, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(64BIT_MMX_UOP, 0x02, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(128BIT_MMX_UOP, 0x1A, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(X87_FP_UOP, 0x04, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(X87_SIMD_MOVES_UOP, 0x2E, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(GLOBAL_POWER_EVENTS, 0x13, 0x06, FALSE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(TC_MS_XFER, 0x05, 0x00, FALSE, MS_ESCR0, MS_ESCR1), P4_EVDESCR(UOP_QUEUE_WRITES, 0x09, 0x00, FALSE, MS_ESCR0, MS_ESCR1), P4_EVDESCR(RETIRED_MISPRED_BRANCH_TYPE, 0x05, 0x02, FALSE, TBPU_ESCR0, TBPU_ESCR1), P4_EVDESCR(RETIRED_BRANCH_TYPE, 0x04, 0x02, FALSE, TBPU_ESCR0, TBPU_ESCR1), P4_EVDESCR(RESOURCE_STALL, 0x01, 0x01, FALSE, ALF_ESCR0, ALF_ESCR1), P4_EVDESCR(WC_BUFFER, 0x05, 0x05, TRUE, DAC_ESCR0, DAC_ESCR1), P4_EVDESCR(B2B_CYCLES, 0x16, 0x03, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(BNR, 0x08, 0x03, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(SNOOP, 0x06, 0x03, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(RESPONSE, 0x04, 0x03, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(FRONT_END_EVENT, 0x08, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(EXECUTION_EVENT, 0x0C, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(REPLAY_EVENT, 0x09, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(INSTR_RETIRED, 0x02, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1), P4_EVDESCR(UOPS_RETIRED, 0x01, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1), P4_EVDESCR(UOP_TYPE, 0x02, 0x02, FALSE, RAT_ESCR0, RAT_ESCR1), P4_EVDESCR(BRANCH_RETIRED, 0x06, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(MISPRED_BRANCH_RETIRED, 0x03, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1), P4_EVDESCR(X87_ASSIST, 0x03, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(MACHINE_CLEAR, 0x02, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3) #undef P4_EVDESCR }; #define P4_EVENT_IS_TI(E) ((E)->pm_is_ti_event == TRUE) #define P4_NEVENTS (PMC_EV_P4_LAST - PMC_EV_P4_FIRST + 1) /* * P4 PMC descriptors */ struct p4pmc_descr { struct pmc_descr pm_descr; /* common information */ enum pmc_p4pmc pm_pmcnum; /* PMC number */ uint32_t pm_pmc_msr; /* PERFCTR MSR address */ uint32_t pm_cccr_msr; /* CCCR MSR address */ }; static struct p4pmc_descr p4_pmcdesc[P4_NPMCS] = { #define P4_PMC_CAPS (PMC_CAP_INTERRUPT | PMC_CAP_USER | PMC_CAP_SYSTEM | \ PMC_CAP_EDGE | PMC_CAP_THRESHOLD | PMC_CAP_READ | PMC_CAP_WRITE | \ PMC_CAP_INVERT | PMC_CAP_QUALIFIER | PMC_CAP_PRECISE | \ PMC_CAP_TAGGING | PMC_CAP_CASCADE) #define P4_PMCDESCR(N, PMC, CCCR) \ { \ .pm_descr = \ { \ .pd_name = #N, \ .pd_class = PMC_CLASS_P4, \ .pd_caps = P4_PMC_CAPS, \ .pd_width = 40 \ }, \ .pm_pmcnum = P4_PMC_##N, \ .pm_cccr_msr = (CCCR), \ .pm_pmc_msr = (PMC) \ } P4_PMCDESCR(BPU_COUNTER0, 0x300, 0x360), P4_PMCDESCR(BPU_COUNTER1, 0x301, 0x361), P4_PMCDESCR(BPU_COUNTER2, 0x302, 0x362), P4_PMCDESCR(BPU_COUNTER3, 0x303, 0x363), P4_PMCDESCR(MS_COUNTER0, 0x304, 0x364), P4_PMCDESCR(MS_COUNTER1, 0x305, 0x365), P4_PMCDESCR(MS_COUNTER2, 0x306, 0x366), P4_PMCDESCR(MS_COUNTER3, 0x307, 0x367), P4_PMCDESCR(FLAME_COUNTER0, 0x308, 0x368), P4_PMCDESCR(FLAME_COUNTER1, 0x309, 0x369), P4_PMCDESCR(FLAME_COUNTER2, 0x30A, 0x36A), P4_PMCDESCR(FLAME_COUNTER3, 0x30B, 0x36B), P4_PMCDESCR(IQ_COUNTER0, 0x30C, 0x36C), P4_PMCDESCR(IQ_COUNTER1, 0x30D, 0x36D), P4_PMCDESCR(IQ_COUNTER2, 0x30E, 0x36E), P4_PMCDESCR(IQ_COUNTER3, 0x30F, 0x36F), P4_PMCDESCR(IQ_COUNTER4, 0x310, 0x370), P4_PMCDESCR(IQ_COUNTER5, 0x311, 0x371), #undef P4_PMCDESCR }; /* HTT support */ #define P4_NHTT 2 /* logical processors/chip */ static int p4_system_has_htt; /* * Per-CPU data structure for P4 class CPUs * * [19 struct pmc_hw structures] * [45 ESCRs status bytes] * [per-cpu spin mutex] * [19 flag fields for holding config flags and a runcount] * [19*2 hw value fields] (Thread mode PMC support) * or * [19*2 EIP values] (Sampling mode PMCs) * [19*2 pmc value fields] (Thread mode PMC support)) */ struct p4_cpu { struct pmc_hw pc_p4pmcs[P4_NPMCS]; char pc_escrs[P4_NESCR]; struct mtx pc_mtx; /* spin lock */ uint32_t pc_intrflag; /* NMI handler flags */ unsigned int pc_intrlock; /* NMI handler spin lock */ unsigned char pc_flags[P4_NPMCS]; /* 4 bits each: {cfg,run}count */ union { pmc_value_t pc_hw[P4_NPMCS * P4_NHTT]; uintptr_t pc_ip[P4_NPMCS * P4_NHTT]; } pc_si; pmc_value_t pc_pmc_values[P4_NPMCS * P4_NHTT]; }; static struct p4_cpu **p4_pcpu; #define P4_PCPU_PMC_VALUE(PC,RI,CPU) (PC)->pc_pmc_values[(RI)*((CPU) & 1)] #define P4_PCPU_HW_VALUE(PC,RI,CPU) (PC)->pc_si.pc_hw[(RI)*((CPU) & 1)] #define P4_PCPU_SAVED_IP(PC,RI,CPU) (PC)->pc_si.pc_ip[(RI)*((CPU) & 1)] #define P4_PCPU_GET_FLAGS(PC,RI,MASK) ((PC)->pc_flags[(RI)] & (MASK)) #define P4_PCPU_SET_FLAGS(PC,RI,MASK,VAL) do { \ char _tmp; \ _tmp = (PC)->pc_flags[(RI)]; \ _tmp &= ~(MASK); \ _tmp |= (VAL) & (MASK); \ (PC)->pc_flags[(RI)] = _tmp; \ } while (0) #define P4_PCPU_GET_RUNCOUNT(PC,RI) P4_PCPU_GET_FLAGS(PC,RI,0x0F) #define P4_PCPU_SET_RUNCOUNT(PC,RI,V) P4_PCPU_SET_FLAGS(PC,RI,0x0F,V) #define P4_PCPU_GET_CFGFLAGS(PC,RI) (P4_PCPU_GET_FLAGS(PC,RI,0xF0) >> 4) #define P4_PCPU_SET_CFGFLAGS(PC,RI,C) P4_PCPU_SET_FLAGS(PC,RI,0xF0,((C) <<4)) #define P4_CPU_TO_FLAG(C) (P4_CPU_IS_HTT_SECONDARY(cpu) ? 0x2 : 0x1) #define P4_PCPU_GET_INTRFLAG(PC,I) ((PC)->pc_intrflag & (1 << (I))) #define P4_PCPU_SET_INTRFLAG(PC,I,V) do { \ uint32_t __mask; \ __mask = 1 << (I); \ if ((V)) \ (PC)->pc_intrflag |= __mask; \ else \ (PC)->pc_intrflag &= ~__mask; \ } while (0) /* * A minimal spin lock implementation for use inside the NMI handler. * * We don't want to use a regular spin lock here, because curthread * may not be consistent at the time the handler is invoked. */ #define P4_PCPU_ACQ_INTR_SPINLOCK(PC) do { \ while (!atomic_cmpset_acq_int(&pc->pc_intrlock, 0, 1)) \ ia32_pause(); \ } while (0) #define P4_PCPU_REL_INTR_SPINLOCK(PC) \ atomic_store_rel_int(&pc->pc_intrlock, 0); /* ESCR row disposition */ static int p4_escrdisp[P4_NESCR]; #define P4_ESCR_ROW_DISP_IS_THREAD(E) (p4_escrdisp[(E)] > 0) #define P4_ESCR_ROW_DISP_IS_STANDALONE(E) (p4_escrdisp[(E)] < 0) #define P4_ESCR_ROW_DISP_IS_FREE(E) (p4_escrdisp[(E)] == 0) #define P4_ESCR_MARK_ROW_STANDALONE(E) do { \ KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\ __LINE__)); \ atomic_add_int(&p4_escrdisp[(E)], -1); \ KASSERT(p4_escrdisp[(E)] >= (-pmc_cpu_max_active()), \ ("[p4,%d] row disposition error", __LINE__)); \ } while (0) #define P4_ESCR_UNMARK_ROW_STANDALONE(E) do { \ atomic_add_int(&p4_escrdisp[(E)], 1); \ KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\ __LINE__)); \ } while (0) #define P4_ESCR_MARK_ROW_THREAD(E) do { \ KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \ __LINE__)); \ atomic_add_int(&p4_escrdisp[(E)], 1); \ } while (0) #define P4_ESCR_UNMARK_ROW_THREAD(E) do { \ atomic_add_int(&p4_escrdisp[(E)], -1); \ KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \ __LINE__)); \ } while (0) #define P4_PMC_IS_STOPPED(cccr) ((rdmsr(cccr) & P4_CCCR_ENABLE) == 0) #define P4_CPU_IS_HTT_SECONDARY(cpu) \ (p4_system_has_htt ? ((cpu) & 1) : 0) #define P4_TO_HTT_PRIMARY(cpu) \ (p4_system_has_htt ? ((cpu) & ~1) : (cpu)) #define P4_CCCR_Tx_MASK (~(P4_CCCR_OVF_PMI_T0|P4_CCCR_OVF_PMI_T1| \ P4_CCCR_ENABLE|P4_CCCR_OVF)) #define P4_ESCR_Tx_MASK (~(P4_ESCR_T0_OS|P4_ESCR_T0_USR|P4_ESCR_T1_OS| \ P4_ESCR_T1_USR)) /* * support routines */ static struct p4_event_descr * p4_find_event(enum pmc_event ev) { int n; for (n = 0; n < P4_NEVENTS; n++) if (p4_events[n].pm_event == ev) break; if (n == P4_NEVENTS) return (NULL); return (&p4_events[n]); } /* * Initialize per-cpu state */ static int p4_pcpu_init(struct pmc_mdep *md, int cpu) { char *pescr; int n, first_ri, phycpu; struct pmc_hw *phw; struct p4_cpu *p4c; struct pmc_cpu *pc, *plc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[p4,%d] insane cpu number %d", __LINE__, cpu)); PMCDBG2(MDP,INI,0, "p4-init cpu=%d is-primary=%d", cpu, pmc_cpu_is_primary(cpu) != 0); first_ri = md->pmd_classdep[PMC_MDEP_CLASS_INDEX_P4].pcd_ri; /* * The two CPUs in an HT pair share their per-cpu state. * * For HT capable CPUs, we assume that the two logical * processors in the HT pair get two consecutive CPU ids * starting with an even id #. * * The primary CPU (the even numbered CPU of the pair) would * have been initialized prior to the initialization for the * secondary. */ if (!pmc_cpu_is_primary(cpu) && (cpu & 1)) { p4_system_has_htt = 1; phycpu = P4_TO_HTT_PRIMARY(cpu); pc = pmc_pcpu[phycpu]; plc = pmc_pcpu[cpu]; KASSERT(plc != pc, ("[p4,%d] per-cpu config error", __LINE__)); PMCDBG3(MDP,INI,1, "p4-init cpu=%d phycpu=%d pc=%p", cpu, phycpu, pc); KASSERT(pc, ("[p4,%d] Null Per-Cpu state cpu=%d phycpu=%d", __LINE__, cpu, phycpu)); /* PMCs are shared with the physical CPU. */ for (n = 0; n < P4_NPMCS; n++) plc->pc_hwpmcs[n + first_ri] = pc->pc_hwpmcs[n + first_ri]; return (0); } p4c = malloc(sizeof(struct p4_cpu), M_PMC, M_WAITOK|M_ZERO); - if (p4c == NULL) - return (ENOMEM); - pc = pmc_pcpu[cpu]; KASSERT(pc != NULL, ("[p4,%d] cpu %d null per-cpu", __LINE__, cpu)); p4_pcpu[cpu] = p4c; phw = p4c->pc_p4pmcs; for (n = 0; n < P4_NPMCS; n++, phw++) { phw->phw_state = PMC_PHW_FLAG_IS_ENABLED | PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n); phw->phw_pmc = NULL; pc->pc_hwpmcs[n + first_ri] = phw; } pescr = p4c->pc_escrs; for (n = 0; n < P4_NESCR; n++) *pescr++ = P4_INVALID_PMC_INDEX; mtx_init(&p4c->pc_mtx, "p4-pcpu", "pmc-leaf", MTX_SPIN); return (0); } /* * Destroy per-cpu state. */ static int p4_pcpu_fini(struct pmc_mdep *md, int cpu) { int first_ri, i; struct p4_cpu *p4c; struct pmc_cpu *pc; PMCDBG1(MDP,INI,0, "p4-cleanup cpu=%d", cpu); pc = pmc_pcpu[cpu]; first_ri = md->pmd_classdep[PMC_MDEP_CLASS_INDEX_P4].pcd_ri; for (i = 0; i < P4_NPMCS; i++) pc->pc_hwpmcs[i + first_ri] = NULL; if (!pmc_cpu_is_primary(cpu) && (cpu & 1)) return (0); p4c = p4_pcpu[cpu]; KASSERT(p4c != NULL, ("[p4,%d] NULL pcpu", __LINE__)); /* Turn off all PMCs on this CPU */ for (i = 0; i < P4_NPMCS - 1; i++) wrmsr(P4_CCCR_MSR_FIRST + i, rdmsr(P4_CCCR_MSR_FIRST + i) & ~P4_CCCR_ENABLE); mtx_destroy(&p4c->pc_mtx); free(p4c, M_PMC); p4_pcpu[cpu] = NULL; return (0); } /* * Read a PMC */ static int p4_read_pmc(int cpu, int ri, pmc_value_t *v) { struct pmc *pm; pmc_value_t tmp; struct p4_cpu *pc; enum pmc_mode mode; struct p4pmc_descr *pd; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[p4,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index %d", __LINE__, ri)); pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)]; pm = pc->pc_p4pmcs[ri].phw_pmc; pd = &p4_pmcdesc[ri]; KASSERT(pm != NULL, ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__, cpu, ri)); KASSERT(pd->pm_descr.pd_class == PMC_TO_CLASS(pm), ("[p4,%d] class mismatch pd %d != id class %d", __LINE__, pd->pm_descr.pd_class, PMC_TO_CLASS(pm))); mode = PMC_TO_MODE(pm); PMCDBG3(MDP,REA,1, "p4-read cpu=%d ri=%d mode=%d", cpu, ri, mode); KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4, ("[p4,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class)); tmp = rdmsr(p4_pmcdesc[ri].pm_pmc_msr); if (PMC_IS_VIRTUAL_MODE(mode)) { if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit overflow */ tmp += (P4_PERFCTR_MASK + 1) - P4_PCPU_HW_VALUE(pc,ri,cpu); else tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu); tmp += P4_PCPU_PMC_VALUE(pc,ri,cpu); } if (PMC_IS_SAMPLING_MODE(mode)) /* undo transformation */ *v = P4_PERFCTR_VALUE_TO_RELOAD_COUNT(tmp); else *v = tmp; PMCDBG1(MDP,REA,2, "p4-read -> %jx", *v); return (0); } /* * Write a PMC */ static int p4_write_pmc(int cpu, int ri, pmc_value_t v) { enum pmc_mode mode; struct pmc *pm; struct p4_cpu *pc; const struct pmc_hw *phw; const struct p4pmc_descr *pd; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)]; phw = &pc->pc_p4pmcs[ri]; pm = phw->phw_pmc; pd = &p4_pmcdesc[ri]; KASSERT(pm != NULL, ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__, cpu, ri)); mode = PMC_TO_MODE(pm); PMCDBG4(MDP,WRI,1, "p4-write cpu=%d ri=%d mode=%d v=%jx", cpu, ri, mode, v); /* * write the PMC value to the register/saved value: for * sampling mode PMCs, the value to be programmed into the PMC * counter is -(C+1) where 'C' is the requested sample rate. */ if (PMC_IS_SAMPLING_MODE(mode)) v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(v); if (PMC_IS_SYSTEM_MODE(mode)) wrmsr(pd->pm_pmc_msr, v); else P4_PCPU_PMC_VALUE(pc,ri,cpu) = v; return (0); } /* * Configure a PMC 'pm' on the given CPU and row-index. * * 'pm' may be NULL to indicate de-configuration. * * On HTT systems, a PMC may get configured twice, once for each * "logical" CPU. We track this using the CFGFLAGS field of the * per-cpu state; this field is a bit mask with one bit each for * logical CPUs 0 & 1. */ static int p4_config_pmc(int cpu, int ri, struct pmc *pm) { struct pmc_hw *phw; struct p4_cpu *pc; int cfgflags, cpuflag; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[p4,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index %d", __LINE__, ri)); PMCDBG3(MDP,CFG,1, "cpu=%d ri=%d pm=%p", cpu, ri, pm); pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)]; phw = &pc->pc_p4pmcs[ri]; KASSERT(pm == NULL || phw->phw_pmc == NULL || (p4_system_has_htt && phw->phw_pmc == pm), ("[p4,%d] hwpmc not unconfigured before re-config", __LINE__)); mtx_lock_spin(&pc->pc_mtx); cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri); KASSERT(cfgflags >= 0 || cfgflags <= 3, ("[p4,%d] illegal cfgflags cfg=%d on cpu=%d ri=%d", __LINE__, cfgflags, cpu, ri)); KASSERT(cfgflags == 0 || phw->phw_pmc, ("[p4,%d] cpu=%d ri=%d pmc configured with zero cfg count", __LINE__, cpu, ri)); cpuflag = P4_CPU_TO_FLAG(cpu); if (pm) { /* config */ if (cfgflags == 0) phw->phw_pmc = pm; KASSERT(phw->phw_pmc == pm, ("[p4,%d] cpu=%d ri=%d config %p != hw %p", __LINE__, cpu, ri, pm, phw->phw_pmc)); cfgflags |= cpuflag; } else { /* unconfig */ cfgflags &= ~cpuflag; if (cfgflags == 0) phw->phw_pmc = NULL; } KASSERT(cfgflags >= 0 || cfgflags <= 3, ("[p4,%d] illegal runcount cfg=%d on cpu=%d ri=%d", __LINE__, cfgflags, cpu, ri)); P4_PCPU_SET_CFGFLAGS(pc,ri,cfgflags); mtx_unlock_spin(&pc->pc_mtx); return (0); } /* * Retrieve a configured PMC pointer from hardware state. */ static int p4_get_config(int cpu, int ri, struct pmc **ppm) { int cfgflags; struct p4_cpu *pc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[p4,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index %d", __LINE__, ri)); pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)]; mtx_lock_spin(&pc->pc_mtx); cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri); mtx_unlock_spin(&pc->pc_mtx); if (cfgflags & P4_CPU_TO_FLAG(cpu)) *ppm = pc->pc_p4pmcs[ri].phw_pmc; /* PMC config'ed on this CPU */ else *ppm = NULL; return 0; } /* * Allocate a PMC. * * The allocation strategy differs between HTT and non-HTT systems. * * The non-HTT case: * - Given the desired event and the PMC row-index, lookup the * list of valid ESCRs for the event. * - For each valid ESCR: * - Check if the ESCR is free and the ESCR row is in a compatible * mode (i.e., system or process)) * - Check if the ESCR is usable with a P4 PMC at the desired row-index. * If everything matches, we determine the appropriate bit values for the * ESCR and CCCR registers. * * The HTT case: * * - Process mode PMCs require special care. The FreeBSD scheduler could * schedule any two processes on the same physical CPU. We need to ensure * that a given PMC row-index is never allocated to two different * PMCs owned by different user-processes. * This is ensured by always allocating a PMC from a 'FREE' PMC row * if the system has HTT active. * - A similar check needs to be done for ESCRs; we do not want two PMCs * using the same ESCR to be scheduled at the same time. Thus ESCR * allocation is also restricted to FREE rows if the system has HTT * enabled. * - Thirdly, some events are 'thread-independent' terminology, i.e., * the PMC hardware cannot distinguish between events caused by * different logical CPUs. This makes it impossible to assign events * to a given thread of execution. If the system has HTT enabled, * these events are not allowed for process-mode PMCs. */ static int p4_allocate_pmc(int cpu, int ri, struct pmc *pm, const struct pmc_op_pmcallocate *a) { int found, n, m; uint32_t caps, cccrvalue, escrvalue, tflags; enum pmc_p4escr escr; struct p4_cpu *pc; struct p4_event_descr *pevent; const struct p4pmc_descr *pd; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[p4,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index value %d", __LINE__, ri)); pd = &p4_pmcdesc[ri]; PMCDBG4(MDP,ALL,1, "p4-allocate ri=%d class=%d pmccaps=0x%x " "reqcaps=0x%x", ri, pd->pm_descr.pd_class, pd->pm_descr.pd_caps, pm->pm_caps); /* check class */ if (pd->pm_descr.pd_class != a->pm_class) return (EINVAL); /* check requested capabilities */ caps = a->pm_caps; if ((pd->pm_descr.pd_caps & caps) != caps) return (EPERM); /* * If the system has HTT enabled, and the desired allocation * mode is process-private, and the PMC row disposition is not * FREE (0), decline the allocation. */ if (p4_system_has_htt && PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) && pmc_getrowdisp(ri) != 0) return (EBUSY); KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4, ("[p4,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class)); if (pm->pm_event < PMC_EV_P4_FIRST || pm->pm_event > PMC_EV_P4_LAST) return (EINVAL); if ((pevent = p4_find_event(pm->pm_event)) == NULL) return (ESRCH); PMCDBG4(MDP,ALL,2, "pevent={ev=%d,escrsel=0x%x,cccrsel=0x%x,isti=%d}", pevent->pm_event, pevent->pm_escr_eventselect, pevent->pm_cccr_select, pevent->pm_is_ti_event); /* * Some PMC events are 'thread independent'and therefore * cannot be used for process-private modes if HTT is being * used. */ if (P4_EVENT_IS_TI(pevent) && PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) && p4_system_has_htt) return (EINVAL); pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)]; found = 0; /* look for a suitable ESCR for this event */ for (n = 0; n < P4_MAX_ESCR_PER_EVENT && !found; n++) { if ((escr = pevent->pm_escrs[n]) == P4_ESCR_NONE) break; /* out of ESCRs */ /* * Check ESCR row disposition. * * If the request is for a system-mode PMC, then the * ESCR row should not be in process-virtual mode, and * should also be free on the current CPU. */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { if (P4_ESCR_ROW_DISP_IS_THREAD(escr) || pc->pc_escrs[escr] != P4_INVALID_PMC_INDEX) continue; } /* * If the request is for a process-virtual PMC, and if * HTT is not enabled, we can use an ESCR row that is * either FREE or already in process mode. * * If HTT is enabled, then we need to ensure that a * given ESCR is never allocated to two PMCS that * could run simultaneously on the two logical CPUs of * a CPU package. We ensure this be only allocating * ESCRs from rows marked as 'FREE'. */ if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) { if (p4_system_has_htt) { if (!P4_ESCR_ROW_DISP_IS_FREE(escr)) continue; } else if (P4_ESCR_ROW_DISP_IS_STANDALONE(escr)) continue; } /* * We found a suitable ESCR for this event. Now check if * this escr can work with the PMC at row-index 'ri'. */ for (m = 0; m < P4_MAX_PMC_PER_ESCR; m++) if (p4_escrs[escr].pm_pmcs[m] == pd->pm_pmcnum) { found = 1; break; } } if (found == 0) return (ESRCH); KASSERT((int) escr >= 0 && escr < P4_NESCR, ("[p4,%d] illegal ESCR value %d", __LINE__, escr)); /* mark ESCR row mode */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { pc->pc_escrs[escr] = ri; /* mark ESCR as in use on this cpu */ P4_ESCR_MARK_ROW_STANDALONE(escr); } else { KASSERT(pc->pc_escrs[escr] == P4_INVALID_PMC_INDEX, ("[p4,%d] escr[%d] already in use", __LINE__, escr)); P4_ESCR_MARK_ROW_THREAD(escr); } pm->pm_md.pm_p4.pm_p4_escrmsr = p4_escrs[escr].pm_escr_msr; pm->pm_md.pm_p4.pm_p4_escr = escr; cccrvalue = P4_CCCR_TO_ESCR_SELECT(pevent->pm_cccr_select); escrvalue = P4_ESCR_TO_EVENT_SELECT(pevent->pm_escr_eventselect); /* CCCR fields */ if (caps & PMC_CAP_THRESHOLD) cccrvalue |= (a->pm_md.pm_p4.pm_p4_cccrconfig & P4_CCCR_THRESHOLD_MASK) | P4_CCCR_COMPARE; if (caps & PMC_CAP_EDGE) cccrvalue |= P4_CCCR_EDGE; if (caps & PMC_CAP_INVERT) cccrvalue |= P4_CCCR_COMPLEMENT; if (p4_system_has_htt) cccrvalue |= a->pm_md.pm_p4.pm_p4_cccrconfig & P4_CCCR_ACTIVE_THREAD_MASK; else /* no HTT; thread field should be '11b' */ cccrvalue |= P4_CCCR_TO_ACTIVE_THREAD(0x3); if (caps & PMC_CAP_CASCADE) cccrvalue |= P4_CCCR_CASCADE; /* On HTT systems the PMI T0 field may get moved to T1 at pmc start */ if (caps & PMC_CAP_INTERRUPT) cccrvalue |= P4_CCCR_OVF_PMI_T0; /* ESCR fields */ if (caps & PMC_CAP_QUALIFIER) escrvalue |= a->pm_md.pm_p4.pm_p4_escrconfig & P4_ESCR_EVENT_MASK_MASK; if (caps & PMC_CAP_TAGGING) escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig & P4_ESCR_TAG_VALUE_MASK) | P4_ESCR_TAG_ENABLE; if (caps & PMC_CAP_QUALIFIER) escrvalue |= (a->pm_md.pm_p4.pm_p4_escrconfig & P4_ESCR_EVENT_MASK_MASK); /* HTT: T0_{OS,USR} bits may get moved to T1 at pmc start */ tflags = 0; if (caps & PMC_CAP_SYSTEM) tflags |= P4_ESCR_T0_OS; if (caps & PMC_CAP_USER) tflags |= P4_ESCR_T0_USR; if (tflags == 0) tflags = (P4_ESCR_T0_OS|P4_ESCR_T0_USR); escrvalue |= tflags; pm->pm_md.pm_p4.pm_p4_cccrvalue = cccrvalue; pm->pm_md.pm_p4.pm_p4_escrvalue = escrvalue; PMCDBG5(MDP,ALL,2, "p4-allocate cccrsel=0x%x cccrval=0x%x " "escr=%d escrmsr=0x%x escrval=0x%x", pevent->pm_cccr_select, cccrvalue, escr, pm->pm_md.pm_p4.pm_p4_escrmsr, escrvalue); return (0); } /* * release a PMC. */ static int p4_release_pmc(int cpu, int ri, struct pmc *pm) { enum pmc_p4escr escr; struct p4_cpu *pc; KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index %d", __LINE__, ri)); escr = pm->pm_md.pm_p4.pm_p4_escr; PMCDBG3(MDP,REL,1, "p4-release cpu=%d ri=%d escr=%d", cpu, ri, escr); if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)]; KASSERT(pc->pc_p4pmcs[ri].phw_pmc == NULL, ("[p4,%d] releasing configured PMC ri=%d", __LINE__, ri)); P4_ESCR_UNMARK_ROW_STANDALONE(escr); KASSERT(pc->pc_escrs[escr] == ri, ("[p4,%d] escr[%d] not allocated to ri %d", __LINE__, escr, ri)); pc->pc_escrs[escr] = P4_INVALID_PMC_INDEX; /* mark as free */ } else P4_ESCR_UNMARK_ROW_THREAD(escr); return (0); } /* * Start a PMC */ static int p4_start_pmc(int cpu, int ri) { int rc; struct pmc *pm; struct p4_cpu *pc; struct p4pmc_descr *pd; uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[p4,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index %d", __LINE__, ri)); pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)]; pm = pc->pc_p4pmcs[ri].phw_pmc; pd = &p4_pmcdesc[ri]; KASSERT(pm != NULL, ("[p4,%d] starting cpu%d,pmc%d with null pmc", __LINE__, cpu, ri)); PMCDBG2(MDP,STA,1, "p4-start cpu=%d ri=%d", cpu, ri); KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4, ("[p4,%d] wrong PMC class %d", __LINE__, pd->pm_descr.pd_class)); /* retrieve the desired CCCR/ESCR values from the PMC */ cccrvalue = pm->pm_md.pm_p4.pm_p4_cccrvalue; escrvalue = pm->pm_md.pm_p4.pm_p4_escrvalue; escrmsr = pm->pm_md.pm_p4.pm_p4_escrmsr; /* extract and zero the logical processor selection bits */ cccrtbits = cccrvalue & P4_CCCR_OVF_PMI_T0; escrtbits = escrvalue & (P4_ESCR_T0_OS|P4_ESCR_T0_USR); cccrvalue &= ~P4_CCCR_OVF_PMI_T0; escrvalue &= ~(P4_ESCR_T0_OS|P4_ESCR_T0_USR); if (P4_CPU_IS_HTT_SECONDARY(cpu)) { /* shift T0 bits to T1 position */ cccrtbits <<= 1; escrtbits >>= 2; } /* start system mode PMCs directly */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { wrmsr(escrmsr, escrvalue | escrtbits); wrmsr(pd->pm_cccr_msr, cccrvalue | cccrtbits | P4_CCCR_ENABLE); return 0; } /* * Thread mode PMCs * * On HTT machines, the same PMC could be scheduled on the * same physical CPU twice (once for each logical CPU), for * example, if two threads of a multi-threaded process get * scheduled on the same CPU. * */ mtx_lock_spin(&pc->pc_mtx); rc = P4_PCPU_GET_RUNCOUNT(pc,ri); KASSERT(rc == 0 || rc == 1, ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri, rc)); if (rc == 0) { /* 1st CPU and the non-HTT case */ KASSERT(P4_PMC_IS_STOPPED(pd->pm_cccr_msr), ("[p4,%d] cpu=%d ri=%d cccr=0x%x not stopped", __LINE__, cpu, ri, pd->pm_cccr_msr)); /* write out the low 40 bits of the saved value to hardware */ wrmsr(pd->pm_pmc_msr, P4_PCPU_PMC_VALUE(pc,ri,cpu) & P4_PERFCTR_MASK); } else if (rc == 1) { /* 2nd CPU */ /* * Stop the PMC and retrieve the CCCR and ESCR values * from their MSRs, and turn on the additional T[0/1] * bits for the 2nd CPU. */ cccrvalue = rdmsr(pd->pm_cccr_msr); wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE); /* check that the configuration bits read back match the PMC */ KASSERT((cccrvalue & P4_CCCR_Tx_MASK) == (pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK), ("[p4,%d] Extra CCCR bits cpu=%d rc=%d ri=%d " "cccr=0x%x PMC=0x%x", __LINE__, cpu, rc, ri, cccrvalue & P4_CCCR_Tx_MASK, pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK)); KASSERT(cccrvalue & P4_CCCR_ENABLE, ("[p4,%d] 2nd cpu rc=%d cpu=%d ri=%d not running", __LINE__, rc, cpu, ri)); KASSERT((cccrvalue & cccrtbits) == 0, ("[p4,%d] CCCR T0/T1 mismatch rc=%d cpu=%d ri=%d" "cccrvalue=0x%x tbits=0x%x", __LINE__, rc, cpu, ri, cccrvalue, cccrtbits)); escrvalue = rdmsr(escrmsr); KASSERT((escrvalue & P4_ESCR_Tx_MASK) == (pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK), ("[p4,%d] Extra ESCR bits cpu=%d rc=%d ri=%d " "escr=0x%x pm=0x%x", __LINE__, cpu, rc, ri, escrvalue & P4_ESCR_Tx_MASK, pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK)); KASSERT((escrvalue & escrtbits) == 0, ("[p4,%d] ESCR T0/T1 mismatch rc=%d cpu=%d ri=%d " "escrmsr=0x%x escrvalue=0x%x tbits=0x%x", __LINE__, rc, cpu, ri, escrmsr, escrvalue, escrtbits)); } /* Enable the correct bits for this CPU. */ escrvalue |= escrtbits; cccrvalue |= cccrtbits | P4_CCCR_ENABLE; /* Save HW value at the time of starting hardware */ P4_PCPU_HW_VALUE(pc,ri,cpu) = rdmsr(pd->pm_pmc_msr); /* Program the ESCR and CCCR and start the PMC */ wrmsr(escrmsr, escrvalue); wrmsr(pd->pm_cccr_msr, cccrvalue); ++rc; P4_PCPU_SET_RUNCOUNT(pc,ri,rc); mtx_unlock_spin(&pc->pc_mtx); PMCDBG6(MDP,STA,2,"p4-start cpu=%d rc=%d ri=%d escr=%d " "escrmsr=0x%x escrvalue=0x%x", cpu, rc, ri, pm->pm_md.pm_p4.pm_p4_escr, escrmsr, escrvalue); PMCDBG2(MDP,STA,2,"cccr_config=0x%x v=%jx", cccrvalue, P4_PCPU_HW_VALUE(pc,ri,cpu)); return (0); } /* * Stop a PMC. */ static int p4_stop_pmc(int cpu, int ri) { int rc; uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits; struct pmc *pm; struct p4_cpu *pc; struct p4pmc_descr *pd; pmc_value_t tmp; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[p4,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row index %d", __LINE__, ri)); pd = &p4_pmcdesc[ri]; pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)]; pm = pc->pc_p4pmcs[ri].phw_pmc; KASSERT(pm != NULL, ("[p4,%d] null pmc for cpu%d, ri%d", __LINE__, cpu, ri)); PMCDBG2(MDP,STO,1, "p4-stop cpu=%d ri=%d", cpu, ri); if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { wrmsr(pd->pm_cccr_msr, pm->pm_md.pm_p4.pm_p4_cccrvalue & ~P4_CCCR_ENABLE); return (0); } /* * Thread mode PMCs. * * On HTT machines, this PMC may be in use by two threads * running on two logical CPUS. Thus we look at the * 'runcount' field and only turn off the appropriate TO/T1 * bits (and keep the PMC running) if two logical CPUs were * using the PMC. * */ /* bits to mask */ cccrtbits = P4_CCCR_OVF_PMI_T0; escrtbits = P4_ESCR_T0_OS | P4_ESCR_T0_USR; if (P4_CPU_IS_HTT_SECONDARY(cpu)) { cccrtbits <<= 1; escrtbits >>= 2; } mtx_lock_spin(&pc->pc_mtx); rc = P4_PCPU_GET_RUNCOUNT(pc,ri); KASSERT(rc == 2 || rc == 1, ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri, rc)); --rc; P4_PCPU_SET_RUNCOUNT(pc,ri,rc); /* Stop this PMC */ cccrvalue = rdmsr(pd->pm_cccr_msr); wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE); escrmsr = pm->pm_md.pm_p4.pm_p4_escrmsr; escrvalue = rdmsr(escrmsr); /* The current CPU should be running on this PMC */ KASSERT(escrvalue & escrtbits, ("[p4,%d] ESCR T0/T1 mismatch cpu=%d rc=%d ri=%d escrmsr=0x%x " "escrvalue=0x%x tbits=0x%x", __LINE__, cpu, rc, ri, escrmsr, escrvalue, escrtbits)); KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)) || (cccrvalue & cccrtbits), ("[p4,%d] CCCR T0/T1 mismatch cpu=%d ri=%d cccrvalue=0x%x " "tbits=0x%x", __LINE__, cpu, ri, cccrvalue, cccrtbits)); /* get the current hardware reading */ tmp = rdmsr(pd->pm_pmc_msr); if (rc == 1) { /* need to keep the PMC running */ escrvalue &= ~escrtbits; cccrvalue &= ~cccrtbits; wrmsr(escrmsr, escrvalue); wrmsr(pd->pm_cccr_msr, cccrvalue); } mtx_unlock_spin(&pc->pc_mtx); PMCDBG5(MDP,STO,2, "p4-stop cpu=%d rc=%d ri=%d escrmsr=0x%x " "escrval=0x%x", cpu, rc, ri, escrmsr, escrvalue); PMCDBG2(MDP,STO,2, "cccrval=0x%x v=%jx", cccrvalue, tmp); if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit counter overflow */ tmp += (P4_PERFCTR_MASK + 1) - P4_PCPU_HW_VALUE(pc,ri,cpu); else tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu); P4_PCPU_PMC_VALUE(pc,ri,cpu) += tmp; return 0; } /* * Handle an interrupt. * * The hardware sets the CCCR_OVF whenever a counter overflow occurs, * so the handler examines all the 18 CCCR registers, processing the * counters that have overflowed. * * On HTT machines, the CCCR register is shared and will interrupt * both logical processors if so configured. Thus multiple logical * CPUs could enter the NMI service routine at the same time. These * will get serialized using a per-cpu spinlock dedicated for use in * the NMI handler. */ static int p4_intr(int cpu, struct trapframe *tf) { uint32_t cccrval, ovf_mask, ovf_partner; int did_interrupt, error, ri; struct p4_cpu *pc; struct pmc *pm; pmc_value_t v; PMCDBG3(MDP,INT, 1, "cpu=%d tf=0x%p um=%d", cpu, (void *) tf, TRAPF_USERMODE(tf)); pc = p4_pcpu[P4_TO_HTT_PRIMARY(cpu)]; ovf_mask = P4_CPU_IS_HTT_SECONDARY(cpu) ? P4_CCCR_OVF_PMI_T1 : P4_CCCR_OVF_PMI_T0; ovf_mask |= P4_CCCR_OVF; if (p4_system_has_htt) ovf_partner = P4_CPU_IS_HTT_SECONDARY(cpu) ? P4_CCCR_OVF_PMI_T0 : P4_CCCR_OVF_PMI_T1; else ovf_partner = 0; did_interrupt = 0; if (p4_system_has_htt) P4_PCPU_ACQ_INTR_SPINLOCK(pc); /* * Loop through all CCCRs, looking for ones that have * interrupted this CPU. */ for (ri = 0; ri < P4_NPMCS; ri++) { /* * Check if our partner logical CPU has already marked * this PMC has having interrupted it. If so, reset * the flag and process the interrupt, but leave the * hardware alone. */ if (p4_system_has_htt && P4_PCPU_GET_INTRFLAG(pc,ri)) { P4_PCPU_SET_INTRFLAG(pc,ri,0); did_interrupt = 1; /* * Ignore de-configured or stopped PMCs. * Ignore PMCs not in sampling mode. */ pm = pc->pc_p4pmcs[ri].phw_pmc; if (pm == NULL || pm->pm_state != PMC_STATE_RUNNING || !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) { continue; } (void) pmc_process_interrupt(cpu, PMC_HR, pm, tf, TRAPF_USERMODE(tf)); continue; } /* * Fresh interrupt. Look for the CCCR_OVF bit * and the OVF_Tx bit for this logical * processor being set. */ cccrval = rdmsr(P4_CCCR_MSR_FIRST + ri); if ((cccrval & ovf_mask) != ovf_mask) continue; /* * If the other logical CPU would also have been * interrupted due to the PMC being shared, record * this fact in the per-cpu saved interrupt flag * bitmask. */ if (p4_system_has_htt && (cccrval & ovf_partner)) P4_PCPU_SET_INTRFLAG(pc, ri, 1); v = rdmsr(P4_PERFCTR_MSR_FIRST + ri); PMCDBG2(MDP,INT, 2, "ri=%d v=%jx", ri, v); /* Stop the counter, and reset the overflow bit */ cccrval &= ~(P4_CCCR_OVF | P4_CCCR_ENABLE); wrmsr(P4_CCCR_MSR_FIRST + ri, cccrval); did_interrupt = 1; /* * Ignore de-configured or stopped PMCs. Ignore PMCs * not in sampling mode. */ pm = pc->pc_p4pmcs[ri].phw_pmc; if (pm == NULL || pm->pm_state != PMC_STATE_RUNNING || !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) { continue; } /* * Process the interrupt. Re-enable the PMC if * processing was successful. */ error = pmc_process_interrupt(cpu, PMC_HR, pm, tf, TRAPF_USERMODE(tf)); /* * Only the first processor executing the NMI handler * in a HTT pair will restart a PMC, and that too * only if there were no errors. */ v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE( pm->pm_sc.pm_reloadcount); wrmsr(P4_PERFCTR_MSR_FIRST + ri, v); if (error == 0) wrmsr(P4_CCCR_MSR_FIRST + ri, cccrval | P4_CCCR_ENABLE); } /* allow the other CPU to proceed */ if (p4_system_has_htt) P4_PCPU_REL_INTR_SPINLOCK(pc); /* * On Intel P4 CPUs, the PMC 'pcint' entry in the LAPIC gets * masked when a PMC interrupts the CPU. We need to unmask * the interrupt source explicitly. */ if (did_interrupt) lapic_reenable_pmc(); atomic_add_int(did_interrupt ? &pmc_stats.pm_intr_processed : &pmc_stats.pm_intr_ignored, 1); return (did_interrupt); } /* * Describe a CPU's PMC state. */ static int p4_describe(int cpu, int ri, struct pmc_info *pi, struct pmc **ppmc) { int error; size_t copied; const struct p4pmc_descr *pd; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[p4,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] row-index %d out of range", __LINE__, ri)); PMCDBG2(MDP,OPS,1,"p4-describe cpu=%d ri=%d", cpu, ri); if (P4_CPU_IS_HTT_SECONDARY(cpu)) return (EINVAL); pd = &p4_pmcdesc[ri]; if ((error = copystr(pd->pm_descr.pd_name, pi->pm_name, PMC_NAME_MAX, &copied)) != 0) return (error); pi->pm_class = pd->pm_descr.pd_class; if (p4_pcpu[cpu]->pc_p4pmcs[ri].phw_state & PMC_PHW_FLAG_IS_ENABLED) { pi->pm_enabled = TRUE; *ppmc = p4_pcpu[cpu]->pc_p4pmcs[ri].phw_pmc; } else { pi->pm_enabled = FALSE; *ppmc = NULL; } return (0); } /* * Get MSR# for use with RDPMC. */ static int p4_get_msr(int ri, uint32_t *msr) { KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] ri %d out of range", __LINE__, ri)); *msr = p4_pmcdesc[ri].pm_pmc_msr - P4_PERFCTR_MSR_FIRST; PMCDBG2(MDP,OPS, 1, "ri=%d getmsr=0x%x", ri, *msr); return 0; } int pmc_p4_initialize(struct pmc_mdep *md, int ncpus) { struct pmc_classdep *pcd; struct p4_event_descr *pe; KASSERT(md != NULL, ("[p4,%d] md is NULL", __LINE__)); KASSERT(cpu_vendor_id == CPU_VENDOR_INTEL, ("[p4,%d] Initializing non-intel processor", __LINE__)); PMCDBG0(MDP,INI,1, "p4-initialize"); /* Allocate space for pointers to per-cpu descriptors. */ p4_pcpu = malloc(sizeof(*p4_pcpu) * ncpus, M_PMC, M_ZERO | M_WAITOK); /* Fill in the class dependent descriptor. */ pcd = &md->pmd_classdep[PMC_MDEP_CLASS_INDEX_P4]; switch (md->pmd_cputype) { case PMC_CPU_INTEL_PIV: pcd->pcd_caps = P4_PMC_CAPS; pcd->pcd_class = PMC_CLASS_P4; pcd->pcd_num = P4_NPMCS; pcd->pcd_ri = md->pmd_npmc; pcd->pcd_width = 40; pcd->pcd_allocate_pmc = p4_allocate_pmc; pcd->pcd_config_pmc = p4_config_pmc; pcd->pcd_describe = p4_describe; pcd->pcd_get_config = p4_get_config; pcd->pcd_get_msr = p4_get_msr; pcd->pcd_pcpu_fini = p4_pcpu_fini; pcd->pcd_pcpu_init = p4_pcpu_init; pcd->pcd_read_pmc = p4_read_pmc; pcd->pcd_release_pmc = p4_release_pmc; pcd->pcd_start_pmc = p4_start_pmc; pcd->pcd_stop_pmc = p4_stop_pmc; pcd->pcd_write_pmc = p4_write_pmc; md->pmd_pcpu_fini = NULL; md->pmd_pcpu_init = NULL; md->pmd_intr = p4_intr; md->pmd_npmc += P4_NPMCS; /* model specific configuration */ if ((cpu_id & 0xFFF) < 0xF27) { /* * On P4 and Xeon with CPUID < (Family 15, * Model 2, Stepping 7), only one ESCR is * available for the IOQ_ALLOCATION event. */ pe = p4_find_event(PMC_EV_P4_IOQ_ALLOCATION); pe->pm_escrs[1] = P4_ESCR_NONE; } break; default: KASSERT(0,("[p4,%d] Unknown CPU type", __LINE__)); return ENOSYS; } return (0); } void pmc_p4_finalize(struct pmc_mdep *md) { #if defined(INVARIANTS) int i, ncpus; #endif KASSERT(p4_pcpu != NULL, ("[p4,%d] NULL p4_pcpu", __LINE__)); #if defined(INVARIANTS) ncpus = pmc_cpu_max(); for (i = 0; i < ncpus; i++) KASSERT(p4_pcpu[i] == NULL, ("[p4,%d] non-null pcpu %d", __LINE__, i)); #endif free(p4_pcpu, M_PMC); p4_pcpu = NULL; } Index: head/sys/dev/ioat/ioat.c =================================================================== --- head/sys/dev/ioat/ioat.c (revision 299352) +++ head/sys/dev/ioat/ioat.c (revision 299353) @@ -1,2087 +1,2085 @@ /*- * Copyright (C) 2012 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ioat.h" #include "ioat_hw.h" #include "ioat_internal.h" #ifndef BUS_SPACE_MAXADDR_40BIT #define BUS_SPACE_MAXADDR_40BIT 0xFFFFFFFFFFULL #endif #define IOAT_INTR_TIMO (hz / 10) #define IOAT_REFLK (&ioat->submit_lock) static int ioat_probe(device_t device); static int ioat_attach(device_t device); static int ioat_detach(device_t device); static int ioat_setup_intr(struct ioat_softc *ioat); static int ioat_teardown_intr(struct ioat_softc *ioat); static int ioat3_attach(device_t device); static int ioat_start_channel(struct ioat_softc *ioat); static int ioat_map_pci_bar(struct ioat_softc *ioat); static void ioat_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error); static void ioat_interrupt_handler(void *arg); static boolean_t ioat_model_resets_msix(struct ioat_softc *ioat); static int chanerr_to_errno(uint32_t); static void ioat_process_events(struct ioat_softc *ioat); static inline uint32_t ioat_get_active(struct ioat_softc *ioat); static inline uint32_t ioat_get_ring_space(struct ioat_softc *ioat); static void ioat_free_ring(struct ioat_softc *, uint32_t size, struct ioat_descriptor **); static void ioat_free_ring_entry(struct ioat_softc *ioat, struct ioat_descriptor *desc); static struct ioat_descriptor *ioat_alloc_ring_entry(struct ioat_softc *, int mflags); static int ioat_reserve_space(struct ioat_softc *, uint32_t, int mflags); static struct ioat_descriptor *ioat_get_ring_entry(struct ioat_softc *ioat, uint32_t index); static struct ioat_descriptor **ioat_prealloc_ring(struct ioat_softc *, uint32_t size, boolean_t need_dscr, int mflags); static int ring_grow(struct ioat_softc *, uint32_t oldorder, struct ioat_descriptor **); static int ring_shrink(struct ioat_softc *, uint32_t oldorder, struct ioat_descriptor **); static void ioat_halted_debug(struct ioat_softc *, uint32_t); static void ioat_timer_callback(void *arg); static void dump_descriptor(void *hw_desc); static void ioat_submit_single(struct ioat_softc *ioat); static void ioat_comp_update_map(void *arg, bus_dma_segment_t *seg, int nseg, int error); static int ioat_reset_hw(struct ioat_softc *ioat); static void ioat_reset_hw_task(void *, int); static void ioat_setup_sysctl(device_t device); static int sysctl_handle_reset(SYSCTL_HANDLER_ARGS); static inline struct ioat_softc *ioat_get(struct ioat_softc *, enum ioat_ref_kind); static inline void ioat_put(struct ioat_softc *, enum ioat_ref_kind); static inline void _ioat_putn(struct ioat_softc *, uint32_t, enum ioat_ref_kind, boolean_t); static inline void ioat_putn(struct ioat_softc *, uint32_t, enum ioat_ref_kind); static inline void ioat_putn_locked(struct ioat_softc *, uint32_t, enum ioat_ref_kind); static void ioat_drain_locked(struct ioat_softc *); #define ioat_log_message(v, ...) do { \ if ((v) <= g_ioat_debug_level) { \ device_printf(ioat->device, __VA_ARGS__); \ } \ } while (0) MALLOC_DEFINE(M_IOAT, "ioat", "ioat driver memory allocations"); SYSCTL_NODE(_hw, OID_AUTO, ioat, CTLFLAG_RD, 0, "ioat node"); static int g_force_legacy_interrupts; SYSCTL_INT(_hw_ioat, OID_AUTO, force_legacy_interrupts, CTLFLAG_RDTUN, &g_force_legacy_interrupts, 0, "Set to non-zero to force MSI-X disabled"); int g_ioat_debug_level = 0; SYSCTL_INT(_hw_ioat, OID_AUTO, debug_level, CTLFLAG_RWTUN, &g_ioat_debug_level, 0, "Set log level (0-3) for ioat(4). Higher is more verbose."); /* * OS <-> Driver interface structures */ static device_method_t ioat_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ioat_probe), DEVMETHOD(device_attach, ioat_attach), DEVMETHOD(device_detach, ioat_detach), DEVMETHOD_END }; static driver_t ioat_pci_driver = { "ioat", ioat_pci_methods, sizeof(struct ioat_softc), }; static devclass_t ioat_devclass; DRIVER_MODULE(ioat, pci, ioat_pci_driver, ioat_devclass, 0, 0); MODULE_VERSION(ioat, 1); /* * Private data structures */ static struct ioat_softc *ioat_channel[IOAT_MAX_CHANNELS]; static int ioat_channel_index = 0; SYSCTL_INT(_hw_ioat, OID_AUTO, channels, CTLFLAG_RD, &ioat_channel_index, 0, "Number of IOAT channels attached"); static struct _pcsid { u_int32_t type; const char *desc; } pci_ids[] = { { 0x34308086, "TBG IOAT Ch0" }, { 0x34318086, "TBG IOAT Ch1" }, { 0x34328086, "TBG IOAT Ch2" }, { 0x34338086, "TBG IOAT Ch3" }, { 0x34298086, "TBG IOAT Ch4" }, { 0x342a8086, "TBG IOAT Ch5" }, { 0x342b8086, "TBG IOAT Ch6" }, { 0x342c8086, "TBG IOAT Ch7" }, { 0x37108086, "JSF IOAT Ch0" }, { 0x37118086, "JSF IOAT Ch1" }, { 0x37128086, "JSF IOAT Ch2" }, { 0x37138086, "JSF IOAT Ch3" }, { 0x37148086, "JSF IOAT Ch4" }, { 0x37158086, "JSF IOAT Ch5" }, { 0x37168086, "JSF IOAT Ch6" }, { 0x37178086, "JSF IOAT Ch7" }, { 0x37188086, "JSF IOAT Ch0 (RAID)" }, { 0x37198086, "JSF IOAT Ch1 (RAID)" }, { 0x3c208086, "SNB IOAT Ch0" }, { 0x3c218086, "SNB IOAT Ch1" }, { 0x3c228086, "SNB IOAT Ch2" }, { 0x3c238086, "SNB IOAT Ch3" }, { 0x3c248086, "SNB IOAT Ch4" }, { 0x3c258086, "SNB IOAT Ch5" }, { 0x3c268086, "SNB IOAT Ch6" }, { 0x3c278086, "SNB IOAT Ch7" }, { 0x3c2e8086, "SNB IOAT Ch0 (RAID)" }, { 0x3c2f8086, "SNB IOAT Ch1 (RAID)" }, { 0x0e208086, "IVB IOAT Ch0" }, { 0x0e218086, "IVB IOAT Ch1" }, { 0x0e228086, "IVB IOAT Ch2" }, { 0x0e238086, "IVB IOAT Ch3" }, { 0x0e248086, "IVB IOAT Ch4" }, { 0x0e258086, "IVB IOAT Ch5" }, { 0x0e268086, "IVB IOAT Ch6" }, { 0x0e278086, "IVB IOAT Ch7" }, { 0x0e2e8086, "IVB IOAT Ch0 (RAID)" }, { 0x0e2f8086, "IVB IOAT Ch1 (RAID)" }, { 0x2f208086, "HSW IOAT Ch0" }, { 0x2f218086, "HSW IOAT Ch1" }, { 0x2f228086, "HSW IOAT Ch2" }, { 0x2f238086, "HSW IOAT Ch3" }, { 0x2f248086, "HSW IOAT Ch4" }, { 0x2f258086, "HSW IOAT Ch5" }, { 0x2f268086, "HSW IOAT Ch6" }, { 0x2f278086, "HSW IOAT Ch7" }, { 0x2f2e8086, "HSW IOAT Ch0 (RAID)" }, { 0x2f2f8086, "HSW IOAT Ch1 (RAID)" }, { 0x0c508086, "BWD IOAT Ch0" }, { 0x0c518086, "BWD IOAT Ch1" }, { 0x0c528086, "BWD IOAT Ch2" }, { 0x0c538086, "BWD IOAT Ch3" }, { 0x6f508086, "BDXDE IOAT Ch0" }, { 0x6f518086, "BDXDE IOAT Ch1" }, { 0x6f528086, "BDXDE IOAT Ch2" }, { 0x6f538086, "BDXDE IOAT Ch3" }, { 0x6f208086, "BDX IOAT Ch0" }, { 0x6f218086, "BDX IOAT Ch1" }, { 0x6f228086, "BDX IOAT Ch2" }, { 0x6f238086, "BDX IOAT Ch3" }, { 0x6f248086, "BDX IOAT Ch4" }, { 0x6f258086, "BDX IOAT Ch5" }, { 0x6f268086, "BDX IOAT Ch6" }, { 0x6f278086, "BDX IOAT Ch7" }, { 0x6f2e8086, "BDX IOAT Ch0 (RAID)" }, { 0x6f2f8086, "BDX IOAT Ch1 (RAID)" }, { 0x00000000, NULL } }; /* * OS <-> Driver linkage functions */ static int ioat_probe(device_t device) { struct _pcsid *ep; u_int32_t type; type = pci_get_devid(device); for (ep = pci_ids; ep->type; ep++) { if (ep->type == type) { device_set_desc(device, ep->desc); return (0); } } return (ENXIO); } static int ioat_attach(device_t device) { struct ioat_softc *ioat; int error; ioat = DEVICE2SOFTC(device); ioat->device = device; error = ioat_map_pci_bar(ioat); if (error != 0) goto err; ioat->version = ioat_read_cbver(ioat); if (ioat->version < IOAT_VER_3_0) { error = ENODEV; goto err; } error = ioat3_attach(device); if (error != 0) goto err; error = pci_enable_busmaster(device); if (error != 0) goto err; error = ioat_setup_intr(ioat); if (error != 0) goto err; error = ioat_reset_hw(ioat); if (error != 0) goto err; ioat_process_events(ioat); ioat_setup_sysctl(device); ioat->chan_idx = ioat_channel_index; ioat_channel[ioat_channel_index++] = ioat; ioat_test_attach(); err: if (error != 0) ioat_detach(device); return (error); } static int ioat_detach(device_t device) { struct ioat_softc *ioat; ioat = DEVICE2SOFTC(device); ioat_test_detach(); taskqueue_drain(taskqueue_thread, &ioat->reset_task); mtx_lock(IOAT_REFLK); ioat->quiescing = TRUE; ioat->destroying = TRUE; wakeup(&ioat->quiescing); ioat_channel[ioat->chan_idx] = NULL; ioat_drain_locked(ioat); mtx_unlock(IOAT_REFLK); ioat_teardown_intr(ioat); callout_drain(&ioat->timer); pci_disable_busmaster(device); if (ioat->pci_resource != NULL) bus_release_resource(device, SYS_RES_MEMORY, ioat->pci_resource_id, ioat->pci_resource); if (ioat->ring != NULL) ioat_free_ring(ioat, 1 << ioat->ring_size_order, ioat->ring); if (ioat->comp_update != NULL) { bus_dmamap_unload(ioat->comp_update_tag, ioat->comp_update_map); bus_dmamem_free(ioat->comp_update_tag, ioat->comp_update, ioat->comp_update_map); bus_dma_tag_destroy(ioat->comp_update_tag); } bus_dma_tag_destroy(ioat->hw_desc_tag); return (0); } static int ioat_teardown_intr(struct ioat_softc *ioat) { if (ioat->tag != NULL) bus_teardown_intr(ioat->device, ioat->res, ioat->tag); if (ioat->res != NULL) bus_release_resource(ioat->device, SYS_RES_IRQ, rman_get_rid(ioat->res), ioat->res); pci_release_msi(ioat->device); return (0); } static int ioat_start_channel(struct ioat_softc *ioat) { uint64_t status; uint32_t chanerr; int i; ioat_acquire(&ioat->dmaengine); ioat_null(&ioat->dmaengine, NULL, NULL, 0); ioat_release(&ioat->dmaengine); for (i = 0; i < 100; i++) { DELAY(1); status = ioat_get_chansts(ioat); if (is_ioat_idle(status)) return (0); } chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET); ioat_log_message(0, "could not start channel: " "status = %#jx error = %b\n", (uintmax_t)status, (int)chanerr, IOAT_CHANERR_STR); return (ENXIO); } /* * Initialize Hardware */ static int ioat3_attach(device_t device) { struct ioat_softc *ioat; struct ioat_descriptor **ring; struct ioat_descriptor *next; struct ioat_dma_hw_descriptor *dma_hw_desc; int i, num_descriptors; int error; uint8_t xfercap; error = 0; ioat = DEVICE2SOFTC(device); ioat->capabilities = ioat_read_dmacapability(ioat); ioat_log_message(1, "Capabilities: %b\n", (int)ioat->capabilities, IOAT_DMACAP_STR); xfercap = ioat_read_xfercap(ioat); ioat->max_xfer_size = 1 << xfercap; ioat->intrdelay_supported = (ioat_read_2(ioat, IOAT_INTRDELAY_OFFSET) & IOAT_INTRDELAY_SUPPORTED) != 0; if (ioat->intrdelay_supported) ioat->intrdelay_max = IOAT_INTRDELAY_US_MASK; /* TODO: need to check DCA here if we ever do XOR/PQ */ mtx_init(&ioat->submit_lock, "ioat_submit", NULL, MTX_DEF); mtx_init(&ioat->cleanup_lock, "ioat_cleanup", NULL, MTX_DEF); callout_init(&ioat->timer, 1); TASK_INIT(&ioat->reset_task, 0, ioat_reset_hw_task, ioat); /* Establish lock order for Witness */ mtx_lock(&ioat->submit_lock); mtx_lock(&ioat->cleanup_lock); mtx_unlock(&ioat->cleanup_lock); mtx_unlock(&ioat->submit_lock); ioat->is_resize_pending = FALSE; ioat->is_completion_pending = FALSE; ioat->is_reset_pending = FALSE; ioat->is_channel_running = FALSE; bus_dma_tag_create(bus_get_dma_tag(ioat->device), sizeof(uint64_t), 0x0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, sizeof(uint64_t), 1, sizeof(uint64_t), 0, NULL, NULL, &ioat->comp_update_tag); error = bus_dmamem_alloc(ioat->comp_update_tag, (void **)&ioat->comp_update, BUS_DMA_ZERO, &ioat->comp_update_map); if (ioat->comp_update == NULL) return (ENOMEM); error = bus_dmamap_load(ioat->comp_update_tag, ioat->comp_update_map, ioat->comp_update, sizeof(uint64_t), ioat_comp_update_map, ioat, 0); if (error != 0) return (error); ioat->ring_size_order = IOAT_MIN_ORDER; num_descriptors = 1 << ioat->ring_size_order; bus_dma_tag_create(bus_get_dma_tag(ioat->device), 0x40, 0x0, BUS_SPACE_MAXADDR_40BIT, BUS_SPACE_MAXADDR, NULL, NULL, sizeof(struct ioat_dma_hw_descriptor), 1, sizeof(struct ioat_dma_hw_descriptor), 0, NULL, NULL, &ioat->hw_desc_tag); ioat->ring = malloc(num_descriptors * sizeof(*ring), M_IOAT, M_ZERO | M_WAITOK); - if (ioat->ring == NULL) - return (ENOMEM); ring = ioat->ring; for (i = 0; i < num_descriptors; i++) { ring[i] = ioat_alloc_ring_entry(ioat, M_WAITOK); if (ring[i] == NULL) return (ENOMEM); ring[i]->id = i; } for (i = 0; i < num_descriptors - 1; i++) { next = ring[i + 1]; dma_hw_desc = ring[i]->u.dma; dma_hw_desc->next = next->hw_desc_bus_addr; } ring[i]->u.dma->next = ring[0]->hw_desc_bus_addr; ioat->head = ioat->hw_head = 0; ioat->tail = 0; ioat->last_seen = 0; return (0); } static int ioat_map_pci_bar(struct ioat_softc *ioat) { ioat->pci_resource_id = PCIR_BAR(0); ioat->pci_resource = bus_alloc_resource_any(ioat->device, SYS_RES_MEMORY, &ioat->pci_resource_id, RF_ACTIVE); if (ioat->pci_resource == NULL) { ioat_log_message(0, "unable to allocate pci resource\n"); return (ENODEV); } ioat->pci_bus_tag = rman_get_bustag(ioat->pci_resource); ioat->pci_bus_handle = rman_get_bushandle(ioat->pci_resource); return (0); } static void ioat_comp_update_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) { struct ioat_softc *ioat = arg; KASSERT(error == 0, ("%s: error:%d", __func__, error)); ioat->comp_update_bus_addr = seg[0].ds_addr; } static void ioat_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *baddr; KASSERT(error == 0, ("%s: error:%d", __func__, error)); baddr = arg; *baddr = segs->ds_addr; } /* * Interrupt setup and handlers */ static int ioat_setup_intr(struct ioat_softc *ioat) { uint32_t num_vectors; int error; boolean_t use_msix; boolean_t force_legacy_interrupts; use_msix = FALSE; force_legacy_interrupts = FALSE; if (!g_force_legacy_interrupts && pci_msix_count(ioat->device) >= 1) { num_vectors = 1; pci_alloc_msix(ioat->device, &num_vectors); if (num_vectors == 1) use_msix = TRUE; } if (use_msix) { ioat->rid = 1; ioat->res = bus_alloc_resource_any(ioat->device, SYS_RES_IRQ, &ioat->rid, RF_ACTIVE); } else { ioat->rid = 0; ioat->res = bus_alloc_resource_any(ioat->device, SYS_RES_IRQ, &ioat->rid, RF_SHAREABLE | RF_ACTIVE); } if (ioat->res == NULL) { ioat_log_message(0, "bus_alloc_resource failed\n"); return (ENOMEM); } ioat->tag = NULL; error = bus_setup_intr(ioat->device, ioat->res, INTR_MPSAFE | INTR_TYPE_MISC, NULL, ioat_interrupt_handler, ioat, &ioat->tag); if (error != 0) { ioat_log_message(0, "bus_setup_intr failed\n"); return (error); } ioat_write_intrctrl(ioat, IOAT_INTRCTRL_MASTER_INT_EN); return (0); } static boolean_t ioat_model_resets_msix(struct ioat_softc *ioat) { u_int32_t pciid; pciid = pci_get_devid(ioat->device); switch (pciid) { /* BWD: */ case 0x0c508086: case 0x0c518086: case 0x0c528086: case 0x0c538086: /* BDXDE: */ case 0x6f508086: case 0x6f518086: case 0x6f528086: case 0x6f538086: return (TRUE); } return (FALSE); } static void ioat_interrupt_handler(void *arg) { struct ioat_softc *ioat = arg; ioat->stats.interrupts++; ioat_process_events(ioat); } static int chanerr_to_errno(uint32_t chanerr) { if (chanerr == 0) return (0); if ((chanerr & (IOAT_CHANERR_XSADDERR | IOAT_CHANERR_XDADDERR)) != 0) return (EFAULT); if ((chanerr & (IOAT_CHANERR_RDERR | IOAT_CHANERR_WDERR)) != 0) return (EIO); /* This one is probably our fault: */ if ((chanerr & IOAT_CHANERR_NDADDERR) != 0) return (EIO); return (EIO); } static void ioat_process_events(struct ioat_softc *ioat) { struct ioat_descriptor *desc; struct bus_dmadesc *dmadesc; uint64_t comp_update, status; uint32_t completed, chanerr; int error; mtx_lock(&ioat->cleanup_lock); completed = 0; comp_update = *ioat->comp_update; status = comp_update & IOAT_CHANSTS_COMPLETED_DESCRIPTOR_MASK; CTR0(KTR_IOAT, __func__); if (status == ioat->last_seen) { /* * If we landed in process_events and nothing has been * completed, check for a timeout due to channel halt. */ comp_update = ioat_get_chansts(ioat); goto out; } while (1) { desc = ioat_get_ring_entry(ioat, ioat->tail); dmadesc = &desc->bus_dmadesc; CTR1(KTR_IOAT, "completing desc %d", ioat->tail); if (dmadesc->callback_fn != NULL) dmadesc->callback_fn(dmadesc->callback_arg, 0); completed++; ioat->tail++; if (desc->hw_desc_bus_addr == status) break; } ioat->last_seen = desc->hw_desc_bus_addr; if (ioat->head == ioat->tail) { ioat->is_completion_pending = FALSE; callout_reset(&ioat->timer, IOAT_INTR_TIMO, ioat_timer_callback, ioat); } ioat->stats.descriptors_processed += completed; out: ioat_write_chanctrl(ioat, IOAT_CHANCTRL_RUN); mtx_unlock(&ioat->cleanup_lock); if (completed != 0) { ioat_putn(ioat, completed, IOAT_ACTIVE_DESCR_REF); wakeup(&ioat->tail); } if (!is_ioat_halted(comp_update) && !is_ioat_suspended(comp_update)) return; ioat->stats.channel_halts++; /* * Fatal programming error on this DMA channel. Flush any outstanding * work with error status and restart the engine. */ ioat_log_message(0, "Channel halted due to fatal programming error\n"); mtx_lock(&ioat->submit_lock); mtx_lock(&ioat->cleanup_lock); ioat->quiescing = TRUE; chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET); ioat_halted_debug(ioat, chanerr); ioat->stats.last_halt_chanerr = chanerr; while (ioat_get_active(ioat) > 0) { desc = ioat_get_ring_entry(ioat, ioat->tail); dmadesc = &desc->bus_dmadesc; CTR1(KTR_IOAT, "completing err desc %d", ioat->tail); if (dmadesc->callback_fn != NULL) dmadesc->callback_fn(dmadesc->callback_arg, chanerr_to_errno(chanerr)); ioat_putn_locked(ioat, 1, IOAT_ACTIVE_DESCR_REF); ioat->tail++; ioat->stats.descriptors_processed++; ioat->stats.descriptors_error++; } /* Clear error status */ ioat_write_4(ioat, IOAT_CHANERR_OFFSET, chanerr); mtx_unlock(&ioat->cleanup_lock); mtx_unlock(&ioat->submit_lock); ioat_log_message(0, "Resetting channel to recover from error\n"); error = taskqueue_enqueue(taskqueue_thread, &ioat->reset_task); KASSERT(error == 0, ("%s: taskqueue_enqueue failed: %d", __func__, error)); } static void ioat_reset_hw_task(void *ctx, int pending __unused) { struct ioat_softc *ioat; int error; ioat = ctx; ioat_log_message(1, "%s: Resetting channel\n", __func__); error = ioat_reset_hw(ioat); KASSERT(error == 0, ("%s: reset failed: %d", __func__, error)); (void)error; } /* * User API functions */ bus_dmaengine_t ioat_get_dmaengine(uint32_t index, int flags) { struct ioat_softc *ioat; KASSERT((flags & ~(M_NOWAIT | M_WAITOK)) == 0, ("invalid flags: 0x%08x", flags)); KASSERT((flags & (M_NOWAIT | M_WAITOK)) != (M_NOWAIT | M_WAITOK), ("invalid wait | nowait")); if (index >= ioat_channel_index) return (NULL); ioat = ioat_channel[index]; if (ioat == NULL || ioat->destroying) return (NULL); if (ioat->quiescing) { if ((flags & M_NOWAIT) != 0) return (NULL); mtx_lock(IOAT_REFLK); while (ioat->quiescing && !ioat->destroying) msleep(&ioat->quiescing, IOAT_REFLK, 0, "getdma", 0); mtx_unlock(IOAT_REFLK); if (ioat->destroying) return (NULL); } /* * There's a race here between the quiescing check and HW reset or * module destroy. */ return (&ioat_get(ioat, IOAT_DMAENGINE_REF)->dmaengine); } void ioat_put_dmaengine(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); ioat_put(ioat, IOAT_DMAENGINE_REF); } int ioat_get_hwversion(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); return (ioat->version); } size_t ioat_get_max_io_size(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); return (ioat->max_xfer_size); } int ioat_set_interrupt_coalesce(bus_dmaengine_t dmaengine, uint16_t delay) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); if (!ioat->intrdelay_supported) return (ENODEV); if (delay > ioat->intrdelay_max) return (ERANGE); ioat_write_2(ioat, IOAT_INTRDELAY_OFFSET, delay); ioat->cached_intrdelay = ioat_read_2(ioat, IOAT_INTRDELAY_OFFSET) & IOAT_INTRDELAY_US_MASK; return (0); } uint16_t ioat_get_max_coalesce_period(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); return (ioat->intrdelay_max); } void ioat_acquire(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); mtx_lock(&ioat->submit_lock); CTR0(KTR_IOAT, __func__); } int ioat_acquire_reserve(bus_dmaengine_t dmaengine, unsigned n, int mflags) { struct ioat_softc *ioat; int error; ioat = to_ioat_softc(dmaengine); ioat_acquire(dmaengine); error = ioat_reserve_space(ioat, n, mflags); if (error != 0) ioat_release(dmaengine); return (error); } void ioat_release(bus_dmaengine_t dmaengine) { struct ioat_softc *ioat; ioat = to_ioat_softc(dmaengine); CTR0(KTR_IOAT, __func__); ioat_write_2(ioat, IOAT_DMACOUNT_OFFSET, (uint16_t)ioat->hw_head); mtx_unlock(&ioat->submit_lock); } static struct ioat_descriptor * ioat_op_generic(struct ioat_softc *ioat, uint8_t op, uint32_t size, uint64_t src, uint64_t dst, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_generic_hw_descriptor *hw_desc; struct ioat_descriptor *desc; int mflags; mtx_assert(&ioat->submit_lock, MA_OWNED); KASSERT((flags & ~_DMA_GENERIC_FLAGS) == 0, ("Unrecognized flag(s): %#x", flags & ~_DMA_GENERIC_FLAGS)); if ((flags & DMA_NO_WAIT) != 0) mflags = M_NOWAIT; else mflags = M_WAITOK; if (size > ioat->max_xfer_size) { ioat_log_message(0, "%s: max_xfer_size = %d, requested = %u\n", __func__, ioat->max_xfer_size, (unsigned)size); return (NULL); } if (ioat_reserve_space(ioat, 1, mflags) != 0) return (NULL); desc = ioat_get_ring_entry(ioat, ioat->head); hw_desc = desc->u.generic; hw_desc->u.control_raw = 0; hw_desc->u.control_generic.op = op; hw_desc->u.control_generic.completion_update = 1; if ((flags & DMA_INT_EN) != 0) hw_desc->u.control_generic.int_enable = 1; if ((flags & DMA_FENCE) != 0) hw_desc->u.control_generic.fence = 1; hw_desc->size = size; hw_desc->src_addr = src; hw_desc->dest_addr = dst; desc->bus_dmadesc.callback_fn = callback_fn; desc->bus_dmadesc.callback_arg = callback_arg; return (desc); } struct bus_dmadesc * ioat_null(bus_dmaengine_t dmaengine, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_dma_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; CTR0(KTR_IOAT, __func__); ioat = to_ioat_softc(dmaengine); desc = ioat_op_generic(ioat, IOAT_OP_COPY, 8, 0, 0, callback_fn, callback_arg, flags); if (desc == NULL) return (NULL); hw_desc = desc->u.dma; hw_desc->u.control.null = 1; ioat_submit_single(ioat); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_copy(bus_dmaengine_t dmaengine, bus_addr_t dst, bus_addr_t src, bus_size_t len, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_dma_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; CTR0(KTR_IOAT, __func__); ioat = to_ioat_softc(dmaengine); if (((src | dst) & (0xffffull << 48)) != 0) { ioat_log_message(0, "%s: High 16 bits of src/dst invalid\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, IOAT_OP_COPY, len, src, dst, callback_fn, callback_arg, flags); if (desc == NULL) return (NULL); hw_desc = desc->u.dma; if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_copy_8k_aligned(bus_dmaengine_t dmaengine, bus_addr_t dst1, bus_addr_t dst2, bus_addr_t src1, bus_addr_t src2, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_dma_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; CTR0(KTR_IOAT, __func__); ioat = to_ioat_softc(dmaengine); if (((src1 | src2 | dst1 | dst2) & (0xffffull << 48)) != 0) { ioat_log_message(0, "%s: High 16 bits of src/dst invalid\n", __func__); return (NULL); } if (((src1 | src2 | dst1 | dst2) & PAGE_MASK) != 0) { ioat_log_message(0, "%s: Addresses must be page-aligned\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, IOAT_OP_COPY, 2 * PAGE_SIZE, src1, dst1, callback_fn, callback_arg, flags); if (desc == NULL) return (NULL); hw_desc = desc->u.dma; if (src2 != src1 + PAGE_SIZE) { hw_desc->u.control.src_page_break = 1; hw_desc->next_src_addr = src2; } if (dst2 != dst1 + PAGE_SIZE) { hw_desc->u.control.dest_page_break = 1; hw_desc->next_dest_addr = dst2; } if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_copy_crc(bus_dmaengine_t dmaengine, bus_addr_t dst, bus_addr_t src, bus_size_t len, uint32_t *initialseed, bus_addr_t crcptr, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_crc32_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; uint32_t teststore; uint8_t op; CTR0(KTR_IOAT, __func__); ioat = to_ioat_softc(dmaengine); if ((ioat->capabilities & IOAT_DMACAP_MOVECRC) == 0) { ioat_log_message(0, "%s: Device lacks MOVECRC capability\n", __func__); return (NULL); } if (((src | dst) & (0xffffffull << 40)) != 0) { ioat_log_message(0, "%s: High 24 bits of src/dst invalid\n", __func__); return (NULL); } teststore = (flags & _DMA_CRC_TESTSTORE); if (teststore == _DMA_CRC_TESTSTORE) { ioat_log_message(0, "%s: TEST and STORE invalid\n", __func__); return (NULL); } if (teststore == 0 && (flags & DMA_CRC_INLINE) != 0) { ioat_log_message(0, "%s: INLINE invalid without TEST or STORE\n", __func__); return (NULL); } switch (teststore) { case DMA_CRC_STORE: op = IOAT_OP_MOVECRC_STORE; break; case DMA_CRC_TEST: op = IOAT_OP_MOVECRC_TEST; break; default: KASSERT(teststore == 0, ("bogus")); op = IOAT_OP_MOVECRC; break; } if ((flags & DMA_CRC_INLINE) == 0 && (crcptr & (0xffffffull << 40)) != 0) { ioat_log_message(0, "%s: High 24 bits of crcptr invalid\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, op, len, src, dst, callback_fn, callback_arg, flags & ~_DMA_CRC_FLAGS); if (desc == NULL) return (NULL); hw_desc = desc->u.crc32; if ((flags & DMA_CRC_INLINE) == 0) hw_desc->crc_address = crcptr; else hw_desc->u.control.crc_location = 1; if (initialseed != NULL) { hw_desc->u.control.use_seed = 1; hw_desc->seed = *initialseed; } if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_crc(bus_dmaengine_t dmaengine, bus_addr_t src, bus_size_t len, uint32_t *initialseed, bus_addr_t crcptr, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_crc32_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; uint32_t teststore; uint8_t op; CTR0(KTR_IOAT, __func__); ioat = to_ioat_softc(dmaengine); if ((ioat->capabilities & IOAT_DMACAP_CRC) == 0) { ioat_log_message(0, "%s: Device lacks CRC capability\n", __func__); return (NULL); } if ((src & (0xffffffull << 40)) != 0) { ioat_log_message(0, "%s: High 24 bits of src invalid\n", __func__); return (NULL); } teststore = (flags & _DMA_CRC_TESTSTORE); if (teststore == _DMA_CRC_TESTSTORE) { ioat_log_message(0, "%s: TEST and STORE invalid\n", __func__); return (NULL); } if (teststore == 0 && (flags & DMA_CRC_INLINE) != 0) { ioat_log_message(0, "%s: INLINE invalid without TEST or STORE\n", __func__); return (NULL); } switch (teststore) { case DMA_CRC_STORE: op = IOAT_OP_CRC_STORE; break; case DMA_CRC_TEST: op = IOAT_OP_CRC_TEST; break; default: KASSERT(teststore == 0, ("bogus")); op = IOAT_OP_CRC; break; } if ((flags & DMA_CRC_INLINE) == 0 && (crcptr & (0xffffffull << 40)) != 0) { ioat_log_message(0, "%s: High 24 bits of crcptr invalid\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, op, len, src, 0, callback_fn, callback_arg, flags & ~_DMA_CRC_FLAGS); if (desc == NULL) return (NULL); hw_desc = desc->u.crc32; if ((flags & DMA_CRC_INLINE) == 0) hw_desc->crc_address = crcptr; else hw_desc->u.control.crc_location = 1; if (initialseed != NULL) { hw_desc->u.control.use_seed = 1; hw_desc->seed = *initialseed; } if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); return (&desc->bus_dmadesc); } struct bus_dmadesc * ioat_blockfill(bus_dmaengine_t dmaengine, bus_addr_t dst, uint64_t fillpattern, bus_size_t len, bus_dmaengine_callback_t callback_fn, void *callback_arg, uint32_t flags) { struct ioat_fill_hw_descriptor *hw_desc; struct ioat_descriptor *desc; struct ioat_softc *ioat; CTR0(KTR_IOAT, __func__); ioat = to_ioat_softc(dmaengine); if ((ioat->capabilities & IOAT_DMACAP_BFILL) == 0) { ioat_log_message(0, "%s: Device lacks BFILL capability\n", __func__); return (NULL); } if ((dst & (0xffffull << 48)) != 0) { ioat_log_message(0, "%s: High 16 bits of dst invalid\n", __func__); return (NULL); } desc = ioat_op_generic(ioat, IOAT_OP_FILL, len, fillpattern, dst, callback_fn, callback_arg, flags); if (desc == NULL) return (NULL); hw_desc = desc->u.fill; if (g_ioat_debug_level >= 3) dump_descriptor(hw_desc); ioat_submit_single(ioat); return (&desc->bus_dmadesc); } /* * Ring Management */ static inline uint32_t ioat_get_active(struct ioat_softc *ioat) { return ((ioat->head - ioat->tail) & ((1 << ioat->ring_size_order) - 1)); } static inline uint32_t ioat_get_ring_space(struct ioat_softc *ioat) { return ((1 << ioat->ring_size_order) - ioat_get_active(ioat) - 1); } static struct ioat_descriptor * ioat_alloc_ring_entry(struct ioat_softc *ioat, int mflags) { struct ioat_generic_hw_descriptor *hw_desc; struct ioat_descriptor *desc; int error, busdmaflag; error = ENOMEM; hw_desc = NULL; if ((mflags & M_WAITOK) != 0) busdmaflag = BUS_DMA_WAITOK; else busdmaflag = BUS_DMA_NOWAIT; desc = malloc(sizeof(*desc), M_IOAT, mflags); if (desc == NULL) goto out; bus_dmamem_alloc(ioat->hw_desc_tag, (void **)&hw_desc, BUS_DMA_ZERO | busdmaflag, &ioat->hw_desc_map); if (hw_desc == NULL) goto out; memset(&desc->bus_dmadesc, 0, sizeof(desc->bus_dmadesc)); desc->u.generic = hw_desc; error = bus_dmamap_load(ioat->hw_desc_tag, ioat->hw_desc_map, hw_desc, sizeof(*hw_desc), ioat_dmamap_cb, &desc->hw_desc_bus_addr, busdmaflag); if (error) goto out; out: if (error) { ioat_free_ring_entry(ioat, desc); return (NULL); } return (desc); } static void ioat_free_ring_entry(struct ioat_softc *ioat, struct ioat_descriptor *desc) { if (desc == NULL) return; if (desc->u.generic) bus_dmamem_free(ioat->hw_desc_tag, desc->u.generic, ioat->hw_desc_map); free(desc, M_IOAT); } /* * Reserves space in this IOAT descriptor ring by ensuring enough slots remain * for 'num_descs'. * * If mflags contains M_WAITOK, blocks until enough space is available. * * Returns zero on success, or an errno on error. If num_descs is beyond the * maximum ring size, returns EINVAl; if allocation would block and mflags * contains M_NOWAIT, returns EAGAIN. * * Must be called with the submit_lock held; returns with the lock held. The * lock may be dropped to allocate the ring. * * (The submit_lock is needed to add any entries to the ring, so callers are * assured enough room is available.) */ static int ioat_reserve_space(struct ioat_softc *ioat, uint32_t num_descs, int mflags) { struct ioat_descriptor **new_ring; uint32_t order; int error; mtx_assert(&ioat->submit_lock, MA_OWNED); error = 0; if (num_descs < 1 || num_descs > (1 << IOAT_MAX_ORDER)) { error = EINVAL; goto out; } if (ioat->quiescing) { error = ENXIO; goto out; } for (;;) { if (ioat_get_ring_space(ioat) >= num_descs) goto out; order = ioat->ring_size_order; if (ioat->is_resize_pending || order == IOAT_MAX_ORDER) { if ((mflags & M_WAITOK) != 0) { msleep(&ioat->tail, &ioat->submit_lock, 0, "ioat_rsz", 0); continue; } error = EAGAIN; break; } ioat->is_resize_pending = TRUE; for (;;) { mtx_unlock(&ioat->submit_lock); new_ring = ioat_prealloc_ring(ioat, 1 << (order + 1), TRUE, mflags); mtx_lock(&ioat->submit_lock); KASSERT(ioat->ring_size_order == order, ("is_resize_pending should protect order")); if (new_ring == NULL) { KASSERT((mflags & M_WAITOK) == 0, ("allocation failed")); error = EAGAIN; break; } error = ring_grow(ioat, order, new_ring); if (error == 0) break; } ioat->is_resize_pending = FALSE; wakeup(&ioat->tail); if (error) break; } out: mtx_assert(&ioat->submit_lock, MA_OWNED); return (error); } static struct ioat_descriptor ** ioat_prealloc_ring(struct ioat_softc *ioat, uint32_t size, boolean_t need_dscr, int mflags) { struct ioat_descriptor **ring; uint32_t i; int error; KASSERT(size > 0 && powerof2(size), ("bogus size")); ring = malloc(size * sizeof(*ring), M_IOAT, M_ZERO | mflags); if (ring == NULL) return (NULL); if (need_dscr) { error = ENOMEM; for (i = size / 2; i < size; i++) { ring[i] = ioat_alloc_ring_entry(ioat, mflags); if (ring[i] == NULL) goto out; ring[i]->id = i; } } error = 0; out: if (error != 0 && ring != NULL) { ioat_free_ring(ioat, size, ring); ring = NULL; } return (ring); } static void ioat_free_ring(struct ioat_softc *ioat, uint32_t size, struct ioat_descriptor **ring) { uint32_t i; for (i = 0; i < size; i++) { if (ring[i] != NULL) ioat_free_ring_entry(ioat, ring[i]); } free(ring, M_IOAT); } static struct ioat_descriptor * ioat_get_ring_entry(struct ioat_softc *ioat, uint32_t index) { return (ioat->ring[index % (1 << ioat->ring_size_order)]); } static int ring_grow(struct ioat_softc *ioat, uint32_t oldorder, struct ioat_descriptor **newring) { struct ioat_descriptor *tmp, *next; struct ioat_dma_hw_descriptor *hw; uint32_t oldsize, newsize, head, tail, i, end; int error; CTR0(KTR_IOAT, __func__); mtx_assert(&ioat->submit_lock, MA_OWNED); if (oldorder != ioat->ring_size_order || oldorder >= IOAT_MAX_ORDER) { error = EINVAL; goto out; } oldsize = (1 << oldorder); newsize = (1 << (oldorder + 1)); mtx_lock(&ioat->cleanup_lock); head = ioat->head & (oldsize - 1); tail = ioat->tail & (oldsize - 1); /* Copy old descriptors to new ring */ for (i = 0; i < oldsize; i++) newring[i] = ioat->ring[i]; /* * If head has wrapped but tail hasn't, we must swap some descriptors * around so that tail can increment directly to head. */ if (head < tail) { for (i = 0; i <= head; i++) { tmp = newring[oldsize + i]; newring[oldsize + i] = newring[i]; newring[oldsize + i]->id = oldsize + i; newring[i] = tmp; newring[i]->id = i; } head += oldsize; } KASSERT(head >= tail, ("invariants")); /* Head didn't wrap; we only need to link in oldsize..newsize */ if (head < oldsize) { i = oldsize - 1; end = newsize; } else { /* Head did wrap; link newhead..newsize and 0..oldhead */ i = head; end = newsize + (head - oldsize) + 1; } /* * Fix up hardware ring, being careful not to trample the active * section (tail -> head). */ for (; i < end; i++) { KASSERT((i & (newsize - 1)) < tail || (i & (newsize - 1)) >= head, ("trampling snake")); next = newring[(i + 1) & (newsize - 1)]; hw = newring[i & (newsize - 1)]->u.dma; hw->next = next->hw_desc_bus_addr; } free(ioat->ring, M_IOAT); ioat->ring = newring; ioat->ring_size_order = oldorder + 1; ioat->tail = tail; ioat->head = head; error = 0; mtx_unlock(&ioat->cleanup_lock); out: if (error) ioat_free_ring(ioat, (1 << (oldorder + 1)), newring); return (error); } static int ring_shrink(struct ioat_softc *ioat, uint32_t oldorder, struct ioat_descriptor **newring) { struct ioat_dma_hw_descriptor *hw; struct ioat_descriptor *ent, *next; uint32_t oldsize, newsize, current_idx, new_idx, i; int error; CTR0(KTR_IOAT, __func__); mtx_assert(&ioat->submit_lock, MA_OWNED); if (oldorder != ioat->ring_size_order || oldorder <= IOAT_MIN_ORDER) { error = EINVAL; goto out_unlocked; } oldsize = (1 << oldorder); newsize = (1 << (oldorder - 1)); mtx_lock(&ioat->cleanup_lock); /* Can't shrink below current active set! */ if (ioat_get_active(ioat) >= newsize) { error = ENOMEM; goto out; } /* * Copy current descriptors to the new ring, dropping the removed * descriptors. */ for (i = 0; i < newsize; i++) { current_idx = (ioat->tail + i) & (oldsize - 1); new_idx = (ioat->tail + i) & (newsize - 1); newring[new_idx] = ioat->ring[current_idx]; newring[new_idx]->id = new_idx; } /* Free deleted descriptors */ for (i = newsize; i < oldsize; i++) { ent = ioat_get_ring_entry(ioat, ioat->tail + i); ioat_free_ring_entry(ioat, ent); } /* Fix up hardware ring. */ hw = newring[(ioat->tail + newsize - 1) & (newsize - 1)]->u.dma; next = newring[(ioat->tail + newsize) & (newsize - 1)]; hw->next = next->hw_desc_bus_addr; free(ioat->ring, M_IOAT); ioat->ring = newring; ioat->ring_size_order = oldorder - 1; error = 0; out: mtx_unlock(&ioat->cleanup_lock); out_unlocked: if (error) ioat_free_ring(ioat, (1 << (oldorder - 1)), newring); return (error); } static void ioat_halted_debug(struct ioat_softc *ioat, uint32_t chanerr) { struct ioat_descriptor *desc; ioat_log_message(0, "Channel halted (%b)\n", (int)chanerr, IOAT_CHANERR_STR); if (chanerr == 0) return; mtx_assert(&ioat->cleanup_lock, MA_OWNED); desc = ioat_get_ring_entry(ioat, ioat->tail + 0); dump_descriptor(desc->u.raw); desc = ioat_get_ring_entry(ioat, ioat->tail + 1); dump_descriptor(desc->u.raw); } static void ioat_timer_callback(void *arg) { struct ioat_descriptor **newring; struct ioat_softc *ioat; uint32_t order; ioat = arg; ioat_log_message(1, "%s\n", __func__); if (ioat->is_completion_pending) { ioat_process_events(ioat); return; } /* Slowly scale the ring down if idle. */ mtx_lock(&ioat->submit_lock); order = ioat->ring_size_order; if (ioat->is_resize_pending || order == IOAT_MIN_ORDER) { mtx_unlock(&ioat->submit_lock); goto out; } ioat->is_resize_pending = TRUE; mtx_unlock(&ioat->submit_lock); newring = ioat_prealloc_ring(ioat, 1 << (order - 1), FALSE, M_NOWAIT); mtx_lock(&ioat->submit_lock); KASSERT(ioat->ring_size_order == order, ("resize_pending protects order")); if (newring != NULL) ring_shrink(ioat, order, newring); ioat->is_resize_pending = FALSE; mtx_unlock(&ioat->submit_lock); out: if (ioat->ring_size_order > IOAT_MIN_ORDER) callout_reset(&ioat->timer, 10 * hz, ioat_timer_callback, ioat); } /* * Support Functions */ static void ioat_submit_single(struct ioat_softc *ioat) { ioat_get(ioat, IOAT_ACTIVE_DESCR_REF); atomic_add_rel_int(&ioat->head, 1); atomic_add_rel_int(&ioat->hw_head, 1); if (!ioat->is_completion_pending) { ioat->is_completion_pending = TRUE; callout_reset(&ioat->timer, IOAT_INTR_TIMO, ioat_timer_callback, ioat); } ioat->stats.descriptors_submitted++; } static int ioat_reset_hw(struct ioat_softc *ioat) { uint64_t status; uint32_t chanerr; unsigned timeout; int error; mtx_lock(IOAT_REFLK); ioat->quiescing = TRUE; ioat_drain_locked(ioat); mtx_unlock(IOAT_REFLK); status = ioat_get_chansts(ioat); if (is_ioat_active(status) || is_ioat_idle(status)) ioat_suspend(ioat); /* Wait at most 20 ms */ for (timeout = 0; (is_ioat_active(status) || is_ioat_idle(status)) && timeout < 20; timeout++) { DELAY(1000); status = ioat_get_chansts(ioat); } if (timeout == 20) { error = ETIMEDOUT; goto out; } KASSERT(ioat_get_active(ioat) == 0, ("active after quiesce")); chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET); ioat_write_4(ioat, IOAT_CHANERR_OFFSET, chanerr); /* * IOAT v3 workaround - CHANERRMSK_INT with 3E07h to masks out errors * that can cause stability issues for IOAT v3. */ pci_write_config(ioat->device, IOAT_CFG_CHANERRMASK_INT_OFFSET, 0x3e07, 4); chanerr = pci_read_config(ioat->device, IOAT_CFG_CHANERR_INT_OFFSET, 4); pci_write_config(ioat->device, IOAT_CFG_CHANERR_INT_OFFSET, chanerr, 4); /* * BDXDE and BWD models reset MSI-X registers on device reset. * Save/restore their contents manually. */ if (ioat_model_resets_msix(ioat)) { ioat_log_message(1, "device resets MSI-X registers; saving\n"); pci_save_state(ioat->device); } ioat_reset(ioat); /* Wait at most 20 ms */ for (timeout = 0; ioat_reset_pending(ioat) && timeout < 20; timeout++) DELAY(1000); if (timeout == 20) { error = ETIMEDOUT; goto out; } if (ioat_model_resets_msix(ioat)) { ioat_log_message(1, "device resets registers; restored\n"); pci_restore_state(ioat->device); } /* Reset attempts to return the hardware to "halted." */ status = ioat_get_chansts(ioat); if (is_ioat_active(status) || is_ioat_idle(status)) { /* So this really shouldn't happen... */ ioat_log_message(0, "Device is active after a reset?\n"); ioat_write_chanctrl(ioat, IOAT_CHANCTRL_RUN); error = 0; goto out; } chanerr = ioat_read_4(ioat, IOAT_CHANERR_OFFSET); if (chanerr != 0) { mtx_lock(&ioat->cleanup_lock); ioat_halted_debug(ioat, chanerr); mtx_unlock(&ioat->cleanup_lock); error = EIO; goto out; } /* * Bring device back online after reset. Writing CHAINADDR brings the * device back to active. * * The internal ring counter resets to zero, so we have to start over * at zero as well. */ ioat->tail = ioat->head = ioat->hw_head = 0; ioat->last_seen = 0; ioat_write_chanctrl(ioat, IOAT_CHANCTRL_RUN); ioat_write_chancmp(ioat, ioat->comp_update_bus_addr); ioat_write_chainaddr(ioat, ioat->ring[0]->hw_desc_bus_addr); error = 0; out: mtx_lock(IOAT_REFLK); ioat->quiescing = FALSE; wakeup(&ioat->quiescing); mtx_unlock(IOAT_REFLK); if (error == 0) error = ioat_start_channel(ioat); return (error); } static int sysctl_handle_chansts(SYSCTL_HANDLER_ARGS) { struct ioat_softc *ioat; struct sbuf sb; uint64_t status; int error; ioat = arg1; status = ioat_get_chansts(ioat) & IOAT_CHANSTS_STATUS; sbuf_new_for_sysctl(&sb, NULL, 256, req); switch (status) { case IOAT_CHANSTS_ACTIVE: sbuf_printf(&sb, "ACTIVE"); break; case IOAT_CHANSTS_IDLE: sbuf_printf(&sb, "IDLE"); break; case IOAT_CHANSTS_SUSPENDED: sbuf_printf(&sb, "SUSPENDED"); break; case IOAT_CHANSTS_HALTED: sbuf_printf(&sb, "HALTED"); break; case IOAT_CHANSTS_ARMED: sbuf_printf(&sb, "ARMED"); break; default: sbuf_printf(&sb, "UNKNOWN"); break; } error = sbuf_finish(&sb); sbuf_delete(&sb); if (error != 0 || req->newptr == NULL) return (error); return (EINVAL); } static int sysctl_handle_dpi(SYSCTL_HANDLER_ARGS) { struct ioat_softc *ioat; struct sbuf sb; #define PRECISION "1" const uintmax_t factor = 10; uintmax_t rate; int error; ioat = arg1; sbuf_new_for_sysctl(&sb, NULL, 16, req); if (ioat->stats.interrupts == 0) { sbuf_printf(&sb, "NaN"); goto out; } rate = ioat->stats.descriptors_processed * factor / ioat->stats.interrupts; sbuf_printf(&sb, "%ju.%." PRECISION "ju", rate / factor, rate % factor); #undef PRECISION out: error = sbuf_finish(&sb); sbuf_delete(&sb); if (error != 0 || req->newptr == NULL) return (error); return (EINVAL); } static int sysctl_handle_error(SYSCTL_HANDLER_ARGS) { struct ioat_descriptor *desc; struct ioat_softc *ioat; int error, arg; ioat = arg1; arg = 0; error = SYSCTL_OUT(req, &arg, sizeof(arg)); if (error != 0 || req->newptr == NULL) return (error); error = SYSCTL_IN(req, &arg, sizeof(arg)); if (error != 0) return (error); if (arg != 0) { ioat_acquire(&ioat->dmaengine); desc = ioat_op_generic(ioat, IOAT_OP_COPY, 1, 0xffff000000000000ull, 0xffff000000000000ull, NULL, NULL, 0); if (desc == NULL) error = ENOMEM; else ioat_submit_single(ioat); ioat_release(&ioat->dmaengine); } return (error); } static int sysctl_handle_reset(SYSCTL_HANDLER_ARGS) { struct ioat_softc *ioat; int error, arg; ioat = arg1; arg = 0; error = SYSCTL_OUT(req, &arg, sizeof(arg)); if (error != 0 || req->newptr == NULL) return (error); error = SYSCTL_IN(req, &arg, sizeof(arg)); if (error != 0) return (error); if (arg != 0) error = ioat_reset_hw(ioat); return (error); } static void dump_descriptor(void *hw_desc) { int i, j; for (i = 0; i < 2; i++) { for (j = 0; j < 8; j++) printf("%08x ", ((uint32_t *)hw_desc)[i * 8 + j]); printf("\n"); } } static void ioat_setup_sysctl(device_t device) { struct sysctl_oid_list *par, *statpar, *state, *hammer; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree, *tmp; struct ioat_softc *ioat; ioat = DEVICE2SOFTC(device); ctx = device_get_sysctl_ctx(device); tree = device_get_sysctl_tree(device); par = SYSCTL_CHILDREN(tree); SYSCTL_ADD_INT(ctx, par, OID_AUTO, "version", CTLFLAG_RD, &ioat->version, 0, "HW version (0xMM form)"); SYSCTL_ADD_UINT(ctx, par, OID_AUTO, "max_xfer_size", CTLFLAG_RD, &ioat->max_xfer_size, 0, "HW maximum transfer size"); SYSCTL_ADD_INT(ctx, par, OID_AUTO, "intrdelay_supported", CTLFLAG_RD, &ioat->intrdelay_supported, 0, "Is INTRDELAY supported"); SYSCTL_ADD_U16(ctx, par, OID_AUTO, "intrdelay_max", CTLFLAG_RD, &ioat->intrdelay_max, 0, "Maximum configurable INTRDELAY on this channel (microseconds)"); tmp = SYSCTL_ADD_NODE(ctx, par, OID_AUTO, "state", CTLFLAG_RD, NULL, "IOAT channel internal state"); state = SYSCTL_CHILDREN(tmp); SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "ring_size_order", CTLFLAG_RD, &ioat->ring_size_order, 0, "SW descriptor ring size order"); SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "head", CTLFLAG_RD, &ioat->head, 0, "SW descriptor head pointer index"); SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "tail", CTLFLAG_RD, &ioat->tail, 0, "SW descriptor tail pointer index"); SYSCTL_ADD_UINT(ctx, state, OID_AUTO, "hw_head", CTLFLAG_RD, &ioat->hw_head, 0, "HW DMACOUNT"); SYSCTL_ADD_UQUAD(ctx, state, OID_AUTO, "last_completion", CTLFLAG_RD, ioat->comp_update, "HW addr of last completion"); SYSCTL_ADD_INT(ctx, state, OID_AUTO, "is_resize_pending", CTLFLAG_RD, &ioat->is_resize_pending, 0, "resize pending"); SYSCTL_ADD_INT(ctx, state, OID_AUTO, "is_completion_pending", CTLFLAG_RD, &ioat->is_completion_pending, 0, "completion pending"); SYSCTL_ADD_INT(ctx, state, OID_AUTO, "is_reset_pending", CTLFLAG_RD, &ioat->is_reset_pending, 0, "reset pending"); SYSCTL_ADD_INT(ctx, state, OID_AUTO, "is_channel_running", CTLFLAG_RD, &ioat->is_channel_running, 0, "channel running"); SYSCTL_ADD_PROC(ctx, state, OID_AUTO, "chansts", CTLTYPE_STRING | CTLFLAG_RD, ioat, 0, sysctl_handle_chansts, "A", "String of the channel status"); SYSCTL_ADD_U16(ctx, state, OID_AUTO, "intrdelay", CTLFLAG_RD, &ioat->cached_intrdelay, 0, "Current INTRDELAY on this channel (cached, microseconds)"); tmp = SYSCTL_ADD_NODE(ctx, par, OID_AUTO, "hammer", CTLFLAG_RD, NULL, "Big hammers (mostly for testing)"); hammer = SYSCTL_CHILDREN(tmp); SYSCTL_ADD_PROC(ctx, hammer, OID_AUTO, "force_hw_reset", CTLTYPE_INT | CTLFLAG_RW, ioat, 0, sysctl_handle_reset, "I", "Set to non-zero to reset the hardware"); SYSCTL_ADD_PROC(ctx, hammer, OID_AUTO, "force_hw_error", CTLTYPE_INT | CTLFLAG_RW, ioat, 0, sysctl_handle_error, "I", "Set to non-zero to inject a recoverable hardware error"); tmp = SYSCTL_ADD_NODE(ctx, par, OID_AUTO, "stats", CTLFLAG_RD, NULL, "IOAT channel statistics"); statpar = SYSCTL_CHILDREN(tmp); SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "interrupts", CTLFLAG_RW, &ioat->stats.interrupts, "Number of interrupts processed on this channel"); SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "descriptors", CTLFLAG_RW, &ioat->stats.descriptors_processed, "Number of descriptors processed on this channel"); SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "submitted", CTLFLAG_RW, &ioat->stats.descriptors_submitted, "Number of descriptors submitted to this channel"); SYSCTL_ADD_UQUAD(ctx, statpar, OID_AUTO, "errored", CTLFLAG_RW, &ioat->stats.descriptors_error, "Number of descriptors failed by channel errors"); SYSCTL_ADD_U32(ctx, statpar, OID_AUTO, "halts", CTLFLAG_RW, &ioat->stats.channel_halts, 0, "Number of times the channel has halted"); SYSCTL_ADD_U32(ctx, statpar, OID_AUTO, "last_halt_chanerr", CTLFLAG_RW, &ioat->stats.last_halt_chanerr, 0, "The raw CHANERR when the channel was last halted"); SYSCTL_ADD_PROC(ctx, statpar, OID_AUTO, "desc_per_interrupt", CTLTYPE_STRING | CTLFLAG_RD, ioat, 0, sysctl_handle_dpi, "A", "Descriptors per interrupt"); } static inline struct ioat_softc * ioat_get(struct ioat_softc *ioat, enum ioat_ref_kind kind) { uint32_t old; KASSERT(kind < IOAT_NUM_REF_KINDS, ("bogus")); old = atomic_fetchadd_32(&ioat->refcnt, 1); KASSERT(old < UINT32_MAX, ("refcnt overflow")); #ifdef INVARIANTS old = atomic_fetchadd_32(&ioat->refkinds[kind], 1); KASSERT(old < UINT32_MAX, ("refcnt kind overflow")); #endif return (ioat); } static inline void ioat_putn(struct ioat_softc *ioat, uint32_t n, enum ioat_ref_kind kind) { _ioat_putn(ioat, n, kind, FALSE); } static inline void ioat_putn_locked(struct ioat_softc *ioat, uint32_t n, enum ioat_ref_kind kind) { _ioat_putn(ioat, n, kind, TRUE); } static inline void _ioat_putn(struct ioat_softc *ioat, uint32_t n, enum ioat_ref_kind kind, boolean_t locked) { uint32_t old; KASSERT(kind < IOAT_NUM_REF_KINDS, ("bogus")); if (n == 0) return; #ifdef INVARIANTS old = atomic_fetchadd_32(&ioat->refkinds[kind], -n); KASSERT(old >= n, ("refcnt kind underflow")); #endif /* Skip acquiring the lock if resulting refcnt > 0. */ for (;;) { old = ioat->refcnt; if (old <= n) break; if (atomic_cmpset_32(&ioat->refcnt, old, old - n)) return; } if (locked) mtx_assert(IOAT_REFLK, MA_OWNED); else mtx_lock(IOAT_REFLK); old = atomic_fetchadd_32(&ioat->refcnt, -n); KASSERT(old >= n, ("refcnt error")); if (old == n) wakeup(IOAT_REFLK); if (!locked) mtx_unlock(IOAT_REFLK); } static inline void ioat_put(struct ioat_softc *ioat, enum ioat_ref_kind kind) { ioat_putn(ioat, 1, kind); } static void ioat_drain_locked(struct ioat_softc *ioat) { mtx_assert(IOAT_REFLK, MA_OWNED); while (ioat->refcnt > 0) msleep(IOAT_REFLK, IOAT_REFLK, 0, "ioat_drain", 0); } Index: head/sys/dev/xen/evtchn/evtchn_dev.c =================================================================== --- head/sys/dev/xen/evtchn/evtchn_dev.c (revision 299352) +++ head/sys/dev/xen/evtchn/evtchn_dev.c (revision 299353) @@ -1,607 +1,605 @@ /****************************************************************************** * evtchn.c * * Driver for receiving and demuxing event-channel signals. * * Copyright (c) 2004-2005, K A Fraser * Multi-process extensions Copyright (c) 2004, Steven Smith * FreeBSD port Copyright (c) 2014, Roger Pau Monné * Fetched from git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git * File: drivers/xen/evtchn.c * Git commit: 0dc0064add422bc0ef5165ebe9ece3052bbd457d * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation; or, when distributed * separately from the Linux kernel or incorporated into other * software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_EVTCHN, "evtchn_dev", "Xen event channel user-space device"); struct user_evtchn; static int evtchn_cmp(struct user_evtchn *u1, struct user_evtchn *u2); RB_HEAD(evtchn_tree, user_evtchn); struct per_user_data { struct mtx bind_mutex; /* serialize bind/unbind operations */ struct evtchn_tree evtchns; /* Notification ring, accessed via /dev/xen/evtchn. */ #define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) #define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) evtchn_port_t *ring; unsigned int ring_cons, ring_prod, ring_overflow; struct sx ring_cons_mutex; /* protect against concurrent readers */ struct mtx ring_prod_mutex; /* product against concurrent interrupts */ struct selinfo ev_rsel; }; struct user_evtchn { RB_ENTRY(user_evtchn) node; struct per_user_data *user; evtchn_port_t port; xen_intr_handle_t handle; bool enabled; }; RB_GENERATE_STATIC(evtchn_tree, user_evtchn, node, evtchn_cmp); static device_t evtchn_dev; static d_read_t evtchn_read; static d_write_t evtchn_write; static d_ioctl_t evtchn_ioctl; static d_poll_t evtchn_poll; static d_open_t evtchn_open; static void evtchn_release(void *arg); static struct cdevsw evtchn_devsw = { .d_version = D_VERSION, .d_open = evtchn_open, .d_read = evtchn_read, .d_write = evtchn_write, .d_ioctl = evtchn_ioctl, .d_poll = evtchn_poll, .d_name = "evtchn", }; /*------------------------- Red-black tree helpers ---------------------------*/ static int evtchn_cmp(struct user_evtchn *u1, struct user_evtchn *u2) { return (u1->port - u2->port); } static struct user_evtchn * find_evtchn(struct per_user_data *u, evtchn_port_t port) { struct user_evtchn tmp = { .port = port, }; return (RB_FIND(evtchn_tree, &u->evtchns, &tmp)); } /*--------------------------- Interrupt handlers -----------------------------*/ static int evtchn_filter(void *arg) { struct user_evtchn *evtchn; evtchn = arg; if (!evtchn->enabled && bootverbose) { device_printf(evtchn_dev, "Received upcall for disabled event channel %d\n", evtchn->port); } evtchn_mask_port(evtchn->port); evtchn->enabled = false; return (FILTER_SCHEDULE_THREAD); } static void evtchn_interrupt(void *arg) { struct user_evtchn *evtchn; struct per_user_data *u; evtchn = arg; u = evtchn->user; /* * Protect against concurrent events using this handler * on different CPUs. */ mtx_lock(&u->ring_prod_mutex); if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port; wmb(); /* Ensure ring contents visible */ if (u->ring_cons == u->ring_prod++) { wakeup(u); selwakeup(&u->ev_rsel); } } else u->ring_overflow = 1; mtx_unlock(&u->ring_prod_mutex); } /*------------------------- Character device methods -------------------------*/ static int evtchn_open(struct cdev *dev, int flag, int otyp, struct thread *td) { struct per_user_data *u; int error; u = malloc(sizeof(*u), M_EVTCHN, M_WAITOK | M_ZERO); u->ring = malloc(PAGE_SIZE, M_EVTCHN, M_WAITOK | M_ZERO); /* Initialize locks */ mtx_init(&u->bind_mutex, "evtchn_bind_mutex", NULL, MTX_DEF); sx_init(&u->ring_cons_mutex, "evtchn_ringc_sx"); mtx_init(&u->ring_prod_mutex, "evtchn_ringp_mutex", NULL, MTX_DEF); /* Initialize red-black tree. */ RB_INIT(&u->evtchns); /* Assign the allocated per_user_data to this open instance. */ error = devfs_set_cdevpriv(u, evtchn_release); if (error != 0) { mtx_destroy(&u->bind_mutex); mtx_destroy(&u->ring_prod_mutex); sx_destroy(&u->ring_cons_mutex); free(u->ring, M_EVTCHN); free(u, M_EVTCHN); } return (error); } static void evtchn_release(void *arg) { struct per_user_data *u; struct user_evtchn *evtchn, *tmp; u = arg; seldrain(&u->ev_rsel); RB_FOREACH_SAFE(evtchn, evtchn_tree, &u->evtchns, tmp) { xen_intr_unbind(&evtchn->handle); RB_REMOVE(evtchn_tree, &u->evtchns, evtchn); free(evtchn, M_EVTCHN); } mtx_destroy(&u->bind_mutex); mtx_destroy(&u->ring_prod_mutex); sx_destroy(&u->ring_cons_mutex); free(u->ring, M_EVTCHN); free(u, M_EVTCHN); } static int evtchn_read(struct cdev *dev, struct uio *uio, int ioflag) { int error, count; unsigned int c, p, bytes1 = 0, bytes2 = 0; struct per_user_data *u; error = devfs_get_cdevpriv((void **)&u); if (error != 0) return (EINVAL); /* Whole number of ports. */ count = uio->uio_resid; count &= ~(sizeof(evtchn_port_t)-1); if (count == 0) return (0); if (count > PAGE_SIZE) count = PAGE_SIZE; sx_xlock(&u->ring_cons_mutex); for (;;) { error = EFBIG; if (u->ring_overflow) goto unlock_out; c = u->ring_cons; p = u->ring_prod; if (c != p) break; if (ioflag & IO_NDELAY) { sx_xunlock(&u->ring_cons_mutex); return (EWOULDBLOCK); } error = sx_sleep(u, &u->ring_cons_mutex, PCATCH, "evtchw", 0); if ((error != 0) && (error != EWOULDBLOCK)) return (error); } /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * sizeof(evtchn_port_t); bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t); } else { bytes1 = (p - c) * sizeof(evtchn_port_t); bytes2 = 0; } /* Truncate chunks according to caller's maximum byte count. */ if (bytes1 > count) { bytes1 = count; bytes2 = 0; } else if ((bytes1 + bytes2) > count) { bytes2 = count - bytes1; } error = EFAULT; rmb(); /* Ensure that we see the port before we copy it. */ if (uiomove(&u->ring[EVTCHN_RING_MASK(c)], bytes1, uio) || ((bytes2 != 0) && uiomove(&u->ring[0], bytes2, uio))) goto unlock_out; u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); error = 0; unlock_out: sx_xunlock(&u->ring_cons_mutex); return (error); } static int evtchn_write(struct cdev *dev, struct uio *uio, int ioflag) { int error, i, count; evtchn_port_t *kbuf; struct per_user_data *u; error = devfs_get_cdevpriv((void **)&u); if (error != 0) return (EINVAL); kbuf = malloc(PAGE_SIZE, M_EVTCHN, M_WAITOK); - if (kbuf == NULL) - return (ENOMEM); count = uio->uio_resid; /* Whole number of ports. */ count &= ~(sizeof(evtchn_port_t)-1); error = 0; if (count == 0) goto out; if (count > PAGE_SIZE) count = PAGE_SIZE; error = uiomove(kbuf, count, uio); if (error != 0) goto out; mtx_lock(&u->bind_mutex); for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { evtchn_port_t port = kbuf[i]; struct user_evtchn *evtchn; evtchn = find_evtchn(u, port); if (evtchn && !evtchn->enabled) { evtchn->enabled = true; evtchn_unmask_port(evtchn->port); } } mtx_unlock(&u->bind_mutex); error = 0; out: free(kbuf, M_EVTCHN); return (error); } static inline int evtchn_bind_user_port(struct per_user_data *u, struct user_evtchn *evtchn) { int error; evtchn->port = xen_intr_port(evtchn->handle); evtchn->user = u; evtchn->enabled = true; mtx_lock(&u->bind_mutex); RB_INSERT(evtchn_tree, &u->evtchns, evtchn); mtx_unlock(&u->bind_mutex); error = xen_intr_add_handler(evtchn_dev, evtchn_filter, evtchn_interrupt, evtchn, INTR_TYPE_MISC | INTR_MPSAFE, evtchn->handle); if (error != 0) { xen_intr_unbind(&evtchn->handle); mtx_lock(&u->bind_mutex); RB_REMOVE(evtchn_tree, &u->evtchns, evtchn); mtx_unlock(&u->bind_mutex); free(evtchn, M_EVTCHN); } return (error); } static int evtchn_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg, int mode, struct thread *td __unused) { struct per_user_data *u; int error; error = devfs_get_cdevpriv((void **)&u); if (error != 0) return (EINVAL); switch (cmd) { case IOCTL_EVTCHN_BIND_VIRQ: { struct ioctl_evtchn_bind_virq *bind; struct user_evtchn *evtchn; evtchn = malloc(sizeof(*evtchn), M_EVTCHN, M_WAITOK | M_ZERO); bind = (struct ioctl_evtchn_bind_virq *)arg; error = xen_intr_bind_virq(evtchn_dev, bind->virq, 0, NULL, NULL, NULL, 0, &evtchn->handle); if (error != 0) { free(evtchn, M_EVTCHN); break; } error = evtchn_bind_user_port(u, evtchn); if (error != 0) break; bind->port = evtchn->port; break; } case IOCTL_EVTCHN_BIND_INTERDOMAIN: { struct ioctl_evtchn_bind_interdomain *bind; struct user_evtchn *evtchn; evtchn = malloc(sizeof(*evtchn), M_EVTCHN, M_WAITOK | M_ZERO); bind = (struct ioctl_evtchn_bind_interdomain *)arg; error = xen_intr_bind_remote_port(evtchn_dev, bind->remote_domain, bind->remote_port, NULL, NULL, NULL, 0, &evtchn->handle); if (error != 0) { free(evtchn, M_EVTCHN); break; } error = evtchn_bind_user_port(u, evtchn); if (error != 0) break; bind->port = evtchn->port; break; } case IOCTL_EVTCHN_BIND_UNBOUND_PORT: { struct ioctl_evtchn_bind_unbound_port *bind; struct user_evtchn *evtchn; evtchn = malloc(sizeof(*evtchn), M_EVTCHN, M_WAITOK | M_ZERO); bind = (struct ioctl_evtchn_bind_unbound_port *)arg; error = xen_intr_alloc_and_bind_local_port(evtchn_dev, bind->remote_domain, NULL, NULL, NULL, 0, &evtchn->handle); if (error != 0) { free(evtchn, M_EVTCHN); break; } error = evtchn_bind_user_port(u, evtchn); if (error != 0) break; bind->port = evtchn->port; break; } case IOCTL_EVTCHN_UNBIND: { struct ioctl_evtchn_unbind *unbind; struct user_evtchn *evtchn; unbind = (struct ioctl_evtchn_unbind *)arg; mtx_lock(&u->bind_mutex); evtchn = find_evtchn(u, unbind->port); if (evtchn == NULL) { error = ENOTCONN; break; } xen_intr_unbind(&evtchn->handle); RB_REMOVE(evtchn_tree, &u->evtchns, evtchn); mtx_unlock(&u->bind_mutex); free(evtchn, M_EVTCHN); error = 0; break; } case IOCTL_EVTCHN_NOTIFY: { struct ioctl_evtchn_notify *notify; struct user_evtchn *evtchn; notify = (struct ioctl_evtchn_notify *)arg; mtx_lock(&u->bind_mutex); evtchn = find_evtchn(u, notify->port); if (evtchn == NULL) { error = ENOTCONN; break; } xen_intr_signal(evtchn->handle); mtx_unlock(&u->bind_mutex); error = 0; break; } case IOCTL_EVTCHN_RESET: { /* Initialise the ring to empty. Clear errors. */ sx_xlock(&u->ring_cons_mutex); mtx_lock(&u->ring_prod_mutex); u->ring_cons = u->ring_prod = u->ring_overflow = 0; mtx_unlock(&u->ring_prod_mutex); sx_xunlock(&u->ring_cons_mutex); error = 0; break; } case FIONBIO: case FIOASYNC: /* Handled in an upper layer */ error = 0; break; default: error = ENOTTY; break; } return (error); } static int evtchn_poll(struct cdev *dev, int events, struct thread *td) { struct per_user_data *u; int error, mask; error = devfs_get_cdevpriv((void **)&u); if (error != 0) return (POLLERR); /* we can always write */ mask = events & (POLLOUT | POLLWRNORM); mtx_lock(&u->ring_prod_mutex); if (events & (POLLIN | POLLRDNORM)) { if (u->ring_cons != u->ring_prod) { mask |= events & (POLLIN | POLLRDNORM); } else { /* Record that someone is waiting */ selrecord(td, &u->ev_rsel); } } mtx_unlock(&u->ring_prod_mutex); return (mask); } /*------------------ Private Device Attachment Functions --------------------*/ static void evtchn_identify(driver_t *driver, device_t parent) { KASSERT((xen_domain()), ("Trying to attach evtchn device on non Xen domain")); evtchn_dev = BUS_ADD_CHILD(parent, 0, "evtchn", 0); if (evtchn_dev == NULL) panic("unable to attach evtchn user-space device"); } static int evtchn_probe(device_t dev) { device_set_desc(dev, "Xen event channel user-space device"); return (BUS_PROBE_NOWILDCARD); } static int evtchn_attach(device_t dev) { make_dev_credf(MAKEDEV_ETERNAL, &evtchn_devsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "xen/evtchn"); return (0); } /*-------------------- Private Device Attachment Data -----------------------*/ static device_method_t evtchn_methods[] = { DEVMETHOD(device_identify, evtchn_identify), DEVMETHOD(device_probe, evtchn_probe), DEVMETHOD(device_attach, evtchn_attach), DEVMETHOD_END }; static driver_t evtchn_driver = { "evtchn", evtchn_methods, 0, }; devclass_t evtchn_devclass; DRIVER_MODULE(evtchn, xenpv, evtchn_driver, evtchn_devclass, 0, 0); MODULE_DEPEND(evtchn, xenpv, 1, 1, 1); Index: head/sys/x86/xen/pv.c =================================================================== --- head/sys/x86/xen/pv.c (revision 299352) +++ head/sys/x86/xen/pv.c (revision 299353) @@ -1,441 +1,439 @@ /* * Copyright (c) 2004 Christian Limpach. * Copyright (c) 2004-2006,2008 Kip Macy * Copyright (c) 2008 The NetBSD Foundation, Inc. * Copyright (c) 2013 Roger Pau Monné * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* Native initial function */ extern u_int64_t hammer_time(u_int64_t, u_int64_t); /* Xen initial function */ uint64_t hammer_time_xen(start_info_t *, uint64_t); #define MAX_E820_ENTRIES 128 /*--------------------------- Forward Declarations ---------------------------*/ static caddr_t xen_pv_parse_preload_data(u_int64_t); static void xen_pv_parse_memmap(caddr_t, vm_paddr_t *, int *); #ifdef SMP static int xen_pv_start_all_aps(void); #endif /*---------------------------- Extern Declarations ---------------------------*/ #ifdef SMP /* Variables used by amd64 mp_machdep to start APs */ extern char *doublefault_stack; extern char *nmi_stack; #endif /* * Placed by the linker at the end of the bss section, which is the last * section loaded by Xen before loading the symtab and strtab. */ extern uint32_t end; /*-------------------------------- Global Data -------------------------------*/ /* Xen init_ops implementation. */ struct init_ops xen_init_ops = { .parse_preload_data = xen_pv_parse_preload_data, .early_clock_source_init = xen_clock_init, .early_delay = xen_delay, .parse_memmap = xen_pv_parse_memmap, #ifdef SMP .start_all_aps = xen_pv_start_all_aps, #endif .msi_init = xen_msi_init, }; static struct bios_smap xen_smap[MAX_E820_ENTRIES]; /*-------------------------------- Xen PV init -------------------------------*/ /* * First function called by the Xen PVH boot sequence. * * Set some Xen global variables and prepare the environment so it is * as similar as possible to what native FreeBSD init function expects. */ uint64_t hammer_time_xen(start_info_t *si, uint64_t xenstack) { uint64_t physfree; uint64_t *PT4 = (u_int64_t *)xenstack; uint64_t *PT3 = (u_int64_t *)(xenstack + PAGE_SIZE); uint64_t *PT2 = (u_int64_t *)(xenstack + 2 * PAGE_SIZE); int i; xen_domain_type = XEN_PV_DOMAIN; vm_guest = VM_GUEST_XEN; if ((si == NULL) || (xenstack == 0)) { xc_printf("ERROR: invalid start_info or xen stack, halting\n"); HYPERVISOR_shutdown(SHUTDOWN_crash); } xc_printf("FreeBSD PVH running on %s\n", si->magic); /* We use 3 pages of xen stack for the boot pagetables */ physfree = xenstack + 3 * PAGE_SIZE - KERNBASE; /* Setup Xen global variables */ HYPERVISOR_start_info = si; HYPERVISOR_shared_info = (shared_info_t *)(si->shared_info + KERNBASE); /* * Setup some misc global variables for Xen devices * * XXX: Devices that need these specific variables should * be rewritten to fetch this info by themselves from the * start_info page. */ xen_store = (struct xenstore_domain_interface *) (ptoa(si->store_mfn) + KERNBASE); console_page = (char *)(ptoa(si->console.domU.mfn) + KERNBASE); /* * Use the stack Xen gives us to build the page tables * as native FreeBSD expects to find them (created * by the boot trampoline). */ for (i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); i++) { /* * Each slot of the level 4 pages points * to the same level 3 page */ PT4[i] = ((uint64_t)&PT3[0]) - KERNBASE; PT4[i] |= PG_V | PG_RW | PG_U; /* * Each slot of the level 3 pages points * to the same level 2 page */ PT3[i] = ((uint64_t)&PT2[0]) - KERNBASE; PT3[i] |= PG_V | PG_RW | PG_U; /* * The level 2 page slots are mapped with * 2MB pages for 1GB. */ PT2[i] = i * (2 * 1024 * 1024); PT2[i] |= PG_V | PG_RW | PG_PS | PG_U; } load_cr3(((uint64_t)&PT4[0]) - KERNBASE); /* Set the hooks for early functions that diverge from bare metal */ init_ops = xen_init_ops; apic_ops = xen_apic_ops; /* Now we can jump into the native init function */ return (hammer_time(0, physfree)); } /*-------------------------------- PV specific -------------------------------*/ #ifdef SMP static bool start_xen_ap(int cpu) { struct vcpu_guest_context *ctxt; int ms, cpus = mp_naps; const size_t stacksize = kstack_pages * PAGE_SIZE; /* allocate and set up an idle stack data page */ bootstacks[cpu] = (void *)kmem_malloc(kernel_arena, stacksize, M_WAITOK | M_ZERO); doublefault_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, M_WAITOK | M_ZERO); bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8; bootAP = cpu; ctxt = malloc(sizeof(*ctxt), M_TEMP, M_WAITOK | M_ZERO); - if (ctxt == NULL) - panic("unable to allocate memory"); ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.rip = (unsigned long) init_secondary; ctxt->user_regs.rsp = (unsigned long) bootSTK; /* Set the AP to use the same page tables */ ctxt->ctrlreg[3] = KPML4phys; if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) panic("unable to initialize AP#%d", cpu); free(ctxt, M_TEMP); /* Launch the vCPU */ if (HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) panic("unable to start AP#%d", cpu); /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return (true); DELAY(1000); } return (false); } static int xen_pv_start_all_aps(void) { int cpu; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); for (cpu = 1; cpu < mp_ncpus; cpu++) { /* attempt to start the Application Processor */ if (!start_xen_ap(cpu)) panic("AP #%d failed to start!", cpu); CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ } return (mp_naps); } #endif /* SMP */ /* * Functions to convert the "extra" parameters passed by Xen * into FreeBSD boot options. */ static void xen_pv_set_env(void) { char *cmd_line_next, *cmd_line; size_t env_size; cmd_line = HYPERVISOR_start_info->cmd_line; env_size = sizeof(HYPERVISOR_start_info->cmd_line); /* Skip leading spaces */ for (; isspace(*cmd_line) && (env_size != 0); cmd_line++) env_size--; /* Replace ',' with '\0' */ for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;) ; init_static_kenv(cmd_line, 0); } static void xen_pv_set_boothowto(void) { int i; char *env; /* get equivalents from the environment */ for (i = 0; howto_names[i].ev != NULL; i++) { if ((env = kern_getenv(howto_names[i].ev)) != NULL) { boothowto |= howto_names[i].mask; freeenv(env); } } } #ifdef DDB /* * The way Xen loads the symtab is different from the native boot loader, * because it's tailored for NetBSD. So we have to adapt and use the same * method as NetBSD. Portions of the code below have been picked from NetBSD: * sys/kern/kern_ksyms.c CVS Revision 1.71. */ static void xen_pv_parse_symtab(void) { Elf_Ehdr *ehdr; Elf_Shdr *shdr; vm_offset_t sym_end; uint32_t size; int i, j; size = end; sym_end = HYPERVISOR_start_info->mod_start != 0 ? HYPERVISOR_start_info->mod_start : HYPERVISOR_start_info->mfn_list; /* * Make sure the size is right headed, sym_end is just a * high boundary, but at least allows us to fail earlier. */ if ((vm_offset_t)&end + size > sym_end) { xc_printf("Unable to load ELF symtab: size mismatch\n"); return; } ehdr = (Elf_Ehdr *)(&end + 1); if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) || ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || ehdr->e_version > 1) { xc_printf("Unable to load ELF symtab: invalid symbol table\n"); return; } shdr = (Elf_Shdr *)((uint8_t *)ehdr + ehdr->e_shoff); /* Find the symbol table and the corresponding string table. */ for (i = 1; i < ehdr->e_shnum; i++) { if (shdr[i].sh_type != SHT_SYMTAB) continue; if (shdr[i].sh_offset == 0) continue; ksymtab = (uintptr_t)((uint8_t *)ehdr + shdr[i].sh_offset); ksymtab_size = shdr[i].sh_size; j = shdr[i].sh_link; if (shdr[j].sh_offset == 0) continue; /* Can this happen? */ kstrtab = (uintptr_t)((uint8_t *)ehdr + shdr[j].sh_offset); break; } if (ksymtab == 0 || kstrtab == 0) { xc_printf( "Unable to load ELF symtab: could not find symtab or strtab\n"); return; } } #endif static caddr_t xen_pv_parse_preload_data(u_int64_t modulep) { caddr_t kmdp; vm_ooffset_t off; vm_paddr_t metadata; char *envp; if (HYPERVISOR_start_info->mod_start != 0) { preload_metadata = (caddr_t)(HYPERVISOR_start_info->mod_start); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); KASSERT(kmdp != NULL, ("unable to find kernel")); /* * Xen has relocated the metadata and the modules, * so we need to recalculate it's position. This is * done by saving the original modulep address and * then calculating the offset with mod_start, * which contains the relocated modulep address. */ metadata = MD_FETCH(kmdp, MODINFOMD_MODULEP, vm_paddr_t); off = HYPERVISOR_start_info->mod_start - metadata; preload_bootstrap_relocate(off); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); if (envp != NULL) envp += off; init_static_kenv(envp, 0); } else { /* Parse the extra boot information given by Xen */ xen_pv_set_env(); xen_pv_set_boothowto(); kmdp = NULL; } #ifdef DDB xen_pv_parse_symtab(); #endif return (kmdp); } static void xen_pv_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) { struct xen_memory_map memmap; u_int32_t size; int rc; /* Fetch the E820 map from Xen */ memmap.nr_entries = MAX_E820_ENTRIES; set_xen_guest_handle(memmap.buffer, xen_smap); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); if (rc) panic("unable to fetch Xen E820 memory map"); size = memmap.nr_entries * sizeof(xen_smap[0]); bios_add_smap_entries(xen_smap, size, physmap, physmap_idx); }