diff --git a/sys/dev/hwpmc/hwpmc_amd.c b/sys/dev/hwpmc/hwpmc_amd.c index c35f5fd4f095..43812951a8e5 100644 --- a/sys/dev/hwpmc/hwpmc_amd.c +++ b/sys/dev/hwpmc/hwpmc_amd.c @@ -1,1240 +1,1238 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2008 Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* Support for the AMD K7 and later processors */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_DEBUG enum pmc_class amd_pmc_class; #endif #define OVERFLOW_WAIT_COUNT 50 DPCPU_DEFINE_STATIC(uint32_t, nmi_counter); /* AMD K7 & K8 PMCs */ struct amd_descr { struct pmc_descr pm_descr; /* "base class" */ uint32_t pm_evsel; /* address of EVSEL register */ uint32_t pm_perfctr; /* address of PERFCTR register */ }; static struct amd_descr amd_pmcdesc[AMD_NPMCS] = { { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_0, .pm_perfctr = AMD_PMC_PERFCTR_0 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_1, .pm_perfctr = AMD_PMC_PERFCTR_1 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_2, .pm_perfctr = AMD_PMC_PERFCTR_2 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_3, .pm_perfctr = AMD_PMC_PERFCTR_3 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_4, .pm_perfctr = AMD_PMC_PERFCTR_4 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_5, .pm_perfctr = AMD_PMC_PERFCTR_5 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_L3_0, .pm_perfctr = AMD_PMC_PERFCTR_EP_L3_0 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_L3_1, .pm_perfctr = AMD_PMC_PERFCTR_EP_L3_1 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_L3_2, .pm_perfctr = AMD_PMC_PERFCTR_EP_L3_2 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_L3_3, .pm_perfctr = AMD_PMC_PERFCTR_EP_L3_3 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_L3_4, .pm_perfctr = AMD_PMC_PERFCTR_EP_L3_4 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_L3_5, .pm_perfctr = AMD_PMC_PERFCTR_EP_L3_5 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_DF_0, .pm_perfctr = AMD_PMC_PERFCTR_EP_DF_0 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_DF_1, .pm_perfctr = AMD_PMC_PERFCTR_EP_DF_1 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_DF_2, .pm_perfctr = AMD_PMC_PERFCTR_EP_DF_2 }, { .pm_descr = { .pd_name = "", .pd_class = -1, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_EP_DF_3, .pm_perfctr = AMD_PMC_PERFCTR_EP_DF_3 } }; struct amd_event_code_map { enum pmc_event pe_ev; /* enum value */ uint16_t pe_code; /* encoded event mask */ uint8_t pe_mask; /* bits allowed in unit mask */ }; const struct amd_event_code_map amd_event_codes[] = { #if defined(__i386__) /* 32 bit Athlon (K7) only */ { PMC_EV_K7_DC_ACCESSES, 0x40, 0 }, { PMC_EV_K7_DC_MISSES, 0x41, 0 }, { PMC_EV_K7_DC_REFILLS_FROM_L2, 0x42, AMD_PMC_UNITMASK_MOESI }, { PMC_EV_K7_DC_REFILLS_FROM_SYSTEM, 0x43, AMD_PMC_UNITMASK_MOESI }, { PMC_EV_K7_DC_WRITEBACKS, 0x44, AMD_PMC_UNITMASK_MOESI }, { PMC_EV_K7_L1_DTLB_MISS_AND_L2_DTLB_HITS, 0x45, 0 }, { PMC_EV_K7_L1_AND_L2_DTLB_MISSES, 0x46, 0 }, { PMC_EV_K7_MISALIGNED_REFERENCES, 0x47, 0 }, { PMC_EV_K7_IC_FETCHES, 0x80, 0 }, { PMC_EV_K7_IC_MISSES, 0x81, 0 }, { PMC_EV_K7_L1_ITLB_MISSES, 0x84, 0 }, { PMC_EV_K7_L1_L2_ITLB_MISSES, 0x85, 0 }, { PMC_EV_K7_RETIRED_INSTRUCTIONS, 0xC0, 0 }, { PMC_EV_K7_RETIRED_OPS, 0xC1, 0 }, { PMC_EV_K7_RETIRED_BRANCHES, 0xC2, 0 }, { PMC_EV_K7_RETIRED_BRANCHES_MISPREDICTED, 0xC3, 0 }, { PMC_EV_K7_RETIRED_TAKEN_BRANCHES, 0xC4, 0 }, { PMC_EV_K7_RETIRED_TAKEN_BRANCHES_MISPREDICTED, 0xC5, 0 }, { PMC_EV_K7_RETIRED_FAR_CONTROL_TRANSFERS, 0xC6, 0 }, { PMC_EV_K7_RETIRED_RESYNC_BRANCHES, 0xC7, 0 }, { PMC_EV_K7_INTERRUPTS_MASKED_CYCLES, 0xCD, 0 }, { PMC_EV_K7_INTERRUPTS_MASKED_WHILE_PENDING_CYCLES, 0xCE, 0 }, { PMC_EV_K7_HARDWARE_INTERRUPTS, 0xCF, 0 }, #endif { PMC_EV_K8_FP_DISPATCHED_FPU_OPS, 0x00, 0x3F }, { PMC_EV_K8_FP_CYCLES_WITH_NO_FPU_OPS_RETIRED, 0x01, 0x00 }, { PMC_EV_K8_FP_DISPATCHED_FPU_FAST_FLAG_OPS, 0x02, 0x00 }, { PMC_EV_K8_LS_SEGMENT_REGISTER_LOAD, 0x20, 0x7F }, { PMC_EV_K8_LS_MICROARCHITECTURAL_RESYNC_BY_SELF_MODIFYING_CODE, 0x21, 0x00 }, { PMC_EV_K8_LS_MICROARCHITECTURAL_RESYNC_BY_SNOOP, 0x22, 0x00 }, { PMC_EV_K8_LS_BUFFER2_FULL, 0x23, 0x00 }, { PMC_EV_K8_LS_LOCKED_OPERATION, 0x24, 0x07 }, { PMC_EV_K8_LS_MICROARCHITECTURAL_LATE_CANCEL, 0x25, 0x00 }, { PMC_EV_K8_LS_RETIRED_CFLUSH_INSTRUCTIONS, 0x26, 0x00 }, { PMC_EV_K8_LS_RETIRED_CPUID_INSTRUCTIONS, 0x27, 0x00 }, { PMC_EV_K8_DC_ACCESS, 0x40, 0x00 }, { PMC_EV_K8_DC_MISS, 0x41, 0x00 }, { PMC_EV_K8_DC_REFILL_FROM_L2, 0x42, 0x1F }, { PMC_EV_K8_DC_REFILL_FROM_SYSTEM, 0x43, 0x1F }, { PMC_EV_K8_DC_COPYBACK, 0x44, 0x1F }, { PMC_EV_K8_DC_L1_DTLB_MISS_AND_L2_DTLB_HIT, 0x45, 0x00 }, { PMC_EV_K8_DC_L1_DTLB_MISS_AND_L2_DTLB_MISS, 0x46, 0x00 }, { PMC_EV_K8_DC_MISALIGNED_DATA_REFERENCE, 0x47, 0x00 }, { PMC_EV_K8_DC_MICROARCHITECTURAL_LATE_CANCEL, 0x48, 0x00 }, { PMC_EV_K8_DC_MICROARCHITECTURAL_EARLY_CANCEL, 0x49, 0x00 }, { PMC_EV_K8_DC_ONE_BIT_ECC_ERROR, 0x4A, 0x03 }, { PMC_EV_K8_DC_DISPATCHED_PREFETCH_INSTRUCTIONS, 0x4B, 0x07 }, { PMC_EV_K8_DC_DCACHE_ACCESSES_BY_LOCKS, 0x4C, 0x03 }, { PMC_EV_K8_BU_CPU_CLK_UNHALTED, 0x76, 0x00 }, { PMC_EV_K8_BU_INTERNAL_L2_REQUEST, 0x7D, 0x1F }, { PMC_EV_K8_BU_FILL_REQUEST_L2_MISS, 0x7E, 0x07 }, { PMC_EV_K8_BU_FILL_INTO_L2, 0x7F, 0x03 }, { PMC_EV_K8_IC_FETCH, 0x80, 0x00 }, { PMC_EV_K8_IC_MISS, 0x81, 0x00 }, { PMC_EV_K8_IC_REFILL_FROM_L2, 0x82, 0x00 }, { PMC_EV_K8_IC_REFILL_FROM_SYSTEM, 0x83, 0x00 }, { PMC_EV_K8_IC_L1_ITLB_MISS_AND_L2_ITLB_HIT, 0x84, 0x00 }, { PMC_EV_K8_IC_L1_ITLB_MISS_AND_L2_ITLB_MISS, 0x85, 0x00 }, { PMC_EV_K8_IC_MICROARCHITECTURAL_RESYNC_BY_SNOOP, 0x86, 0x00 }, { PMC_EV_K8_IC_INSTRUCTION_FETCH_STALL, 0x87, 0x00 }, { PMC_EV_K8_IC_RETURN_STACK_HIT, 0x88, 0x00 }, { PMC_EV_K8_IC_RETURN_STACK_OVERFLOW, 0x89, 0x00 }, { PMC_EV_K8_FR_RETIRED_X86_INSTRUCTIONS, 0xC0, 0x00 }, { PMC_EV_K8_FR_RETIRED_UOPS, 0xC1, 0x00 }, { PMC_EV_K8_FR_RETIRED_BRANCHES, 0xC2, 0x00 }, { PMC_EV_K8_FR_RETIRED_BRANCHES_MISPREDICTED, 0xC3, 0x00 }, { PMC_EV_K8_FR_RETIRED_TAKEN_BRANCHES, 0xC4, 0x00 }, { PMC_EV_K8_FR_RETIRED_TAKEN_BRANCHES_MISPREDICTED, 0xC5, 0x00 }, { PMC_EV_K8_FR_RETIRED_FAR_CONTROL_TRANSFERS, 0xC6, 0x00 }, { PMC_EV_K8_FR_RETIRED_RESYNCS, 0xC7, 0x00 }, { PMC_EV_K8_FR_RETIRED_NEAR_RETURNS, 0xC8, 0x00 }, { PMC_EV_K8_FR_RETIRED_NEAR_RETURNS_MISPREDICTED, 0xC9, 0x00 }, { PMC_EV_K8_FR_RETIRED_TAKEN_BRANCHES_MISPREDICTED_BY_ADDR_MISCOMPARE, 0xCA, 0x00 }, { PMC_EV_K8_FR_RETIRED_FPU_INSTRUCTIONS, 0xCB, 0x0F }, { PMC_EV_K8_FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS, 0xCC, 0x07 }, { PMC_EV_K8_FR_INTERRUPTS_MASKED_CYCLES, 0xCD, 0x00 }, { PMC_EV_K8_FR_INTERRUPTS_MASKED_WHILE_PENDING_CYCLES, 0xCE, 0x00 }, { PMC_EV_K8_FR_TAKEN_HARDWARE_INTERRUPTS, 0xCF, 0x00 }, { PMC_EV_K8_FR_DECODER_EMPTY, 0xD0, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALLS, 0xD1, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_FROM_BRANCH_ABORT_TO_RETIRE, 0xD2, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_FOR_SERIALIZATION, 0xD3, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_FOR_SEGMENT_LOAD, 0xD4, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_REORDER_BUFFER_IS_FULL, 0xD5, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_RESERVATION_STATIONS_ARE_FULL, 0xD6, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_FPU_IS_FULL, 0xD7, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_LS_IS_FULL, 0xD8, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_WAITING_FOR_ALL_TO_BE_QUIET, 0xD9, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_FAR_XFER_OR_RESYNC_BRANCH_PENDING, 0xDA, 0x00 }, { PMC_EV_K8_FR_FPU_EXCEPTIONS, 0xDB, 0x0F }, { PMC_EV_K8_FR_NUMBER_OF_BREAKPOINTS_FOR_DR0, 0xDC, 0x00 }, { PMC_EV_K8_FR_NUMBER_OF_BREAKPOINTS_FOR_DR1, 0xDD, 0x00 }, { PMC_EV_K8_FR_NUMBER_OF_BREAKPOINTS_FOR_DR2, 0xDE, 0x00 }, { PMC_EV_K8_FR_NUMBER_OF_BREAKPOINTS_FOR_DR3, 0xDF, 0x00 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_PAGE_ACCESS_EVENT, 0xE0, 0x7 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_PAGE_TABLE_OVERFLOW, 0xE1, 0x00 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_DRAM_COMMAND_SLOTS_MISSED, 0xE2, 0x00 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_TURNAROUND, 0xE3, 0x07 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_BYPASS_SATURATION, 0xE4, 0x0F }, { PMC_EV_K8_NB_SIZED_COMMANDS, 0xEB, 0x7F }, { PMC_EV_K8_NB_PROBE_RESULT, 0xEC, 0x0F }, { PMC_EV_K8_NB_HT_BUS0_BANDWIDTH, 0xF6, 0x0F }, { PMC_EV_K8_NB_HT_BUS1_BANDWIDTH, 0xF7, 0x0F }, { PMC_EV_K8_NB_HT_BUS2_BANDWIDTH, 0xF8, 0x0F } }; const int amd_event_codes_size = nitems(amd_event_codes); /* * Per-processor information */ struct amd_cpu { struct pmc_hw pc_amdpmcs[AMD_NPMCS]; }; static struct amd_cpu **amd_pcpu; /* * read a pmc register */ static int amd_read_pmc(int cpu, int ri, pmc_value_t *v) { enum pmc_mode mode; const struct amd_descr *pd; struct pmc *pm; pmc_value_t tmp; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); KASSERT(amd_pcpu[cpu], ("[amd,%d] null per-cpu, cpu %d", __LINE__, cpu)); pm = amd_pcpu[cpu]->pc_amdpmcs[ri].phw_pmc; pd = &amd_pmcdesc[ri]; KASSERT(pm != NULL, ("[amd,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__, cpu, ri)); mode = PMC_TO_MODE(pm); PMCDBG2(MDP,REA,1,"amd-read id=%d class=%d", ri, pd->pm_descr.pd_class); #ifdef HWPMC_DEBUG KASSERT(pd->pm_descr.pd_class == amd_pmc_class, ("[amd,%d] unknown PMC class (%d)", __LINE__, pd->pm_descr.pd_class)); #endif tmp = rdmsr(pd->pm_perfctr); /* RDMSR serializes */ PMCDBG2(MDP,REA,2,"amd-read (pre-munge) id=%d -> %jd", ri, tmp); if (PMC_IS_SAMPLING_MODE(mode)) { /* * Clamp value to 0 if the counter just overflowed, * otherwise the returned reload count would wrap to a * huge value. */ if ((tmp & (1ULL << 47)) == 0) tmp = 0; else { /* Sign extend 48 bit value to 64 bits. */ tmp = (pmc_value_t) ((int64_t)(tmp << 16) >> 16); tmp = AMD_PERFCTR_VALUE_TO_RELOAD_COUNT(tmp); } } *v = tmp; PMCDBG2(MDP,REA,2,"amd-read (post-munge) id=%d -> %jd", ri, *v); return 0; } /* * Write a PMC MSR. */ static int amd_write_pmc(int cpu, int ri, pmc_value_t v) { const struct amd_descr *pd; enum pmc_mode mode; struct pmc *pm; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); pm = amd_pcpu[cpu]->pc_amdpmcs[ri].phw_pmc; pd = &amd_pmcdesc[ri]; KASSERT(pm != NULL, ("[amd,%d] PMC not owned (cpu%d,pmc%d)", __LINE__, cpu, ri)); mode = PMC_TO_MODE(pm); #ifdef HWPMC_DEBUG KASSERT(pd->pm_descr.pd_class == amd_pmc_class, ("[amd,%d] unknown PMC class (%d)", __LINE__, pd->pm_descr.pd_class)); #endif /* use 2's complement of the count for sampling mode PMCs */ if (PMC_IS_SAMPLING_MODE(mode)) v = AMD_RELOAD_COUNT_TO_PERFCTR_VALUE(v); PMCDBG3(MDP,WRI,1,"amd-write cpu=%d ri=%d v=%jx", cpu, ri, v); /* write the PMC value */ wrmsr(pd->pm_perfctr, v); return 0; } /* * configure hardware pmc according to the configuration recorded in * pmc 'pm'. */ static int amd_config_pmc(int cpu, int ri, struct pmc *pm) { struct pmc_hw *phw; PMCDBG3(MDP,CFG,1, "cpu=%d ri=%d pm=%p", cpu, ri, pm); KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = &amd_pcpu[cpu]->pc_amdpmcs[ri]; KASSERT(pm == NULL || phw->phw_pmc == NULL, ("[amd,%d] pm=%p phw->pm=%p hwpmc not unconfigured", __LINE__, pm, phw->phw_pmc)); phw->phw_pmc = pm; return 0; } /* * Retrieve a configured PMC pointer from hardware state. */ static int amd_get_config(int cpu, int ri, struct pmc **ppm) { *ppm = amd_pcpu[cpu]->pc_amdpmcs[ri].phw_pmc; return 0; } /* * Machine dependent actions taken during the context switch in of a * thread. */ static int amd_switch_in(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; PMCDBG3(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp, (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0); /* enable the RDPMC instruction if needed */ if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) load_cr4(rcr4() | CR4_PCE); return 0; } /* * Machine dependent actions taken during the context switch out of a * thread. */ static int amd_switch_out(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; (void) pp; /* can be NULL */ PMCDBG3(MDP,SWO,1, "pc=%p pp=%p enable-msr=%d", pc, pp, pp ? (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) == 1 : 0); /* always turn off the RDPMC instruction */ load_cr4(rcr4() & ~CR4_PCE); return 0; } /* * Check if a given allocation is feasible. */ static int amd_allocate_pmc(int cpu, int ri, struct pmc *pm, const struct pmc_op_pmcallocate *a) { int i; uint64_t allowed_unitmask, caps, config, unitmask; enum pmc_event pe; const struct pmc_descr *pd; (void) cpu; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row index %d", __LINE__, ri)); pd = &amd_pmcdesc[ri].pm_descr; /* check class match */ if (pd->pd_class != a->pm_class) return EINVAL; caps = pm->pm_caps; PMCDBG2(MDP,ALL,1,"amd-allocate ri=%d caps=0x%x", ri, caps); if((ri >= 0 && ri < 6) && !(a->pm_md.pm_amd.pm_amd_sub_class == PMC_AMD_SUB_CLASS_CORE)) return EINVAL; if((ri >= 6 && ri < 12) && !(a->pm_md.pm_amd.pm_amd_sub_class == PMC_AMD_SUB_CLASS_L3_CACHE)) return EINVAL; if((ri >= 12 && ri < 16) && !(a->pm_md.pm_amd.pm_amd_sub_class == PMC_AMD_SUB_CLASS_DATA_FABRIC)) return EINVAL; if (strlen(pmc_cpuid) != 0) { pm->pm_md.pm_amd.pm_amd_evsel = a->pm_md.pm_amd.pm_amd_config; PMCDBG2(MDP,ALL,2,"amd-allocate ri=%d -> config=0x%x", ri, a->pm_md.pm_amd.pm_amd_config); return (0); } pe = a->pm_ev; /* map ev to the correct event mask code */ config = allowed_unitmask = 0; for (i = 0; i < amd_event_codes_size; i++) if (amd_event_codes[i].pe_ev == pe) { config = AMD_PMC_TO_EVENTMASK(amd_event_codes[i].pe_code); allowed_unitmask = AMD_PMC_TO_UNITMASK(amd_event_codes[i].pe_mask); break; } if (i == amd_event_codes_size) return EINVAL; unitmask = a->pm_md.pm_amd.pm_amd_config & AMD_PMC_UNITMASK; if (unitmask & ~allowed_unitmask) /* disallow reserved bits */ return EINVAL; if (unitmask && (caps & PMC_CAP_QUALIFIER)) config |= unitmask; if (caps & PMC_CAP_THRESHOLD) config |= a->pm_md.pm_amd.pm_amd_config & AMD_PMC_COUNTERMASK; /* set at least one of the 'usr' or 'os' caps */ if (caps & PMC_CAP_USER) config |= AMD_PMC_USR; if (caps & PMC_CAP_SYSTEM) config |= AMD_PMC_OS; if ((caps & (PMC_CAP_USER|PMC_CAP_SYSTEM)) == 0) config |= (AMD_PMC_USR|AMD_PMC_OS); if (caps & PMC_CAP_EDGE) config |= AMD_PMC_EDGE; if (caps & PMC_CAP_INVERT) config |= AMD_PMC_INVERT; if (caps & PMC_CAP_INTERRUPT) config |= AMD_PMC_INT; pm->pm_md.pm_amd.pm_amd_evsel = config; /* save config value */ PMCDBG2(MDP,ALL,2,"amd-allocate ri=%d -> config=0x%x", ri, config); return 0; } /* * Release machine dependent state associated with a PMC. This is a * no-op on this architecture. * */ /* ARGSUSED0 */ static int amd_release_pmc(int cpu, int ri, struct pmc *pmc) { #ifdef HWPMC_DEBUG const struct amd_descr *pd; #endif struct pmc_hw *phw __diagused; (void) pmc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = &amd_pcpu[cpu]->pc_amdpmcs[ri]; KASSERT(phw->phw_pmc == NULL, ("[amd,%d] PHW pmc %p non-NULL", __LINE__, phw->phw_pmc)); #ifdef HWPMC_DEBUG pd = &amd_pmcdesc[ri]; if (pd->pm_descr.pd_class == amd_pmc_class) KASSERT(AMD_PMC_IS_STOPPED(pd->pm_evsel), ("[amd,%d] PMC %d released while active", __LINE__, ri)); #endif return 0; } /* * start a PMC. */ static int amd_start_pmc(int cpu, int ri) { uint64_t config; struct pmc *pm; struct pmc_hw *phw; const struct amd_descr *pd; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = &amd_pcpu[cpu]->pc_amdpmcs[ri]; pm = phw->phw_pmc; pd = &amd_pmcdesc[ri]; KASSERT(pm != NULL, ("[amd,%d] starting cpu%d,pmc%d with null pmc record", __LINE__, cpu, ri)); PMCDBG2(MDP,STA,1,"amd-start cpu=%d ri=%d", cpu, ri); KASSERT(AMD_PMC_IS_STOPPED(pd->pm_evsel), ("[amd,%d] pmc%d,cpu%d: Starting active PMC \"%s\"", __LINE__, ri, cpu, pd->pm_descr.pd_name)); /* turn on the PMC ENABLE bit */ config = pm->pm_md.pm_amd.pm_amd_evsel | AMD_PMC_ENABLE; PMCDBG1(MDP,STA,2,"amd-start config=0x%x", config); wrmsr(pd->pm_evsel, config); return 0; } /* * Stop a PMC. */ static int amd_stop_pmc(int cpu, int ri) { struct pmc *pm; struct pmc_hw *phw; const struct amd_descr *pd; uint64_t config; int i; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = &amd_pcpu[cpu]->pc_amdpmcs[ri]; pm = phw->phw_pmc; pd = &amd_pmcdesc[ri]; KASSERT(pm != NULL, ("[amd,%d] cpu%d,pmc%d no PMC to stop", __LINE__, cpu, ri)); KASSERT(!AMD_PMC_IS_STOPPED(pd->pm_evsel), ("[amd,%d] PMC%d, CPU%d \"%s\" already stopped", __LINE__, ri, cpu, pd->pm_descr.pd_name)); PMCDBG1(MDP,STO,1,"amd-stop ri=%d", ri); /* turn off the PMC ENABLE bit */ config = pm->pm_md.pm_amd.pm_amd_evsel & ~AMD_PMC_ENABLE; wrmsr(pd->pm_evsel, config); /* * Due to NMI latency on newer AMD processors * NMI interrupts are ignored, which leads to * panic or messages based on kernel configuration */ /* Wait for the count to be reset */ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { if (rdmsr(pd->pm_perfctr) & (1 << (pd->pm_descr.pd_width - 1))) break; DELAY(1); } return 0; } /* * Interrupt handler. This function needs to return '1' if the * interrupt was this CPU's PMCs or '0' otherwise. It is not allowed * to sleep or do anything a 'fast' interrupt handler is not allowed * to do. */ static int amd_intr(struct trapframe *tf) { int i, error, retval, cpu; uint64_t config, evsel, perfctr; struct pmc *pm; struct amd_cpu *pac; pmc_value_t v; uint32_t active = 0, count = 0; cpu = curcpu; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] out of range CPU %d", __LINE__, cpu)); PMCDBG3(MDP,INT,1, "cpu=%d tf=%p um=%d", cpu, (void *) tf, TRAPF_USERMODE(tf)); retval = 0; pac = amd_pcpu[cpu]; /* * look for all PMCs that have interrupted: * - look for a running, sampling PMC which has overflowed * and which has a valid 'struct pmc' association * * If found, we call a helper to process the interrupt. * * PMCs interrupting at the same time are collapsed into * a single interrupt. Check all the valid pmcs for * overflow. */ for (i = 0; i < AMD_CORE_NPMCS; i++) { if ((pm = pac->pc_amdpmcs[i].phw_pmc) == NULL || !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) { continue; } /* Consider pmc with valid handle as active */ active++; if (!AMD_PMC_HAS_OVERFLOWED(i)) continue; retval = 1; /* Found an interrupting PMC. */ if (pm->pm_state != PMC_STATE_RUNNING) continue; /* Stop the PMC, reload count. */ evsel = amd_pmcdesc[i].pm_evsel; perfctr = amd_pmcdesc[i].pm_perfctr; v = pm->pm_sc.pm_reloadcount; config = rdmsr(evsel); KASSERT((config & ~AMD_PMC_ENABLE) == (pm->pm_md.pm_amd.pm_amd_evsel & ~AMD_PMC_ENABLE), ("[amd,%d] config mismatch reg=0x%jx pm=0x%jx", __LINE__, (uintmax_t)config, (uintmax_t)pm->pm_md.pm_amd.pm_amd_evsel)); wrmsr(evsel, config & ~AMD_PMC_ENABLE); wrmsr(perfctr, AMD_RELOAD_COUNT_TO_PERFCTR_VALUE(v)); /* Restart the counter if logging succeeded. */ error = pmc_process_interrupt(PMC_HR, pm, tf); if (error == 0) wrmsr(evsel, config); } /* * Due to NMI latency, there can be a scenario in which * multiple pmcs gets serviced in an earlier NMI and we * do not find an overflow in the subsequent NMI. * * For such cases we keep a per-cpu count of active NMIs * and compare it with min(active pmcs, 2) to determine * if this NMI was for a pmc overflow which was serviced * in an earlier request or should be ignored. */ if (retval) { DPCPU_SET(nmi_counter, min(2, active)); } else { if ((count = DPCPU_GET(nmi_counter))) { retval = 1; DPCPU_SET(nmi_counter, --count); } } if (retval) counter_u64_add(pmc_stats.pm_intr_processed, 1); else counter_u64_add(pmc_stats.pm_intr_ignored, 1); PMCDBG1(MDP,INT,2, "retval=%d", retval); return (retval); } /* * describe a PMC */ static int amd_describe(int cpu, int ri, struct pmc_info *pi, struct pmc **ppmc) { const struct amd_descr *pd; struct pmc_hw *phw; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] row-index %d out of range", __LINE__, ri)); phw = &amd_pcpu[cpu]->pc_amdpmcs[ri]; pd = &amd_pmcdesc[ri]; strlcpy(pi->pm_name, pd->pm_descr.pd_name, sizeof(pi->pm_name)); pi->pm_class = pd->pm_descr.pd_class; if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) { pi->pm_enabled = TRUE; *ppmc = phw->phw_pmc; } else { pi->pm_enabled = FALSE; *ppmc = NULL; } return 0; } /* * i386 specific entry points */ /* * return the MSR address of the given PMC. */ static int amd_get_msr(int ri, uint32_t *msr) { KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] ri %d out of range", __LINE__, ri)); *msr = amd_pmcdesc[ri].pm_perfctr - AMD_PMC_PERFCTR_0; return (0); } /* * processor dependent initialization. */ static int amd_pcpu_init(struct pmc_mdep *md, int cpu) { int classindex, first_ri, n; struct pmc_cpu *pc; struct amd_cpu *pac; struct pmc_hw *phw; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] insane cpu number %d", __LINE__, cpu)); PMCDBG1(MDP,INI,1,"amd-init cpu=%d", cpu); amd_pcpu[cpu] = pac = malloc(sizeof(struct amd_cpu), M_PMC, M_WAITOK|M_ZERO); /* * Set the content of the hardware descriptors to a known * state and initialize pointers in the MI per-cpu descriptor. */ pc = pmc_pcpu[cpu]; #if defined(__amd64__) classindex = PMC_MDEP_CLASS_INDEX_K8; #elif defined(__i386__) classindex = md->pmd_cputype == PMC_CPU_AMD_K8 ? PMC_MDEP_CLASS_INDEX_K8 : PMC_MDEP_CLASS_INDEX_K7; #endif first_ri = md->pmd_classdep[classindex].pcd_ri; KASSERT(pc != NULL, ("[amd,%d] NULL per-cpu pointer", __LINE__)); for (n = 0, phw = pac->pc_amdpmcs; n < AMD_NPMCS; n++, phw++) { phw->phw_state = PMC_PHW_FLAG_IS_ENABLED | PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n); phw->phw_pmc = NULL; pc->pc_hwpmcs[n + first_ri] = phw; } return (0); } /* * processor dependent cleanup prior to the KLD * being unloaded */ static int amd_pcpu_fini(struct pmc_mdep *md, int cpu) { int classindex, first_ri, i; uint32_t evsel; struct pmc_cpu *pc; struct amd_cpu *pac; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[amd,%d] insane cpu number (%d)", __LINE__, cpu)); PMCDBG1(MDP,INI,1,"amd-cleanup cpu=%d", cpu); /* * First, turn off all PMCs on this CPU. */ for (i = 0; i < 4; i++) { /* XXX this loop is now not needed */ evsel = rdmsr(AMD_PMC_EVSEL_0 + i); evsel &= ~AMD_PMC_ENABLE; wrmsr(AMD_PMC_EVSEL_0 + i, evsel); } /* * Next, free up allocated space. */ if ((pac = amd_pcpu[cpu]) == NULL) return (0); amd_pcpu[cpu] = NULL; #ifdef HWPMC_DEBUG for (i = 0; i < AMD_NPMCS; i++) { KASSERT(pac->pc_amdpmcs[i].phw_pmc == NULL, ("[amd,%d] CPU%d/PMC%d in use", __LINE__, cpu, i)); KASSERT(AMD_PMC_IS_STOPPED(AMD_PMC_EVSEL_0 + i), ("[amd,%d] CPU%d/PMC%d not stopped", __LINE__, cpu, i)); } #endif pc = pmc_pcpu[cpu]; KASSERT(pc != NULL, ("[amd,%d] NULL per-cpu state", __LINE__)); #if defined(__amd64__) classindex = PMC_MDEP_CLASS_INDEX_K8; #elif defined(__i386__) classindex = md->pmd_cputype == PMC_CPU_AMD_K8 ? PMC_MDEP_CLASS_INDEX_K8 : PMC_MDEP_CLASS_INDEX_K7; #endif first_ri = md->pmd_classdep[classindex].pcd_ri; /* * Reset pointers in the MI 'per-cpu' state. */ for (i = 0; i < AMD_NPMCS; i++) { pc->pc_hwpmcs[i + first_ri] = NULL; } free(pac, M_PMC); return (0); } /* * Initialize ourselves. */ struct pmc_mdep * pmc_amd_initialize(void) { int classindex, error, i, ncpus; struct pmc_classdep *pcd; enum pmc_cputype cputype; struct pmc_mdep *pmc_mdep; enum pmc_class class; int family, model, stepping; char *name; /* * The presence of hardware performance counters on the AMD * Athlon, Duron or later processors, is _not_ indicated by * any of the processor feature flags set by the 'CPUID' * instruction, so we only check the 'instruction family' * field returned by CPUID for instruction family >= 6. */ name = NULL; family = CPUID_TO_FAMILY(cpu_id); model = CPUID_TO_MODEL(cpu_id); stepping = CPUID_TO_STEPPING(cpu_id); if (family == 0x18) snprintf(pmc_cpuid, sizeof(pmc_cpuid), "HygonGenuine-%d-%02X-%X", family, model, stepping); else snprintf(pmc_cpuid, sizeof(pmc_cpuid), "AuthenticAMD-%d-%02X-%X", family, model, stepping); switch (cpu_id & 0xF00) { #if defined(__i386__) case 0x600: /* Athlon(tm) processor */ classindex = PMC_MDEP_CLASS_INDEX_K7; cputype = PMC_CPU_AMD_K7; class = PMC_CLASS_K7; name = "K7"; break; #endif case 0xF00: /* Athlon64/Opteron processor */ classindex = PMC_MDEP_CLASS_INDEX_K8; cputype = PMC_CPU_AMD_K8; class = PMC_CLASS_K8; name = "K8"; break; default: (void) printf("pmc: Unknown AMD CPU %x %d-%d.\n", cpu_id, (cpu_id & 0xF00) >> 8, model); return NULL; } #ifdef HWPMC_DEBUG amd_pmc_class = class; #endif /* * Allocate space for pointers to PMC HW descriptors and for * the MDEP structure used by MI code. */ amd_pcpu = malloc(sizeof(struct amd_cpu *) * pmc_cpu_max(), M_PMC, M_WAITOK|M_ZERO); /* * These processors have two classes of PMCs: the TSC and * programmable PMCs. */ pmc_mdep = pmc_mdep_alloc(2); pmc_mdep->pmd_cputype = cputype; ncpus = pmc_cpu_max(); /* Initialize the TSC. */ error = pmc_tsc_initialize(pmc_mdep, ncpus); if (error) goto error; /* Initialize AMD K7 and K8 PMC handling. */ pcd = &pmc_mdep->pmd_classdep[classindex]; pcd->pcd_caps = AMD_PMC_CAPS; pcd->pcd_class = class; pcd->pcd_num = AMD_NPMCS; pcd->pcd_ri = pmc_mdep->pmd_npmc; pcd->pcd_width = 48; /* fill in the correct pmc name and class */ for (i = 0; i < AMD_NPMCS; i++) { (void) snprintf(amd_pmcdesc[i].pm_descr.pd_name, sizeof(amd_pmcdesc[i].pm_descr.pd_name), "%s-%d", name, i); amd_pmcdesc[i].pm_descr.pd_class = class; } pcd->pcd_allocate_pmc = amd_allocate_pmc; pcd->pcd_config_pmc = amd_config_pmc; pcd->pcd_describe = amd_describe; pcd->pcd_get_config = amd_get_config; pcd->pcd_get_msr = amd_get_msr; pcd->pcd_pcpu_fini = amd_pcpu_fini; pcd->pcd_pcpu_init = amd_pcpu_init; pcd->pcd_read_pmc = amd_read_pmc; pcd->pcd_release_pmc = amd_release_pmc; pcd->pcd_start_pmc = amd_start_pmc; pcd->pcd_stop_pmc = amd_stop_pmc; pcd->pcd_write_pmc = amd_write_pmc; - pmc_mdep->pmd_pcpu_init = NULL; - pmc_mdep->pmd_pcpu_fini = NULL; pmc_mdep->pmd_intr = amd_intr; pmc_mdep->pmd_switch_in = amd_switch_in; pmc_mdep->pmd_switch_out = amd_switch_out; pmc_mdep->pmd_npmc += AMD_NPMCS; PMCDBG0(MDP,INI,0,"amd-initialize"); return (pmc_mdep); error: if (error) { free(pmc_mdep, M_PMC); pmc_mdep = NULL; } return (NULL); } /* * Finalization code for AMD CPUs. */ void pmc_amd_finalize(struct pmc_mdep *md) { #if defined(INVARIANTS) int classindex, i, ncpus, pmcclass; #endif pmc_tsc_finalize(md); KASSERT(amd_pcpu != NULL, ("[amd,%d] NULL per-cpu array pointer", __LINE__)); #if defined(INVARIANTS) switch (md->pmd_cputype) { #if defined(__i386__) case PMC_CPU_AMD_K7: classindex = PMC_MDEP_CLASS_INDEX_K7; pmcclass = PMC_CLASS_K7; break; #endif default: classindex = PMC_MDEP_CLASS_INDEX_K8; pmcclass = PMC_CLASS_K8; } KASSERT(md->pmd_classdep[classindex].pcd_class == pmcclass, ("[amd,%d] pmc class mismatch", __LINE__)); ncpus = pmc_cpu_max(); for (i = 0; i < ncpus; i++) KASSERT(amd_pcpu[i] == NULL, ("[amd,%d] non-null pcpu", __LINE__)); #endif free(amd_pcpu, M_PMC); amd_pcpu = NULL; } diff --git a/sys/dev/hwpmc/hwpmc_core.c b/sys/dev/hwpmc/hwpmc_core.c index ad9d323bdb9a..76ffbdb9b0d2 100644 --- a/sys/dev/hwpmc/hwpmc_core.c +++ b/sys/dev/hwpmc/hwpmc_core.c @@ -1,1310 +1,1307 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Joseph Koshy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Intel Core PMCs. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #define CORE_CPUID_REQUEST 0xA #define CORE_CPUID_REQUEST_SIZE 0x4 #define CORE_CPUID_EAX 0x0 #define CORE_CPUID_EBX 0x1 #define CORE_CPUID_ECX 0x2 #define CORE_CPUID_EDX 0x3 #define IAF_PMC_CAPS \ (PMC_CAP_READ | PMC_CAP_WRITE | PMC_CAP_INTERRUPT | \ PMC_CAP_USER | PMC_CAP_SYSTEM) #define IAF_RI_TO_MSR(RI) ((RI) + (1 << 30)) #define IAP_PMC_CAPS (PMC_CAP_INTERRUPT | PMC_CAP_USER | PMC_CAP_SYSTEM | \ PMC_CAP_EDGE | PMC_CAP_THRESHOLD | PMC_CAP_READ | PMC_CAP_WRITE | \ PMC_CAP_INVERT | PMC_CAP_QUALIFIER | PMC_CAP_PRECISE) #define EV_IS_NOTARCH 0 #define EV_IS_ARCH_SUPP 1 #define EV_IS_ARCH_NOTSUPP -1 /* * "Architectural" events defined by Intel. The values of these * symbols correspond to positions in the bitmask returned by * the CPUID.0AH instruction. */ enum core_arch_events { CORE_AE_BRANCH_INSTRUCTION_RETIRED = 5, CORE_AE_BRANCH_MISSES_RETIRED = 6, CORE_AE_INSTRUCTION_RETIRED = 1, CORE_AE_LLC_MISSES = 4, CORE_AE_LLC_REFERENCE = 3, CORE_AE_UNHALTED_REFERENCE_CYCLES = 2, CORE_AE_UNHALTED_CORE_CYCLES = 0 }; static enum pmc_cputype core_cputype; static int core_version; struct core_cpu { volatile uint32_t pc_iafctrl; /* Fixed function control. */ volatile uint64_t pc_globalctrl; /* Global control register. */ struct pmc_hw pc_corepmcs[]; }; static struct core_cpu **core_pcpu; static uint32_t core_architectural_events; static uint64_t core_pmcmask; static int core_iaf_ri; /* relative index of fixed counters */ static int core_iaf_width; static int core_iaf_npmc; static int core_iap_width; static int core_iap_npmc; static int core_iap_wroffset; static u_int pmc_alloc_refs; static bool pmc_tsx_force_abort_set; static int core_pcpu_noop(struct pmc_mdep *md, int cpu) { (void) md; (void) cpu; return (0); } static int core_pcpu_init(struct pmc_mdep *md, int cpu) { struct pmc_cpu *pc; struct core_cpu *cc; struct pmc_hw *phw; int core_ri, n, npmc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[iaf,%d] insane cpu number %d", __LINE__, cpu)); PMCDBG1(MDP,INI,1,"core-init cpu=%d", cpu); core_ri = md->pmd_classdep[PMC_MDEP_CLASS_INDEX_IAP].pcd_ri; npmc = md->pmd_classdep[PMC_MDEP_CLASS_INDEX_IAP].pcd_num; if (core_version >= 2) npmc += md->pmd_classdep[PMC_MDEP_CLASS_INDEX_IAF].pcd_num; cc = malloc(sizeof(struct core_cpu) + npmc * sizeof(struct pmc_hw), M_PMC, M_WAITOK | M_ZERO); core_pcpu[cpu] = cc; pc = pmc_pcpu[cpu]; KASSERT(pc != NULL && cc != NULL, ("[core,%d] NULL per-cpu structures cpu=%d", __LINE__, cpu)); for (n = 0, phw = cc->pc_corepmcs; n < npmc; n++, phw++) { phw->phw_state = PMC_PHW_FLAG_IS_ENABLED | PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n + core_ri); phw->phw_pmc = NULL; pc->pc_hwpmcs[n + core_ri] = phw; } if (core_version >= 2 && vm_guest == VM_GUEST_NO) { /* Enable Freezing PMCs on PMI. */ wrmsr(MSR_DEBUGCTLMSR, rdmsr(MSR_DEBUGCTLMSR) | 0x1000); } return (0); } static int core_pcpu_fini(struct pmc_mdep *md, int cpu) { int core_ri, n, npmc; struct pmc_cpu *pc; struct core_cpu *cc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] insane cpu number (%d)", __LINE__, cpu)); PMCDBG1(MDP,INI,1,"core-pcpu-fini cpu=%d", cpu); if ((cc = core_pcpu[cpu]) == NULL) return (0); core_pcpu[cpu] = NULL; pc = pmc_pcpu[cpu]; KASSERT(pc != NULL, ("[core,%d] NULL per-cpu %d state", __LINE__, cpu)); npmc = md->pmd_classdep[PMC_MDEP_CLASS_INDEX_IAP].pcd_num; core_ri = md->pmd_classdep[PMC_MDEP_CLASS_INDEX_IAP].pcd_ri; for (n = 0; n < npmc; n++) wrmsr(IAP_EVSEL0 + n, 0); if (core_version >= 2) { wrmsr(IAF_CTRL, 0); npmc += md->pmd_classdep[PMC_MDEP_CLASS_INDEX_IAF].pcd_num; } for (n = 0; n < npmc; n++) pc->pc_hwpmcs[n + core_ri] = NULL; free(cc, M_PMC); return (0); } /* * Fixed function counters. */ static pmc_value_t iaf_perfctr_value_to_reload_count(pmc_value_t v) { /* If the PMC has overflowed, return a reload count of zero. */ if ((v & (1ULL << (core_iaf_width - 1))) == 0) return (0); v &= (1ULL << core_iaf_width) - 1; return (1ULL << core_iaf_width) - v; } static pmc_value_t iaf_reload_count_to_perfctr_value(pmc_value_t rlc) { return (1ULL << core_iaf_width) - rlc; } static int iaf_allocate_pmc(int cpu, int ri, struct pmc *pm, const struct pmc_op_pmcallocate *a) { uint8_t ev, umask; uint32_t caps; uint64_t config, flags; const struct pmc_md_iap_op_pmcallocate *iap; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal CPU %d", __LINE__, cpu)); PMCDBG2(MDP,ALL,1, "iaf-allocate ri=%d reqcaps=0x%x", ri, pm->pm_caps); if (ri < 0 || ri > core_iaf_npmc) return (EINVAL); if (a->pm_class != PMC_CLASS_IAF) return (EINVAL); iap = &a->pm_md.pm_iap; config = iap->pm_iap_config; ev = IAP_EVSEL_GET(config); umask = IAP_UMASK_GET(config); if (ev == 0x0) { if (umask != ri + 1) return (EINVAL); } else { switch (ri) { case 0: /* INST_RETIRED.ANY */ if (ev != 0xC0 || umask != 0x00) return (EINVAL); break; case 1: /* CPU_CLK_UNHALTED.THREAD */ if (ev != 0x3C || umask != 0x00) return (EINVAL); break; case 2: /* CPU_CLK_UNHALTED.REF */ if (ev != 0x3C || umask != 0x01) return (EINVAL); break; case 3: /* TOPDOWN.SLOTS */ if (ev != 0xA4 || umask != 0x01) return (EINVAL); break; default: return (EINVAL); } } pmc_alloc_refs++; if ((cpu_stdext_feature3 & CPUID_STDEXT3_TSXFA) != 0 && !pmc_tsx_force_abort_set) { pmc_tsx_force_abort_set = true; x86_msr_op(MSR_TSX_FORCE_ABORT, MSR_OP_RENDEZVOUS_ALL | MSR_OP_WRITE, 1, NULL); } flags = 0; if (config & IAP_OS) flags |= IAF_OS; if (config & IAP_USR) flags |= IAF_USR; if (config & IAP_ANY) flags |= IAF_ANY; if (config & IAP_INT) flags |= IAF_PMI; caps = a->pm_caps; if (caps & PMC_CAP_INTERRUPT) flags |= IAF_PMI; if (caps & PMC_CAP_SYSTEM) flags |= IAF_OS; if (caps & PMC_CAP_USER) flags |= IAF_USR; if ((caps & (PMC_CAP_USER | PMC_CAP_SYSTEM)) == 0) flags |= (IAF_OS | IAF_USR); pm->pm_md.pm_iaf.pm_iaf_ctrl = (flags << (ri * 4)); PMCDBG1(MDP,ALL,2, "iaf-allocate config=0x%jx", (uintmax_t) pm->pm_md.pm_iaf.pm_iaf_ctrl); return (0); } static int iaf_config_pmc(int cpu, int ri, struct pmc *pm) { KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iaf_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); PMCDBG3(MDP,CFG,1, "iaf-config cpu=%d ri=%d pm=%p", cpu, ri, pm); KASSERT(core_pcpu[cpu] != NULL, ("[core,%d] null per-cpu %d", __LINE__, cpu)); core_pcpu[cpu]->pc_corepmcs[ri + core_iaf_ri].phw_pmc = pm; return (0); } static int iaf_describe(int cpu, int ri, struct pmc_info *pi, struct pmc **ppmc) { struct pmc_hw *phw; phw = &core_pcpu[cpu]->pc_corepmcs[ri + core_iaf_ri]; snprintf(pi->pm_name, sizeof(pi->pm_name), "IAF-%d", ri); pi->pm_class = PMC_CLASS_IAF; if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) { pi->pm_enabled = TRUE; *ppmc = phw->phw_pmc; } else { pi->pm_enabled = FALSE; *ppmc = NULL; } return (0); } static int iaf_get_config(int cpu, int ri, struct pmc **ppm) { *ppm = core_pcpu[cpu]->pc_corepmcs[ri + core_iaf_ri].phw_pmc; return (0); } static int iaf_get_msr(int ri, uint32_t *msr) { KASSERT(ri >= 0 && ri < core_iaf_npmc, ("[iaf,%d] ri %d out of range", __LINE__, ri)); *msr = IAF_RI_TO_MSR(ri); return (0); } static int iaf_read_pmc(int cpu, int ri, pmc_value_t *v) { struct pmc *pm; pmc_value_t tmp; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal cpu value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iaf_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); pm = core_pcpu[cpu]->pc_corepmcs[ri + core_iaf_ri].phw_pmc; KASSERT(pm, ("[core,%d] cpu %d ri %d(%d) pmc not configured", __LINE__, cpu, ri, ri + core_iaf_ri)); tmp = rdpmc(IAF_RI_TO_MSR(ri)); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) *v = iaf_perfctr_value_to_reload_count(tmp); else *v = tmp & ((1ULL << core_iaf_width) - 1); PMCDBG4(MDP,REA,1, "iaf-read cpu=%d ri=%d msr=0x%x -> v=%jx", cpu, ri, IAF_RI_TO_MSR(ri), *v); return (0); } static int iaf_release_pmc(int cpu, int ri, struct pmc *pmc) { PMCDBG3(MDP,REL,1, "iaf-release cpu=%d ri=%d pm=%p", cpu, ri, pmc); KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iaf_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); KASSERT(core_pcpu[cpu]->pc_corepmcs[ri + core_iaf_ri].phw_pmc == NULL, ("[core,%d] PHW pmc non-NULL", __LINE__)); MPASS(pmc_alloc_refs > 0); if (pmc_alloc_refs-- == 1 && pmc_tsx_force_abort_set) { pmc_tsx_force_abort_set = false; x86_msr_op(MSR_TSX_FORCE_ABORT, MSR_OP_RENDEZVOUS_ALL | MSR_OP_WRITE, 0, NULL); } return (0); } static int iaf_start_pmc(int cpu, int ri) { struct pmc *pm; struct core_cpu *cc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iaf_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); PMCDBG2(MDP,STA,1,"iaf-start cpu=%d ri=%d", cpu, ri); cc = core_pcpu[cpu]; pm = cc->pc_corepmcs[ri + core_iaf_ri].phw_pmc; cc->pc_iafctrl |= pm->pm_md.pm_iaf.pm_iaf_ctrl; wrmsr(IAF_CTRL, cc->pc_iafctrl); cc->pc_globalctrl |= (1ULL << (ri + IAF_OFFSET)); wrmsr(IA_GLOBAL_CTRL, cc->pc_globalctrl); PMCDBG4(MDP,STA,1,"iafctrl=%x(%x) globalctrl=%jx(%jx)", cc->pc_iafctrl, (uint32_t) rdmsr(IAF_CTRL), cc->pc_globalctrl, rdmsr(IA_GLOBAL_CTRL)); return (0); } static int iaf_stop_pmc(int cpu, int ri) { struct core_cpu *cc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iaf_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); PMCDBG2(MDP,STA,1,"iaf-stop cpu=%d ri=%d", cpu, ri); cc = core_pcpu[cpu]; cc->pc_iafctrl &= ~(IAF_MASK << (ri * 4)); wrmsr(IAF_CTRL, cc->pc_iafctrl); /* Don't need to write IA_GLOBAL_CTRL, one disable is enough. */ PMCDBG4(MDP,STO,1,"iafctrl=%x(%x) globalctrl=%jx(%jx)", cc->pc_iafctrl, (uint32_t) rdmsr(IAF_CTRL), cc->pc_globalctrl, rdmsr(IA_GLOBAL_CTRL)); return (0); } static int iaf_write_pmc(int cpu, int ri, pmc_value_t v) { struct core_cpu *cc; struct pmc *pm; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal cpu value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iaf_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); cc = core_pcpu[cpu]; pm = cc->pc_corepmcs[ri + core_iaf_ri].phw_pmc; KASSERT(pm, ("[core,%d] cpu %d ri %d pmc not configured", __LINE__, cpu, ri)); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) v = iaf_reload_count_to_perfctr_value(v); /* Turn off the fixed counter */ wrmsr(IAF_CTRL, cc->pc_iafctrl & ~(IAF_MASK << (ri * 4))); wrmsr(IAF_CTR0 + ri, v & ((1ULL << core_iaf_width) - 1)); /* Turn on fixed counters */ wrmsr(IAF_CTRL, cc->pc_iafctrl); PMCDBG6(MDP,WRI,1, "iaf-write cpu=%d ri=%d msr=0x%x v=%jx iafctrl=%jx " "pmc=%jx", cpu, ri, IAF_RI_TO_MSR(ri), v, (uintmax_t) rdmsr(IAF_CTRL), (uintmax_t) rdpmc(IAF_RI_TO_MSR(ri))); return (0); } static void iaf_initialize(struct pmc_mdep *md, int maxcpu, int npmc, int pmcwidth) { struct pmc_classdep *pcd; KASSERT(md != NULL, ("[iaf,%d] md is NULL", __LINE__)); PMCDBG0(MDP,INI,1, "iaf-initialize"); pcd = &md->pmd_classdep[PMC_MDEP_CLASS_INDEX_IAF]; pcd->pcd_caps = IAF_PMC_CAPS; pcd->pcd_class = PMC_CLASS_IAF; pcd->pcd_num = npmc; pcd->pcd_ri = md->pmd_npmc; pcd->pcd_width = pmcwidth; pcd->pcd_allocate_pmc = iaf_allocate_pmc; pcd->pcd_config_pmc = iaf_config_pmc; pcd->pcd_describe = iaf_describe; pcd->pcd_get_config = iaf_get_config; pcd->pcd_get_msr = iaf_get_msr; pcd->pcd_pcpu_fini = core_pcpu_noop; pcd->pcd_pcpu_init = core_pcpu_noop; pcd->pcd_read_pmc = iaf_read_pmc; pcd->pcd_release_pmc = iaf_release_pmc; pcd->pcd_start_pmc = iaf_start_pmc; pcd->pcd_stop_pmc = iaf_stop_pmc; pcd->pcd_write_pmc = iaf_write_pmc; md->pmd_npmc += npmc; } /* * Intel programmable PMCs. */ /* Sub fields of UMASK that this event supports. */ #define IAP_M_CORE (1 << 0) /* Core specificity */ #define IAP_M_AGENT (1 << 1) /* Agent specificity */ #define IAP_M_PREFETCH (1 << 2) /* Prefetch */ #define IAP_M_MESI (1 << 3) /* MESI */ #define IAP_M_SNOOPRESPONSE (1 << 4) /* Snoop response */ #define IAP_M_SNOOPTYPE (1 << 5) /* Snoop type */ #define IAP_M_TRANSITION (1 << 6) /* Transition */ #define IAP_F_CORE (0x3 << 14) /* Core specificity */ #define IAP_F_AGENT (0x1 << 13) /* Agent specificity */ #define IAP_F_PREFETCH (0x3 << 12) /* Prefetch */ #define IAP_F_MESI (0xF << 8) /* MESI */ #define IAP_F_SNOOPRESPONSE (0xB << 8) /* Snoop response */ #define IAP_F_SNOOPTYPE (0x3 << 8) /* Snoop type */ #define IAP_F_TRANSITION (0x1 << 12) /* Transition */ #define IAP_PREFETCH_RESERVED (0x2 << 12) #define IAP_CORE_THIS (0x1 << 14) #define IAP_CORE_ALL (0x3 << 14) #define IAP_F_CMASK 0xFF000000 static pmc_value_t iap_perfctr_value_to_reload_count(pmc_value_t v) { /* If the PMC has overflowed, return a reload count of zero. */ if ((v & (1ULL << (core_iap_width - 1))) == 0) return (0); v &= (1ULL << core_iap_width) - 1; return (1ULL << core_iap_width) - v; } static pmc_value_t iap_reload_count_to_perfctr_value(pmc_value_t rlc) { return (1ULL << core_iap_width) - rlc; } static int iap_pmc_has_overflowed(int ri) { uint64_t v; /* * We treat a Core (i.e., Intel architecture v1) PMC as has * having overflowed if its MSB is zero. */ v = rdpmc(ri); return ((v & (1ULL << (core_iap_width - 1))) == 0); } static int iap_event_corei7_ok_on_counter(uint8_t evsel, int ri) { uint32_t mask; switch (evsel) { /* Events valid only on counter 0, 1. */ case 0x40: case 0x41: case 0x42: case 0x43: case 0x4C: case 0x4E: case 0x51: case 0x52: case 0x53: case 0x63: mask = 0x3; break; /* Any row index is ok. */ default: mask = ~0; } return (mask & (1 << ri)); } static int iap_event_westmere_ok_on_counter(uint8_t evsel, int ri) { uint32_t mask; switch (evsel) { /* Events valid only on counter 0. */ case 0x60: case 0xB3: mask = 0x1; break; /* Events valid only on counter 0, 1. */ case 0x4C: case 0x4E: case 0x51: case 0x52: case 0x63: mask = 0x3; break; /* Any row index is ok. */ default: mask = ~0; } return (mask & (1 << ri)); } static int iap_event_sb_sbx_ib_ibx_ok_on_counter(uint8_t evsel, int ri) { uint32_t mask; switch (evsel) { /* Events valid only on counter 0. */ case 0xB7: mask = 0x1; break; /* Events valid only on counter 1. */ case 0xC0: mask = 0x2; break; /* Events valid only on counter 2. */ case 0x48: case 0xA2: case 0xA3: mask = 0x4; break; /* Events valid only on counter 3. */ case 0xBB: case 0xCD: mask = 0x8; break; /* Any row index is ok. */ default: mask = ~0; } return (mask & (1 << ri)); } static int iap_event_core_ok_on_counter(uint8_t evsel, int ri) { uint32_t mask; switch (evsel) { /* * Events valid only on counter 0. */ case 0x10: case 0x14: case 0x18: case 0xB3: case 0xC1: case 0xCB: mask = (1 << 0); break; /* * Events valid only on counter 1. */ case 0x11: case 0x12: case 0x13: mask = (1 << 1); break; default: mask = ~0; /* Any row index is ok. */ } return (mask & (1 << ri)); } static int iap_allocate_pmc(int cpu, int ri, struct pmc *pm, const struct pmc_op_pmcallocate *a) { uint8_t ev; const struct pmc_md_iap_op_pmcallocate *iap; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iap_npmc, ("[core,%d] illegal row-index value %d", __LINE__, ri)); if (a->pm_class != PMC_CLASS_IAP) return (EINVAL); iap = &a->pm_md.pm_iap; ev = IAP_EVSEL_GET(iap->pm_iap_config); switch (core_cputype) { case PMC_CPU_INTEL_CORE: case PMC_CPU_INTEL_CORE2: case PMC_CPU_INTEL_CORE2EXTREME: if (iap_event_core_ok_on_counter(ev, ri) == 0) return (EINVAL); case PMC_CPU_INTEL_COREI7: case PMC_CPU_INTEL_NEHALEM_EX: if (iap_event_corei7_ok_on_counter(ev, ri) == 0) return (EINVAL); break; case PMC_CPU_INTEL_WESTMERE: case PMC_CPU_INTEL_WESTMERE_EX: if (iap_event_westmere_ok_on_counter(ev, ri) == 0) return (EINVAL); break; case PMC_CPU_INTEL_SANDYBRIDGE: case PMC_CPU_INTEL_SANDYBRIDGE_XEON: case PMC_CPU_INTEL_IVYBRIDGE: case PMC_CPU_INTEL_IVYBRIDGE_XEON: case PMC_CPU_INTEL_HASWELL: case PMC_CPU_INTEL_HASWELL_XEON: case PMC_CPU_INTEL_BROADWELL: case PMC_CPU_INTEL_BROADWELL_XEON: if (iap_event_sb_sbx_ib_ibx_ok_on_counter(ev, ri) == 0) return (EINVAL); break; case PMC_CPU_INTEL_ATOM: case PMC_CPU_INTEL_ATOM_SILVERMONT: case PMC_CPU_INTEL_ATOM_GOLDMONT: case PMC_CPU_INTEL_ATOM_GOLDMONT_P: case PMC_CPU_INTEL_ATOM_TREMONT: case PMC_CPU_INTEL_SKYLAKE: case PMC_CPU_INTEL_SKYLAKE_XEON: case PMC_CPU_INTEL_ICELAKE: case PMC_CPU_INTEL_ICELAKE_XEON: case PMC_CPU_INTEL_ALDERLAKE: default: break; } pm->pm_md.pm_iap.pm_iap_evsel = iap->pm_iap_config; return (0); } static int iap_config_pmc(int cpu, int ri, struct pmc *pm) { KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iap_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); PMCDBG3(MDP,CFG,1, "iap-config cpu=%d ri=%d pm=%p", cpu, ri, pm); KASSERT(core_pcpu[cpu] != NULL, ("[core,%d] null per-cpu %d", __LINE__, cpu)); core_pcpu[cpu]->pc_corepmcs[ri].phw_pmc = pm; return (0); } static int iap_describe(int cpu, int ri, struct pmc_info *pi, struct pmc **ppmc) { struct pmc_hw *phw; phw = &core_pcpu[cpu]->pc_corepmcs[ri]; snprintf(pi->pm_name, sizeof(pi->pm_name), "IAP-%d", ri); pi->pm_class = PMC_CLASS_IAP; if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) { pi->pm_enabled = TRUE; *ppmc = phw->phw_pmc; } else { pi->pm_enabled = FALSE; *ppmc = NULL; } return (0); } static int iap_get_config(int cpu, int ri, struct pmc **ppm) { *ppm = core_pcpu[cpu]->pc_corepmcs[ri].phw_pmc; return (0); } static int iap_get_msr(int ri, uint32_t *msr) { KASSERT(ri >= 0 && ri < core_iap_npmc, ("[iap,%d] ri %d out of range", __LINE__, ri)); *msr = ri; return (0); } static int iap_read_pmc(int cpu, int ri, pmc_value_t *v) { struct pmc *pm; pmc_value_t tmp; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal cpu value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iap_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); pm = core_pcpu[cpu]->pc_corepmcs[ri].phw_pmc; KASSERT(pm, ("[core,%d] cpu %d ri %d pmc not configured", __LINE__, cpu, ri)); tmp = rdpmc(ri); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) *v = iap_perfctr_value_to_reload_count(tmp); else *v = tmp & ((1ULL << core_iap_width) - 1); PMCDBG4(MDP,REA,1, "iap-read cpu=%d ri=%d msr=0x%x -> v=%jx", cpu, ri, IAP_PMC0 + ri, *v); return (0); } static int iap_release_pmc(int cpu, int ri, struct pmc *pm) { (void) pm; PMCDBG3(MDP,REL,1, "iap-release cpu=%d ri=%d pm=%p", cpu, ri, pm); KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iap_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); KASSERT(core_pcpu[cpu]->pc_corepmcs[ri].phw_pmc == NULL, ("[core,%d] PHW pmc non-NULL", __LINE__)); return (0); } static int iap_start_pmc(int cpu, int ri) { struct pmc *pm; uint64_t evsel; struct core_cpu *cc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iap_npmc, ("[core,%d] illegal row-index %d", __LINE__, ri)); cc = core_pcpu[cpu]; pm = cc->pc_corepmcs[ri].phw_pmc; KASSERT(pm, ("[core,%d] starting cpu%d,ri%d with no pmc configured", __LINE__, cpu, ri)); PMCDBG2(MDP,STA,1, "iap-start cpu=%d ri=%d", cpu, ri); evsel = pm->pm_md.pm_iap.pm_iap_evsel; PMCDBG4(MDP,STA,2, "iap-start/2 cpu=%d ri=%d evselmsr=0x%x evsel=0x%x", cpu, ri, IAP_EVSEL0 + ri, evsel); /* Event specific configuration. */ switch (IAP_EVSEL_GET(evsel)) { case 0xB7: wrmsr(IA_OFFCORE_RSP0, pm->pm_md.pm_iap.pm_iap_rsp); break; case 0xBB: wrmsr(IA_OFFCORE_RSP1, pm->pm_md.pm_iap.pm_iap_rsp); break; default: break; } wrmsr(IAP_EVSEL0 + ri, evsel | IAP_EN); if (core_version >= 2) { cc->pc_globalctrl |= (1ULL << ri); wrmsr(IA_GLOBAL_CTRL, cc->pc_globalctrl); } return (0); } static int iap_stop_pmc(int cpu, int ri) { struct pmc *pm __diagused; struct core_cpu *cc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal cpu value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iap_npmc, ("[core,%d] illegal row index %d", __LINE__, ri)); cc = core_pcpu[cpu]; pm = cc->pc_corepmcs[ri].phw_pmc; KASSERT(pm, ("[core,%d] cpu%d ri%d no configured PMC to stop", __LINE__, cpu, ri)); PMCDBG2(MDP,STO,1, "iap-stop cpu=%d ri=%d", cpu, ri); wrmsr(IAP_EVSEL0 + ri, 0); /* Don't need to write IA_GLOBAL_CTRL, one disable is enough. */ return (0); } static int iap_write_pmc(int cpu, int ri, pmc_value_t v) { struct pmc *pm; struct core_cpu *cc; KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[core,%d] illegal cpu value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < core_iap_npmc, ("[core,%d] illegal row index %d", __LINE__, ri)); cc = core_pcpu[cpu]; pm = cc->pc_corepmcs[ri].phw_pmc; KASSERT(pm, ("[core,%d] cpu%d ri%d no configured PMC to stop", __LINE__, cpu, ri)); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) v = iap_reload_count_to_perfctr_value(v); v &= (1ULL << core_iap_width) - 1; PMCDBG4(MDP,WRI,1, "iap-write cpu=%d ri=%d msr=0x%x v=%jx", cpu, ri, IAP_PMC0 + ri, v); /* * Write the new value to the counter (or it's alias). The * counter will be in a stopped state when the pcd_write() * entry point is called. */ wrmsr(core_iap_wroffset + IAP_PMC0 + ri, v); return (0); } static void iap_initialize(struct pmc_mdep *md, int maxcpu, int npmc, int pmcwidth, int flags) { struct pmc_classdep *pcd; KASSERT(md != NULL, ("[iap,%d] md is NULL", __LINE__)); PMCDBG0(MDP,INI,1, "iap-initialize"); /* Remember the set of architectural events supported. */ core_architectural_events = ~flags; pcd = &md->pmd_classdep[PMC_MDEP_CLASS_INDEX_IAP]; pcd->pcd_caps = IAP_PMC_CAPS; pcd->pcd_class = PMC_CLASS_IAP; pcd->pcd_num = npmc; pcd->pcd_ri = md->pmd_npmc; pcd->pcd_width = pmcwidth; pcd->pcd_allocate_pmc = iap_allocate_pmc; pcd->pcd_config_pmc = iap_config_pmc; pcd->pcd_describe = iap_describe; pcd->pcd_get_config = iap_get_config; pcd->pcd_get_msr = iap_get_msr; pcd->pcd_pcpu_fini = core_pcpu_fini; pcd->pcd_pcpu_init = core_pcpu_init; pcd->pcd_read_pmc = iap_read_pmc; pcd->pcd_release_pmc = iap_release_pmc; pcd->pcd_start_pmc = iap_start_pmc; pcd->pcd_stop_pmc = iap_stop_pmc; pcd->pcd_write_pmc = iap_write_pmc; md->pmd_npmc += npmc; } static int core_intr(struct trapframe *tf) { pmc_value_t v; struct pmc *pm; struct core_cpu *cc; int error, found_interrupt, ri; PMCDBG3(MDP,INT, 1, "cpu=%d tf=%p um=%d", curcpu, (void *) tf, TRAPF_USERMODE(tf)); found_interrupt = 0; cc = core_pcpu[curcpu]; for (ri = 0; ri < core_iap_npmc; ri++) { if ((pm = cc->pc_corepmcs[ri].phw_pmc) == NULL || !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) continue; if (!iap_pmc_has_overflowed(ri)) continue; found_interrupt = 1; if (pm->pm_state != PMC_STATE_RUNNING) continue; error = pmc_process_interrupt(PMC_HR, pm, tf); v = pm->pm_sc.pm_reloadcount; v = iap_reload_count_to_perfctr_value(v); /* * Stop the counter, reload it but only restart it if * the PMC is not stalled. */ wrmsr(IAP_EVSEL0 + ri, pm->pm_md.pm_iap.pm_iap_evsel); wrmsr(core_iap_wroffset + IAP_PMC0 + ri, v); if (__predict_false(error)) continue; wrmsr(IAP_EVSEL0 + ri, pm->pm_md.pm_iap.pm_iap_evsel | IAP_EN); } if (found_interrupt) counter_u64_add(pmc_stats.pm_intr_processed, 1); else counter_u64_add(pmc_stats.pm_intr_ignored, 1); if (found_interrupt) lapic_reenable_pmc(); return (found_interrupt); } static int core2_intr(struct trapframe *tf) { int error, found_interrupt = 0, n, cpu; uint64_t flag, intrstatus, intrdisable = 0; struct pmc *pm; struct core_cpu *cc; pmc_value_t v; cpu = curcpu; PMCDBG3(MDP,INT, 1, "cpu=%d tf=0x%p um=%d", cpu, (void *) tf, TRAPF_USERMODE(tf)); /* * The IA_GLOBAL_STATUS (MSR 0x38E) register indicates which * PMCs have a pending PMI interrupt. We take a 'snapshot' of * the current set of interrupting PMCs and process these * after stopping them. */ intrstatus = rdmsr(IA_GLOBAL_STATUS); PMCDBG2(MDP,INT, 1, "cpu=%d intrstatus=%jx", cpu, (uintmax_t) intrstatus); /* * Stop PMCs unless hardware already done it. */ if ((intrstatus & IA_GLOBAL_STATUS_FLAG_CTR_FRZ) == 0) wrmsr(IA_GLOBAL_CTRL, 0); cc = core_pcpu[cpu]; KASSERT(cc != NULL, ("[core,%d] null pcpu", __LINE__)); /* * Look for interrupts from fixed function PMCs. */ for (n = 0, flag = (1ULL << IAF_OFFSET); n < core_iaf_npmc; n++, flag <<= 1) { if ((intrstatus & flag) == 0) continue; found_interrupt = 1; pm = cc->pc_corepmcs[n + core_iaf_ri].phw_pmc; if (pm == NULL || pm->pm_state != PMC_STATE_RUNNING || !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) continue; error = pmc_process_interrupt(PMC_HR, pm, tf); if (__predict_false(error)) intrdisable |= flag; v = iaf_reload_count_to_perfctr_value(pm->pm_sc.pm_reloadcount); /* Reload sampling count. */ wrmsr(IAF_CTR0 + n, v); PMCDBG4(MDP,INT, 1, "iaf-intr cpu=%d error=%d v=%jx(%jx)", curcpu, error, (uintmax_t) v, (uintmax_t) rdpmc(IAF_RI_TO_MSR(n))); } /* * Process interrupts from the programmable counters. */ for (n = 0, flag = 1; n < core_iap_npmc; n++, flag <<= 1) { if ((intrstatus & flag) == 0) continue; found_interrupt = 1; pm = cc->pc_corepmcs[n].phw_pmc; if (pm == NULL || pm->pm_state != PMC_STATE_RUNNING || !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) continue; error = pmc_process_interrupt(PMC_HR, pm, tf); if (__predict_false(error)) intrdisable |= flag; v = iap_reload_count_to_perfctr_value(pm->pm_sc.pm_reloadcount); PMCDBG3(MDP,INT, 1, "iap-intr cpu=%d error=%d v=%jx", cpu, error, (uintmax_t) v); /* Reload sampling count. */ wrmsr(core_iap_wroffset + IAP_PMC0 + n, v); } if (found_interrupt) counter_u64_add(pmc_stats.pm_intr_processed, 1); else counter_u64_add(pmc_stats.pm_intr_ignored, 1); if (found_interrupt) lapic_reenable_pmc(); /* * Reenable all non-stalled PMCs. */ if ((intrstatus & IA_GLOBAL_STATUS_FLAG_CTR_FRZ) == 0) { wrmsr(IA_GLOBAL_OVF_CTRL, intrstatus); cc->pc_globalctrl &= ~intrdisable; wrmsr(IA_GLOBAL_CTRL, cc->pc_globalctrl); } else { if (__predict_false(intrdisable)) { cc->pc_globalctrl &= ~intrdisable; wrmsr(IA_GLOBAL_CTRL, cc->pc_globalctrl); } wrmsr(IA_GLOBAL_OVF_CTRL, intrstatus); } PMCDBG4(MDP, INT, 1, "cpu=%d fixedctrl=%jx globalctrl=%jx status=%jx", cpu, (uintmax_t) rdmsr(IAF_CTRL), (uintmax_t) rdmsr(IA_GLOBAL_CTRL), (uintmax_t) rdmsr(IA_GLOBAL_STATUS)); return (found_interrupt); } int pmc_core_initialize(struct pmc_mdep *md, int maxcpu, int version_override) { int cpuid[CORE_CPUID_REQUEST_SIZE]; int flags, nflags; do_cpuid(CORE_CPUID_REQUEST, cpuid); core_cputype = md->pmd_cputype; core_version = (version_override > 0) ? version_override : cpuid[CORE_CPUID_EAX] & 0xFF; PMCDBG3(MDP,INI,1,"core-init cputype=%d ncpu=%d version=%d", core_cputype, maxcpu, core_version); if (core_version < 1 || core_version > 5 || (core_cputype != PMC_CPU_INTEL_CORE && core_version == 1)) { /* Unknown PMC architecture. */ printf("hwpmc_core: unknown PMC architecture: %d\n", core_version); return (EPROGMISMATCH); } core_iap_wroffset = 0; if (cpu_feature2 & CPUID2_PDCM) { if (rdmsr(IA32_PERF_CAPABILITIES) & PERFCAP_FW_WRITE) { PMCDBG0(MDP, INI, 1, "core-init full-width write supported"); core_iap_wroffset = IAP_A_PMC0 - IAP_PMC0; } else PMCDBG0(MDP, INI, 1, "core-init full-width write NOT supported"); } else PMCDBG0(MDP, INI, 1, "core-init pdcm not supported"); core_pmcmask = 0; /* * Initialize programmable counters. */ core_iap_npmc = (cpuid[CORE_CPUID_EAX] >> 8) & 0xFF; core_iap_width = (cpuid[CORE_CPUID_EAX] >> 16) & 0xFF; core_pmcmask |= ((1ULL << core_iap_npmc) - 1); nflags = (cpuid[CORE_CPUID_EAX] >> 24) & 0xFF; flags = cpuid[CORE_CPUID_EBX] & ((1 << nflags) - 1); iap_initialize(md, maxcpu, core_iap_npmc, core_iap_width, flags); /* * Initialize fixed function counters, if present. */ if (core_version >= 2) { core_iaf_ri = core_iap_npmc; core_iaf_npmc = cpuid[CORE_CPUID_EDX] & 0x1F; core_iaf_width = (cpuid[CORE_CPUID_EDX] >> 5) & 0xFF; iaf_initialize(md, maxcpu, core_iaf_npmc, core_iaf_width); core_pmcmask |= ((1ULL << core_iaf_npmc) - 1) << IAF_OFFSET; } PMCDBG2(MDP,INI,1,"core-init pmcmask=0x%jx iafri=%d", core_pmcmask, core_iaf_ri); core_pcpu = malloc(sizeof(*core_pcpu) * maxcpu, M_PMC, M_ZERO | M_WAITOK); /* * Choose the appropriate interrupt handler. */ if (core_version >= 2) md->pmd_intr = core2_intr; else md->pmd_intr = core_intr; - md->pmd_pcpu_fini = NULL; - md->pmd_pcpu_init = NULL; - return (0); } void pmc_core_finalize(struct pmc_mdep *md) { PMCDBG0(MDP,INI,1, "core-finalize"); free(core_pcpu, M_PMC); core_pcpu = NULL; } diff --git a/sys/dev/hwpmc/hwpmc_mod.c b/sys/dev/hwpmc/hwpmc_mod.c index e5a12b666e1b..d5de1874c59b 100644 --- a/sys/dev/hwpmc/hwpmc_mod.c +++ b/sys/dev/hwpmc/hwpmc_mod.c @@ -1,5982 +1,5976 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2008 Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * Copyright (c) 2018 Matthew Macy * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* needs to be after */ #include #include #include #include #include #include #include #include "hwpmc_soft.h" #define PMC_EPOCH_ENTER() struct epoch_tracker pmc_et; epoch_enter_preempt(global_epoch_preempt, &pmc_et) #define PMC_EPOCH_EXIT() epoch_exit_preempt(global_epoch_preempt, &pmc_et) /* * Types */ enum pmc_flags { PMC_FLAG_NONE = 0x00, /* do nothing */ PMC_FLAG_REMOVE = 0x01, /* atomically remove entry from hash */ PMC_FLAG_ALLOCATE = 0x02, /* add entry to hash if not found */ PMC_FLAG_NOWAIT = 0x04, /* do not wait for mallocs */ }; /* * The offset in sysent where the syscall is allocated. */ static int pmc_syscall_num = NO_SYSCALL; struct pmc_cpu **pmc_pcpu; /* per-cpu state */ pmc_value_t *pmc_pcpu_saved; /* saved PMC values: CSW handling */ #define PMC_PCPU_SAVED(C,R) pmc_pcpu_saved[(R) + md->pmd_npmc*(C)] struct mtx_pool *pmc_mtxpool; static int *pmc_pmcdisp; /* PMC row dispositions */ #define PMC_ROW_DISP_IS_FREE(R) (pmc_pmcdisp[(R)] == 0) #define PMC_ROW_DISP_IS_THREAD(R) (pmc_pmcdisp[(R)] > 0) #define PMC_ROW_DISP_IS_STANDALONE(R) (pmc_pmcdisp[(R)] < 0) #define PMC_MARK_ROW_FREE(R) do { \ pmc_pmcdisp[(R)] = 0; \ } while (0) #define PMC_MARK_ROW_STANDALONE(R) do { \ KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ atomic_add_int(&pmc_pmcdisp[(R)], -1); \ KASSERT(pmc_pmcdisp[(R)] >= (-pmc_cpu_max_active()), \ ("[pmc,%d] row disposition error", __LINE__)); \ } while (0) #define PMC_UNMARK_ROW_STANDALONE(R) do { \ atomic_add_int(&pmc_pmcdisp[(R)], 1); \ KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ } while (0) #define PMC_MARK_ROW_THREAD(R) do { \ KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ atomic_add_int(&pmc_pmcdisp[(R)], 1); \ } while (0) #define PMC_UNMARK_ROW_THREAD(R) do { \ atomic_add_int(&pmc_pmcdisp[(R)], -1); \ KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \ __LINE__)); \ } while (0) /* various event handlers */ static eventhandler_tag pmc_exit_tag, pmc_fork_tag, pmc_kld_load_tag, pmc_kld_unload_tag; /* Module statistics */ struct pmc_driverstats pmc_stats; /* Machine/processor dependent operations */ static struct pmc_mdep *md; /* * Hash tables mapping owner processes and target threads to PMCs. */ struct mtx pmc_processhash_mtx; /* spin mutex */ static u_long pmc_processhashmask; static LIST_HEAD(pmc_processhash, pmc_process) *pmc_processhash; /* * Hash table of PMC owner descriptors. This table is protected by * the shared PMC "sx" lock. */ static u_long pmc_ownerhashmask; static LIST_HEAD(pmc_ownerhash, pmc_owner) *pmc_ownerhash; /* * List of PMC owners with system-wide sampling PMCs. */ static CK_LIST_HEAD(, pmc_owner) pmc_ss_owners; /* * List of free thread entries. This is protected by the spin * mutex. */ static struct mtx pmc_threadfreelist_mtx; /* spin mutex */ static LIST_HEAD(, pmc_thread) pmc_threadfreelist; static int pmc_threadfreelist_entries=0; #define THREADENTRY_SIZE \ (sizeof(struct pmc_thread) + (md->pmd_npmc * sizeof(struct pmc_threadpmcstate))) /* * Task to free thread descriptors */ static struct task free_task; /* * A map of row indices to classdep structures. */ static struct pmc_classdep **pmc_rowindex_to_classdep; /* * Prototypes */ #ifdef HWPMC_DEBUG static int pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS); static int pmc_debugflags_parse(char *newstr, char *fence); #endif static int load(struct module *module, int cmd, void *arg); static int pmc_add_sample(ring_type_t ring, struct pmc *pm, struct trapframe *tf); static void pmc_add_thread_descriptors_from_proc(struct proc *p, struct pmc_process *pp); static int pmc_attach_process(struct proc *p, struct pmc *pm); static struct pmc *pmc_allocate_pmc_descriptor(void); static struct pmc_owner *pmc_allocate_owner_descriptor(struct proc *p); static int pmc_attach_one_process(struct proc *p, struct pmc *pm); static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu); static int pmc_can_attach(struct pmc *pm, struct proc *p); static void pmc_capture_user_callchain(int cpu, int soft, struct trapframe *tf); static void pmc_cleanup(void); static int pmc_detach_process(struct proc *p, struct pmc *pm); static int pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags); static void pmc_destroy_owner_descriptor(struct pmc_owner *po); static void pmc_destroy_pmc_descriptor(struct pmc *pm); static void pmc_destroy_process_descriptor(struct pmc_process *pp); static struct pmc_owner *pmc_find_owner_descriptor(struct proc *p); static int pmc_find_pmc(pmc_id_t pmcid, struct pmc **pm); static struct pmc *pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmc); static struct pmc_process *pmc_find_process_descriptor(struct proc *p, uint32_t mode); static struct pmc_thread *pmc_find_thread_descriptor(struct pmc_process *pp, struct thread *td, uint32_t mode); static void pmc_force_context_switch(void); static void pmc_link_target_process(struct pmc *pm, struct pmc_process *pp); static void pmc_log_all_process_mappings(struct pmc_owner *po); static void pmc_log_kernel_mappings(struct pmc *pm); static void pmc_log_process_mappings(struct pmc_owner *po, struct proc *p); static void pmc_maybe_remove_owner(struct pmc_owner *po); static void pmc_process_csw_in(struct thread *td); static void pmc_process_csw_out(struct thread *td); static void pmc_process_exit(void *arg, struct proc *p); static void pmc_process_fork(void *arg, struct proc *p1, struct proc *p2, int n); static void pmc_process_samples(int cpu, ring_type_t soft); static void pmc_release_pmc_descriptor(struct pmc *pmc); static void pmc_process_thread_add(struct thread *td); static void pmc_process_thread_delete(struct thread *td); static void pmc_process_thread_userret(struct thread *td); static void pmc_remove_owner(struct pmc_owner *po); static void pmc_remove_process_descriptor(struct pmc_process *pp); static int pmc_start(struct pmc *pm); static int pmc_stop(struct pmc *pm); static int pmc_syscall_handler(struct thread *td, void *syscall_args); static struct pmc_thread *pmc_thread_descriptor_pool_alloc(void); static void pmc_thread_descriptor_pool_drain(void); static void pmc_thread_descriptor_pool_free(struct pmc_thread *pt); static void pmc_unlink_target_process(struct pmc *pmc, struct pmc_process *pp); static int generic_switch_in(struct pmc_cpu *pc, struct pmc_process *pp); static int generic_switch_out(struct pmc_cpu *pc, struct pmc_process *pp); static struct pmc_mdep *pmc_generic_cpu_initialize(void); static void pmc_generic_cpu_finalize(struct pmc_mdep *md); static void pmc_post_callchain_callback(void); static void pmc_process_threadcreate(struct thread *td); static void pmc_process_threadexit(struct thread *td); static void pmc_process_proccreate(struct proc *p); static void pmc_process_allproc(struct pmc *pm); /* * Kernel tunables and sysctl(8) interface. */ SYSCTL_DECL(_kern_hwpmc); SYSCTL_NODE(_kern_hwpmc, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "HWPMC stats"); /* Stats. */ SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, intr_ignored, CTLFLAG_RW, &pmc_stats.pm_intr_ignored, "# of interrupts ignored"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, intr_processed, CTLFLAG_RW, &pmc_stats.pm_intr_processed, "# of interrupts processed"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, intr_bufferfull, CTLFLAG_RW, &pmc_stats.pm_intr_bufferfull, "# of interrupts where buffer was full"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, syscalls, CTLFLAG_RW, &pmc_stats.pm_syscalls, "# of syscalls"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, syscall_errors, CTLFLAG_RW, &pmc_stats.pm_syscall_errors, "# of syscall_errors"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, buffer_requests, CTLFLAG_RW, &pmc_stats.pm_buffer_requests, "# of buffer requests"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, buffer_requests_failed, CTLFLAG_RW, &pmc_stats.pm_buffer_requests_failed, "# of buffer requests which failed"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, log_sweeps, CTLFLAG_RW, &pmc_stats.pm_log_sweeps, "# of times samples were processed"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, merges, CTLFLAG_RW, &pmc_stats.pm_merges, "# of times kernel stack was found for user trace"); SYSCTL_COUNTER_U64(_kern_hwpmc_stats, OID_AUTO, overwrites, CTLFLAG_RW, &pmc_stats.pm_overwrites, "# of times a sample was overwritten before being logged"); static int pmc_callchaindepth = PMC_CALLCHAIN_DEPTH; SYSCTL_INT(_kern_hwpmc, OID_AUTO, callchaindepth, CTLFLAG_RDTUN, &pmc_callchaindepth, 0, "depth of call chain records"); char pmc_cpuid[PMC_CPUID_LEN]; SYSCTL_STRING(_kern_hwpmc, OID_AUTO, cpuid, CTLFLAG_RD, pmc_cpuid, 0, "cpu version string"); #ifdef HWPMC_DEBUG struct pmc_debugflags pmc_debugflags = PMC_DEBUG_DEFAULT_FLAGS; char pmc_debugstr[PMC_DEBUG_STRSIZE]; TUNABLE_STR(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr, sizeof(pmc_debugstr)); SYSCTL_PROC(_kern_hwpmc, OID_AUTO, debugflags, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 0, 0, pmc_debugflags_sysctl_handler, "A", "debug flags"); #endif /* * kern.hwpmc.hashrows -- determines the number of rows in the * of the hash table used to look up threads */ static int pmc_hashsize = PMC_HASH_SIZE; SYSCTL_INT(_kern_hwpmc, OID_AUTO, hashsize, CTLFLAG_RDTUN, &pmc_hashsize, 0, "rows in hash tables"); /* * kern.hwpmc.nsamples --- number of PC samples/callchain stacks per CPU */ static int pmc_nsamples = PMC_NSAMPLES; SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_RDTUN, &pmc_nsamples, 0, "number of PC samples per CPU"); static uint64_t pmc_sample_mask = PMC_NSAMPLES-1; /* * kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool. */ static int pmc_mtxpool_size = PMC_MTXPOOL_SIZE; SYSCTL_INT(_kern_hwpmc, OID_AUTO, mtxpoolsize, CTLFLAG_RDTUN, &pmc_mtxpool_size, 0, "size of spin mutex pool"); /* * kern.hwpmc.threadfreelist_entries -- number of free entries */ SYSCTL_INT(_kern_hwpmc, OID_AUTO, threadfreelist_entries, CTLFLAG_RD, &pmc_threadfreelist_entries, 0, "number of available thread entries"); /* * kern.hwpmc.threadfreelist_max -- maximum number of free entries */ static int pmc_threadfreelist_max = PMC_THREADLIST_MAX; SYSCTL_INT(_kern_hwpmc, OID_AUTO, threadfreelist_max, CTLFLAG_RW, &pmc_threadfreelist_max, 0, "maximum number of available thread entries before freeing some"); /* * kern.hwpmc.mincount -- minimum sample count */ static u_int pmc_mincount = 1000; SYSCTL_INT(_kern_hwpmc, OID_AUTO, mincount, CTLFLAG_RWTUN, &pmc_mincount, 0, "minimum count for sampling counters"); /* * security.bsd.unprivileged_syspmcs -- allow non-root processes to * allocate system-wide PMCs. * * Allowing unprivileged processes to allocate system PMCs is convenient * if system-wide measurements need to be taken concurrently with other * per-process measurements. This feature is turned off by default. */ static int pmc_unprivileged_syspmcs = 0; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_syspmcs, CTLFLAG_RWTUN, &pmc_unprivileged_syspmcs, 0, "allow unprivileged process to allocate system PMCs"); /* * Hash function. Discard the lower 2 bits of the pointer since * these are always zero for our uses. The hash multiplier is * round((2^LONG_BIT) * ((sqrt(5)-1)/2)). */ #if LONG_BIT == 64 #define _PMC_HM 11400714819323198486u #elif LONG_BIT == 32 #define _PMC_HM 2654435769u #else #error Must know the size of 'long' to compile #endif #define PMC_HASH_PTR(P,M) ((((unsigned long) (P) >> 2) * _PMC_HM) & (M)) /* * Syscall structures */ /* The `sysent' for the new syscall */ static struct sysent pmc_sysent = { .sy_narg = 2, .sy_call = pmc_syscall_handler, }; static struct syscall_module_data pmc_syscall_mod = { .chainevh = load, .chainarg = NULL, .offset = &pmc_syscall_num, .new_sysent = &pmc_sysent, .old_sysent = { .sy_narg = 0, .sy_call = NULL }, .flags = SY_THR_STATIC_KLD, }; static moduledata_t pmc_mod = { .name = PMC_MODULE_NAME, .evhand = syscall_module_handler, .priv = &pmc_syscall_mod, }; #ifdef EARLY_AP_STARTUP DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SYSCALLS, SI_ORDER_ANY); #else DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SMP, SI_ORDER_ANY); #endif MODULE_VERSION(pmc, PMC_VERSION); #ifdef HWPMC_DEBUG enum pmc_dbgparse_state { PMCDS_WS, /* in whitespace */ PMCDS_MAJOR, /* seen a major keyword */ PMCDS_MINOR }; static int pmc_debugflags_parse(char *newstr, char *fence) { char c, *p, *q; struct pmc_debugflags *tmpflags; int error, found, *newbits, tmp; size_t kwlen; tmpflags = malloc(sizeof(*tmpflags), M_PMC, M_WAITOK|M_ZERO); p = newstr; error = 0; for (; p < fence && (c = *p); p++) { /* skip white space */ if (c == ' ' || c == '\t') continue; /* look for a keyword followed by "=" */ for (q = p; p < fence && (c = *p) && c != '='; p++) ; if (c != '=') { error = EINVAL; goto done; } kwlen = p - q; newbits = NULL; /* lookup flag group name */ #define DBG_SET_FLAG_MAJ(S,F) \ if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \ newbits = &tmpflags->pdb_ ## F; DBG_SET_FLAG_MAJ("cpu", CPU); DBG_SET_FLAG_MAJ("csw", CSW); DBG_SET_FLAG_MAJ("logging", LOG); DBG_SET_FLAG_MAJ("module", MOD); DBG_SET_FLAG_MAJ("md", MDP); DBG_SET_FLAG_MAJ("owner", OWN); DBG_SET_FLAG_MAJ("pmc", PMC); DBG_SET_FLAG_MAJ("process", PRC); DBG_SET_FLAG_MAJ("sampling", SAM); if (newbits == NULL) { error = EINVAL; goto done; } p++; /* skip the '=' */ /* Now parse the individual flags */ tmp = 0; newflag: for (q = p; p < fence && (c = *p); p++) if (c == ' ' || c == '\t' || c == ',') break; /* p == fence or c == ws or c == "," or c == 0 */ if ((kwlen = p - q) == 0) { *newbits = tmp; continue; } found = 0; #define DBG_SET_FLAG_MIN(S,F) \ if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \ tmp |= found = (1 << PMC_DEBUG_MIN_ ## F) /* a '*' denotes all possible flags in the group */ if (kwlen == 1 && *q == '*') tmp = found = ~0; /* look for individual flag names */ DBG_SET_FLAG_MIN("allocaterow", ALR); DBG_SET_FLAG_MIN("allocate", ALL); DBG_SET_FLAG_MIN("attach", ATT); DBG_SET_FLAG_MIN("bind", BND); DBG_SET_FLAG_MIN("config", CFG); DBG_SET_FLAG_MIN("exec", EXC); DBG_SET_FLAG_MIN("exit", EXT); DBG_SET_FLAG_MIN("find", FND); DBG_SET_FLAG_MIN("flush", FLS); DBG_SET_FLAG_MIN("fork", FRK); DBG_SET_FLAG_MIN("getbuf", GTB); DBG_SET_FLAG_MIN("hook", PMH); DBG_SET_FLAG_MIN("init", INI); DBG_SET_FLAG_MIN("intr", INT); DBG_SET_FLAG_MIN("linktarget", TLK); DBG_SET_FLAG_MIN("mayberemove", OMR); DBG_SET_FLAG_MIN("ops", OPS); DBG_SET_FLAG_MIN("read", REA); DBG_SET_FLAG_MIN("register", REG); DBG_SET_FLAG_MIN("release", REL); DBG_SET_FLAG_MIN("remove", ORM); DBG_SET_FLAG_MIN("sample", SAM); DBG_SET_FLAG_MIN("scheduleio", SIO); DBG_SET_FLAG_MIN("select", SEL); DBG_SET_FLAG_MIN("signal", SIG); DBG_SET_FLAG_MIN("swi", SWI); DBG_SET_FLAG_MIN("swo", SWO); DBG_SET_FLAG_MIN("start", STA); DBG_SET_FLAG_MIN("stop", STO); DBG_SET_FLAG_MIN("syscall", PMS); DBG_SET_FLAG_MIN("unlinktarget", TUL); DBG_SET_FLAG_MIN("write", WRI); if (found == 0) { /* unrecognized flag name */ error = EINVAL; goto done; } if (c == 0 || c == ' ' || c == '\t') { /* end of flag group */ *newbits = tmp; continue; } p++; goto newflag; } /* save the new flag set */ bcopy(tmpflags, &pmc_debugflags, sizeof(pmc_debugflags)); done: free(tmpflags, M_PMC); return error; } static int pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS) { char *fence, *newstr; int error; unsigned int n; (void) arg1; (void) arg2; /* unused parameters */ n = sizeof(pmc_debugstr); newstr = malloc(n, M_PMC, M_WAITOK|M_ZERO); (void) strlcpy(newstr, pmc_debugstr, n); error = sysctl_handle_string(oidp, newstr, n, req); /* if there is a new string, parse and copy it */ if (error == 0 && req->newptr != NULL) { fence = newstr + (n < req->newlen ? n : req->newlen + 1); if ((error = pmc_debugflags_parse(newstr, fence)) == 0) (void) strlcpy(pmc_debugstr, newstr, sizeof(pmc_debugstr)); } free(newstr, M_PMC); return error; } #endif /* * Map a row index to a classdep structure and return the adjusted row * index for the PMC class index. */ static struct pmc_classdep * pmc_ri_to_classdep(struct pmc_mdep *md, int ri, int *adjri) { struct pmc_classdep *pcd; (void) md; KASSERT(ri >= 0 && ri < md->pmd_npmc, ("[pmc,%d] illegal row-index %d", __LINE__, ri)); pcd = pmc_rowindex_to_classdep[ri]; KASSERT(pcd != NULL, ("[pmc,%d] ri %d null pcd", __LINE__, ri)); *adjri = ri - pcd->pcd_ri; KASSERT(*adjri >= 0 && *adjri < pcd->pcd_num, ("[pmc,%d] adjusted row-index %d", __LINE__, *adjri)); return (pcd); } /* * Concurrency Control * * The driver manages the following data structures: * * - target process descriptors, one per target process * - owner process descriptors (and attached lists), one per owner process * - lookup hash tables for owner and target processes * - PMC descriptors (and attached lists) * - per-cpu hardware state * - the 'hook' variable through which the kernel calls into * this module * - the machine hardware state (managed by the MD layer) * * These data structures are accessed from: * * - thread context-switch code * - interrupt handlers (possibly on multiple cpus) * - kernel threads on multiple cpus running on behalf of user * processes doing system calls * - this driver's private kernel threads * * = Locks and Locking strategy = * * The driver uses four locking strategies for its operation: * * - The global SX lock "pmc_sx" is used to protect internal * data structures. * * Calls into the module by syscall() start with this lock being * held in exclusive mode. Depending on the requested operation, * the lock may be downgraded to 'shared' mode to allow more * concurrent readers into the module. Calls into the module from * other parts of the kernel acquire the lock in shared mode. * * This SX lock is held in exclusive mode for any operations that * modify the linkages between the driver's internal data structures. * * The 'pmc_hook' function pointer is also protected by this lock. * It is only examined with the sx lock held in exclusive mode. The * kernel module is allowed to be unloaded only with the sx lock held * in exclusive mode. In normal syscall handling, after acquiring the * pmc_sx lock we first check that 'pmc_hook' is non-null before * proceeding. This prevents races between the thread unloading the module * and other threads seeking to use the module. * * - Lookups of target process structures and owner process structures * cannot use the global "pmc_sx" SX lock because these lookups need * to happen during context switches and in other critical sections * where sleeping is not allowed. We protect these lookup tables * with their own private spin-mutexes, "pmc_processhash_mtx" and * "pmc_ownerhash_mtx". * * - Interrupt handlers work in a lock free manner. At interrupt * time, handlers look at the PMC pointer (phw->phw_pmc) configured * when the PMC was started. If this pointer is NULL, the interrupt * is ignored after updating driver statistics. We ensure that this * pointer is set (using an atomic operation if necessary) before the * PMC hardware is started. Conversely, this pointer is unset atomically * only after the PMC hardware is stopped. * * We ensure that everything needed for the operation of an * interrupt handler is available without it needing to acquire any * locks. We also ensure that a PMC's software state is destroyed only * after the PMC is taken off hardware (on all CPUs). * * - Context-switch handling with process-private PMCs needs more * care. * * A given process may be the target of multiple PMCs. For example, * PMCATTACH and PMCDETACH may be requested by a process on one CPU * while the target process is running on another. A PMC could also * be getting released because its owner is exiting. We tackle * these situations in the following manner: * * - each target process structure 'pmc_process' has an array * of 'struct pmc *' pointers, one for each hardware PMC. * * - At context switch IN time, each "target" PMC in RUNNING state * gets started on hardware and a pointer to each PMC is copied into * the per-cpu phw array. The 'runcount' for the PMC is * incremented. * * - At context switch OUT time, all process-virtual PMCs are stopped * on hardware. The saved value is added to the PMCs value field * only if the PMC is in a non-deleted state (the PMCs state could * have changed during the current time slice). * * Note that since in-between a switch IN on a processor and a switch * OUT, the PMC could have been released on another CPU. Therefore * context switch OUT always looks at the hardware state to turn * OFF PMCs and will update a PMC's saved value only if reachable * from the target process record. * * - OP PMCRELEASE could be called on a PMC at any time (the PMC could * be attached to many processes at the time of the call and could * be active on multiple CPUs). * * We prevent further scheduling of the PMC by marking it as in * state 'DELETED'. If the runcount of the PMC is non-zero then * this PMC is currently running on a CPU somewhere. The thread * doing the PMCRELEASE operation waits by repeatedly doing a * pause() till the runcount comes to zero. * * The contents of a PMC descriptor (struct pmc) are protected using * a spin-mutex. In order to save space, we use a mutex pool. * * In terms of lock types used by witness(4), we use: * - Type "pmc-sx", used by the global SX lock. * - Type "pmc-sleep", for sleep mutexes used by logger threads. * - Type "pmc-per-proc", for protecting PMC owner descriptors. * - Type "pmc-leaf", used for all other spin mutexes. */ /* * save the cpu binding of the current kthread */ void pmc_save_cpu_binding(struct pmc_binding *pb) { PMCDBG0(CPU,BND,2, "save-cpu"); thread_lock(curthread); pb->pb_bound = sched_is_bound(curthread); pb->pb_cpu = curthread->td_oncpu; pb->pb_priority = curthread->td_priority; thread_unlock(curthread); PMCDBG1(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu); } /* * restore the cpu binding of the current thread */ void pmc_restore_cpu_binding(struct pmc_binding *pb) { PMCDBG2(CPU,BND,2, "restore-cpu curcpu=%d restore=%d", curthread->td_oncpu, pb->pb_cpu); thread_lock(curthread); sched_bind(curthread, pb->pb_cpu); if (!pb->pb_bound) sched_unbind(curthread); sched_prio(curthread, pb->pb_priority); thread_unlock(curthread); PMCDBG0(CPU,BND,2, "restore-cpu done"); } /* * move execution over the specified cpu and bind it there. */ void pmc_select_cpu(int cpu) { KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[pmc,%d] bad cpu number %d", __LINE__, cpu)); /* Never move to an inactive CPU. */ KASSERT(pmc_cpu_is_active(cpu), ("[pmc,%d] selecting inactive " "CPU %d", __LINE__, cpu)); PMCDBG1(CPU,SEL,2, "select-cpu cpu=%d", cpu); thread_lock(curthread); sched_prio(curthread, PRI_MIN); sched_bind(curthread, cpu); thread_unlock(curthread); KASSERT(curthread->td_oncpu == cpu, ("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__, cpu, curthread->td_oncpu)); PMCDBG1(CPU,SEL,2, "select-cpu cpu=%d ok", cpu); } /* * Force a context switch. * * We do this by pause'ing for 1 tick -- invoking mi_switch() is not * guaranteed to force a context switch. */ static void pmc_force_context_switch(void) { pause("pmcctx", 1); } uint64_t pmc_rdtsc(void) { #if defined(__i386__) || defined(__amd64__) if (__predict_true(amd_feature & AMDID_RDTSCP)) return rdtscp(); else return rdtsc(); #else return get_cyclecount(); #endif } /* * Get the file name for an executable. This is a simple wrapper * around vn_fullpath(9). */ static void pmc_getfilename(struct vnode *v, char **fullpath, char **freepath) { *fullpath = "unknown"; *freepath = NULL; vn_fullpath(v, fullpath, freepath); } /* * remove an process owning PMCs */ void pmc_remove_owner(struct pmc_owner *po) { struct pmc *pm, *tmp; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG1(OWN,ORM,1, "remove-owner po=%p", po); /* Remove descriptor from the owner hash table */ LIST_REMOVE(po, po_next); /* release all owned PMC descriptors */ LIST_FOREACH_SAFE(pm, &po->po_pmcs, pm_next, tmp) { PMCDBG1(OWN,ORM,2, "pmc=%p", pm); KASSERT(pm->pm_owner == po, ("[pmc,%d] owner %p != po %p", __LINE__, pm->pm_owner, po)); pmc_release_pmc_descriptor(pm); /* will unlink from the list */ pmc_destroy_pmc_descriptor(pm); } KASSERT(po->po_sscount == 0, ("[pmc,%d] SS count not zero", __LINE__)); KASSERT(LIST_EMPTY(&po->po_pmcs), ("[pmc,%d] PMC list not empty", __LINE__)); /* de-configure the log file if present */ if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_deconfigure_log(po); } /* * remove an owner process record if all conditions are met. */ static void pmc_maybe_remove_owner(struct pmc_owner *po) { PMCDBG1(OWN,OMR,1, "maybe-remove-owner po=%p", po); /* * Remove owner record if * - this process does not own any PMCs * - this process has not allocated a system-wide sampling buffer */ if (LIST_EMPTY(&po->po_pmcs) && ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)) { pmc_remove_owner(po); pmc_destroy_owner_descriptor(po); } } /* * Add an association between a target process and a PMC. */ static void pmc_link_target_process(struct pmc *pm, struct pmc_process *pp) { int ri; struct pmc_target *pt; #ifdef INVARIANTS struct pmc_thread *pt_td; #endif sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm != NULL && pp != NULL, ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp)); KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] Attaching a non-process-virtual pmc=%p to pid=%d", __LINE__, pm, pp->pp_proc->p_pid)); KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= ((int) md->pmd_npmc - 1), ("[pmc,%d] Illegal reference count %d for process record %p", __LINE__, pp->pp_refcnt, (void *) pp)); ri = PMC_TO_ROWINDEX(pm); PMCDBG3(PRC,TLK,1, "link-target pmc=%p ri=%d pmc-process=%p", pm, ri, pp); #ifdef HWPMC_DEBUG LIST_FOREACH(pt, &pm->pm_targets, pt_next) if (pt->pt_process == pp) KASSERT(0, ("[pmc,%d] pp %p already in pmc %p targets", __LINE__, pp, pm)); #endif pt = malloc(sizeof(struct pmc_target), M_PMC, M_WAITOK|M_ZERO); pt->pt_process = pp; LIST_INSERT_HEAD(&pm->pm_targets, pt, pt_next); atomic_store_rel_ptr((uintptr_t *)&pp->pp_pmcs[ri].pp_pmc, (uintptr_t)pm); if (pm->pm_owner->po_owner == pp->pp_proc) pm->pm_flags |= PMC_F_ATTACHED_TO_OWNER; /* * Initialize the per-process values at this row index. */ pp->pp_pmcs[ri].pp_pmcval = PMC_TO_MODE(pm) == PMC_MODE_TS ? pm->pm_sc.pm_reloadcount : 0; pp->pp_refcnt++; #ifdef INVARIANTS /* Confirm that the per-thread values at this row index are cleared. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) { mtx_lock_spin(pp->pp_tdslock); LIST_FOREACH(pt_td, &pp->pp_tds, pt_next) { KASSERT(pt_td->pt_pmcs[ri].pt_pmcval == (pmc_value_t) 0, ("[pmc,%d] pt_pmcval not cleared for pid=%d at " "ri=%d", __LINE__, pp->pp_proc->p_pid, ri)); } mtx_unlock_spin(pp->pp_tdslock); } #endif } /* * Removes the association between a target process and a PMC. */ static void pmc_unlink_target_process(struct pmc *pm, struct pmc_process *pp) { int ri; struct proc *p; struct pmc_target *ptgt; struct pmc_thread *pt; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm != NULL && pp != NULL, ("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp)); KASSERT(pp->pp_refcnt >= 1 && pp->pp_refcnt <= (int) md->pmd_npmc, ("[pmc,%d] Illegal ref count %d on process record %p", __LINE__, pp->pp_refcnt, (void *) pp)); ri = PMC_TO_ROWINDEX(pm); PMCDBG3(PRC,TUL,1, "unlink-target pmc=%p ri=%d pmc-process=%p", pm, ri, pp); KASSERT(pp->pp_pmcs[ri].pp_pmc == pm, ("[pmc,%d] PMC ri %d mismatch pmc %p pp->[ri] %p", __LINE__, ri, pm, pp->pp_pmcs[ri].pp_pmc)); pp->pp_pmcs[ri].pp_pmc = NULL; pp->pp_pmcs[ri].pp_pmcval = (pmc_value_t) 0; /* Clear the per-thread values at this row index. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) { mtx_lock_spin(pp->pp_tdslock); LIST_FOREACH(pt, &pp->pp_tds, pt_next) pt->pt_pmcs[ri].pt_pmcval = (pmc_value_t) 0; mtx_unlock_spin(pp->pp_tdslock); } /* Remove owner-specific flags */ if (pm->pm_owner->po_owner == pp->pp_proc) { pp->pp_flags &= ~PMC_PP_ENABLE_MSR_ACCESS; pm->pm_flags &= ~PMC_F_ATTACHED_TO_OWNER; } pp->pp_refcnt--; /* Remove the target process from the PMC structure */ LIST_FOREACH(ptgt, &pm->pm_targets, pt_next) if (ptgt->pt_process == pp) break; KASSERT(ptgt != NULL, ("[pmc,%d] process %p (pp: %p) not found " "in pmc %p", __LINE__, pp->pp_proc, pp, pm)); LIST_REMOVE(ptgt, pt_next); free(ptgt, M_PMC); /* if the PMC now lacks targets, send the owner a SIGIO */ if (LIST_EMPTY(&pm->pm_targets)) { p = pm->pm_owner->po_owner; PROC_LOCK(p); kern_psignal(p, SIGIO); PROC_UNLOCK(p); PMCDBG2(PRC,SIG,2, "signalling proc=%p signal=%d", p, SIGIO); } } /* * Check if PMC 'pm' may be attached to target process 't'. */ static int pmc_can_attach(struct pmc *pm, struct proc *t) { struct proc *o; /* pmc owner */ struct ucred *oc, *tc; /* owner, target credentials */ int decline_attach, i; /* * A PMC's owner can always attach that PMC to itself. */ if ((o = pm->pm_owner->po_owner) == t) return 0; PROC_LOCK(o); oc = o->p_ucred; crhold(oc); PROC_UNLOCK(o); PROC_LOCK(t); tc = t->p_ucred; crhold(tc); PROC_UNLOCK(t); /* * The effective uid of the PMC owner should match at least one * of the {effective,real,saved} uids of the target process. */ decline_attach = oc->cr_uid != tc->cr_uid && oc->cr_uid != tc->cr_svuid && oc->cr_uid != tc->cr_ruid; /* * Every one of the target's group ids, must be in the owner's * group list. */ for (i = 0; !decline_attach && i < tc->cr_ngroups; i++) decline_attach = !groupmember(tc->cr_groups[i], oc); /* check the read and saved gids too */ if (decline_attach == 0) decline_attach = !groupmember(tc->cr_rgid, oc) || !groupmember(tc->cr_svgid, oc); crfree(tc); crfree(oc); return !decline_attach; } /* * Attach a process to a PMC. */ static int pmc_attach_one_process(struct proc *p, struct pmc *pm) { int ri, error; char *fullpath, *freepath; struct pmc_process *pp; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG5(PRC,ATT,2, "attach-one pm=%p ri=%d proc=%p (%d, %s)", pm, PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm); /* * Locate the process descriptor corresponding to process 'p', * allocating space as needed. * * Verify that rowindex 'pm_rowindex' is free in the process * descriptor. * * If not, allocate space for a descriptor and link the * process descriptor and PMC. */ ri = PMC_TO_ROWINDEX(pm); /* mark process as using HWPMCs */ PROC_LOCK(p); p->p_flag |= P_HWPMC; PROC_UNLOCK(p); if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_ALLOCATE)) == NULL) { error = ENOMEM; goto fail; } if (pp->pp_pmcs[ri].pp_pmc == pm) {/* already present at slot [ri] */ error = EEXIST; goto fail; } if (pp->pp_pmcs[ri].pp_pmc != NULL) { error = EBUSY; goto fail; } pmc_link_target_process(pm, pp); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) && (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) == 0) pm->pm_flags |= PMC_F_NEEDS_LOGFILE; pm->pm_flags |= PMC_F_ATTACH_DONE; /* mark as attached */ /* issue an attach event to a configured log file */ if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) { if (p->p_flag & P_KPROC) { fullpath = kernelname; freepath = NULL; } else { pmc_getfilename(p->p_textvp, &fullpath, &freepath); pmclog_process_pmcattach(pm, p->p_pid, fullpath); } free(freepath, M_TEMP); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmc_log_process_mappings(pm->pm_owner, p); } return (0); fail: PROC_LOCK(p); p->p_flag &= ~P_HWPMC; PROC_UNLOCK(p); return (error); } /* * Attach a process and optionally its children */ static int pmc_attach_process(struct proc *p, struct pmc *pm) { int error; struct proc *top; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG5(PRC,ATT,1, "attach pm=%p ri=%d proc=%p (%d, %s)", pm, PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm); /* * If this PMC successfully allowed a GETMSR operation * in the past, disallow further ATTACHes. */ if ((pm->pm_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0) return EPERM; if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0) return pmc_attach_one_process(p, pm); /* * Traverse all child processes, attaching them to * this PMC. */ sx_slock(&proctree_lock); top = p; for (;;) { if ((error = pmc_attach_one_process(p, pm)) != 0) break; if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) goto done; if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } if (error) (void) pmc_detach_process(top, pm); done: sx_sunlock(&proctree_lock); return error; } /* * Detach a process from a PMC. If there are no other PMCs tracking * this process, remove the process structure from its hash table. If * 'flags' contains PMC_FLAG_REMOVE, then free the process structure. */ static int pmc_detach_one_process(struct proc *p, struct pmc *pm, int flags) { int ri; struct pmc_process *pp; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm != NULL, ("[pmc,%d] null pm pointer", __LINE__)); ri = PMC_TO_ROWINDEX(pm); PMCDBG6(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x", pm, ri, p, p->p_pid, p->p_comm, flags); if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) return ESRCH; if (pp->pp_pmcs[ri].pp_pmc != pm) return EINVAL; pmc_unlink_target_process(pm, pp); /* Issue a detach entry if a log file is configured */ if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_pmcdetach(pm, p->p_pid); /* * If there are no PMCs targeting this process, we remove its * descriptor from the target hash table and unset the P_HWPMC * flag in the struct proc. */ KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= (int) md->pmd_npmc, ("[pmc,%d] Illegal refcnt %d for process struct %p", __LINE__, pp->pp_refcnt, pp)); if (pp->pp_refcnt != 0) /* still a target of some PMC */ return 0; pmc_remove_process_descriptor(pp); if (flags & PMC_FLAG_REMOVE) pmc_destroy_process_descriptor(pp); PROC_LOCK(p); p->p_flag &= ~P_HWPMC; PROC_UNLOCK(p); return 0; } /* * Detach a process and optionally its descendants from a PMC. */ static int pmc_detach_process(struct proc *p, struct pmc *pm) { struct proc *top; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG5(PRC,ATT,1, "detach pm=%p ri=%d proc=%p (%d, %s)", pm, PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm); if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0) return pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE); /* * Traverse all children, detaching them from this PMC. We * ignore errors since we could be detaching a PMC from a * partially attached proc tree. */ sx_slock(&proctree_lock); top = p; for (;;) { (void) pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE); if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) goto done; if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } done: sx_sunlock(&proctree_lock); if (LIST_EMPTY(&pm->pm_targets)) pm->pm_flags &= ~PMC_F_ATTACH_DONE; return 0; } /* * Thread context switch IN */ static void pmc_process_csw_in(struct thread *td) { int cpu; unsigned int adjri, ri; struct pmc *pm; struct proc *p; struct pmc_cpu *pc; struct pmc_hw *phw __diagused; pmc_value_t newvalue; struct pmc_process *pp; struct pmc_thread *pt; struct pmc_classdep *pcd; p = td->td_proc; pt = NULL; if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE)) == NULL) return; KASSERT(pp->pp_proc == td->td_proc, ("[pmc,%d] not my thread state", __LINE__)); critical_enter(); /* no preemption from this point */ cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */ PMCDBG5(CSW,SWI,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p, p->p_pid, p->p_comm, pp); KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[pmc,%d] weird CPU id %d", __LINE__, cpu)); pc = pmc_pcpu[cpu]; for (ri = 0; ri < md->pmd_npmc; ri++) { if ((pm = pp->pp_pmcs[ri].pp_pmc) == NULL) continue; KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] Target PMC in non-virtual mode (%d)", __LINE__, PMC_TO_MODE(pm))); KASSERT(PMC_TO_ROWINDEX(pm) == ri, ("[pmc,%d] Row index mismatch pmc %d != ri %d", __LINE__, PMC_TO_ROWINDEX(pm), ri)); /* * Only PMCs that are marked as 'RUNNING' need * be placed on hardware. */ if (pm->pm_state != PMC_STATE_RUNNING) continue; KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0, ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, (unsigned long)counter_u64_fetch(pm->pm_runcount))); /* increment PMC runcount */ counter_u64_add(pm->pm_runcount, 1); /* configure the HWPMC we are going to use. */ pcd = pmc_ri_to_classdep(md, ri, &adjri); pcd->pcd_config_pmc(cpu, adjri, pm); phw = pc->pc_hwpmcs[ri]; KASSERT(phw != NULL, ("[pmc,%d] null hw pointer", __LINE__)); KASSERT(phw->phw_pmc == pm, ("[pmc,%d] hw->pmc %p != pmc %p", __LINE__, phw->phw_pmc, pm)); /* * Write out saved value and start the PMC. * * Sampling PMCs use a per-thread value, while * counting mode PMCs use a per-pmc value that is * inherited across descendants. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) { if (pt == NULL) pt = pmc_find_thread_descriptor(pp, td, PMC_FLAG_NONE); KASSERT(pt != NULL, ("[pmc,%d] No thread found for td=%p", __LINE__, td)); mtx_pool_lock_spin(pmc_mtxpool, pm); /* * If we have a thread descriptor, use the per-thread * counter in the descriptor. If not, we will use * a per-process counter. * * TODO: Remove the per-process "safety net" once * we have thoroughly tested that we don't hit the * above assert. */ if (pt != NULL) { if (pt->pt_pmcs[ri].pt_pmcval > 0) newvalue = pt->pt_pmcs[ri].pt_pmcval; else newvalue = pm->pm_sc.pm_reloadcount; } else { /* * Use the saved value calculated after the most * recent time a thread using the shared counter * switched out. Reset the saved count in case * another thread from this process switches in * before any threads switch out. */ newvalue = pp->pp_pmcs[ri].pp_pmcval; pp->pp_pmcs[ri].pp_pmcval = pm->pm_sc.pm_reloadcount; } mtx_pool_unlock_spin(pmc_mtxpool, pm); KASSERT(newvalue > 0 && newvalue <= pm->pm_sc.pm_reloadcount, ("[pmc,%d] pmcval outside of expected range cpu=%d " "ri=%d pmcval=%jx pm_reloadcount=%jx", __LINE__, cpu, ri, newvalue, pm->pm_sc.pm_reloadcount)); } else { KASSERT(PMC_TO_MODE(pm) == PMC_MODE_TC, ("[pmc,%d] illegal mode=%d", __LINE__, PMC_TO_MODE(pm))); mtx_pool_lock_spin(pmc_mtxpool, pm); newvalue = PMC_PCPU_SAVED(cpu, ri) = pm->pm_gv.pm_savedvalue; mtx_pool_unlock_spin(pmc_mtxpool, pm); } PMCDBG3(CSW,SWI,1,"cpu=%d ri=%d new=%jd", cpu, ri, newvalue); pcd->pcd_write_pmc(cpu, adjri, newvalue); /* If a sampling mode PMC, reset stalled state. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) pm->pm_pcpu_state[cpu].pps_stalled = 0; /* Indicate that we desire this to run. */ pm->pm_pcpu_state[cpu].pps_cpustate = 1; /* Start the PMC. */ pcd->pcd_start_pmc(cpu, adjri); } /* * perform any other architecture/cpu dependent thread * switch-in actions. */ (void) (*md->pmd_switch_in)(pc, pp); critical_exit(); } /* * Thread context switch OUT. */ static void pmc_process_csw_out(struct thread *td) { int cpu; int64_t tmp; struct pmc *pm; struct proc *p; enum pmc_mode mode; struct pmc_cpu *pc; pmc_value_t newvalue; unsigned int adjri, ri; struct pmc_process *pp; struct pmc_thread *pt = NULL; struct pmc_classdep *pcd; /* * Locate our process descriptor; this may be NULL if * this process is exiting and we have already removed * the process from the target process table. * * Note that due to kernel preemption, multiple * context switches may happen while the process is * exiting. * * Note also that if the target process cannot be * found we still need to deconfigure any PMCs that * are currently running on hardware. */ p = td->td_proc; pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE); /* * save PMCs */ critical_enter(); cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */ PMCDBG5(CSW,SWO,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p, p->p_pid, p->p_comm, pp); KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[pmc,%d weird CPU id %d", __LINE__, cpu)); pc = pmc_pcpu[cpu]; /* * When a PMC gets unlinked from a target PMC, it will * be removed from the target's pp_pmc[] array. * * However, on a MP system, the target could have been * executing on another CPU at the time of the unlink. * So, at context switch OUT time, we need to look at * the hardware to determine if a PMC is scheduled on * it. */ for (ri = 0; ri < md->pmd_npmc; ri++) { pcd = pmc_ri_to_classdep(md, ri, &adjri); pm = NULL; (void) (*pcd->pcd_get_config)(cpu, adjri, &pm); if (pm == NULL) /* nothing at this row index */ continue; mode = PMC_TO_MODE(pm); if (!PMC_IS_VIRTUAL_MODE(mode)) continue; /* not a process virtual PMC */ KASSERT(PMC_TO_ROWINDEX(pm) == ri, ("[pmc,%d] ri mismatch pmc(%d) ri(%d)", __LINE__, PMC_TO_ROWINDEX(pm), ri)); /* * Change desired state, and then stop if not stalled. * This two-step dance should avoid race conditions where * an interrupt re-enables the PMC after this code has * already checked the pm_stalled flag. */ pm->pm_pcpu_state[cpu].pps_cpustate = 0; if (pm->pm_pcpu_state[cpu].pps_stalled == 0) pcd->pcd_stop_pmc(cpu, adjri); KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, (unsigned long)counter_u64_fetch(pm->pm_runcount))); /* reduce this PMC's runcount */ counter_u64_add(pm->pm_runcount, -1); /* * If this PMC is associated with this process, * save the reading. */ if (pm->pm_state != PMC_STATE_DELETED && pp != NULL && pp->pp_pmcs[ri].pp_pmc != NULL) { KASSERT(pm == pp->pp_pmcs[ri].pp_pmc, ("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc)); KASSERT(pp->pp_refcnt > 0, ("[pmc,%d] pp refcnt = %d", __LINE__, pp->pp_refcnt)); pcd->pcd_read_pmc(cpu, adjri, &newvalue); if (mode == PMC_MODE_TS) { PMCDBG3(CSW,SWO,1,"cpu=%d ri=%d val=%jd (samp)", cpu, ri, newvalue); if (pt == NULL) pt = pmc_find_thread_descriptor(pp, td, PMC_FLAG_NONE); KASSERT(pt != NULL, ("[pmc,%d] No thread found for td=%p", __LINE__, td)); mtx_pool_lock_spin(pmc_mtxpool, pm); /* * If we have a thread descriptor, save the * per-thread counter in the descriptor. If not, * we will update the per-process counter. * * TODO: Remove the per-process "safety net" * once we have thoroughly tested that we * don't hit the above assert. */ if (pt != NULL) pt->pt_pmcs[ri].pt_pmcval = newvalue; else { /* * For sampling process-virtual PMCs, * newvalue is the number of events to * be seen until the next sampling * interrupt. We can just add the events * left from this invocation to the * counter, then adjust in case we * overflow our range. * * (Recall that we reload the counter * every time we use it.) */ pp->pp_pmcs[ri].pp_pmcval += newvalue; if (pp->pp_pmcs[ri].pp_pmcval > pm->pm_sc.pm_reloadcount) pp->pp_pmcs[ri].pp_pmcval -= pm->pm_sc.pm_reloadcount; } mtx_pool_unlock_spin(pmc_mtxpool, pm); } else { tmp = newvalue - PMC_PCPU_SAVED(cpu,ri); PMCDBG3(CSW,SWO,1,"cpu=%d ri=%d tmp=%jd (count)", cpu, ri, tmp); /* * For counting process-virtual PMCs, * we expect the count to be * increasing monotonically, modulo a 64 * bit wraparound. */ KASSERT(tmp >= 0, ("[pmc,%d] negative increment cpu=%d " "ri=%d newvalue=%jx saved=%jx " "incr=%jx", __LINE__, cpu, ri, newvalue, PMC_PCPU_SAVED(cpu,ri), tmp)); mtx_pool_lock_spin(pmc_mtxpool, pm); pm->pm_gv.pm_savedvalue += tmp; pp->pp_pmcs[ri].pp_pmcval += tmp; mtx_pool_unlock_spin(pmc_mtxpool, pm); if (pm->pm_flags & PMC_F_LOG_PROCCSW) pmclog_process_proccsw(pm, pp, tmp, td); } } /* mark hardware as free */ pcd->pcd_config_pmc(cpu, adjri, NULL); } /* * perform any other architecture/cpu dependent thread * switch out functions. */ (void) (*md->pmd_switch_out)(pc, pp); critical_exit(); } /* * A new thread for a process. */ static void pmc_process_thread_add(struct thread *td) { struct pmc_process *pmc; pmc = pmc_find_process_descriptor(td->td_proc, PMC_FLAG_NONE); if (pmc != NULL) pmc_find_thread_descriptor(pmc, td, PMC_FLAG_ALLOCATE); } /* * A thread delete for a process. */ static void pmc_process_thread_delete(struct thread *td) { struct pmc_process *pmc; pmc = pmc_find_process_descriptor(td->td_proc, PMC_FLAG_NONE); if (pmc != NULL) pmc_thread_descriptor_pool_free(pmc_find_thread_descriptor(pmc, td, PMC_FLAG_REMOVE)); } /* * A userret() call for a thread. */ static void pmc_process_thread_userret(struct thread *td) { sched_pin(); pmc_capture_user_callchain(curcpu, PMC_UR, td->td_frame); sched_unpin(); } /* * A mapping change for a process. */ static void pmc_process_mmap(struct thread *td, struct pmckern_map_in *pkm) { int ri; pid_t pid; char *fullpath, *freepath; const struct pmc *pm; struct pmc_owner *po; const struct pmc_process *pp; freepath = fullpath = NULL; MPASS(!in_epoch(global_epoch_preempt)); pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath); pid = td->td_proc->p_pid; PMC_EPOCH_ENTER(); /* Inform owners of all system-wide sampling PMCs. */ CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_in(po, pid, pkm->pm_address, fullpath); if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL) goto done; /* * Inform sampling PMC owners tracking this process. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL && PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmclog_process_map_in(pm->pm_owner, pid, pkm->pm_address, fullpath); done: if (freepath) free(freepath, M_TEMP); PMC_EPOCH_EXIT(); } /* * Log an munmap request. */ static void pmc_process_munmap(struct thread *td, struct pmckern_map_out *pkm) { int ri; pid_t pid; struct pmc_owner *po; const struct pmc *pm; const struct pmc_process *pp; pid = td->td_proc->p_pid; PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_out(po, pid, pkm->pm_address, pkm->pm_address + pkm->pm_size); PMC_EPOCH_EXIT(); if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL) return; for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL && PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmclog_process_map_out(pm->pm_owner, pid, pkm->pm_address, pkm->pm_address + pkm->pm_size); } /* * Log mapping information about the kernel. */ static void pmc_log_kernel_mappings(struct pmc *pm) { struct pmc_owner *po; struct pmckern_map_in *km, *kmbase; MPASS(in_epoch(global_epoch_preempt) || sx_xlocked(&pmc_sx)); KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] non-sampling PMC (%p) desires mapping information", __LINE__, (void *) pm)); po = pm->pm_owner; if (po->po_flags & PMC_PO_INITIAL_MAPPINGS_DONE) return; if (PMC_TO_MODE(pm) == PMC_MODE_SS) pmc_process_allproc(pm); /* * Log the current set of kernel modules. */ kmbase = linker_hwpmc_list_objects(); for (km = kmbase; km->pm_file != NULL; km++) { PMCDBG2(LOG,REG,1,"%s %p", (char *) km->pm_file, (void *) km->pm_address); pmclog_process_map_in(po, (pid_t) -1, km->pm_address, km->pm_file); } free(kmbase, M_LINKER); po->po_flags |= PMC_PO_INITIAL_MAPPINGS_DONE; } /* * Log the mappings for a single process. */ static void pmc_log_process_mappings(struct pmc_owner *po, struct proc *p) { vm_map_t map; struct vnode *vp; struct vmspace *vm; vm_map_entry_t entry; vm_offset_t last_end; u_int last_timestamp; struct vnode *last_vp; vm_offset_t start_addr; vm_object_t obj, lobj, tobj; char *fullpath, *freepath; last_vp = NULL; last_end = (vm_offset_t) 0; fullpath = freepath = NULL; if ((vm = vmspace_acquire_ref(p)) == NULL) return; map = &vm->vm_map; vm_map_lock_read(map); VM_MAP_ENTRY_FOREACH(entry, map) { if (entry == NULL) { PMCDBG2(LOG,OPS,2, "hwpmc: vm_map entry unexpectedly " "NULL! pid=%d vm_map=%p\n", p->p_pid, map); break; } /* * We only care about executable map entries. */ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) || !(entry->protection & VM_PROT_EXECUTE) || (entry->object.vm_object == NULL)) { continue; } obj = entry->object.vm_object; VM_OBJECT_RLOCK(obj); /* * Walk the backing_object list to find the base * (non-shadowed) vm_object. */ for (lobj = tobj = obj; tobj != NULL; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; } /* * At this point lobj is the base vm_object and it is locked. */ if (lobj == NULL) { PMCDBG3(LOG,OPS,2, "hwpmc: lobj unexpectedly NULL! pid=%d " "vm_map=%p vm_obj=%p\n", p->p_pid, map, obj); VM_OBJECT_RUNLOCK(obj); continue; } vp = vm_object_vnode(lobj); if (vp == NULL) { if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); continue; } /* * Skip contiguous regions that point to the same * vnode, so we don't emit redundant MAP-IN * directives. */ if (entry->start == last_end && vp == last_vp) { last_end = entry->end; if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); continue; } /* * We don't want to keep the proc's vm_map or this * vm_object locked while we walk the pathname, since * vn_fullpath() can sleep. However, if we drop the * lock, it's possible for concurrent activity to * modify the vm_map list. To protect against this, * we save the vm_map timestamp before we release the * lock, and check it after we reacquire the lock * below. */ start_addr = entry->start; last_end = entry->end; last_timestamp = map->timestamp; vm_map_unlock_read(map); vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); freepath = NULL; pmc_getfilename(vp, &fullpath, &freepath); last_vp = vp; vrele(vp); vp = NULL; pmclog_process_map_in(po, p->p_pid, start_addr, fullpath); if (freepath) free(freepath, M_TEMP); vm_map_lock_read(map); /* * If our saved timestamp doesn't match, this means * that the vm_map was modified out from under us and * we can't trust our current "entry" pointer. Do a * new lookup for this entry. If there is no entry * for this address range, vm_map_lookup_entry() will * return the previous one, so we always want to go to * the next entry on the next loop iteration. * * There is an edge condition here that can occur if * there is no entry at or before this address. In * this situation, vm_map_lookup_entry returns * &map->header, which would cause our loop to abort * without processing the rest of the map. However, * in practice this will never happen for process * vm_map. This is because the executable's text * segment is the first mapping in the proc's address * space, and this mapping is never removed until the * process exits, so there will always be a non-header * entry at or before the requested address for * vm_map_lookup_entry to return. */ if (map->timestamp != last_timestamp) vm_map_lookup_entry(map, last_end - 1, &entry); } vm_map_unlock_read(map); vmspace_free(vm); return; } /* * Log mappings for all processes in the system. */ static void pmc_log_all_process_mappings(struct pmc_owner *po) { struct proc *p, *top; sx_assert(&pmc_sx, SX_XLOCKED); if ((p = pfind(1)) == NULL) panic("[pmc,%d] Cannot find init", __LINE__); PROC_UNLOCK(p); sx_slock(&proctree_lock); top = p; for (;;) { pmc_log_process_mappings(po, p); if (!LIST_EMPTY(&p->p_children)) p = LIST_FIRST(&p->p_children); else for (;;) { if (p == top) goto done; if (LIST_NEXT(p, p_sibling)) { p = LIST_NEXT(p, p_sibling); break; } p = p->p_pptr; } } done: sx_sunlock(&proctree_lock); } /* * The 'hook' invoked from the kernel proper */ #ifdef HWPMC_DEBUG const char *pmc_hooknames[] = { /* these strings correspond to PMC_FN_* in */ "", "EXEC", "CSW-IN", "CSW-OUT", "SAMPLE", "UNUSED1", "UNUSED2", "MMAP", "MUNMAP", "CALLCHAIN-NMI", "CALLCHAIN-SOFT", "SOFTSAMPLING", "THR-CREATE", "THR-EXIT", "THR-USERRET", "THR-CREATE-LOG", "THR-EXIT-LOG", "PROC-CREATE-LOG" }; #endif static int pmc_hook_handler(struct thread *td, int function, void *arg) { int cpu; PMCDBG4(MOD,PMH,1, "hook td=%p func=%d \"%s\" arg=%p", td, function, pmc_hooknames[function], arg); switch (function) { /* * Process exec() */ case PMC_FN_PROCESS_EXEC: { char *fullpath, *freepath; unsigned int ri; int is_using_hwpmcs; struct pmc *pm; struct proc *p; struct pmc_owner *po; struct pmc_process *pp; struct pmckern_procexec *pk; sx_assert(&pmc_sx, SX_XLOCKED); p = td->td_proc; pmc_getfilename(p->p_textvp, &fullpath, &freepath); pk = (struct pmckern_procexec *) arg; PMC_EPOCH_ENTER(); /* Inform owners of SS mode PMCs of the exec event. */ CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procexec(po, PMC_ID_INVALID, p->p_pid, pk->pm_entryaddr, fullpath); PMC_EPOCH_EXIT(); PROC_LOCK(p); is_using_hwpmcs = p->p_flag & P_HWPMC; PROC_UNLOCK(p); if (!is_using_hwpmcs) { if (freepath) free(freepath, M_TEMP); break; } /* * PMCs are not inherited across an exec(): remove any * PMCs that this process is the owner of. */ if ((po = pmc_find_owner_descriptor(p)) != NULL) { pmc_remove_owner(po); pmc_destroy_owner_descriptor(po); } /* * If the process being exec'ed is not the target of any * PMC, we are done. */ if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) { if (freepath) free(freepath, M_TEMP); break; } /* * Log the exec event to all monitoring owners. Skip * owners who have already received the event because * they had system sampling PMCs active. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) { po = pm->pm_owner; if (po->po_sscount == 0 && po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procexec(po, pm->pm_id, p->p_pid, pk->pm_entryaddr, fullpath); } if (freepath) free(freepath, M_TEMP); PMCDBG4(PRC,EXC,1, "exec proc=%p (%d, %s) cred-changed=%d", p, p->p_pid, p->p_comm, pk->pm_credentialschanged); if (pk->pm_credentialschanged == 0) /* no change */ break; /* * If the newly exec()'ed process has a different credential * than before, allow it to be the target of a PMC only if * the PMC's owner has sufficient privilege. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) if (pmc_can_attach(pm, td->td_proc) != 0) pmc_detach_one_process(td->td_proc, pm, PMC_FLAG_NONE); KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= (int) md->pmd_npmc, ("[pmc,%d] Illegal ref count %d on pp %p", __LINE__, pp->pp_refcnt, pp)); /* * If this process is no longer the target of any * PMCs, we can remove the process entry and free * up space. */ if (pp->pp_refcnt == 0) { pmc_remove_process_descriptor(pp); pmc_destroy_process_descriptor(pp); break; } } break; case PMC_FN_CSW_IN: pmc_process_csw_in(td); break; case PMC_FN_CSW_OUT: pmc_process_csw_out(td); break; /* * Process accumulated PC samples. * * This function is expected to be called by hardclock() for * each CPU that has accumulated PC samples. * * This function is to be executed on the CPU whose samples * are being processed. */ case PMC_FN_DO_SAMPLES: /* * Clear the cpu specific bit in the CPU mask before * do the rest of the processing. If the NMI handler * gets invoked after the "atomic_clear_int()" call * below but before "pmc_process_samples()" gets * around to processing the interrupt, then we will * come back here at the next hardclock() tick (and * may find nothing to do if "pmc_process_samples()" * had already processed the interrupt). We don't * lose the interrupt sample. */ DPCPU_SET(pmc_sampled, 0); cpu = PCPU_GET(cpuid); pmc_process_samples(cpu, PMC_HR); pmc_process_samples(cpu, PMC_SR); pmc_process_samples(cpu, PMC_UR); break; case PMC_FN_MMAP: pmc_process_mmap(td, (struct pmckern_map_in *) arg); break; case PMC_FN_MUNMAP: MPASS(in_epoch(global_epoch_preempt) || sx_xlocked(&pmc_sx)); pmc_process_munmap(td, (struct pmckern_map_out *) arg); break; case PMC_FN_PROC_CREATE_LOG: pmc_process_proccreate((struct proc *)arg); break; case PMC_FN_USER_CALLCHAIN: /* * Record a call chain. */ KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); pmc_capture_user_callchain(PCPU_GET(cpuid), PMC_HR, (struct trapframe *) arg); KASSERT(td->td_pinned == 1, ("[pmc,%d] invalid td_pinned value", __LINE__)); sched_unpin(); /* Can migrate safely now. */ td->td_pflags &= ~TDP_CALLCHAIN; break; case PMC_FN_USER_CALLCHAIN_SOFT: /* * Record a call chain. */ KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); cpu = PCPU_GET(cpuid); pmc_capture_user_callchain(cpu, PMC_SR, (struct trapframe *) arg); KASSERT(td->td_pinned == 1, ("[pmc,%d] invalid td_pinned value", __LINE__)); sched_unpin(); /* Can migrate safely now. */ td->td_pflags &= ~TDP_CALLCHAIN; break; case PMC_FN_SOFT_SAMPLING: /* * Call soft PMC sampling intr. */ pmc_soft_intr((struct pmckern_soft *) arg); break; case PMC_FN_THR_CREATE: pmc_process_thread_add(td); pmc_process_threadcreate(td); break; case PMC_FN_THR_CREATE_LOG: pmc_process_threadcreate(td); break; case PMC_FN_THR_EXIT: KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); pmc_process_thread_delete(td); pmc_process_threadexit(td); break; case PMC_FN_THR_EXIT_LOG: pmc_process_threadexit(td); break; case PMC_FN_THR_USERRET: KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); pmc_process_thread_userret(td); break; default: #ifdef HWPMC_DEBUG KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function)); #endif break; } return 0; } /* * allocate a 'struct pmc_owner' descriptor in the owner hash table. */ static struct pmc_owner * pmc_allocate_owner_descriptor(struct proc *p) { uint32_t hindex; struct pmc_owner *po; struct pmc_ownerhash *poh; hindex = PMC_HASH_PTR(p, pmc_ownerhashmask); poh = &pmc_ownerhash[hindex]; /* allocate space for N pointers and one descriptor struct */ po = malloc(sizeof(struct pmc_owner), M_PMC, M_WAITOK|M_ZERO); po->po_owner = p; LIST_INSERT_HEAD(poh, po, po_next); /* insert into hash table */ TAILQ_INIT(&po->po_logbuffers); mtx_init(&po->po_mtx, "pmc-owner-mtx", "pmc-per-proc", MTX_SPIN); PMCDBG4(OWN,ALL,1, "allocate-owner proc=%p (%d, %s) pmc-owner=%p", p, p->p_pid, p->p_comm, po); return po; } static void pmc_destroy_owner_descriptor(struct pmc_owner *po) { PMCDBG4(OWN,REL,1, "destroy-owner po=%p proc=%p (%d, %s)", po, po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm); mtx_destroy(&po->po_mtx); free(po, M_PMC); } /* * Allocate a thread descriptor from the free pool. * * NOTE: This *can* return NULL. */ static struct pmc_thread * pmc_thread_descriptor_pool_alloc(void) { struct pmc_thread *pt; mtx_lock_spin(&pmc_threadfreelist_mtx); if ((pt = LIST_FIRST(&pmc_threadfreelist)) != NULL) { LIST_REMOVE(pt, pt_next); pmc_threadfreelist_entries--; } mtx_unlock_spin(&pmc_threadfreelist_mtx); return (pt); } /* * Add a thread descriptor to the free pool. We use this instead of free() * to maintain a cache of free entries. Additionally, we can safely call * this function when we cannot call free(), such as in a critical section. * */ static void pmc_thread_descriptor_pool_free(struct pmc_thread *pt) { if (pt == NULL) return; memset(pt, 0, THREADENTRY_SIZE); mtx_lock_spin(&pmc_threadfreelist_mtx); LIST_INSERT_HEAD(&pmc_threadfreelist, pt, pt_next); pmc_threadfreelist_entries++; if (pmc_threadfreelist_entries > pmc_threadfreelist_max) taskqueue_enqueue(taskqueue_fast, &free_task); mtx_unlock_spin(&pmc_threadfreelist_mtx); } /* * An asynchronous task to manage the free list. */ static void pmc_thread_descriptor_pool_free_task(void *arg __unused, int pending __unused) { struct pmc_thread *pt; LIST_HEAD(, pmc_thread) tmplist; int delta; LIST_INIT(&tmplist); /* Determine what changes, if any, we need to make. */ mtx_lock_spin(&pmc_threadfreelist_mtx); delta = pmc_threadfreelist_entries - pmc_threadfreelist_max; while (delta > 0 && (pt = LIST_FIRST(&pmc_threadfreelist)) != NULL) { delta--; pmc_threadfreelist_entries--; LIST_REMOVE(pt, pt_next); LIST_INSERT_HEAD(&tmplist, pt, pt_next); } mtx_unlock_spin(&pmc_threadfreelist_mtx); /* If there are entries to free, free them. */ while (!LIST_EMPTY(&tmplist)) { pt = LIST_FIRST(&tmplist); LIST_REMOVE(pt, pt_next); free(pt, M_PMC); } } /* * Drain the thread free pool, freeing all allocations. */ static void pmc_thread_descriptor_pool_drain(void) { struct pmc_thread *pt, *next; LIST_FOREACH_SAFE(pt, &pmc_threadfreelist, pt_next, next) { LIST_REMOVE(pt, pt_next); free(pt, M_PMC); } } /* * find the descriptor corresponding to thread 'td', adding or removing it * as specified by 'mode'. * * Note that this supports additional mode flags in addition to those * supported by pmc_find_process_descriptor(): * PMC_FLAG_NOWAIT: Causes the function to not wait for mallocs. * This makes it safe to call while holding certain other locks. */ static struct pmc_thread * pmc_find_thread_descriptor(struct pmc_process *pp, struct thread *td, uint32_t mode) { struct pmc_thread *pt = NULL, *ptnew = NULL; int wait_flag; KASSERT(td != NULL, ("[pmc,%d] called to add NULL td", __LINE__)); /* * Pre-allocate memory in the PMC_FLAG_ALLOCATE case prior to * acquiring the lock. */ if (mode & PMC_FLAG_ALLOCATE) { if ((ptnew = pmc_thread_descriptor_pool_alloc()) == NULL) { wait_flag = M_WAITOK; if ((mode & PMC_FLAG_NOWAIT) || in_epoch(global_epoch_preempt)) wait_flag = M_NOWAIT; ptnew = malloc(THREADENTRY_SIZE, M_PMC, wait_flag|M_ZERO); } } mtx_lock_spin(pp->pp_tdslock); LIST_FOREACH(pt, &pp->pp_tds, pt_next) if (pt->pt_td == td) break; if ((mode & PMC_FLAG_REMOVE) && pt != NULL) LIST_REMOVE(pt, pt_next); if ((mode & PMC_FLAG_ALLOCATE) && pt == NULL && ptnew != NULL) { pt = ptnew; ptnew = NULL; pt->pt_td = td; LIST_INSERT_HEAD(&pp->pp_tds, pt, pt_next); } mtx_unlock_spin(pp->pp_tdslock); if (ptnew != NULL) { free(ptnew, M_PMC); } return pt; } /* * Try to add thread descriptors for each thread in a process. */ static void pmc_add_thread_descriptors_from_proc(struct proc *p, struct pmc_process *pp) { struct thread *curtd; struct pmc_thread **tdlist; int i, tdcnt, tdlistsz; KASSERT(!PROC_LOCKED(p), ("[pmc,%d] proc unexpectedly locked", __LINE__)); tdcnt = 32; restart: tdlistsz = roundup2(tdcnt, 32); tdcnt = 0; tdlist = malloc(sizeof(struct pmc_thread*) * tdlistsz, M_TEMP, M_WAITOK); PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, curtd) tdcnt++; if (tdcnt >= tdlistsz) { PROC_UNLOCK(p); free(tdlist, M_TEMP); goto restart; } /* * Try to add each thread to the list without sleeping. If unable, * add to a queue to retry after dropping the process lock. */ tdcnt = 0; FOREACH_THREAD_IN_PROC(p, curtd) { tdlist[tdcnt] = pmc_find_thread_descriptor(pp, curtd, PMC_FLAG_ALLOCATE|PMC_FLAG_NOWAIT); if (tdlist[tdcnt] == NULL) { PROC_UNLOCK(p); for (i = 0; i <= tdcnt; i++) pmc_thread_descriptor_pool_free(tdlist[i]); free(tdlist, M_TEMP); goto restart; } tdcnt++; } PROC_UNLOCK(p); free(tdlist, M_TEMP); } /* * find the descriptor corresponding to process 'p', adding or removing it * as specified by 'mode'. */ static struct pmc_process * pmc_find_process_descriptor(struct proc *p, uint32_t mode) { uint32_t hindex; struct pmc_process *pp, *ppnew; struct pmc_processhash *pph; hindex = PMC_HASH_PTR(p, pmc_processhashmask); pph = &pmc_processhash[hindex]; ppnew = NULL; /* * Pre-allocate memory in the PMC_FLAG_ALLOCATE case since we * cannot call malloc(9) once we hold a spin lock. */ if (mode & PMC_FLAG_ALLOCATE) ppnew = malloc(sizeof(struct pmc_process) + md->pmd_npmc * sizeof(struct pmc_targetstate), M_PMC, M_WAITOK|M_ZERO); mtx_lock_spin(&pmc_processhash_mtx); LIST_FOREACH(pp, pph, pp_next) if (pp->pp_proc == p) break; if ((mode & PMC_FLAG_REMOVE) && pp != NULL) LIST_REMOVE(pp, pp_next); if ((mode & PMC_FLAG_ALLOCATE) && pp == NULL && ppnew != NULL) { ppnew->pp_proc = p; LIST_INIT(&ppnew->pp_tds); ppnew->pp_tdslock = mtx_pool_find(pmc_mtxpool, ppnew); LIST_INSERT_HEAD(pph, ppnew, pp_next); mtx_unlock_spin(&pmc_processhash_mtx); pp = ppnew; ppnew = NULL; /* Add thread descriptors for this process' current threads. */ pmc_add_thread_descriptors_from_proc(p, pp); } else mtx_unlock_spin(&pmc_processhash_mtx); if (ppnew != NULL) free(ppnew, M_PMC); return pp; } /* * remove a process descriptor from the process hash table. */ static void pmc_remove_process_descriptor(struct pmc_process *pp) { KASSERT(pp->pp_refcnt == 0, ("[pmc,%d] Removing process descriptor %p with count %d", __LINE__, pp, pp->pp_refcnt)); mtx_lock_spin(&pmc_processhash_mtx); LIST_REMOVE(pp, pp_next); mtx_unlock_spin(&pmc_processhash_mtx); } /* * destroy a process descriptor. */ static void pmc_destroy_process_descriptor(struct pmc_process *pp) { struct pmc_thread *pmc_td; while ((pmc_td = LIST_FIRST(&pp->pp_tds)) != NULL) { LIST_REMOVE(pmc_td, pt_next); pmc_thread_descriptor_pool_free(pmc_td); } free(pp, M_PMC); } /* * find an owner descriptor corresponding to proc 'p' */ static struct pmc_owner * pmc_find_owner_descriptor(struct proc *p) { uint32_t hindex; struct pmc_owner *po; struct pmc_ownerhash *poh; hindex = PMC_HASH_PTR(p, pmc_ownerhashmask); poh = &pmc_ownerhash[hindex]; po = NULL; LIST_FOREACH(po, poh, po_next) if (po->po_owner == p) break; PMCDBG5(OWN,FND,1, "find-owner proc=%p (%d, %s) hindex=0x%x -> " "pmc-owner=%p", p, p->p_pid, p->p_comm, hindex, po); return po; } /* * pmc_allocate_pmc_descriptor * * Allocate a pmc descriptor and initialize its * fields. */ static struct pmc * pmc_allocate_pmc_descriptor(void) { struct pmc *pmc; pmc = malloc(sizeof(struct pmc), M_PMC, M_WAITOK|M_ZERO); pmc->pm_runcount = counter_u64_alloc(M_WAITOK); pmc->pm_pcpu_state = malloc(sizeof(struct pmc_pcpu_state)*mp_ncpus, M_PMC, M_WAITOK|M_ZERO); PMCDBG1(PMC,ALL,1, "allocate-pmc -> pmc=%p", pmc); return pmc; } /* * Destroy a pmc descriptor. */ static void pmc_destroy_pmc_descriptor(struct pmc *pm) { KASSERT(pm->pm_state == PMC_STATE_DELETED || pm->pm_state == PMC_STATE_FREE, ("[pmc,%d] destroying non-deleted PMC", __LINE__)); KASSERT(LIST_EMPTY(&pm->pm_targets), ("[pmc,%d] destroying pmc with targets", __LINE__)); KASSERT(pm->pm_owner == NULL, ("[pmc,%d] destroying pmc attached to an owner", __LINE__)); KASSERT(counter_u64_fetch(pm->pm_runcount) == 0, ("[pmc,%d] pmc has non-zero run count %ld", __LINE__, (unsigned long)counter_u64_fetch(pm->pm_runcount))); counter_u64_free(pm->pm_runcount); free(pm->pm_pcpu_state, M_PMC); free(pm, M_PMC); } static void pmc_wait_for_pmc_idle(struct pmc *pm) { #ifdef INVARIANTS volatile int maxloop; maxloop = 100 * pmc_cpu_max(); #endif /* * Loop (with a forced context switch) till the PMC's runcount * comes down to zero. */ pmclog_flush(pm->pm_owner, 1); while (counter_u64_fetch(pm->pm_runcount) > 0) { pmclog_flush(pm->pm_owner, 1); #ifdef INVARIANTS maxloop--; KASSERT(maxloop > 0, ("[pmc,%d] (ri%d, rc%ld) waiting too long for " "pmc to be free", __LINE__, PMC_TO_ROWINDEX(pm), (unsigned long)counter_u64_fetch(pm->pm_runcount))); #endif pmc_force_context_switch(); } } /* * This function does the following things: * * - detaches the PMC from hardware * - unlinks all target threads that were attached to it * - removes the PMC from its owner's list * - destroys the PMC private mutex * * Once this function completes, the given pmc pointer can be freed by * calling pmc_destroy_pmc_descriptor(). */ static void pmc_release_pmc_descriptor(struct pmc *pm) { enum pmc_mode mode; struct pmc_hw *phw __diagused; u_int adjri, ri, cpu; struct pmc_owner *po; struct pmc_binding pb; struct pmc_process *pp; struct pmc_classdep *pcd; struct pmc_target *ptgt, *tmp; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(pm, ("[pmc,%d] null pmc", __LINE__)); ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); mode = PMC_TO_MODE(pm); PMCDBG3(PMC,REL,1, "release-pmc pmc=%p ri=%d mode=%d", pm, ri, mode); /* * First, we take the PMC off hardware. */ cpu = 0; if (PMC_IS_SYSTEM_MODE(mode)) { /* * A system mode PMC runs on a specific CPU. Switch * to this CPU and turn hardware off. */ pmc_save_cpu_binding(&pb); cpu = PMC_TO_CPU(pm); pmc_select_cpu(cpu); /* switch off non-stalled CPUs */ pm->pm_pcpu_state[cpu].pps_cpustate = 0; if (pm->pm_state == PMC_STATE_RUNNING && pm->pm_pcpu_state[cpu].pps_stalled == 0) { phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; KASSERT(phw->phw_pmc == pm, ("[pmc, %d] pmc ptr ri(%d) hw(%p) pm(%p)", __LINE__, ri, phw->phw_pmc, pm)); PMCDBG2(PMC,REL,2, "stopping cpu=%d ri=%d", cpu, ri); critical_enter(); pcd->pcd_stop_pmc(cpu, adjri); critical_exit(); } PMCDBG2(PMC,REL,2, "decfg cpu=%d ri=%d", cpu, ri); critical_enter(); pcd->pcd_config_pmc(cpu, adjri, NULL); critical_exit(); /* adjust the global and process count of SS mode PMCs */ if (mode == PMC_MODE_SS && pm->pm_state == PMC_STATE_RUNNING) { po = pm->pm_owner; po->po_sscount--; if (po->po_sscount == 0) { atomic_subtract_rel_int(&pmc_ss_count, 1); CK_LIST_REMOVE(po, po_ssnext); epoch_wait_preempt(global_epoch_preempt); } } pm->pm_state = PMC_STATE_DELETED; pmc_restore_cpu_binding(&pb); /* * We could have references to this PMC structure in * the per-cpu sample queues. Wait for the queue to * drain. */ pmc_wait_for_pmc_idle(pm); } else if (PMC_IS_VIRTUAL_MODE(mode)) { /* * A virtual PMC could be running on multiple CPUs at * a given instant. * * By marking its state as DELETED, we ensure that * this PMC is never further scheduled on hardware. * * Then we wait till all CPUs are done with this PMC. */ pm->pm_state = PMC_STATE_DELETED; /* Wait for the PMCs runcount to come to zero. */ pmc_wait_for_pmc_idle(pm); /* * At this point the PMC is off all CPUs and cannot be * freshly scheduled onto a CPU. It is now safe to * unlink all targets from this PMC. If a * process-record's refcount falls to zero, we remove * it from the hash table. The module-wide SX lock * protects us from races. */ LIST_FOREACH_SAFE(ptgt, &pm->pm_targets, pt_next, tmp) { pp = ptgt->pt_process; pmc_unlink_target_process(pm, pp); /* frees 'ptgt' */ PMCDBG1(PMC,REL,3, "pp->refcnt=%d", pp->pp_refcnt); /* * If the target process record shows that no * PMCs are attached to it, reclaim its space. */ if (pp->pp_refcnt == 0) { pmc_remove_process_descriptor(pp); pmc_destroy_process_descriptor(pp); } } cpu = curthread->td_oncpu; /* setup cpu for pmd_release() */ } /* * Release any MD resources */ (void) pcd->pcd_release_pmc(cpu, adjri, pm); /* * Update row disposition */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) PMC_UNMARK_ROW_STANDALONE(ri); else PMC_UNMARK_ROW_THREAD(ri); /* unlink from the owner's list */ if (pm->pm_owner) { LIST_REMOVE(pm, pm_next); pm->pm_owner = NULL; } } /* * Register an owner and a pmc. */ static int pmc_register_owner(struct proc *p, struct pmc *pmc) { struct pmc_owner *po; sx_assert(&pmc_sx, SX_XLOCKED); if ((po = pmc_find_owner_descriptor(p)) == NULL) if ((po = pmc_allocate_owner_descriptor(p)) == NULL) return ENOMEM; KASSERT(pmc->pm_owner == NULL, ("[pmc,%d] attempting to own an initialized PMC", __LINE__)); pmc->pm_owner = po; LIST_INSERT_HEAD(&po->po_pmcs, pmc, pm_next); PROC_LOCK(p); p->p_flag |= P_HWPMC; PROC_UNLOCK(p); if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_pmcallocate(pmc); PMCDBG2(PMC,REG,1, "register-owner pmc-owner=%p pmc=%p", po, pmc); return 0; } /* * Return the current row disposition: * == 0 => FREE * > 0 => PROCESS MODE * < 0 => SYSTEM MODE */ int pmc_getrowdisp(int ri) { return pmc_pmcdisp[ri]; } /* * Check if a PMC at row index 'ri' can be allocated to the current * process. * * Allocation can fail if: * - the current process is already being profiled by a PMC at index 'ri', * attached to it via OP_PMCATTACH. * - the current process has already allocated a PMC at index 'ri' * via OP_ALLOCATE. */ static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu) { enum pmc_mode mode; struct pmc *pm; struct pmc_owner *po; struct pmc_process *pp; PMCDBG5(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d " "cpu=%d", p, p->p_pid, p->p_comm, ri, cpu); /* * We shouldn't have already allocated a process-mode PMC at * row index 'ri'. * * We shouldn't have allocated a system-wide PMC on the same * CPU and same RI. */ if ((po = pmc_find_owner_descriptor(p)) != NULL) LIST_FOREACH(pm, &po->po_pmcs, pm_next) { if (PMC_TO_ROWINDEX(pm) == ri) { mode = PMC_TO_MODE(pm); if (PMC_IS_VIRTUAL_MODE(mode)) return EEXIST; if (PMC_IS_SYSTEM_MODE(mode) && (int) PMC_TO_CPU(pm) == cpu) return EEXIST; } } /* * We also shouldn't be the target of any PMC at this index * since otherwise a PMC_ATTACH to ourselves will fail. */ if ((pp = pmc_find_process_descriptor(p, 0)) != NULL) if (pp->pp_pmcs[ri].pp_pmc) return EEXIST; PMCDBG4(PMC,ALR,2, "can-allocate-rowindex proc=%p (%d, %s) ri=%d ok", p, p->p_pid, p->p_comm, ri); return 0; } /* * Check if a given PMC at row index 'ri' can be currently used in * mode 'mode'. */ static int pmc_can_allocate_row(int ri, enum pmc_mode mode) { enum pmc_disp disp; sx_assert(&pmc_sx, SX_XLOCKED); PMCDBG2(PMC,ALR,1, "can-allocate-row ri=%d mode=%d", ri, mode); if (PMC_IS_SYSTEM_MODE(mode)) disp = PMC_DISP_STANDALONE; else disp = PMC_DISP_THREAD; /* * check disposition for PMC row 'ri': * * Expected disposition Row-disposition Result * * STANDALONE STANDALONE or FREE proceed * STANDALONE THREAD fail * THREAD THREAD or FREE proceed * THREAD STANDALONE fail */ if (!PMC_ROW_DISP_IS_FREE(ri) && !(disp == PMC_DISP_THREAD && PMC_ROW_DISP_IS_THREAD(ri)) && !(disp == PMC_DISP_STANDALONE && PMC_ROW_DISP_IS_STANDALONE(ri))) return EBUSY; /* * All OK */ PMCDBG2(PMC,ALR,2, "can-allocate-row ri=%d mode=%d ok", ri, mode); return 0; } /* * Find a PMC descriptor with user handle 'pmcid' for thread 'td'. */ static struct pmc * pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmcid) { struct pmc *pm; KASSERT(PMC_ID_TO_ROWINDEX(pmcid) < md->pmd_npmc, ("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__, PMC_ID_TO_ROWINDEX(pmcid), md->pmd_npmc)); LIST_FOREACH(pm, &po->po_pmcs, pm_next) if (pm->pm_id == pmcid) return pm; return NULL; } static int pmc_find_pmc(pmc_id_t pmcid, struct pmc **pmc) { struct pmc *pm, *opm; struct pmc_owner *po; struct pmc_process *pp; PMCDBG1(PMC,FND,1, "find-pmc id=%d", pmcid); if (PMC_ID_TO_ROWINDEX(pmcid) >= md->pmd_npmc) return (EINVAL); if ((po = pmc_find_owner_descriptor(curthread->td_proc)) == NULL) { /* * In case of PMC_F_DESCENDANTS child processes we will not find * the current process in the owners hash list. Find the owner * process first and from there lookup the po. */ if ((pp = pmc_find_process_descriptor(curthread->td_proc, PMC_FLAG_NONE)) == NULL) { return ESRCH; } else { opm = pp->pp_pmcs[PMC_ID_TO_ROWINDEX(pmcid)].pp_pmc; if (opm == NULL) return ESRCH; if ((opm->pm_flags & (PMC_F_ATTACHED_TO_OWNER| PMC_F_DESCENDANTS)) != (PMC_F_ATTACHED_TO_OWNER| PMC_F_DESCENDANTS)) return ESRCH; po = opm->pm_owner; } } if ((pm = pmc_find_pmc_descriptor_in_process(po, pmcid)) == NULL) return EINVAL; PMCDBG2(PMC,FND,2, "find-pmc id=%d -> pmc=%p", pmcid, pm); *pmc = pm; return 0; } /* * Start a PMC. */ static int pmc_start(struct pmc *pm) { enum pmc_mode mode; struct pmc_owner *po; struct pmc_binding pb; struct pmc_classdep *pcd; int adjri, error, cpu, ri; KASSERT(pm != NULL, ("[pmc,%d] null pm", __LINE__)); mode = PMC_TO_MODE(pm); ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); error = 0; PMCDBG3(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, mode, ri); po = pm->pm_owner; /* * Disallow PMCSTART if a logfile is required but has not been * configured yet. */ if ((pm->pm_flags & PMC_F_NEEDS_LOGFILE) && (po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) return (EDOOFUS); /* programming error */ /* * If this is a sampling mode PMC, log mapping information for * the kernel modules that are currently loaded. */ if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) pmc_log_kernel_mappings(pm); if (PMC_IS_VIRTUAL_MODE(mode)) { /* * If a PMCATTACH has never been done on this PMC, * attach it to its owner process. */ if (LIST_EMPTY(&pm->pm_targets)) error = (pm->pm_flags & PMC_F_ATTACH_DONE) ? ESRCH : pmc_attach_process(po->po_owner, pm); /* * If the PMC is attached to its owner, then force a context * switch to ensure that the MD state gets set correctly. */ if (error == 0) { pm->pm_state = PMC_STATE_RUNNING; if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) pmc_force_context_switch(); } return (error); } /* * A system-wide PMC. * * Add the owner to the global list if this is a system-wide * sampling PMC. */ if (mode == PMC_MODE_SS) { /* * Log mapping information for all existing processes in the * system. Subsequent mappings are logged as they happen; * see pmc_process_mmap(). */ if (po->po_logprocmaps == 0) { pmc_log_all_process_mappings(po); po->po_logprocmaps = 1; } po->po_sscount++; if (po->po_sscount == 1) { atomic_add_rel_int(&pmc_ss_count, 1); CK_LIST_INSERT_HEAD(&pmc_ss_owners, po, po_ssnext); PMCDBG1(PMC,OPS,1, "po=%p in global list", po); } } /* * Move to the CPU associated with this * PMC, and start the hardware. */ pmc_save_cpu_binding(&pb); cpu = PMC_TO_CPU(pm); if (!pmc_cpu_is_active(cpu)) return (ENXIO); pmc_select_cpu(cpu); /* * global PMCs are configured at allocation time * so write out the initial value and start the PMC. */ pm->pm_state = PMC_STATE_RUNNING; critical_enter(); if ((error = pcd->pcd_write_pmc(cpu, adjri, PMC_IS_SAMPLING_MODE(mode) ? pm->pm_sc.pm_reloadcount : pm->pm_sc.pm_initial)) == 0) { /* If a sampling mode PMC, reset stalled state. */ if (PMC_IS_SAMPLING_MODE(mode)) pm->pm_pcpu_state[cpu].pps_stalled = 0; /* Indicate that we desire this to run. Start it. */ pm->pm_pcpu_state[cpu].pps_cpustate = 1; error = pcd->pcd_start_pmc(cpu, adjri); } critical_exit(); pmc_restore_cpu_binding(&pb); return (error); } /* * Stop a PMC. */ static int pmc_stop(struct pmc *pm) { struct pmc_owner *po; struct pmc_binding pb; struct pmc_classdep *pcd; int adjri, cpu, error, ri; KASSERT(pm != NULL, ("[pmc,%d] null pmc", __LINE__)); PMCDBG3(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm, PMC_TO_MODE(pm), PMC_TO_ROWINDEX(pm)); pm->pm_state = PMC_STATE_STOPPED; /* * If the PMC is a virtual mode one, changing the state to * non-RUNNING is enough to ensure that the PMC never gets * scheduled. * * If this PMC is current running on a CPU, then it will * handled correctly at the time its target process is context * switched out. */ if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) return 0; /* * A system-mode PMC. Move to the CPU associated with * this PMC, and stop the hardware. We update the * 'initial count' so that a subsequent PMCSTART will * resume counting from the current hardware count. */ pmc_save_cpu_binding(&pb); cpu = PMC_TO_CPU(pm); KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), ("[pmc,%d] illegal cpu=%d", __LINE__, cpu)); if (!pmc_cpu_is_active(cpu)) return ENXIO; pmc_select_cpu(cpu); ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); pm->pm_pcpu_state[cpu].pps_cpustate = 0; critical_enter(); if ((error = pcd->pcd_stop_pmc(cpu, adjri)) == 0) error = pcd->pcd_read_pmc(cpu, adjri, &pm->pm_sc.pm_initial); critical_exit(); pmc_restore_cpu_binding(&pb); po = pm->pm_owner; /* remove this owner from the global list of SS PMC owners */ if (PMC_TO_MODE(pm) == PMC_MODE_SS) { po->po_sscount--; if (po->po_sscount == 0) { atomic_subtract_rel_int(&pmc_ss_count, 1); CK_LIST_REMOVE(po, po_ssnext); epoch_wait_preempt(global_epoch_preempt); PMCDBG1(PMC,OPS,2,"po=%p removed from global list", po); } } return (error); } static struct pmc_classdep * pmc_class_to_classdep(enum pmc_class class) { int n; for (n = 0; n < md->pmd_nclass; n++) if (md->pmd_classdep[n].pcd_class == class) return (&md->pmd_classdep[n]); return (NULL); } #if defined(HWPMC_DEBUG) && defined(KTR) static const char *pmc_op_to_name[] = { #undef __PMC_OP #define __PMC_OP(N, D) #N , __PMC_OPS() NULL }; #endif /* * The syscall interface */ #define PMC_GET_SX_XLOCK(...) do { \ sx_xlock(&pmc_sx); \ if (pmc_hook == NULL) { \ sx_xunlock(&pmc_sx); \ return __VA_ARGS__; \ } \ } while (0) #define PMC_DOWNGRADE_SX() do { \ sx_downgrade(&pmc_sx); \ is_sx_downgraded = 1; \ } while (0) static int pmc_syscall_handler(struct thread *td, void *syscall_args) { int error, is_sx_downgraded, op; struct pmc_syscall_args *c; void *pmclog_proc_handle; void *arg; c = (struct pmc_syscall_args *)syscall_args; op = c->pmop_code; arg = c->pmop_data; /* PMC isn't set up yet */ if (pmc_hook == NULL) return (EINVAL); if (op == PMC_OP_CONFIGURELOG) { /* * We cannot create the logging process inside * pmclog_configure_log() because there is a LOR * between pmc_sx and process structure locks. * Instead, pre-create the process and ignite the loop * if everything is fine, otherwise direct the process * to exit. */ error = pmclog_proc_create(td, &pmclog_proc_handle); if (error != 0) goto done_syscall; } PMC_GET_SX_XLOCK(ENOSYS); is_sx_downgraded = 0; PMCDBG3(MOD,PMS,1, "syscall op=%d \"%s\" arg=%p", op, pmc_op_to_name[op], arg); error = 0; counter_u64_add(pmc_stats.pm_syscalls, 1); switch (op) { /* * Configure a log file. * * XXX This OP will be reworked. */ case PMC_OP_CONFIGURELOG: { struct proc *p; struct pmc *pm; struct pmc_owner *po; struct pmc_op_configurelog cl; if ((error = copyin(arg, &cl, sizeof(cl))) != 0) { pmclog_proc_ignite(pmclog_proc_handle, NULL); break; } /* mark this process as owning a log file */ p = td->td_proc; if ((po = pmc_find_owner_descriptor(p)) == NULL) if ((po = pmc_allocate_owner_descriptor(p)) == NULL) { pmclog_proc_ignite(pmclog_proc_handle, NULL); error = ENOMEM; break; } /* * If a valid fd was passed in, try to configure that, * otherwise if 'fd' was less than zero and there was * a log file configured, flush its buffers and * de-configure it. */ if (cl.pm_logfd >= 0) { error = pmclog_configure_log(md, po, cl.pm_logfd); pmclog_proc_ignite(pmclog_proc_handle, error == 0 ? po : NULL); } else if (po->po_flags & PMC_PO_OWNS_LOGFILE) { pmclog_proc_ignite(pmclog_proc_handle, NULL); error = pmclog_close(po); if (error == 0) { LIST_FOREACH(pm, &po->po_pmcs, pm_next) if (pm->pm_flags & PMC_F_NEEDS_LOGFILE && pm->pm_state == PMC_STATE_RUNNING) pmc_stop(pm); error = pmclog_deconfigure_log(po); } } else { pmclog_proc_ignite(pmclog_proc_handle, NULL); error = EINVAL; } } break; /* * Flush a log file. */ case PMC_OP_FLUSHLOG: { struct pmc_owner *po; sx_assert(&pmc_sx, SX_XLOCKED); if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) { error = EINVAL; break; } error = pmclog_flush(po, 0); } break; /* * Close a log file. */ case PMC_OP_CLOSELOG: { struct pmc_owner *po; sx_assert(&pmc_sx, SX_XLOCKED); if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) { error = EINVAL; break; } error = pmclog_close(po); } break; /* * Retrieve hardware configuration. */ case PMC_OP_GETCPUINFO: /* CPU information */ { struct pmc_op_getcpuinfo gci; struct pmc_classinfo *pci; struct pmc_classdep *pcd; int cl; memset(&gci, 0, sizeof(gci)); gci.pm_cputype = md->pmd_cputype; gci.pm_ncpu = pmc_cpu_max(); gci.pm_npmc = md->pmd_npmc; gci.pm_nclass = md->pmd_nclass; pci = gci.pm_classes; pcd = md->pmd_classdep; for (cl = 0; cl < md->pmd_nclass; cl++, pci++, pcd++) { pci->pm_caps = pcd->pcd_caps; pci->pm_class = pcd->pcd_class; pci->pm_width = pcd->pcd_width; pci->pm_num = pcd->pcd_num; } error = copyout(&gci, arg, sizeof(gci)); } break; /* * Retrieve soft events list. */ case PMC_OP_GETDYNEVENTINFO: { enum pmc_class cl; enum pmc_event ev; struct pmc_op_getdyneventinfo *gei; struct pmc_dyn_event_descr dev; struct pmc_soft *ps; uint32_t nevent; sx_assert(&pmc_sx, SX_LOCKED); gei = (struct pmc_op_getdyneventinfo *) arg; if ((error = copyin(&gei->pm_class, &cl, sizeof(cl))) != 0) break; /* Only SOFT class is dynamic. */ if (cl != PMC_CLASS_SOFT) { error = EINVAL; break; } nevent = 0; for (ev = PMC_EV_SOFT_FIRST; (int)ev <= PMC_EV_SOFT_LAST; ev++) { ps = pmc_soft_ev_acquire(ev); if (ps == NULL) continue; bcopy(&ps->ps_ev, &dev, sizeof(dev)); pmc_soft_ev_release(ps); error = copyout(&dev, &gei->pm_events[nevent], sizeof(struct pmc_dyn_event_descr)); if (error != 0) break; nevent++; } if (error != 0) break; error = copyout(&nevent, &gei->pm_nevent, sizeof(nevent)); } break; /* * Get module statistics */ case PMC_OP_GETDRIVERSTATS: { struct pmc_op_getdriverstats gms; #define CFETCH(a, b, field) a.field = counter_u64_fetch(b.field) CFETCH(gms, pmc_stats, pm_intr_ignored); CFETCH(gms, pmc_stats, pm_intr_processed); CFETCH(gms, pmc_stats, pm_intr_bufferfull); CFETCH(gms, pmc_stats, pm_syscalls); CFETCH(gms, pmc_stats, pm_syscall_errors); CFETCH(gms, pmc_stats, pm_buffer_requests); CFETCH(gms, pmc_stats, pm_buffer_requests_failed); CFETCH(gms, pmc_stats, pm_log_sweeps); #undef CFETCH error = copyout(&gms, arg, sizeof(gms)); } break; /* * Retrieve module version number */ case PMC_OP_GETMODULEVERSION: { uint32_t cv, modv; /* retrieve the client's idea of the ABI version */ if ((error = copyin(arg, &cv, sizeof(uint32_t))) != 0) break; /* don't service clients newer than our driver */ modv = PMC_VERSION; if ((cv & 0xFFFF0000) > (modv & 0xFFFF0000)) { error = EPROGMISMATCH; break; } error = copyout(&modv, arg, sizeof(int)); } break; /* * Retrieve the state of all the PMCs on a given * CPU. */ case PMC_OP_GETPMCINFO: { int ari; struct pmc *pm; size_t pmcinfo_size; uint32_t cpu, n, npmc; struct pmc_owner *po; struct pmc_binding pb; struct pmc_classdep *pcd; struct pmc_info *p, *pmcinfo; struct pmc_op_getpmcinfo *gpi; PMC_DOWNGRADE_SX(); gpi = (struct pmc_op_getpmcinfo *) arg; if ((error = copyin(&gpi->pm_cpu, &cpu, sizeof(cpu))) != 0) break; if (cpu >= pmc_cpu_max()) { error = EINVAL; break; } if (!pmc_cpu_is_active(cpu)) { error = ENXIO; break; } /* switch to CPU 'cpu' */ pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); npmc = md->pmd_npmc; pmcinfo_size = npmc * sizeof(struct pmc_info); pmcinfo = malloc(pmcinfo_size, M_PMC, M_WAITOK | M_ZERO); p = pmcinfo; for (n = 0; n < md->pmd_npmc; n++, p++) { pcd = pmc_ri_to_classdep(md, n, &ari); KASSERT(pcd != NULL, ("[pmc,%d] null pcd ri=%d", __LINE__, n)); if ((error = pcd->pcd_describe(cpu, ari, p, &pm)) != 0) break; if (PMC_ROW_DISP_IS_STANDALONE(n)) p->pm_rowdisp = PMC_DISP_STANDALONE; else if (PMC_ROW_DISP_IS_THREAD(n)) p->pm_rowdisp = PMC_DISP_THREAD; else p->pm_rowdisp = PMC_DISP_FREE; p->pm_ownerpid = -1; if (pm == NULL) /* no PMC associated */ continue; po = pm->pm_owner; KASSERT(po->po_owner != NULL, ("[pmc,%d] pmc_owner had a null proc pointer", __LINE__)); p->pm_ownerpid = po->po_owner->p_pid; p->pm_mode = PMC_TO_MODE(pm); p->pm_event = pm->pm_event; p->pm_flags = pm->pm_flags; if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) p->pm_reloadcount = pm->pm_sc.pm_reloadcount; } pmc_restore_cpu_binding(&pb); /* now copy out the PMC info collected */ if (error == 0) error = copyout(pmcinfo, &gpi->pm_pmcs, pmcinfo_size); free(pmcinfo, M_PMC); } break; /* * Set the administrative state of a PMC. I.e. whether * the PMC is to be used or not. */ case PMC_OP_PMCADMIN: { int cpu, ri; enum pmc_state request; struct pmc_cpu *pc; struct pmc_hw *phw; struct pmc_op_pmcadmin pma; struct pmc_binding pb; sx_assert(&pmc_sx, SX_XLOCKED); KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); error = priv_check(td, PRIV_PMC_MANAGE); if (error) break; if ((error = copyin(arg, &pma, sizeof(pma))) != 0) break; cpu = pma.pm_cpu; if (cpu < 0 || cpu >= (int) pmc_cpu_max()) { error = EINVAL; break; } if (!pmc_cpu_is_active(cpu)) { error = ENXIO; break; } request = pma.pm_state; if (request != PMC_STATE_DISABLED && request != PMC_STATE_FREE) { error = EINVAL; break; } ri = pma.pm_pmc; /* pmc id == row index */ if (ri < 0 || ri >= (int) md->pmd_npmc) { error = EINVAL; break; } /* * We can't disable a PMC with a row-index allocated * for process virtual PMCs. */ if (PMC_ROW_DISP_IS_THREAD(ri) && request == PMC_STATE_DISABLED) { error = EBUSY; break; } /* * otherwise, this PMC on this CPU is either free or * in system-wide mode. */ pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); pc = pmc_pcpu[cpu]; phw = pc->pc_hwpmcs[ri]; /* * XXX do we need some kind of 'forced' disable? */ if (phw->phw_pmc == NULL) { if (request == PMC_STATE_DISABLED && (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED)) { phw->phw_state &= ~PMC_PHW_FLAG_IS_ENABLED; PMC_MARK_ROW_STANDALONE(ri); } else if (request == PMC_STATE_FREE && (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0) { phw->phw_state |= PMC_PHW_FLAG_IS_ENABLED; PMC_UNMARK_ROW_STANDALONE(ri); } /* other cases are a no-op */ } else error = EBUSY; pmc_restore_cpu_binding(&pb); } break; /* * Allocate a PMC. */ case PMC_OP_PMCALLOCATE: { int adjri, n; u_int cpu; uint32_t caps; struct pmc *pmc; enum pmc_mode mode; struct pmc_hw *phw; struct pmc_binding pb; struct pmc_classdep *pcd; struct pmc_op_pmcallocate pa; if ((error = copyin(arg, &pa, sizeof(pa))) != 0) break; caps = pa.pm_caps; mode = pa.pm_mode; cpu = pa.pm_cpu; if ((mode != PMC_MODE_SS && mode != PMC_MODE_SC && mode != PMC_MODE_TS && mode != PMC_MODE_TC) || (cpu != (u_int) PMC_CPU_ANY && cpu >= pmc_cpu_max())) { error = EINVAL; break; } /* * Virtual PMCs should only ask for a default CPU. * System mode PMCs need to specify a non-default CPU. */ if ((PMC_IS_VIRTUAL_MODE(mode) && cpu != (u_int) PMC_CPU_ANY) || (PMC_IS_SYSTEM_MODE(mode) && cpu == (u_int) PMC_CPU_ANY)) { error = EINVAL; break; } /* * Check that an inactive CPU is not being asked for. */ if (PMC_IS_SYSTEM_MODE(mode) && !pmc_cpu_is_active(cpu)) { error = ENXIO; break; } /* * Refuse an allocation for a system-wide PMC if this * process has been jailed, or if this process lacks * super-user credentials and the sysctl tunable * 'security.bsd.unprivileged_syspmcs' is zero. */ if (PMC_IS_SYSTEM_MODE(mode)) { if (jailed(curthread->td_ucred)) { error = EPERM; break; } if (!pmc_unprivileged_syspmcs) { error = priv_check(curthread, PRIV_PMC_SYSTEM); if (error) break; } } /* * Look for valid values for 'pm_flags' */ if ((pa.pm_flags & ~(PMC_F_DESCENDANTS | PMC_F_LOG_PROCCSW | PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN | PMC_F_USERCALLCHAIN)) != 0) { error = EINVAL; break; } /* PMC_F_USERCALLCHAIN is only valid with PMC_F_CALLCHAIN */ if ((pa.pm_flags & (PMC_F_CALLCHAIN | PMC_F_USERCALLCHAIN)) == PMC_F_USERCALLCHAIN) { error = EINVAL; break; } /* PMC_F_USERCALLCHAIN is only valid for sampling mode */ if (pa.pm_flags & PMC_F_USERCALLCHAIN && mode != PMC_MODE_TS && mode != PMC_MODE_SS) { error = EINVAL; break; } /* process logging options are not allowed for system PMCs */ if (PMC_IS_SYSTEM_MODE(mode) && (pa.pm_flags & (PMC_F_LOG_PROCCSW | PMC_F_LOG_PROCEXIT))) { error = EINVAL; break; } /* * All sampling mode PMCs need to be able to interrupt the * CPU. */ if (PMC_IS_SAMPLING_MODE(mode)) caps |= PMC_CAP_INTERRUPT; /* A valid class specifier should have been passed in. */ pcd = pmc_class_to_classdep(pa.pm_class); if (pcd == NULL) { error = EINVAL; break; } /* The requested PMC capabilities should be feasible. */ if ((pcd->pcd_caps & caps) != caps) { error = EOPNOTSUPP; break; } PMCDBG4(PMC,ALL,2, "event=%d caps=0x%x mode=%d cpu=%d", pa.pm_ev, caps, mode, cpu); pmc = pmc_allocate_pmc_descriptor(); pmc->pm_id = PMC_ID_MAKE_ID(cpu,pa.pm_mode,pa.pm_class, PMC_ID_INVALID); pmc->pm_event = pa.pm_ev; pmc->pm_state = PMC_STATE_FREE; pmc->pm_caps = caps; pmc->pm_flags = pa.pm_flags; /* XXX set lower bound on sampling for process counters */ if (PMC_IS_SAMPLING_MODE(mode)) { /* * Don't permit requested sample rate to be * less than pmc_mincount. */ if (pa.pm_count < MAX(1, pmc_mincount)) log(LOG_WARNING, "pmcallocate: passed sample " "rate %ju - setting to %u\n", (uintmax_t)pa.pm_count, MAX(1, pmc_mincount)); pmc->pm_sc.pm_reloadcount = MAX(MAX(1, pmc_mincount), pa.pm_count); } else pmc->pm_sc.pm_initial = pa.pm_count; /* switch thread to CPU 'cpu' */ pmc_save_cpu_binding(&pb); #define PMC_IS_SHAREABLE_PMC(cpu, n) \ (pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_state & \ PMC_PHW_FLAG_IS_SHAREABLE) #define PMC_IS_UNALLOCATED(cpu, n) \ (pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_pmc == NULL) if (PMC_IS_SYSTEM_MODE(mode)) { pmc_select_cpu(cpu); for (n = pcd->pcd_ri; n < (int) md->pmd_npmc; n++) { pcd = pmc_ri_to_classdep(md, n, &adjri); if (pmc_can_allocate_row(n, mode) == 0 && pmc_can_allocate_rowindex( curthread->td_proc, n, cpu) == 0 && (PMC_IS_UNALLOCATED(cpu, n) || PMC_IS_SHAREABLE_PMC(cpu, n)) && pcd->pcd_allocate_pmc(cpu, adjri, pmc, &pa) == 0) break; } } else { /* Process virtual mode */ for (n = pcd->pcd_ri; n < (int) md->pmd_npmc; n++) { pcd = pmc_ri_to_classdep(md, n, &adjri); if (pmc_can_allocate_row(n, mode) == 0 && pmc_can_allocate_rowindex( curthread->td_proc, n, PMC_CPU_ANY) == 0 && pcd->pcd_allocate_pmc(curthread->td_oncpu, adjri, pmc, &pa) == 0) break; } } #undef PMC_IS_UNALLOCATED #undef PMC_IS_SHAREABLE_PMC pmc_restore_cpu_binding(&pb); if (n == (int) md->pmd_npmc) { pmc_destroy_pmc_descriptor(pmc); pmc = NULL; error = EINVAL; break; } /* Fill in the correct value in the ID field */ pmc->pm_id = PMC_ID_MAKE_ID(cpu,mode,pa.pm_class,n); PMCDBG5(PMC,ALL,2, "ev=%d class=%d mode=%d n=%d -> pmcid=%x", pmc->pm_event, pa.pm_class, mode, n, pmc->pm_id); /* Process mode PMCs with logging enabled need log files */ if (pmc->pm_flags & (PMC_F_LOG_PROCEXIT | PMC_F_LOG_PROCCSW)) pmc->pm_flags |= PMC_F_NEEDS_LOGFILE; /* All system mode sampling PMCs require a log file */ if (PMC_IS_SAMPLING_MODE(mode) && PMC_IS_SYSTEM_MODE(mode)) pmc->pm_flags |= PMC_F_NEEDS_LOGFILE; /* * Configure global pmc's immediately */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pmc))) { pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); phw = pmc_pcpu[cpu]->pc_hwpmcs[n]; pcd = pmc_ri_to_classdep(md, n, &adjri); if ((phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0 || (error = pcd->pcd_config_pmc(cpu, adjri, pmc)) != 0) { (void) pcd->pcd_release_pmc(cpu, adjri, pmc); pmc_destroy_pmc_descriptor(pmc); pmc = NULL; pmc_restore_cpu_binding(&pb); error = EPERM; break; } pmc_restore_cpu_binding(&pb); } pmc->pm_state = PMC_STATE_ALLOCATED; pmc->pm_class = pa.pm_class; /* * mark row disposition */ if (PMC_IS_SYSTEM_MODE(mode)) PMC_MARK_ROW_STANDALONE(n); else PMC_MARK_ROW_THREAD(n); /* * Register this PMC with the current thread as its owner. */ if ((error = pmc_register_owner(curthread->td_proc, pmc)) != 0) { pmc_release_pmc_descriptor(pmc); pmc_destroy_pmc_descriptor(pmc); pmc = NULL; break; } /* * Return the allocated index. */ pa.pm_pmcid = pmc->pm_id; error = copyout(&pa, arg, sizeof(pa)); } break; /* * Attach a PMC to a process. */ case PMC_OP_PMCATTACH: { struct pmc *pm; struct proc *p; struct pmc_op_pmcattach a; sx_assert(&pmc_sx, SX_XLOCKED); if ((error = copyin(arg, &a, sizeof(a))) != 0) break; if (a.pm_pid < 0) { error = EINVAL; break; } else if (a.pm_pid == 0) a.pm_pid = td->td_proc->p_pid; if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0) break; if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { error = EINVAL; break; } /* PMCs may be (re)attached only when allocated or stopped */ if (pm->pm_state == PMC_STATE_RUNNING) { error = EBUSY; break; } else if (pm->pm_state != PMC_STATE_ALLOCATED && pm->pm_state != PMC_STATE_STOPPED) { error = EINVAL; break; } /* lookup pid */ if ((p = pfind(a.pm_pid)) == NULL) { error = ESRCH; break; } /* * Ignore processes that are working on exiting. */ if (p->p_flag & P_WEXIT) { error = ESRCH; PROC_UNLOCK(p); /* pfind() returns a locked process */ break; } /* * we are allowed to attach a PMC to a process if * we can debug it. */ error = p_candebug(curthread, p); PROC_UNLOCK(p); if (error == 0) error = pmc_attach_process(p, pm); } break; /* * Detach an attached PMC from a process. */ case PMC_OP_PMCDETACH: { struct pmc *pm; struct proc *p; struct pmc_op_pmcattach a; if ((error = copyin(arg, &a, sizeof(a))) != 0) break; if (a.pm_pid < 0) { error = EINVAL; break; } else if (a.pm_pid == 0) a.pm_pid = td->td_proc->p_pid; if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0) break; if ((p = pfind(a.pm_pid)) == NULL) { error = ESRCH; break; } /* * Treat processes that are in the process of exiting * as if they were not present. */ if (p->p_flag & P_WEXIT) error = ESRCH; PROC_UNLOCK(p); /* pfind() returns a locked process */ if (error == 0) error = pmc_detach_process(p, pm); } break; /* * Retrieve the MSR number associated with the counter * 'pmc_id'. This allows processes to directly use RDPMC * instructions to read their PMCs, without the overhead of a * system call. */ case PMC_OP_PMCGETMSR: { int adjri, ri; struct pmc *pm; struct pmc_target *pt; struct pmc_op_getmsr gm; struct pmc_classdep *pcd; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &gm, sizeof(gm))) != 0) break; if ((error = pmc_find_pmc(gm.pm_pmcid, &pm)) != 0) break; /* * The allocated PMC has to be a process virtual PMC, * i.e., of type MODE_T[CS]. Global PMCs can only be * read using the PMCREAD operation since they may be * allocated on a different CPU than the one we could * be running on at the time of the RDPMC instruction. * * The GETMSR operation is not allowed for PMCs that * are inherited across processes. */ if (!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) || (pm->pm_flags & PMC_F_DESCENDANTS)) { error = EINVAL; break; } /* * It only makes sense to use a RDPMC (or its * equivalent instruction on non-x86 architectures) on * a process that has allocated and attached a PMC to * itself. Conversely the PMC is only allowed to have * one process attached to it -- its owner. */ if ((pt = LIST_FIRST(&pm->pm_targets)) == NULL || LIST_NEXT(pt, pt_next) != NULL || pt->pt_process->pp_proc != pm->pm_owner->po_owner) { error = EINVAL; break; } ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); /* PMC class has no 'GETMSR' support */ if (pcd->pcd_get_msr == NULL) { error = ENOSYS; break; } if ((error = (*pcd->pcd_get_msr)(adjri, &gm.pm_msr)) < 0) break; if ((error = copyout(&gm, arg, sizeof(gm))) < 0) break; /* * Mark our process as using MSRs. Update machine * state using a forced context switch. */ pt->pt_process->pp_flags |= PMC_PP_ENABLE_MSR_ACCESS; pmc_force_context_switch(); } break; /* * Release an allocated PMC */ case PMC_OP_PMCRELEASE: { pmc_id_t pmcid; struct pmc *pm; struct pmc_owner *po; struct pmc_op_simple sp; /* * Find PMC pointer for the named PMC. * * Use pmc_release_pmc_descriptor() to switch off the * PMC, remove all its target threads, and remove the * PMC from its owner's list. * * Remove the owner record if this is the last PMC * owned. * * Free up space. */ if ((error = copyin(arg, &sp, sizeof(sp))) != 0) break; pmcid = sp.pm_pmcid; if ((error = pmc_find_pmc(pmcid, &pm)) != 0) break; po = pm->pm_owner; pmc_release_pmc_descriptor(pm); pmc_maybe_remove_owner(po); pmc_destroy_pmc_descriptor(pm); } break; /* * Read and/or write a PMC. */ case PMC_OP_PMCRW: { int adjri; struct pmc *pm; uint32_t cpu, ri; pmc_value_t oldvalue; struct pmc_binding pb; struct pmc_op_pmcrw prw; struct pmc_classdep *pcd; struct pmc_op_pmcrw *pprw; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &prw, sizeof(prw))) != 0) break; PMCDBG2(PMC,OPS,1, "rw id=%d flags=0x%x", prw.pm_pmcid, prw.pm_flags); /* must have at least one flag set */ if ((prw.pm_flags & (PMC_F_OLDVALUE|PMC_F_NEWVALUE)) == 0) { error = EINVAL; break; } /* locate pmc descriptor */ if ((error = pmc_find_pmc(prw.pm_pmcid, &pm)) != 0) break; /* Can't read a PMC that hasn't been started. */ if (pm->pm_state != PMC_STATE_ALLOCATED && pm->pm_state != PMC_STATE_STOPPED && pm->pm_state != PMC_STATE_RUNNING) { error = EINVAL; break; } /* writing a new value is allowed only for 'STOPPED' pmcs */ if (pm->pm_state == PMC_STATE_RUNNING && (prw.pm_flags & PMC_F_NEWVALUE)) { error = EBUSY; break; } if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) { /* * If this PMC is attached to its owner (i.e., * the process requesting this operation) and * is running, then attempt to get an * upto-date reading from hardware for a READ. * Writes are only allowed when the PMC is * stopped, so only update the saved value * field. * * If the PMC is not running, or is not * attached to its owner, read/write to the * savedvalue field. */ ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); mtx_pool_lock_spin(pmc_mtxpool, pm); cpu = curthread->td_oncpu; if (prw.pm_flags & PMC_F_OLDVALUE) { if ((pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) && (pm->pm_state == PMC_STATE_RUNNING)) error = (*pcd->pcd_read_pmc)(cpu, adjri, &oldvalue); else oldvalue = pm->pm_gv.pm_savedvalue; } if (prw.pm_flags & PMC_F_NEWVALUE) pm->pm_gv.pm_savedvalue = prw.pm_value; mtx_pool_unlock_spin(pmc_mtxpool, pm); } else { /* System mode PMCs */ cpu = PMC_TO_CPU(pm); ri = PMC_TO_ROWINDEX(pm); pcd = pmc_ri_to_classdep(md, ri, &adjri); if (!pmc_cpu_is_active(cpu)) { error = ENXIO; break; } /* move this thread to CPU 'cpu' */ pmc_save_cpu_binding(&pb); pmc_select_cpu(cpu); critical_enter(); /* save old value */ if (prw.pm_flags & PMC_F_OLDVALUE) if ((error = (*pcd->pcd_read_pmc)(cpu, adjri, &oldvalue))) goto error; /* write out new value */ if (prw.pm_flags & PMC_F_NEWVALUE) error = (*pcd->pcd_write_pmc)(cpu, adjri, prw.pm_value); error: critical_exit(); pmc_restore_cpu_binding(&pb); if (error) break; } pprw = (struct pmc_op_pmcrw *) arg; #ifdef HWPMC_DEBUG if (prw.pm_flags & PMC_F_NEWVALUE) PMCDBG3(PMC,OPS,2, "rw id=%d new %jx -> old %jx", ri, prw.pm_value, oldvalue); else if (prw.pm_flags & PMC_F_OLDVALUE) PMCDBG2(PMC,OPS,2, "rw id=%d -> old %jx", ri, oldvalue); #endif /* return old value if requested */ if (prw.pm_flags & PMC_F_OLDVALUE) if ((error = copyout(&oldvalue, &pprw->pm_value, sizeof(prw.pm_value)))) break; } break; /* * Set the sampling rate for a sampling mode PMC and the * initial count for a counting mode PMC. */ case PMC_OP_PMCSETCOUNT: { struct pmc *pm; struct pmc_op_pmcsetcount sc; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &sc, sizeof(sc))) != 0) break; if ((error = pmc_find_pmc(sc.pm_pmcid, &pm)) != 0) break; if (pm->pm_state == PMC_STATE_RUNNING) { error = EBUSY; break; } if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) { /* * Don't permit requested sample rate to be * less than pmc_mincount. */ if (sc.pm_count < MAX(1, pmc_mincount)) log(LOG_WARNING, "pmcsetcount: passed sample " "rate %ju - setting to %u\n", (uintmax_t)sc.pm_count, MAX(1, pmc_mincount)); pm->pm_sc.pm_reloadcount = MAX(MAX(1, pmc_mincount), sc.pm_count); } else pm->pm_sc.pm_initial = sc.pm_count; } break; /* * Start a PMC. */ case PMC_OP_PMCSTART: { pmc_id_t pmcid; struct pmc *pm; struct pmc_op_simple sp; sx_assert(&pmc_sx, SX_XLOCKED); if ((error = copyin(arg, &sp, sizeof(sp))) != 0) break; pmcid = sp.pm_pmcid; if ((error = pmc_find_pmc(pmcid, &pm)) != 0) break; KASSERT(pmcid == pm->pm_id, ("[pmc,%d] pmcid %x != id %x", __LINE__, pm->pm_id, pmcid)); if (pm->pm_state == PMC_STATE_RUNNING) /* already running */ break; else if (pm->pm_state != PMC_STATE_STOPPED && pm->pm_state != PMC_STATE_ALLOCATED) { error = EINVAL; break; } error = pmc_start(pm); } break; /* * Stop a PMC. */ case PMC_OP_PMCSTOP: { pmc_id_t pmcid; struct pmc *pm; struct pmc_op_simple sp; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &sp, sizeof(sp))) != 0) break; pmcid = sp.pm_pmcid; /* * Mark the PMC as inactive and invoke the MD stop * routines if needed. */ if ((error = pmc_find_pmc(pmcid, &pm)) != 0) break; KASSERT(pmcid == pm->pm_id, ("[pmc,%d] pmc id %x != pmcid %x", __LINE__, pm->pm_id, pmcid)); if (pm->pm_state == PMC_STATE_STOPPED) /* already stopped */ break; else if (pm->pm_state != PMC_STATE_RUNNING) { error = EINVAL; break; } error = pmc_stop(pm); } break; /* * Write a user supplied value to the log file. */ case PMC_OP_WRITELOG: { struct pmc_op_writelog wl; struct pmc_owner *po; PMC_DOWNGRADE_SX(); if ((error = copyin(arg, &wl, sizeof(wl))) != 0) break; if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) { error = EINVAL; break; } if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) { error = EINVAL; break; } error = pmclog_process_userlog(po, &wl); } break; default: error = EINVAL; break; } if (is_sx_downgraded) sx_sunlock(&pmc_sx); else sx_xunlock(&pmc_sx); done_syscall: if (error) counter_u64_add(pmc_stats.pm_syscall_errors, 1); return (error); } /* * Helper functions */ /* * Mark the thread as needing callchain capture and post an AST. The * actual callchain capture will be done in a context where it is safe * to take page faults. */ static void pmc_post_callchain_callback(void) { struct thread *td; td = curthread; /* * If there is multiple PMCs for the same interrupt ignore new post */ if (td->td_pflags & TDP_CALLCHAIN) return; /* * Mark this thread as needing callchain capture. * `td->td_pflags' will be safe to touch because this thread * was in user space when it was interrupted. */ td->td_pflags |= TDP_CALLCHAIN; /* * Don't let this thread migrate between CPUs until callchain * capture completes. */ sched_pin(); return; } /* * Find a free slot in the per-cpu array of samples and capture the * current callchain there. If a sample was successfully added, a bit * is set in mask 'pmc_cpumask' denoting that the DO_SAMPLES hook * needs to be invoked from the clock handler. * * This function is meant to be called from an NMI handler. It cannot * use any of the locking primitives supplied by the OS. */ static int pmc_add_sample(ring_type_t ring, struct pmc *pm, struct trapframe *tf) { int error, cpu, callchaindepth, inuserspace; struct thread *td; struct pmc_sample *ps; struct pmc_samplebuffer *psb; error = 0; /* * Allocate space for a sample buffer. */ cpu = curcpu; psb = pmc_pcpu[cpu]->pc_sb[ring]; inuserspace = TRAPF_USERMODE(tf); ps = PMC_PROD_SAMPLE(psb); if (psb->ps_considx != psb->ps_prodidx && ps->ps_nsamples) { /* in use, reader hasn't caught up */ pm->pm_pcpu_state[cpu].pps_stalled = 1; counter_u64_add(pmc_stats.pm_intr_bufferfull, 1); PMCDBG6(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, (int) (psb->ps_prodidx & pmc_sample_mask), (int) (psb->ps_considx & pmc_sample_mask)); callchaindepth = 1; error = ENOMEM; goto done; } /* Fill in entry. */ PMCDBG6(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, (void *) tf, inuserspace, (int) (psb->ps_prodidx & pmc_sample_mask), (int) (psb->ps_considx & pmc_sample_mask)); td = curthread; ps->ps_pmc = pm; ps->ps_td = td; ps->ps_pid = td->td_proc->p_pid; ps->ps_tid = td->td_tid; ps->ps_tsc = pmc_rdtsc(); ps->ps_ticks = ticks; ps->ps_cpu = cpu; ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0; callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ? pmc_callchaindepth : 1; MPASS(ps->ps_pc != NULL); if (callchaindepth == 1) ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf); else { /* * Kernel stack traversals can be done immediately, * while we defer to an AST for user space traversals. */ if (!inuserspace) { callchaindepth = pmc_save_kernel_callchain(ps->ps_pc, callchaindepth, tf); } else { pmc_post_callchain_callback(); callchaindepth = PMC_USER_CALLCHAIN_PENDING; } } ps->ps_nsamples = callchaindepth; /* mark entry as in use */ if (ring == PMC_UR) { ps->ps_nsamples_actual = callchaindepth; /* mark entry as in use */ ps->ps_nsamples = PMC_USER_CALLCHAIN_PENDING; } else ps->ps_nsamples = callchaindepth; /* mark entry as in use */ KASSERT(counter_u64_fetch(pm->pm_runcount) >= 0, ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, (unsigned long)counter_u64_fetch(pm->pm_runcount))); counter_u64_add(pm->pm_runcount, 1); /* hold onto PMC */ /* increment write pointer */ psb->ps_prodidx++; done: /* mark CPU as needing processing */ if (callchaindepth != PMC_USER_CALLCHAIN_PENDING) DPCPU_SET(pmc_sampled, 1); return (error); } /* * Interrupt processing. * * This function is meant to be called from an NMI handler. It cannot * use any of the locking primitives supplied by the OS. */ int pmc_process_interrupt(int ring, struct pmc *pm, struct trapframe *tf) { struct thread *td; td = curthread; if ((pm->pm_flags & PMC_F_USERCALLCHAIN) && (td->td_proc->p_flag & P_KPROC) == 0 && !TRAPF_USERMODE(tf)) { atomic_add_int(&td->td_pmcpend, 1); return (pmc_add_sample(PMC_UR, pm, tf)); } return (pmc_add_sample(ring, pm, tf)); } /* * Capture a user call chain. This function will be called from ast() * before control returns to userland and before the process gets * rescheduled. */ static void pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) { struct pmc *pm; struct thread *td; struct pmc_sample *ps; struct pmc_samplebuffer *psb; uint64_t considx, prodidx; int nsamples, nrecords, pass, iter; #ifdef INVARIANTS int start_ticks = ticks; #endif psb = pmc_pcpu[cpu]->pc_sb[ring]; td = curthread; KASSERT(td->td_pflags & TDP_CALLCHAIN, ("[pmc,%d] Retrieving callchain for thread that doesn't want it", __LINE__)); nrecords = INT_MAX; pass = 0; restart: if (ring == PMC_UR) nrecords = atomic_readandclear_32(&td->td_pmcpend); for (iter = 0, considx = psb->ps_considx, prodidx = psb->ps_prodidx; considx < prodidx && iter < pmc_nsamples; considx++, iter++) { ps = PMC_CONS_SAMPLE_OFF(psb, considx); /* * Iterate through all deferred callchain requests. * Walk from the current read pointer to the current * write pointer. */ #ifdef INVARIANTS if (ps->ps_nsamples == PMC_SAMPLE_FREE) { continue; } #endif if (ps->ps_td != td || ps->ps_nsamples != PMC_USER_CALLCHAIN_PENDING || ps->ps_pmc->pm_state != PMC_STATE_RUNNING) continue; KASSERT(ps->ps_cpu == cpu, ("[pmc,%d] cpu mismatch ps_cpu=%d pcpu=%d", __LINE__, ps->ps_cpu, PCPU_GET(cpuid))); pm = ps->ps_pmc; KASSERT(pm->pm_flags & PMC_F_CALLCHAIN, ("[pmc,%d] Retrieving callchain for PMC that doesn't " "want it", __LINE__)); KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] runcount %ld", __LINE__, (unsigned long)counter_u64_fetch(pm->pm_runcount))); if (ring == PMC_UR) { nsamples = ps->ps_nsamples_actual; counter_u64_add(pmc_stats.pm_merges, 1); } else nsamples = 0; /* * Retrieve the callchain and mark the sample buffer * as 'processable' by the timer tick sweep code. */ if (__predict_true(nsamples < pmc_callchaindepth - 1)) nsamples += pmc_save_user_callchain(ps->ps_pc + nsamples, pmc_callchaindepth - nsamples - 1, tf); /* * We have to prevent hardclock from potentially overwriting * this sample between when we read the value and when we set * it */ spinlock_enter(); /* * Verify that the sample hasn't been dropped in the meantime */ if (ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING) { ps->ps_nsamples = nsamples; /* * If we couldn't get a sample, simply drop the reference */ if (nsamples == 0) counter_u64_add(pm->pm_runcount, -1); } spinlock_exit(); if (nrecords-- == 1) break; } if (__predict_false(ring == PMC_UR && td->td_pmcpend)) { if (pass == 0) { pass = 1; goto restart; } /* only collect samples for this part once */ td->td_pmcpend = 0; } #ifdef INVARIANTS if ((ticks - start_ticks) > hz) log(LOG_ERR, "%s took %d ticks\n", __func__, (ticks - start_ticks)); #endif /* mark CPU as needing processing */ DPCPU_SET(pmc_sampled, 1); } /* * Process saved PC samples. */ static void pmc_process_samples(int cpu, ring_type_t ring) { struct pmc *pm; int adjri, n; struct thread *td; struct pmc_owner *po; struct pmc_sample *ps; struct pmc_classdep *pcd; struct pmc_samplebuffer *psb; uint64_t delta __diagused; KASSERT(PCPU_GET(cpuid) == cpu, ("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__, PCPU_GET(cpuid), cpu)); psb = pmc_pcpu[cpu]->pc_sb[ring]; delta = psb->ps_prodidx - psb->ps_considx; MPASS(delta <= pmc_nsamples); MPASS(psb->ps_considx <= psb->ps_prodidx); for (n = 0; psb->ps_considx < psb->ps_prodidx; psb->ps_considx++, n++) { ps = PMC_CONS_SAMPLE(psb); if (__predict_false(ps->ps_nsamples == PMC_SAMPLE_FREE)) continue; pm = ps->ps_pmc; /* skip non-running samples */ if (pm->pm_state != PMC_STATE_RUNNING) goto entrydone; KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, (unsigned long)counter_u64_fetch(pm->pm_runcount))); po = pm->pm_owner; KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__, pm, PMC_TO_MODE(pm))); /* If there is a pending AST wait for completion */ if (ps->ps_nsamples == PMC_USER_CALLCHAIN_PENDING) { /* if we've been waiting more than 1 tick to * collect a callchain for this record then * drop it and move on. */ if (ticks - ps->ps_ticks > 1) { /* * track how often we hit this as it will * preferentially lose user samples * for long running system calls */ counter_u64_add(pmc_stats.pm_overwrites, 1); goto entrydone; } /* Need a rescan at a later time. */ DPCPU_SET(pmc_sampled, 1); break; } PMCDBG6(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu, pm, ps->ps_nsamples, ps->ps_flags, (int) (psb->ps_prodidx & pmc_sample_mask), (int) (psb->ps_considx & pmc_sample_mask)); /* * If this is a process-mode PMC that is attached to * its owner, and if the PC is in user mode, update * profiling statistics like timer-based profiling * would have done. * * Otherwise, this is either a sampling-mode PMC that * is attached to a different process than its owner, * or a system-wide sampling PMC. Dispatch a log * entry to the PMC's owner process. */ if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) { if (ps->ps_flags & PMC_CC_F_USERSPACE) { td = FIRST_THREAD_IN_PROC(po->po_owner); addupc_intr(td, ps->ps_pc[0], 1); } } else pmclog_process_callchain(pm, ps); entrydone: ps->ps_nsamples = 0; /* mark entry as free */ KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] pm=%p runcount %ld", __LINE__, (void *) pm, (unsigned long)counter_u64_fetch(pm->pm_runcount))); counter_u64_add(pm->pm_runcount, -1); } counter_u64_add(pmc_stats.pm_log_sweeps, 1); /* Do not re-enable stalled PMCs if we failed to process any samples */ if (n == 0) return; /* * Restart any stalled sampling PMCs on this CPU. * * If the NMI handler sets the pm_stalled field of a PMC after * the check below, we'll end up processing the stalled PMC at * the next hardclock tick. */ for (n = 0; n < md->pmd_npmc; n++) { pcd = pmc_ri_to_classdep(md, n, &adjri); KASSERT(pcd != NULL, ("[pmc,%d] null pcd ri=%d", __LINE__, n)); (void) (*pcd->pcd_get_config)(cpu,adjri,&pm); if (pm == NULL || /* !cfg'ed */ pm->pm_state != PMC_STATE_RUNNING || /* !active */ !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) || /* !sampling */ !pm->pm_pcpu_state[cpu].pps_cpustate || /* !desired */ !pm->pm_pcpu_state[cpu].pps_stalled) /* !stalled */ continue; pm->pm_pcpu_state[cpu].pps_stalled = 0; (*pcd->pcd_start_pmc)(cpu, adjri); } } /* * Event handlers. */ /* * Handle a process exit. * * Remove this process from all hash tables. If this process * owned any PMCs, turn off those PMCs and deallocate them, * removing any associations with target processes. * * This function will be called by the last 'thread' of a * process. * * XXX This eventhandler gets called early in the exit process. * Consider using a 'hook' invocation from thread_exit() or equivalent * spot. Another negative is that kse_exit doesn't seem to call * exit1() [??]. * */ static void pmc_process_exit(void *arg __unused, struct proc *p) { struct pmc *pm; int adjri, cpu; unsigned int ri; int is_using_hwpmcs; struct pmc_owner *po; struct pmc_process *pp; struct pmc_classdep *pcd; pmc_value_t newvalue, tmp; PROC_LOCK(p); is_using_hwpmcs = p->p_flag & P_HWPMC; PROC_UNLOCK(p); /* * Log a sysexit event to all SS PMC owners. */ PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_sysexit(po, p->p_pid); PMC_EPOCH_EXIT(); if (!is_using_hwpmcs) return; PMC_GET_SX_XLOCK(); PMCDBG3(PRC,EXT,1,"process-exit proc=%p (%d, %s)", p, p->p_pid, p->p_comm); /* * Since this code is invoked by the last thread in an exiting * process, we would have context switched IN at some prior * point. However, with PREEMPTION, kernel mode context * switches may happen any time, so we want to disable a * context switch OUT till we get any PMCs targeting this * process off the hardware. * * We also need to atomically remove this process' * entry from our target process hash table, using * PMC_FLAG_REMOVE. */ PMCDBG3(PRC,EXT,1, "process-exit proc=%p (%d, %s)", p, p->p_pid, p->p_comm); critical_enter(); /* no preemption */ cpu = curthread->td_oncpu; if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_REMOVE)) != NULL) { PMCDBG2(PRC,EXT,2, "process-exit proc=%p pmc-process=%p", p, pp); /* * The exiting process could the target of * some PMCs which will be running on * currently executing CPU. * * We need to turn these PMCs off like we * would do at context switch OUT time. */ for (ri = 0; ri < md->pmd_npmc; ri++) { /* * Pick up the pmc pointer from hardware * state similar to the CSW_OUT code. */ pm = NULL; pcd = pmc_ri_to_classdep(md, ri, &adjri); (void) (*pcd->pcd_get_config)(cpu, adjri, &pm); PMCDBG2(PRC,EXT,2, "ri=%d pm=%p", ri, pm); if (pm == NULL || !PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) continue; PMCDBG4(PRC,EXT,2, "ppmcs[%d]=%p pm=%p " "state=%d", ri, pp->pp_pmcs[ri].pp_pmc, pm, pm->pm_state); KASSERT(PMC_TO_ROWINDEX(pm) == ri, ("[pmc,%d] ri mismatch pmc(%d) ri(%d)", __LINE__, PMC_TO_ROWINDEX(pm), ri)); KASSERT(pm == pp->pp_pmcs[ri].pp_pmc, ("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc)); KASSERT(counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] bad runcount ri %d rc %ld", __LINE__, ri, (unsigned long)counter_u64_fetch(pm->pm_runcount))); /* * Change desired state, and then stop if not * stalled. This two-step dance should avoid * race conditions where an interrupt re-enables * the PMC after this code has already checked * the pm_stalled flag. */ if (pm->pm_pcpu_state[cpu].pps_cpustate) { pm->pm_pcpu_state[cpu].pps_cpustate = 0; if (!pm->pm_pcpu_state[cpu].pps_stalled) { (void) pcd->pcd_stop_pmc(cpu, adjri); if (PMC_TO_MODE(pm) == PMC_MODE_TC) { pcd->pcd_read_pmc(cpu, adjri, &newvalue); tmp = newvalue - PMC_PCPU_SAVED(cpu,ri); mtx_pool_lock_spin(pmc_mtxpool, pm); pm->pm_gv.pm_savedvalue += tmp; pp->pp_pmcs[ri].pp_pmcval += tmp; mtx_pool_unlock_spin( pmc_mtxpool, pm); } } } KASSERT((int64_t) counter_u64_fetch(pm->pm_runcount) > 0, ("[pmc,%d] runcount is %d", __LINE__, ri)); counter_u64_add(pm->pm_runcount, -1); (void) pcd->pcd_config_pmc(cpu, adjri, NULL); } /* * Inform the MD layer of this pseudo "context switch * out" */ (void) md->pmd_switch_out(pmc_pcpu[cpu], pp); critical_exit(); /* ok to be pre-empted now */ /* * Unlink this process from the PMCs that are * targeting it. This will send a signal to * all PMC owner's whose PMCs are orphaned. * * Log PMC value at exit time if requested. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) { if (pm->pm_flags & PMC_F_NEEDS_LOGFILE && PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm))) pmclog_process_procexit(pm, pp); pmc_unlink_target_process(pm, pp); } free(pp, M_PMC); } else critical_exit(); /* pp == NULL */ /* * If the process owned PMCs, free them up and free up * memory. */ if ((po = pmc_find_owner_descriptor(p)) != NULL) { pmc_remove_owner(po); pmc_destroy_owner_descriptor(po); } sx_xunlock(&pmc_sx); } /* * Handle a process fork. * * If the parent process 'p1' is under HWPMC monitoring, then copy * over any attached PMCs that have 'do_descendants' semantics. */ static void pmc_process_fork(void *arg __unused, struct proc *p1, struct proc *newproc, int flags) { int is_using_hwpmcs; unsigned int ri; uint32_t do_descendants; struct pmc *pm; struct pmc_owner *po; struct pmc_process *ppnew, *ppold; (void) flags; /* unused parameter */ PROC_LOCK(p1); is_using_hwpmcs = p1->p_flag & P_HWPMC; PROC_UNLOCK(p1); /* * If there are system-wide sampling PMCs active, we need to * log all fork events to their owner's logs. */ PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) { pmclog_process_procfork(po, p1->p_pid, newproc->p_pid); pmclog_process_proccreate(po, newproc, 1); } PMC_EPOCH_EXIT(); if (!is_using_hwpmcs) return; PMC_GET_SX_XLOCK(); PMCDBG4(PMC,FRK,1, "process-fork proc=%p (%d, %s) -> %p", p1, p1->p_pid, p1->p_comm, newproc); /* * If the parent process (curthread->td_proc) is a * target of any PMCs, look for PMCs that are to be * inherited, and link these into the new process * descriptor. */ if ((ppold = pmc_find_process_descriptor(curthread->td_proc, PMC_FLAG_NONE)) == NULL) goto done; /* nothing to do */ do_descendants = 0; for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL) do_descendants |= pm->pm_flags & PMC_F_DESCENDANTS; if (do_descendants == 0) /* nothing to do */ goto done; /* * Now mark the new process as being tracked by this driver. */ PROC_LOCK(newproc); newproc->p_flag |= P_HWPMC; PROC_UNLOCK(newproc); /* allocate a descriptor for the new process */ if ((ppnew = pmc_find_process_descriptor(newproc, PMC_FLAG_ALLOCATE)) == NULL) goto done; /* * Run through all PMCs that were targeting the old process * and which specified F_DESCENDANTS and attach them to the * new process. * * Log the fork event to all owners of PMCs attached to this * process, if not already logged. */ for (ri = 0; ri < md->pmd_npmc; ri++) if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL && (pm->pm_flags & PMC_F_DESCENDANTS)) { pmc_link_target_process(pm, ppnew); po = pm->pm_owner; if (po->po_sscount == 0 && po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procfork(po, p1->p_pid, newproc->p_pid); } done: sx_xunlock(&pmc_sx); } static void pmc_process_threadcreate(struct thread *td) { struct pmc_owner *po; PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_threadcreate(po, td, 1); PMC_EPOCH_EXIT(); } static void pmc_process_threadexit(struct thread *td) { struct pmc_owner *po; PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_threadexit(po, td); PMC_EPOCH_EXIT(); } static void pmc_process_proccreate(struct proc *p) { struct pmc_owner *po; PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_proccreate(po, p, 1 /* sync */); PMC_EPOCH_EXIT(); } static void pmc_process_allproc(struct pmc *pm) { struct pmc_owner *po; struct thread *td; struct proc *p; po = pm->pm_owner; if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) return; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { pmclog_process_proccreate(po, p, 0 /* sync */); PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, td) pmclog_process_threadcreate(po, td, 0 /* sync */); PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); pmclog_flush(po, 0); } static void pmc_kld_load(void *arg __unused, linker_file_t lf) { struct pmc_owner *po; /* * Notify owners of system sampling PMCs about KLD operations. */ PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_in(po, (pid_t) -1, (uintfptr_t) lf->address, lf->filename); PMC_EPOCH_EXIT(); /* * TODO: Notify owners of (all) process-sampling PMCs too. */ } static void pmc_kld_unload(void *arg __unused, const char *filename __unused, caddr_t address, size_t size) { struct pmc_owner *po; PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_out(po, (pid_t) -1, (uintfptr_t) address, (uintfptr_t) address + size); PMC_EPOCH_EXIT(); /* * TODO: Notify owners of process-sampling PMCs. */ } /* * initialization */ static const char * pmc_name_of_pmcclass(enum pmc_class class) { switch (class) { #undef __PMC_CLASS #define __PMC_CLASS(S,V,D) \ case PMC_CLASS_##S: \ return #S; __PMC_CLASSES(); default: return (""); } } /* * Base class initializer: allocate structure and set default classes. */ struct pmc_mdep * pmc_mdep_alloc(int nclasses) { struct pmc_mdep *md; int n; /* SOFT + md classes */ n = 1 + nclasses; md = malloc(sizeof(struct pmc_mdep) + n * sizeof(struct pmc_classdep), M_PMC, M_WAITOK|M_ZERO); md->pmd_nclass = n; /* Add base class. */ pmc_soft_initialize(md); return md; } void pmc_mdep_free(struct pmc_mdep *md) { pmc_soft_finalize(md); free(md, M_PMC); } static int generic_switch_in(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; (void) pp; return (0); } static int generic_switch_out(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; (void) pp; return (0); } static struct pmc_mdep * pmc_generic_cpu_initialize(void) { struct pmc_mdep *md; md = pmc_mdep_alloc(0); md->pmd_cputype = PMC_CPU_GENERIC; - md->pmd_pcpu_init = NULL; - md->pmd_pcpu_fini = NULL; md->pmd_switch_in = generic_switch_in; md->pmd_switch_out = generic_switch_out; return (md); } static void pmc_generic_cpu_finalize(struct pmc_mdep *md) { (void) md; } static int pmc_initialize(void) { int c, cpu, error, n, ri; unsigned int maxcpu, domain; struct pcpu *pc; struct pmc_binding pb; struct pmc_sample *ps; struct pmc_classdep *pcd; struct pmc_samplebuffer *sb; md = NULL; error = 0; pmc_stats.pm_intr_ignored = counter_u64_alloc(M_WAITOK); pmc_stats.pm_intr_processed = counter_u64_alloc(M_WAITOK); pmc_stats.pm_intr_bufferfull = counter_u64_alloc(M_WAITOK); pmc_stats.pm_syscalls = counter_u64_alloc(M_WAITOK); pmc_stats.pm_syscall_errors = counter_u64_alloc(M_WAITOK); pmc_stats.pm_buffer_requests = counter_u64_alloc(M_WAITOK); pmc_stats.pm_buffer_requests_failed = counter_u64_alloc(M_WAITOK); pmc_stats.pm_log_sweeps = counter_u64_alloc(M_WAITOK); pmc_stats.pm_merges = counter_u64_alloc(M_WAITOK); pmc_stats.pm_overwrites = counter_u64_alloc(M_WAITOK); #ifdef HWPMC_DEBUG /* parse debug flags first */ if (TUNABLE_STR_FETCH(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr, sizeof(pmc_debugstr))) pmc_debugflags_parse(pmc_debugstr, pmc_debugstr+strlen(pmc_debugstr)); #endif PMCDBG1(MOD,INI,0, "PMC Initialize (version %x)", PMC_VERSION); /* check kernel version */ if (pmc_kernel_version != PMC_VERSION) { if (pmc_kernel_version == 0) printf("hwpmc: this kernel has not been compiled with " "'options HWPMC_HOOKS'.\n"); else printf("hwpmc: kernel version (0x%x) does not match " "module version (0x%x).\n", pmc_kernel_version, PMC_VERSION); return EPROGMISMATCH; } /* * check sysctl parameters */ if (pmc_hashsize <= 0) { (void) printf("hwpmc: tunable \"hashsize\"=%d must be " "greater than zero.\n", pmc_hashsize); pmc_hashsize = PMC_HASH_SIZE; } if (pmc_nsamples <= 0 || pmc_nsamples > 65535) { (void) printf("hwpmc: tunable \"nsamples\"=%d out of " "range.\n", pmc_nsamples); pmc_nsamples = PMC_NSAMPLES; } pmc_sample_mask = pmc_nsamples-1; if (pmc_callchaindepth <= 0 || pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) { (void) printf("hwpmc: tunable \"callchaindepth\"=%d out of " "range - using %d.\n", pmc_callchaindepth, PMC_CALLCHAIN_DEPTH_MAX); pmc_callchaindepth = PMC_CALLCHAIN_DEPTH_MAX; } md = pmc_md_initialize(); if (md == NULL) { /* Default to generic CPU. */ md = pmc_generic_cpu_initialize(); if (md == NULL) return (ENOSYS); } /* * Refresh classes base ri. Optional classes may come in different * order. */ for (ri = c = 0; c < md->pmd_nclass; c++) { pcd = &md->pmd_classdep[c]; pcd->pcd_ri = ri; ri += pcd->pcd_num; } KASSERT(md->pmd_nclass >= 1 && md->pmd_npmc >= 1, ("[pmc,%d] no classes or pmcs", __LINE__)); /* Compute the map from row-indices to classdep pointers. */ pmc_rowindex_to_classdep = malloc(sizeof(struct pmc_classdep *) * md->pmd_npmc, M_PMC, M_WAITOK|M_ZERO); for (n = 0; n < md->pmd_npmc; n++) pmc_rowindex_to_classdep[n] = NULL; for (ri = c = 0; c < md->pmd_nclass; c++) { pcd = &md->pmd_classdep[c]; for (n = 0; n < pcd->pcd_num; n++, ri++) pmc_rowindex_to_classdep[ri] = pcd; } KASSERT(ri == md->pmd_npmc, ("[pmc,%d] npmc miscomputed: ri=%d, md->npmc=%d", __LINE__, ri, md->pmd_npmc)); maxcpu = pmc_cpu_max(); /* allocate space for the per-cpu array */ pmc_pcpu = malloc(maxcpu * sizeof(struct pmc_cpu *), M_PMC, M_WAITOK|M_ZERO); /* per-cpu 'saved values' for managing process-mode PMCs */ pmc_pcpu_saved = malloc(sizeof(pmc_value_t) * maxcpu * md->pmd_npmc, M_PMC, M_WAITOK); /* Perform CPU-dependent initialization. */ pmc_save_cpu_binding(&pb); error = 0; for (cpu = 0; error == 0 && cpu < maxcpu; cpu++) { if (!pmc_cpu_is_active(cpu)) continue; pmc_select_cpu(cpu); pmc_pcpu[cpu] = malloc(sizeof(struct pmc_cpu) + md->pmd_npmc * sizeof(struct pmc_hw *), M_PMC, M_WAITOK|M_ZERO); - if (md->pmd_pcpu_init) - error = md->pmd_pcpu_init(md, cpu); for (n = 0; error == 0 && n < md->pmd_nclass; n++) if (md->pmd_classdep[n].pcd_num > 0) error = md->pmd_classdep[n].pcd_pcpu_init(md, cpu); } pmc_restore_cpu_binding(&pb); if (error) return (error); /* allocate space for the sample array */ for (cpu = 0; cpu < maxcpu; cpu++) { if (!pmc_cpu_is_active(cpu)) continue; pc = pcpu_find(cpu); domain = pc->pc_domain; sb = malloc_domainset(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, DOMAINSET_PREF(domain), M_WAITOK | M_ZERO); KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); sb->ps_callchains = malloc_domainset(pmc_callchaindepth * pmc_nsamples * sizeof(uintptr_t), M_PMC, DOMAINSET_PREF(domain), M_WAITOK | M_ZERO); for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) ps->ps_pc = sb->ps_callchains + (n * pmc_callchaindepth); pmc_pcpu[cpu]->pc_sb[PMC_HR] = sb; sb = malloc_domainset(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, DOMAINSET_PREF(domain), M_WAITOK | M_ZERO); sb->ps_callchains = malloc_domainset(pmc_callchaindepth * pmc_nsamples * sizeof(uintptr_t), M_PMC, DOMAINSET_PREF(domain), M_WAITOK | M_ZERO); for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) ps->ps_pc = sb->ps_callchains + (n * pmc_callchaindepth); pmc_pcpu[cpu]->pc_sb[PMC_SR] = sb; sb = malloc_domainset(sizeof(struct pmc_samplebuffer) + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, DOMAINSET_PREF(domain), M_WAITOK | M_ZERO); sb->ps_callchains = malloc_domainset(pmc_callchaindepth * pmc_nsamples * sizeof(uintptr_t), M_PMC, DOMAINSET_PREF(domain), M_WAITOK | M_ZERO); for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) ps->ps_pc = sb->ps_callchains + n * pmc_callchaindepth; pmc_pcpu[cpu]->pc_sb[PMC_UR] = sb; } /* allocate space for the row disposition array */ pmc_pmcdisp = malloc(sizeof(enum pmc_mode) * md->pmd_npmc, M_PMC, M_WAITOK|M_ZERO); /* mark all PMCs as available */ for (n = 0; n < (int) md->pmd_npmc; n++) PMC_MARK_ROW_FREE(n); /* allocate thread hash tables */ pmc_ownerhash = hashinit(pmc_hashsize, M_PMC, &pmc_ownerhashmask); pmc_processhash = hashinit(pmc_hashsize, M_PMC, &pmc_processhashmask); mtx_init(&pmc_processhash_mtx, "pmc-process-hash", "pmc-leaf", MTX_SPIN); CK_LIST_INIT(&pmc_ss_owners); pmc_ss_count = 0; /* allocate a pool of spin mutexes */ pmc_mtxpool = mtx_pool_create("pmc-leaf", pmc_mtxpool_size, MTX_SPIN); PMCDBG4(MOD,INI,1, "pmc_ownerhash=%p, mask=0x%lx " "targethash=%p mask=0x%lx", pmc_ownerhash, pmc_ownerhashmask, pmc_processhash, pmc_processhashmask); /* Initialize a spin mutex for the thread free list. */ mtx_init(&pmc_threadfreelist_mtx, "pmc-threadfreelist", "pmc-leaf", MTX_SPIN); /* Initialize the task to prune the thread free list. */ TASK_INIT(&free_task, 0, pmc_thread_descriptor_pool_free_task, NULL); /* register process {exit,fork,exec} handlers */ pmc_exit_tag = EVENTHANDLER_REGISTER(process_exit, pmc_process_exit, NULL, EVENTHANDLER_PRI_ANY); pmc_fork_tag = EVENTHANDLER_REGISTER(process_fork, pmc_process_fork, NULL, EVENTHANDLER_PRI_ANY); /* register kld event handlers */ pmc_kld_load_tag = EVENTHANDLER_REGISTER(kld_load, pmc_kld_load, NULL, EVENTHANDLER_PRI_ANY); pmc_kld_unload_tag = EVENTHANDLER_REGISTER(kld_unload, pmc_kld_unload, NULL, EVENTHANDLER_PRI_ANY); /* initialize logging */ pmclog_initialize(); /* set hook functions */ pmc_intr = md->pmd_intr; wmb(); pmc_hook = pmc_hook_handler; if (error == 0) { printf(PMC_MODULE_NAME ":"); for (n = 0; n < (int) md->pmd_nclass; n++) { if (md->pmd_classdep[n].pcd_num == 0) continue; pcd = &md->pmd_classdep[n]; printf(" %s/%d/%d/0x%b", pmc_name_of_pmcclass(pcd->pcd_class), pcd->pcd_num, pcd->pcd_width, pcd->pcd_caps, "\20" "\1INT\2USR\3SYS\4EDG\5THR" "\6REA\7WRI\10INV\11QUA\12PRC" "\13TAG\14CSC"); } printf("\n"); } return (error); } /* prepare to be unloaded */ static void pmc_cleanup(void) { int c, cpu; unsigned int maxcpu; struct pmc_ownerhash *ph; struct pmc_owner *po, *tmp; struct pmc_binding pb; #ifdef HWPMC_DEBUG struct pmc_processhash *prh; #endif PMCDBG0(MOD,INI,0, "cleanup"); /* switch off sampling */ CPU_FOREACH(cpu) DPCPU_ID_SET(cpu, pmc_sampled, 0); pmc_intr = NULL; sx_xlock(&pmc_sx); if (pmc_hook == NULL) { /* being unloaded already */ sx_xunlock(&pmc_sx); return; } pmc_hook = NULL; /* prevent new threads from entering module */ /* deregister event handlers */ EVENTHANDLER_DEREGISTER(process_fork, pmc_fork_tag); EVENTHANDLER_DEREGISTER(process_exit, pmc_exit_tag); EVENTHANDLER_DEREGISTER(kld_load, pmc_kld_load_tag); EVENTHANDLER_DEREGISTER(kld_unload, pmc_kld_unload_tag); /* send SIGBUS to all owner threads, free up allocations */ if (pmc_ownerhash) for (ph = pmc_ownerhash; ph <= &pmc_ownerhash[pmc_ownerhashmask]; ph++) { LIST_FOREACH_SAFE(po, ph, po_next, tmp) { pmc_remove_owner(po); /* send SIGBUS to owner processes */ PMCDBG3(MOD,INI,2, "cleanup signal proc=%p " "(%d, %s)", po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm); PROC_LOCK(po->po_owner); kern_psignal(po->po_owner, SIGBUS); PROC_UNLOCK(po->po_owner); pmc_destroy_owner_descriptor(po); } } /* reclaim allocated data structures */ taskqueue_drain(taskqueue_fast, &free_task); mtx_destroy(&pmc_threadfreelist_mtx); pmc_thread_descriptor_pool_drain(); if (pmc_mtxpool) mtx_pool_destroy(&pmc_mtxpool); mtx_destroy(&pmc_processhash_mtx); if (pmc_processhash) { #ifdef HWPMC_DEBUG struct pmc_process *pp; PMCDBG0(MOD,INI,3, "destroy process hash"); for (prh = pmc_processhash; prh <= &pmc_processhash[pmc_processhashmask]; prh++) LIST_FOREACH(pp, prh, pp_next) PMCDBG1(MOD,INI,3, "pid=%d", pp->pp_proc->p_pid); #endif hashdestroy(pmc_processhash, M_PMC, pmc_processhashmask); pmc_processhash = NULL; } if (pmc_ownerhash) { PMCDBG0(MOD,INI,3, "destroy owner hash"); hashdestroy(pmc_ownerhash, M_PMC, pmc_ownerhashmask); pmc_ownerhash = NULL; } KASSERT(CK_LIST_EMPTY(&pmc_ss_owners), ("[pmc,%d] Global SS owner list not empty", __LINE__)); KASSERT(pmc_ss_count == 0, ("[pmc,%d] Global SS count not empty", __LINE__)); /* do processor and pmc-class dependent cleanup */ maxcpu = pmc_cpu_max(); PMCDBG0(MOD,INI,3, "md cleanup"); if (md) { pmc_save_cpu_binding(&pb); for (cpu = 0; cpu < maxcpu; cpu++) { PMCDBG2(MOD,INI,1,"pmc-cleanup cpu=%d pcs=%p", cpu, pmc_pcpu[cpu]); if (!pmc_cpu_is_active(cpu) || pmc_pcpu[cpu] == NULL) continue; pmc_select_cpu(cpu); for (c = 0; c < md->pmd_nclass; c++) if (md->pmd_classdep[c].pcd_num > 0) md->pmd_classdep[c].pcd_pcpu_fini(md, cpu); - if (md->pmd_pcpu_fini) - md->pmd_pcpu_fini(md, cpu); } if (md->pmd_cputype == PMC_CPU_GENERIC) pmc_generic_cpu_finalize(md); else pmc_md_finalize(md); pmc_mdep_free(md); md = NULL; pmc_restore_cpu_binding(&pb); } /* Free per-cpu descriptors. */ for (cpu = 0; cpu < maxcpu; cpu++) { if (!pmc_cpu_is_active(cpu)) continue; KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_HR] != NULL, ("[pmc,%d] Null hw cpu sample buffer cpu=%d", __LINE__, cpu)); KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_SR] != NULL, ("[pmc,%d] Null sw cpu sample buffer cpu=%d", __LINE__, cpu)); KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_UR] != NULL, ("[pmc,%d] Null userret cpu sample buffer cpu=%d", __LINE__, cpu)); free(pmc_pcpu[cpu]->pc_sb[PMC_HR]->ps_callchains, M_PMC); free(pmc_pcpu[cpu]->pc_sb[PMC_HR], M_PMC); free(pmc_pcpu[cpu]->pc_sb[PMC_SR]->ps_callchains, M_PMC); free(pmc_pcpu[cpu]->pc_sb[PMC_SR], M_PMC); free(pmc_pcpu[cpu]->pc_sb[PMC_UR]->ps_callchains, M_PMC); free(pmc_pcpu[cpu]->pc_sb[PMC_UR], M_PMC); free(pmc_pcpu[cpu], M_PMC); } free(pmc_pcpu, M_PMC); pmc_pcpu = NULL; free(pmc_pcpu_saved, M_PMC); pmc_pcpu_saved = NULL; if (pmc_pmcdisp) { free(pmc_pmcdisp, M_PMC); pmc_pmcdisp = NULL; } if (pmc_rowindex_to_classdep) { free(pmc_rowindex_to_classdep, M_PMC); pmc_rowindex_to_classdep = NULL; } pmclog_shutdown(); counter_u64_free(pmc_stats.pm_intr_ignored); counter_u64_free(pmc_stats.pm_intr_processed); counter_u64_free(pmc_stats.pm_intr_bufferfull); counter_u64_free(pmc_stats.pm_syscalls); counter_u64_free(pmc_stats.pm_syscall_errors); counter_u64_free(pmc_stats.pm_buffer_requests); counter_u64_free(pmc_stats.pm_buffer_requests_failed); counter_u64_free(pmc_stats.pm_log_sweeps); counter_u64_free(pmc_stats.pm_merges); counter_u64_free(pmc_stats.pm_overwrites); sx_xunlock(&pmc_sx); /* we are done */ } /* * The function called at load/unload. */ static int load (struct module *module __unused, int cmd, void *arg __unused) { int error; error = 0; switch (cmd) { case MOD_LOAD : /* initialize the subsystem */ error = pmc_initialize(); if (error != 0) break; PMCDBG2(MOD,INI,1, "syscall=%d maxcpu=%d", pmc_syscall_num, pmc_cpu_max()); break; case MOD_UNLOAD : case MOD_SHUTDOWN: pmc_cleanup(); PMCDBG0(MOD,INI,1, "unloaded"); break; default : error = EINVAL; /* XXX should panic(9) */ break; } return error; } diff --git a/sys/sys/pmc.h b/sys/sys/pmc.h index 558fe4b20d39..3a6052594fda 100644 --- a/sys/sys/pmc.h +++ b/sys/sys/pmc.h @@ -1,1223 +1,1219 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2008, Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_PMC_H_ #define _SYS_PMC_H_ #include #include #include #include #include #ifdef _KERNEL #include #include #endif #define PMC_MODULE_NAME "hwpmc" #define PMC_NAME_MAX 64 /* HW counter name size */ #define PMC_CLASS_MAX 8 /* max #classes of PMCs per-system */ /* * Kernel<->userland API version number [MMmmpppp] * * Major numbers are to be incremented when an incompatible change to * the ABI occurs that older clients will not be able to handle. * * Minor numbers are incremented when a backwards compatible change * occurs that allows older correct programs to run unchanged. For * example, when support for a new PMC type is added. * * The patch version is incremented for every bug fix. */ #define PMC_VERSION_MAJOR 0x09 #define PMC_VERSION_MINOR 0x03 #define PMC_VERSION_PATCH 0x0000 #define PMC_VERSION (PMC_VERSION_MAJOR << 24 | \ PMC_VERSION_MINOR << 16 | PMC_VERSION_PATCH) #define PMC_CPUID_LEN 64 /* cpu model name for pmu lookup */ extern char pmc_cpuid[PMC_CPUID_LEN]; /* * Kinds of CPUs known. * * We keep track of CPU variants that need to be distinguished in * some way for PMC operations. CPU names are grouped by manufacturer * and numbered sparsely in order to minimize changes to the ABI involved * when new CPUs are added. * * Please keep the pmc(3) manual page in sync with this list. */ #define __PMC_CPUS() \ __PMC_CPU(AMD_K7, 0x00, "AMD K7") \ __PMC_CPU(AMD_K8, 0x01, "AMD K8") \ __PMC_CPU(INTEL_CORE, 0x87, "Intel Core Solo/Duo") \ __PMC_CPU(INTEL_CORE2, 0x88, "Intel Core2") \ __PMC_CPU(INTEL_CORE2EXTREME, 0x89, "Intel Core2 Extreme") \ __PMC_CPU(INTEL_ATOM, 0x8A, "Intel Atom") \ __PMC_CPU(INTEL_COREI7, 0x8B, "Intel Core i7") \ __PMC_CPU(INTEL_WESTMERE, 0x8C, "Intel Westmere") \ __PMC_CPU(INTEL_SANDYBRIDGE, 0x8D, "Intel Sandy Bridge") \ __PMC_CPU(INTEL_IVYBRIDGE, 0x8E, "Intel Ivy Bridge") \ __PMC_CPU(INTEL_SANDYBRIDGE_XEON, 0x8F, "Intel Sandy Bridge Xeon") \ __PMC_CPU(INTEL_IVYBRIDGE_XEON, 0x90, "Intel Ivy Bridge Xeon") \ __PMC_CPU(INTEL_HASWELL, 0x91, "Intel Haswell") \ __PMC_CPU(INTEL_ATOM_SILVERMONT, 0x92, "Intel Atom Silvermont") \ __PMC_CPU(INTEL_NEHALEM_EX, 0x93, "Intel Nehalem Xeon 7500") \ __PMC_CPU(INTEL_WESTMERE_EX, 0x94, "Intel Westmere Xeon E7") \ __PMC_CPU(INTEL_HASWELL_XEON, 0x95, "Intel Haswell Xeon E5 v3") \ __PMC_CPU(INTEL_BROADWELL, 0x96, "Intel Broadwell") \ __PMC_CPU(INTEL_BROADWELL_XEON, 0x97, "Intel Broadwell Xeon") \ __PMC_CPU(INTEL_SKYLAKE, 0x98, "Intel Skylake") \ __PMC_CPU(INTEL_SKYLAKE_XEON, 0x99, "Intel Skylake Xeon") \ __PMC_CPU(INTEL_ATOM_GOLDMONT, 0x9A, "Intel Atom Goldmont") \ __PMC_CPU(INTEL_ICELAKE, 0x9B, "Intel Icelake") \ __PMC_CPU(INTEL_ICELAKE_XEON, 0x9C, "Intel Icelake Xeon") \ __PMC_CPU(INTEL_ALDERLAKE, 0x9D, "Intel Alderlake") \ __PMC_CPU(INTEL_ATOM_GOLDMONT_P, 0x9E, "Intel Atom Goldmont Plus") \ __PMC_CPU(INTEL_ATOM_TREMONT, 0x9F, "Intel Atom Tremont") \ __PMC_CPU(INTEL_XSCALE, 0x100, "Intel XScale") \ __PMC_CPU(PPC_7450, 0x300, "PowerPC MPC7450") \ __PMC_CPU(PPC_E500, 0x340, "PowerPC e500 Core") \ __PMC_CPU(PPC_970, 0x380, "IBM PowerPC 970") \ __PMC_CPU(PPC_POWER8, 0x390, "IBM POWER8") \ __PMC_CPU(GENERIC, 0x400, "Generic") \ __PMC_CPU(ARMV7_CORTEX_A5, 0x500, "ARMv7 Cortex A5") \ __PMC_CPU(ARMV7_CORTEX_A7, 0x501, "ARMv7 Cortex A7") \ __PMC_CPU(ARMV7_CORTEX_A8, 0x502, "ARMv7 Cortex A8") \ __PMC_CPU(ARMV7_CORTEX_A9, 0x503, "ARMv7 Cortex A9") \ __PMC_CPU(ARMV7_CORTEX_A15, 0x504, "ARMv7 Cortex A15") \ __PMC_CPU(ARMV7_CORTEX_A17, 0x505, "ARMv7 Cortex A17") \ __PMC_CPU(ARMV8_CORTEX_A53, 0x600, "ARMv8 Cortex A53") \ __PMC_CPU(ARMV8_CORTEX_A57, 0x601, "ARMv8 Cortex A57") \ __PMC_CPU(ARMV8_CORTEX_A76, 0x602, "ARMv8 Cortex A76") enum pmc_cputype { #undef __PMC_CPU #define __PMC_CPU(S,V,D) PMC_CPU_##S = V, __PMC_CPUS() }; #define PMC_CPU_FIRST PMC_CPU_AMD_K7 #define PMC_CPU_LAST PMC_CPU_ARMV8_CORTEX_A76 /* * Classes of PMCs */ #define __PMC_CLASSES() \ __PMC_CLASS(TSC, 0x00, "CPU Timestamp counter") \ __PMC_CLASS(K7, 0x01, "AMD K7 performance counters") \ __PMC_CLASS(K8, 0x02, "AMD K8 performance counters") \ __PMC_CLASS(IAF, 0x06, "Intel Core2/Atom, fixed function") \ __PMC_CLASS(IAP, 0x07, "Intel Core...Atom, programmable") \ __PMC_CLASS(UCF, 0x08, "Intel Uncore fixed function") \ __PMC_CLASS(UCP, 0x09, "Intel Uncore programmable") \ __PMC_CLASS(XSCALE, 0x0A, "Intel XScale counters") \ __PMC_CLASS(PPC7450, 0x0D, "Motorola MPC7450 class") \ __PMC_CLASS(PPC970, 0x0E, "IBM PowerPC 970 class") \ __PMC_CLASS(SOFT, 0x0F, "Software events") \ __PMC_CLASS(ARMV7, 0x10, "ARMv7") \ __PMC_CLASS(ARMV8, 0x11, "ARMv8") \ __PMC_CLASS(E500, 0x13, "Freescale e500 class") \ __PMC_CLASS(POWER8, 0x15, "IBM POWER8 class") \ __PMC_CLASS(DMC620_PMU_CD2, 0x16, "ARM DMC620 Memory Controller PMU CLKDIV2") \ __PMC_CLASS(DMC620_PMU_C, 0x17, "ARM DMC620 Memory Controller PMU CLK") \ __PMC_CLASS(CMN600_PMU, 0x18, "Arm CoreLink CMN600 Coherent Mesh Network PMU") enum pmc_class { #undef __PMC_CLASS #define __PMC_CLASS(S,V,D) PMC_CLASS_##S = V, __PMC_CLASSES() }; #define PMC_CLASS_FIRST PMC_CLASS_TSC #define PMC_CLASS_LAST PMC_CLASS_CMN600_PMU /* * A PMC can be in the following states: * * Hardware states: * DISABLED -- administratively prohibited from being used. * FREE -- HW available for use * Software states: * ALLOCATED -- allocated * STOPPED -- allocated, but not counting events * RUNNING -- allocated, and in operation; 'pm_runcount' * holds the number of CPUs using this PMC at * a given instant * DELETED -- being destroyed */ #define __PMC_HWSTATES() \ __PMC_STATE(DISABLED) \ __PMC_STATE(FREE) #define __PMC_SWSTATES() \ __PMC_STATE(ALLOCATED) \ __PMC_STATE(STOPPED) \ __PMC_STATE(RUNNING) \ __PMC_STATE(DELETED) #define __PMC_STATES() \ __PMC_HWSTATES() \ __PMC_SWSTATES() enum pmc_state { #undef __PMC_STATE #define __PMC_STATE(S) PMC_STATE_##S, __PMC_STATES() __PMC_STATE(MAX) }; #define PMC_STATE_FIRST PMC_STATE_DISABLED #define PMC_STATE_LAST PMC_STATE_DELETED /* * An allocated PMC may used as a 'global' counter or as a * 'thread-private' one. Each such mode of use can be in either * statistical sampling mode or in counting mode. Thus a PMC in use * * SS i.e., SYSTEM STATISTICAL -- system-wide statistical profiling * SC i.e., SYSTEM COUNTER -- system-wide counting mode * TS i.e., THREAD STATISTICAL -- thread virtual, statistical profiling * TC i.e., THREAD COUNTER -- thread virtual, counting mode * * Statistical profiling modes rely on the PMC periodically delivering * a interrupt to the CPU (when the configured number of events have * been measured), so the PMC must have the ability to generate * interrupts. * * In counting modes, the PMC counts its configured events, with the * value of the PMC being read whenever needed by its owner process. * * The thread specific modes "virtualize" the PMCs -- the PMCs appear * to be thread private and count events only when the profiled thread * actually executes on the CPU. * * The system-wide "global" modes keep the PMCs running all the time * and are used to measure the behaviour of the whole system. */ #define __PMC_MODES() \ __PMC_MODE(SS, 0) \ __PMC_MODE(SC, 1) \ __PMC_MODE(TS, 2) \ __PMC_MODE(TC, 3) enum pmc_mode { #undef __PMC_MODE #define __PMC_MODE(M,N) PMC_MODE_##M = N, __PMC_MODES() }; #define PMC_MODE_FIRST PMC_MODE_SS #define PMC_MODE_LAST PMC_MODE_TC #define PMC_IS_COUNTING_MODE(mode) \ ((mode) == PMC_MODE_SC || (mode) == PMC_MODE_TC) #define PMC_IS_SYSTEM_MODE(mode) \ ((mode) == PMC_MODE_SS || (mode) == PMC_MODE_SC) #define PMC_IS_SAMPLING_MODE(mode) \ ((mode) == PMC_MODE_SS || (mode) == PMC_MODE_TS) #define PMC_IS_VIRTUAL_MODE(mode) \ ((mode) == PMC_MODE_TS || (mode) == PMC_MODE_TC) /* * PMC row disposition */ #define __PMC_DISPOSITIONS(N) \ __PMC_DISP(STANDALONE) /* global/disabled counters */ \ __PMC_DISP(FREE) /* free/available */ \ __PMC_DISP(THREAD) /* thread-virtual PMCs */ \ __PMC_DISP(UNKNOWN) /* sentinel */ enum pmc_disp { #undef __PMC_DISP #define __PMC_DISP(D) PMC_DISP_##D , __PMC_DISPOSITIONS() }; #define PMC_DISP_FIRST PMC_DISP_STANDALONE #define PMC_DISP_LAST PMC_DISP_THREAD /* * Counter capabilities * * __PMC_CAPS(NAME, VALUE, DESCRIPTION) */ #define __PMC_CAPS() \ __PMC_CAP(INTERRUPT, 0, "generate interrupts") \ __PMC_CAP(USER, 1, "count user-mode events") \ __PMC_CAP(SYSTEM, 2, "count system-mode events") \ __PMC_CAP(EDGE, 3, "do edge detection of events") \ __PMC_CAP(THRESHOLD, 4, "ignore events below a threshold") \ __PMC_CAP(READ, 5, "read PMC counter") \ __PMC_CAP(WRITE, 6, "reprogram PMC counter") \ __PMC_CAP(INVERT, 7, "invert comparison sense") \ __PMC_CAP(QUALIFIER, 8, "further qualify monitored events") \ __PMC_CAP(PRECISE, 9, "perform precise sampling") \ __PMC_CAP(TAGGING, 10, "tag upstream events") \ __PMC_CAP(CASCADE, 11, "cascade counters") \ __PMC_CAP(SYSWIDE, 12, "system wide counter") \ __PMC_CAP(DOMWIDE, 13, "NUMA domain wide counter") enum pmc_caps { #undef __PMC_CAP #define __PMC_CAP(NAME, VALUE, DESCR) PMC_CAP_##NAME = (1 << VALUE) , __PMC_CAPS() }; #define PMC_CAP_FIRST PMC_CAP_INTERRUPT #define PMC_CAP_LAST PMC_CAP_DOMWIDE /* * PMC Event Numbers * * These are generated from the definitions in "dev/hwpmc/pmc_events.h". */ enum pmc_event { #undef __PMC_EV #undef __PMC_EV_BLOCK #define __PMC_EV_BLOCK(C,V) PMC_EV_ ## C ## __BLOCK_START = (V) - 1 , #define __PMC_EV(C,N) PMC_EV_ ## C ## _ ## N , __PMC_EVENTS() }; /* * PMC SYSCALL INTERFACE */ /* * "PMC_OPS" -- these are the commands recognized by the kernel * module, and are used when performing a system call from userland. */ #define __PMC_OPS() \ __PMC_OP(CONFIGURELOG, "Set log file") \ __PMC_OP(FLUSHLOG, "Flush log file") \ __PMC_OP(GETCPUINFO, "Get system CPU information") \ __PMC_OP(GETDRIVERSTATS, "Get driver statistics") \ __PMC_OP(GETMODULEVERSION, "Get module version") \ __PMC_OP(GETPMCINFO, "Get per-cpu PMC information") \ __PMC_OP(PMCADMIN, "Set PMC state") \ __PMC_OP(PMCALLOCATE, "Allocate and configure a PMC") \ __PMC_OP(PMCATTACH, "Attach a PMC to a process") \ __PMC_OP(PMCDETACH, "Detach a PMC from a process") \ __PMC_OP(PMCGETMSR, "Get a PMC's hardware address") \ __PMC_OP(PMCRELEASE, "Release a PMC") \ __PMC_OP(PMCRW, "Read/Set a PMC") \ __PMC_OP(PMCSETCOUNT, "Set initial count/sampling rate") \ __PMC_OP(PMCSTART, "Start a PMC") \ __PMC_OP(PMCSTOP, "Stop a PMC") \ __PMC_OP(WRITELOG, "Write a cookie to the log file") \ __PMC_OP(CLOSELOG, "Close log file") \ __PMC_OP(GETDYNEVENTINFO, "Get dynamic events list") enum pmc_ops { #undef __PMC_OP #define __PMC_OP(N, D) PMC_OP_##N, __PMC_OPS() }; /* * Flags used in operations on PMCs. */ #define PMC_F_UNUSED1 0x00000001 /* unused */ #define PMC_F_DESCENDANTS 0x00000002 /*OP ALLOCATE track descendants */ #define PMC_F_LOG_PROCCSW 0x00000004 /*OP ALLOCATE track ctx switches */ #define PMC_F_LOG_PROCEXIT 0x00000008 /*OP ALLOCATE log proc exits */ #define PMC_F_NEWVALUE 0x00000010 /*OP RW write new value */ #define PMC_F_OLDVALUE 0x00000020 /*OP RW get old value */ /* V2 API */ #define PMC_F_CALLCHAIN 0x00000080 /*OP ALLOCATE capture callchains */ #define PMC_F_USERCALLCHAIN 0x00000100 /*OP ALLOCATE use userspace stack */ /* internal flags */ #define PMC_F_ATTACHED_TO_OWNER 0x00010000 /*attached to owner*/ #define PMC_F_NEEDS_LOGFILE 0x00020000 /*needs log file */ #define PMC_F_ATTACH_DONE 0x00040000 /*attached at least once */ #define PMC_CALLCHAIN_DEPTH_MAX 512 #define PMC_CC_F_USERSPACE 0x01 /*userspace callchain*/ /* * Cookies used to denote allocated PMCs, and the values of PMCs. */ typedef uint32_t pmc_id_t; typedef uint64_t pmc_value_t; #define PMC_ID_INVALID (~ (pmc_id_t) 0) /* * PMC IDs have the following format: * * +-----------------------+-------+-----------+ * | CPU | PMC MODE | CLASS | ROW INDEX | * +-----------------------+-------+-----------+ * * where CPU is 12 bits, MODE 4, CLASS 8, and ROW INDEX 8 Field 'CPU' * is set to the requested CPU for system-wide PMCs or PMC_CPU_ANY for * process-mode PMCs. Field 'PMC MODE' is the allocated PMC mode. * Field 'PMC CLASS' is the class of the PMC. Field 'ROW INDEX' is the * row index for the PMC. * * The 'ROW INDEX' ranges over 0..NWPMCS where NHWPMCS is the total * number of hardware PMCs on this cpu. */ #define PMC_ID_TO_ROWINDEX(ID) ((ID) & 0xFF) #define PMC_ID_TO_CLASS(ID) (((ID) & 0xFF00) >> 8) #define PMC_ID_TO_MODE(ID) (((ID) & 0xF0000) >> 16) #define PMC_ID_TO_CPU(ID) (((ID) & 0xFFF00000) >> 20) #define PMC_ID_MAKE_ID(CPU,MODE,CLASS,ROWINDEX) \ ((((CPU) & 0xFFF) << 20) | (((MODE) & 0xF) << 16) | \ (((CLASS) & 0xFF) << 8) | ((ROWINDEX) & 0xFF)) /* * Data structures for system calls supported by the pmc driver. */ /* * OP PMCALLOCATE * * Allocate a PMC on the named CPU. */ #define PMC_CPU_ANY ~0 struct pmc_op_pmcallocate { uint32_t pm_caps; /* PMC_CAP_* */ uint32_t pm_cpu; /* CPU number or PMC_CPU_ANY */ enum pmc_class pm_class; /* class of PMC desired */ enum pmc_event pm_ev; /* [enum pmc_event] desired */ uint32_t pm_flags; /* additional modifiers PMC_F_* */ enum pmc_mode pm_mode; /* desired mode */ pmc_id_t pm_pmcid; /* [return] process pmc id */ pmc_value_t pm_count; /* initial/sample count */ union pmc_md_op_pmcallocate pm_md; /* MD layer extensions */ }; /* * OP PMCADMIN * * Set the administrative state (i.e., whether enabled or disabled) of * a PMC 'pm_pmc' on CPU 'pm_cpu'. Note that 'pm_pmc' specifies an * absolute PMC number and need not have been first allocated by the * calling process. */ struct pmc_op_pmcadmin { int pm_cpu; /* CPU# */ uint32_t pm_flags; /* flags */ int pm_pmc; /* PMC# */ enum pmc_state pm_state; /* desired state */ }; /* * OP PMCATTACH / OP PMCDETACH * * Attach/detach a PMC and a process. */ struct pmc_op_pmcattach { pmc_id_t pm_pmc; /* PMC to attach to */ pid_t pm_pid; /* target process */ }; /* * OP PMCSETCOUNT * * Set the sampling rate (i.e., the reload count) for statistical counters. * 'pm_pmcid' need to have been previously allocated using PMCALLOCATE. */ struct pmc_op_pmcsetcount { pmc_value_t pm_count; /* initial/sample count */ pmc_id_t pm_pmcid; /* PMC id to set */ }; /* * OP PMCRW * * Read the value of a PMC named by 'pm_pmcid'. 'pm_pmcid' needs * to have been previously allocated using PMCALLOCATE. */ struct pmc_op_pmcrw { uint32_t pm_flags; /* PMC_F_{OLD,NEW}VALUE*/ pmc_id_t pm_pmcid; /* pmc id */ pmc_value_t pm_value; /* new&returned value */ }; /* * OP GETPMCINFO * * retrieve PMC state for a named CPU. The caller is expected to * allocate 'npmc' * 'struct pmc_info' bytes of space for the return * values. */ struct pmc_info { char pm_name[PMC_NAME_MAX]; /* pmc name */ enum pmc_class pm_class; /* enum pmc_class */ int pm_enabled; /* whether enabled */ enum pmc_disp pm_rowdisp; /* FREE, THREAD or STANDLONE */ pid_t pm_ownerpid; /* owner, or -1 */ enum pmc_mode pm_mode; /* current mode [enum pmc_mode] */ enum pmc_event pm_event; /* current event */ uint32_t pm_flags; /* current flags */ pmc_value_t pm_reloadcount; /* sampling counters only */ }; struct pmc_op_getpmcinfo { int32_t pm_cpu; /* 0 <= cpu < mp_maxid */ struct pmc_info pm_pmcs[]; /* space for 'npmc' structures */ }; /* * OP GETCPUINFO * * Retrieve system CPU information. */ struct pmc_classinfo { enum pmc_class pm_class; /* class id */ uint32_t pm_caps; /* counter capabilities */ uint32_t pm_width; /* width of the PMC */ uint32_t pm_num; /* number of PMCs in class */ }; struct pmc_op_getcpuinfo { enum pmc_cputype pm_cputype; /* what kind of CPU */ uint32_t pm_ncpu; /* max CPU number */ uint32_t pm_npmc; /* #PMCs per CPU */ uint32_t pm_nclass; /* #classes of PMCs */ struct pmc_classinfo pm_classes[PMC_CLASS_MAX]; }; /* * OP CONFIGURELOG * * Configure a log file for writing system-wide statistics to. */ struct pmc_op_configurelog { int pm_flags; int pm_logfd; /* logfile fd (or -1) */ }; /* * OP GETDRIVERSTATS * * Retrieve pmc(4) driver-wide statistics. */ #ifdef _KERNEL struct pmc_driverstats { counter_u64_t pm_intr_ignored; /* #interrupts ignored */ counter_u64_t pm_intr_processed; /* #interrupts processed */ counter_u64_t pm_intr_bufferfull; /* #interrupts with ENOSPC */ counter_u64_t pm_syscalls; /* #syscalls */ counter_u64_t pm_syscall_errors; /* #syscalls with errors */ counter_u64_t pm_buffer_requests; /* #buffer requests */ counter_u64_t pm_buffer_requests_failed; /* #failed buffer requests */ counter_u64_t pm_log_sweeps; /* #sample buffer processing passes */ counter_u64_t pm_merges; /* merged k+u */ counter_u64_t pm_overwrites; /* UR overwrites */ }; #endif struct pmc_op_getdriverstats { unsigned int pm_intr_ignored; /* #interrupts ignored */ unsigned int pm_intr_processed; /* #interrupts processed */ unsigned int pm_intr_bufferfull; /* #interrupts with ENOSPC */ unsigned int pm_syscalls; /* #syscalls */ unsigned int pm_syscall_errors; /* #syscalls with errors */ unsigned int pm_buffer_requests; /* #buffer requests */ unsigned int pm_buffer_requests_failed; /* #failed buffer requests */ unsigned int pm_log_sweeps; /* #sample buffer processing passes */ }; /* * OP RELEASE / OP START / OP STOP * * Simple operations on a PMC id. */ struct pmc_op_simple { pmc_id_t pm_pmcid; }; /* * OP WRITELOG * * Flush the current log buffer and write 4 bytes of user data to it. */ struct pmc_op_writelog { uint32_t pm_userdata; }; /* * OP GETMSR * * Retrieve the machine specific address associated with the allocated * PMC. This number can be used subsequently with a read-performance-counter * instruction. */ struct pmc_op_getmsr { uint32_t pm_msr; /* machine specific address */ pmc_id_t pm_pmcid; /* allocated pmc id */ }; /* * OP GETDYNEVENTINFO * * Retrieve a PMC dynamic class events list. */ struct pmc_dyn_event_descr { char pm_ev_name[PMC_NAME_MAX]; enum pmc_event pm_ev_code; }; struct pmc_op_getdyneventinfo { enum pmc_class pm_class; unsigned int pm_nevent; struct pmc_dyn_event_descr pm_events[PMC_EV_DYN_COUNT]; }; #ifdef _KERNEL #include #include #include #include #define PMC_HASH_SIZE 1024 #define PMC_MTXPOOL_SIZE 2048 #define PMC_LOG_BUFFER_SIZE 256 #define PMC_NLOGBUFFERS_PCPU 32 #define PMC_NSAMPLES 256 #define PMC_CALLCHAIN_DEPTH 128 #define PMC_THREADLIST_MAX 128 #define PMC_SYSCTL_NAME_PREFIX "kern." PMC_MODULE_NAME "." /* * Locking keys * * (b) - pmc_bufferlist_mtx (spin lock) * (k) - pmc_kthread_mtx (sleep lock) * (o) - po->po_mtx (spin lock) * (g) - global_epoch_preempt (epoch) * (p) - pmc_sx (sx) */ /* * PMC commands */ struct pmc_syscall_args { register_t pmop_code; /* one of PMC_OP_* */ void *pmop_data; /* syscall parameter */ }; /* * Interface to processor specific s1tuff */ /* * struct pmc_descr * * Machine independent (i.e., the common parts) of a human readable * PMC description. */ struct pmc_descr { char pd_name[PMC_NAME_MAX]; /* name */ uint32_t pd_caps; /* capabilities */ enum pmc_class pd_class; /* class of the PMC */ uint32_t pd_width; /* width in bits */ }; /* * struct pmc_target * * This structure records all the target processes associated with a * PMC. */ struct pmc_target { LIST_ENTRY(pmc_target) pt_next; struct pmc_process *pt_process; /* target descriptor */ }; /* * struct pmc * * Describes each allocated PMC. * * Each PMC has precisely one owner, namely the process that allocated * the PMC. * * A PMC may be attached to multiple target processes. The * 'pm_targets' field links all the target processes being monitored * by this PMC. * * The 'pm_savedvalue' field is protected by a mutex. * * On a multi-cpu machine, multiple target threads associated with a * process-virtual PMC could be concurrently executing on different * CPUs. The 'pm_runcount' field is atomically incremented every time * the PMC gets scheduled on a CPU and atomically decremented when it * get descheduled. Deletion of a PMC is only permitted when this * field is '0'. * */ struct pmc_pcpu_state { uint32_t pps_overflowcnt; /* count overflow interrupts */ uint8_t pps_stalled; uint8_t pps_cpustate; } __aligned(CACHE_LINE_SIZE); struct pmc { LIST_HEAD(,pmc_target) pm_targets; /* list of target processes */ LIST_ENTRY(pmc) pm_next; /* owner's list */ /* * System-wide PMCs are allocated on a CPU and are not moved * around. For system-wide PMCs we record the CPU the PMC was * allocated on in the 'CPU' field of the pmc ID. * * Virtual PMCs run on whichever CPU is currently executing * their targets' threads. For these PMCs we need to save * their current PMC counter values when they are taken off * CPU. */ union { pmc_value_t pm_savedvalue; /* Virtual PMCS */ } pm_gv; /* * For sampling mode PMCs, we keep track of the PMC's "reload * count", which is the counter value to be loaded in when * arming the PMC for the next counting session. For counting * modes on PMCs that are read-only (e.g., the x86 TSC), we * keep track of the initial value at the start of * counting-mode operation. */ union { pmc_value_t pm_reloadcount; /* sampling PMC modes */ pmc_value_t pm_initial; /* counting PMC modes */ } pm_sc; struct pmc_pcpu_state *pm_pcpu_state; volatile cpuset_t pm_cpustate; /* CPUs where PMC should be active */ uint32_t pm_caps; /* PMC capabilities */ enum pmc_event pm_event; /* event being measured */ uint32_t pm_flags; /* additional flags PMC_F_... */ struct pmc_owner *pm_owner; /* owner thread state */ counter_u64_t pm_runcount; /* #cpus currently on */ enum pmc_state pm_state; /* current PMC state */ /* * The PMC ID field encodes the row-index for the PMC, its * mode, class and the CPU# associated with the PMC. */ pmc_id_t pm_id; /* allocated PMC id */ enum pmc_class pm_class; /* md extensions */ union pmc_md_pmc pm_md; }; /* * Accessor macros for 'struct pmc' */ #define PMC_TO_MODE(P) PMC_ID_TO_MODE((P)->pm_id) #define PMC_TO_CLASS(P) PMC_ID_TO_CLASS((P)->pm_id) #define PMC_TO_ROWINDEX(P) PMC_ID_TO_ROWINDEX((P)->pm_id) #define PMC_TO_CPU(P) PMC_ID_TO_CPU((P)->pm_id) /* * struct pmc_threadpmcstate * * Record per-PMC, per-thread state. */ struct pmc_threadpmcstate { pmc_value_t pt_pmcval; /* per-thread reload count */ }; /* * struct pmc_thread * * Record a 'target' thread being profiled. */ struct pmc_thread { LIST_ENTRY(pmc_thread) pt_next; /* linked list */ struct thread *pt_td; /* target thread */ struct pmc_threadpmcstate pt_pmcs[]; /* per-PMC state */ }; /* * struct pmc_process * * Record a 'target' process being profiled. * * The target process being profiled could be different from the owner * process which allocated the PMCs. Each target process descriptor * is associated with NHWPMC 'struct pmc *' pointers. Each PMC at a * given hardware row-index 'n' will use slot 'n' of the 'pp_pmcs[]' * array. The size of this structure is thus PMC architecture * dependent. * */ struct pmc_targetstate { struct pmc *pp_pmc; /* target PMC */ pmc_value_t pp_pmcval; /* per-process value */ }; struct pmc_process { LIST_ENTRY(pmc_process) pp_next; /* hash chain */ LIST_HEAD(,pmc_thread) pp_tds; /* list of threads */ struct mtx *pp_tdslock; /* lock on pp_tds thread list */ int pp_refcnt; /* reference count */ uint32_t pp_flags; /* flags PMC_PP_* */ struct proc *pp_proc; /* target process */ struct pmc_targetstate pp_pmcs[]; /* NHWPMCs */ }; #define PMC_PP_ENABLE_MSR_ACCESS 0x00000001 /* * struct pmc_owner * * We associate a PMC with an 'owner' process. * * A process can be associated with 0..NCPUS*NHWPMC PMCs during its * lifetime, where NCPUS is the numbers of CPUS in the system and * NHWPMC is the number of hardware PMCs per CPU. These are * maintained in the list headed by the 'po_pmcs' to save on space. * */ struct pmc_owner { LIST_ENTRY(pmc_owner) po_next; /* hash chain */ CK_LIST_ENTRY(pmc_owner) po_ssnext; /* (g/p) list of SS PMC owners */ LIST_HEAD(, pmc) po_pmcs; /* owned PMC list */ TAILQ_HEAD(, pmclog_buffer) po_logbuffers; /* (o) logbuffer list */ struct mtx po_mtx; /* spin lock for (o) */ struct proc *po_owner; /* owner proc */ uint32_t po_flags; /* (k) flags PMC_PO_* */ struct proc *po_kthread; /* (k) helper kthread */ struct file *po_file; /* file reference */ int po_error; /* recorded error */ short po_sscount; /* # SS PMCs owned */ short po_logprocmaps; /* global mappings done */ struct pmclog_buffer *po_curbuf[MAXCPU]; /* current log buffer */ }; #define PMC_PO_OWNS_LOGFILE 0x00000001 /* has a log file */ #define PMC_PO_SHUTDOWN 0x00000010 /* in the process of shutdown */ #define PMC_PO_INITIAL_MAPPINGS_DONE 0x00000020 /* * struct pmc_hw -- describe the state of the PMC hardware * * When in use, a HW PMC is associated with one allocated 'struct pmc' * pointed to by field 'phw_pmc'. When inactive, this field is NULL. * * On an SMP box, one or more HW PMC's in process virtual mode with * the same 'phw_pmc' could be executing on different CPUs. In order * to handle this case correctly, we need to ensure that only * incremental counts get added to the saved value in the associated * 'struct pmc'. The 'phw_save' field is used to keep the saved PMC * value at the time the hardware is started during this context * switch (i.e., the difference between the new (hardware) count and * the saved count is atomically added to the count field in 'struct * pmc' at context switch time). * */ struct pmc_hw { uint32_t phw_state; /* see PHW_* macros below */ struct pmc *phw_pmc; /* current thread PMC */ }; #define PMC_PHW_RI_MASK 0x000000FF #define PMC_PHW_CPU_SHIFT 8 #define PMC_PHW_CPU_MASK 0x0000FF00 #define PMC_PHW_FLAGS_SHIFT 16 #define PMC_PHW_FLAGS_MASK 0xFFFF0000 #define PMC_PHW_INDEX_TO_STATE(ri) ((ri) & PMC_PHW_RI_MASK) #define PMC_PHW_STATE_TO_INDEX(state) ((state) & PMC_PHW_RI_MASK) #define PMC_PHW_CPU_TO_STATE(cpu) (((cpu) << PMC_PHW_CPU_SHIFT) & \ PMC_PHW_CPU_MASK) #define PMC_PHW_STATE_TO_CPU(state) (((state) & PMC_PHW_CPU_MASK) >> \ PMC_PHW_CPU_SHIFT) #define PMC_PHW_FLAGS_TO_STATE(flags) (((flags) << PMC_PHW_FLAGS_SHIFT) & \ PMC_PHW_FLAGS_MASK) #define PMC_PHW_STATE_TO_FLAGS(state) (((state) & PMC_PHW_FLAGS_MASK) >> \ PMC_PHW_FLAGS_SHIFT) #define PMC_PHW_FLAG_IS_ENABLED (PMC_PHW_FLAGS_TO_STATE(0x01)) #define PMC_PHW_FLAG_IS_SHAREABLE (PMC_PHW_FLAGS_TO_STATE(0x02)) /* * struct pmc_sample * * Space for N (tunable) PC samples and associated control data. */ struct pmc_sample { uint16_t ps_nsamples; /* callchain depth */ uint16_t ps_nsamples_actual; uint16_t ps_cpu; /* cpu number */ uint16_t ps_flags; /* other flags */ lwpid_t ps_tid; /* thread id */ pid_t ps_pid; /* process PID or -1 */ int ps_ticks; /* ticks at sample time */ /* pad */ struct thread *ps_td; /* which thread */ struct pmc *ps_pmc; /* interrupting PMC */ uintptr_t *ps_pc; /* (const) callchain start */ uint64_t ps_tsc; /* tsc value */ }; #define PMC_SAMPLE_FREE ((uint16_t) 0) #define PMC_USER_CALLCHAIN_PENDING ((uint16_t) 0xFFFF) struct pmc_samplebuffer { volatile uint64_t ps_prodidx; /* producer index */ volatile uint64_t ps_considx; /* consumer index */ uintptr_t *ps_callchains; /* all saved call chains */ struct pmc_sample ps_samples[]; /* array of sample entries */ }; #define PMC_CONS_SAMPLE(psb) \ (&(psb)->ps_samples[(psb)->ps_considx & pmc_sample_mask]) #define PMC_CONS_SAMPLE_OFF(psb, off) \ (&(psb)->ps_samples[(off) & pmc_sample_mask]) #define PMC_PROD_SAMPLE(psb) \ (&(psb)->ps_samples[(psb)->ps_prodidx & pmc_sample_mask]) /* * struct pmc_cpustate * * A CPU is modelled as a collection of HW PMCs with space for additional * flags. */ struct pmc_cpu { uint32_t pc_state; /* physical cpu number + flags */ struct pmc_samplebuffer *pc_sb[3]; /* space for samples */ struct pmc_hw *pc_hwpmcs[]; /* 'npmc' pointers */ }; #define PMC_PCPU_CPU_MASK 0x000000FF #define PMC_PCPU_FLAGS_MASK 0xFFFFFF00 #define PMC_PCPU_FLAGS_SHIFT 8 #define PMC_PCPU_STATE_TO_CPU(S) ((S) & PMC_PCPU_CPU_MASK) #define PMC_PCPU_STATE_TO_FLAGS(S) (((S) & PMC_PCPU_FLAGS_MASK) >> PMC_PCPU_FLAGS_SHIFT) #define PMC_PCPU_FLAGS_TO_STATE(F) (((F) << PMC_PCPU_FLAGS_SHIFT) & PMC_PCPU_FLAGS_MASK) #define PMC_PCPU_CPU_TO_STATE(C) ((C) & PMC_PCPU_CPU_MASK) #define PMC_PCPU_FLAG_HTT (PMC_PCPU_FLAGS_TO_STATE(0x1)) /* * struct pmc_binding * * CPU binding information. */ struct pmc_binding { int pb_bound; /* is bound? */ int pb_cpu; /* if so, to which CPU */ u_char pb_priority; /* Thread active priority. */ }; struct pmc_mdep; /* * struct pmc_classdep * * PMC class-dependent operations. */ struct pmc_classdep { uint32_t pcd_caps; /* class capabilities */ enum pmc_class pcd_class; /* class id */ int pcd_num; /* number of PMCs */ int pcd_ri; /* row index of the first PMC in class */ int pcd_width; /* width of the PMC */ /* configuring/reading/writing the hardware PMCs */ int (*pcd_config_pmc)(int _cpu, int _ri, struct pmc *_pm); int (*pcd_get_config)(int _cpu, int _ri, struct pmc **_ppm); int (*pcd_read_pmc)(int _cpu, int _ri, pmc_value_t *_value); int (*pcd_write_pmc)(int _cpu, int _ri, pmc_value_t _value); /* pmc allocation/release */ int (*pcd_allocate_pmc)(int _cpu, int _ri, struct pmc *_t, const struct pmc_op_pmcallocate *_a); int (*pcd_release_pmc)(int _cpu, int _ri, struct pmc *_pm); /* starting and stopping PMCs */ int (*pcd_start_pmc)(int _cpu, int _ri); int (*pcd_stop_pmc)(int _cpu, int _ri); /* description */ int (*pcd_describe)(int _cpu, int _ri, struct pmc_info *_pi, struct pmc **_ppmc); /* class-dependent initialization & finalization */ int (*pcd_pcpu_init)(struct pmc_mdep *_md, int _cpu); int (*pcd_pcpu_fini)(struct pmc_mdep *_md, int _cpu); /* machine-specific interface */ int (*pcd_get_msr)(int _ri, uint32_t *_msr); }; /* * struct pmc_mdep * * Machine dependent bits needed per CPU type. */ struct pmc_mdep { uint32_t pmd_cputype; /* from enum pmc_cputype */ uint32_t pmd_npmc; /* number of PMCs per CPU */ uint32_t pmd_nclass; /* number of PMC classes present */ /* * Machine dependent methods. */ - /* per-cpu initialization and finalization */ - int (*pmd_pcpu_init)(struct pmc_mdep *_md, int _cpu); - int (*pmd_pcpu_fini)(struct pmc_mdep *_md, int _cpu); - /* thread context switch in/out */ int (*pmd_switch_in)(struct pmc_cpu *_p, struct pmc_process *_pp); int (*pmd_switch_out)(struct pmc_cpu *_p, struct pmc_process *_pp); /* handle a PMC interrupt */ int (*pmd_intr)(struct trapframe *_tf); /* * PMC class dependent information. */ struct pmc_classdep pmd_classdep[]; }; /* * Per-CPU state. This is an array of 'mp_ncpu' pointers * to struct pmc_cpu descriptors. */ extern struct pmc_cpu **pmc_pcpu; /* driver statistics */ extern struct pmc_driverstats pmc_stats; #if defined(HWPMC_DEBUG) #include /* debug flags, major flag groups */ struct pmc_debugflags { int pdb_CPU; int pdb_CSW; int pdb_LOG; int pdb_MDP; int pdb_MOD; int pdb_OWN; int pdb_PMC; int pdb_PRC; int pdb_SAM; }; extern struct pmc_debugflags pmc_debugflags; #define KTR_PMC KTR_SUBSYS #define PMC_DEBUG_STRSIZE 128 #define PMC_DEBUG_DEFAULT_FLAGS { 0, 0, 0, 0, 0, 0, 0, 0, 0 } #define PMCDBG0(M, N, L, F) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR0(KTR_PMC, #M ":" #N ":" #L ": " F); \ } while (0) #define PMCDBG1(M, N, L, F, p1) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR1(KTR_PMC, #M ":" #N ":" #L ": " F, p1); \ } while (0) #define PMCDBG2(M, N, L, F, p1, p2) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR2(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2); \ } while (0) #define PMCDBG3(M, N, L, F, p1, p2, p3) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR3(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2, p3); \ } while (0) #define PMCDBG4(M, N, L, F, p1, p2, p3, p4) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR4(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2, p3, p4);\ } while (0) #define PMCDBG5(M, N, L, F, p1, p2, p3, p4, p5) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR5(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2, p3, p4, \ p5); \ } while (0) #define PMCDBG6(M, N, L, F, p1, p2, p3, p4, p5, p6) do { \ if (pmc_debugflags.pdb_ ## M & (1 << PMC_DEBUG_MIN_ ## N)) \ CTR6(KTR_PMC, #M ":" #N ":" #L ": " F, p1, p2, p3, p4, \ p5, p6); \ } while (0) /* Major numbers */ #define PMC_DEBUG_MAJ_CPU 0 /* cpu switches */ #define PMC_DEBUG_MAJ_CSW 1 /* context switches */ #define PMC_DEBUG_MAJ_LOG 2 /* logging */ #define PMC_DEBUG_MAJ_MDP 3 /* machine dependent */ #define PMC_DEBUG_MAJ_MOD 4 /* misc module infrastructure */ #define PMC_DEBUG_MAJ_OWN 5 /* owner */ #define PMC_DEBUG_MAJ_PMC 6 /* pmc management */ #define PMC_DEBUG_MAJ_PRC 7 /* processes */ #define PMC_DEBUG_MAJ_SAM 8 /* sampling */ /* Minor numbers */ /* Common (8 bits) */ #define PMC_DEBUG_MIN_ALL 0 /* allocation */ #define PMC_DEBUG_MIN_REL 1 /* release */ #define PMC_DEBUG_MIN_OPS 2 /* ops: start, stop, ... */ #define PMC_DEBUG_MIN_INI 3 /* init */ #define PMC_DEBUG_MIN_FND 4 /* find */ /* MODULE */ #define PMC_DEBUG_MIN_PMH 14 /* pmc_hook */ #define PMC_DEBUG_MIN_PMS 15 /* pmc_syscall */ /* OWN */ #define PMC_DEBUG_MIN_ORM 8 /* owner remove */ #define PMC_DEBUG_MIN_OMR 9 /* owner maybe remove */ /* PROCESSES */ #define PMC_DEBUG_MIN_TLK 8 /* link target */ #define PMC_DEBUG_MIN_TUL 9 /* unlink target */ #define PMC_DEBUG_MIN_EXT 10 /* process exit */ #define PMC_DEBUG_MIN_EXC 11 /* process exec */ #define PMC_DEBUG_MIN_FRK 12 /* process fork */ #define PMC_DEBUG_MIN_ATT 13 /* attach/detach */ #define PMC_DEBUG_MIN_SIG 14 /* signalling */ /* CONTEXT SWITCHES */ #define PMC_DEBUG_MIN_SWI 8 /* switch in */ #define PMC_DEBUG_MIN_SWO 9 /* switch out */ /* PMC */ #define PMC_DEBUG_MIN_REG 8 /* pmc register */ #define PMC_DEBUG_MIN_ALR 9 /* allocate row */ /* MACHINE DEPENDENT LAYER */ #define PMC_DEBUG_MIN_REA 8 /* read */ #define PMC_DEBUG_MIN_WRI 9 /* write */ #define PMC_DEBUG_MIN_CFG 10 /* config */ #define PMC_DEBUG_MIN_STA 11 /* start */ #define PMC_DEBUG_MIN_STO 12 /* stop */ #define PMC_DEBUG_MIN_INT 13 /* interrupts */ /* CPU */ #define PMC_DEBUG_MIN_BND 8 /* bind */ #define PMC_DEBUG_MIN_SEL 9 /* select */ /* LOG */ #define PMC_DEBUG_MIN_GTB 8 /* get buf */ #define PMC_DEBUG_MIN_SIO 9 /* schedule i/o */ #define PMC_DEBUG_MIN_FLS 10 /* flush */ #define PMC_DEBUG_MIN_SAM 11 /* sample */ #define PMC_DEBUG_MIN_CLO 12 /* close */ #else #define PMCDBG0(M, N, L, F) /* nothing */ #define PMCDBG1(M, N, L, F, p1) #define PMCDBG2(M, N, L, F, p1, p2) #define PMCDBG3(M, N, L, F, p1, p2, p3) #define PMCDBG4(M, N, L, F, p1, p2, p3, p4) #define PMCDBG5(M, N, L, F, p1, p2, p3, p4, p5) #define PMCDBG6(M, N, L, F, p1, p2, p3, p4, p5, p6) #endif /* declare a dedicated memory pool */ MALLOC_DECLARE(M_PMC); /* * Functions */ struct pmc_mdep *pmc_md_initialize(void); /* MD init function */ void pmc_md_finalize(struct pmc_mdep *_md); /* MD fini function */ int pmc_getrowdisp(int _ri); int pmc_process_interrupt(int _ring, struct pmc *_pm, struct trapframe *_tf); int pmc_save_kernel_callchain(uintptr_t *_cc, int _maxsamples, struct trapframe *_tf); int pmc_save_user_callchain(uintptr_t *_cc, int _maxsamples, struct trapframe *_tf); void pmc_restore_cpu_binding(struct pmc_binding *pb); void pmc_save_cpu_binding(struct pmc_binding *pb); void pmc_select_cpu(int cpu); struct pmc_mdep *pmc_mdep_alloc(int nclasses); void pmc_mdep_free(struct pmc_mdep *md); uint64_t pmc_rdtsc(void); #endif /* _KERNEL */ #endif /* _SYS_PMC_H_ */