Index: stable/8/lib/libpmc/Makefile =================================================================== --- stable/8/lib/libpmc/Makefile (revision 206701) +++ stable/8/lib/libpmc/Makefile (revision 206702) @@ -1,65 +1,70 @@ # $FreeBSD$ LIB= pmc SRCS= libpmc.c pmclog.c INCS= pmc.h pmclog.h WARNS?= 6 MAN= pmc.3 MAN+= pmc_allocate.3 MAN+= pmc_attach.3 MAN+= pmc_capabilities.3 MAN+= pmc_configure_logfile.3 MAN+= pmc_disable.3 MAN+= pmc_event_names_of_class.3 MAN+= pmc_get_driver_stats.3 MAN+= pmc_get_msr.3 MAN+= pmc_init.3 MAN+= pmc_name_of_capability.3 MAN+= pmc_read.3 MAN+= pmc_set.3 MAN+= pmc_start.3 MAN+= pmclog.3 # PMC-dependent manual pages MAN+= pmc.atom.3 MAN+= pmc.core.3 MAN+= pmc.core2.3 MAN+= pmc.iaf.3 +MAN+= pmc.ucf.3 MAN+= pmc.k7.3 MAN+= pmc.k8.3 MAN+= pmc.p4.3 MAN+= pmc.p5.3 MAN+= pmc.p6.3 +MAN+= pmc.corei7.3 +MAN+= pmc.corei7uc.3 +MAN+= pmc.westmere.3 +MAN+= pmc.westmereuc.3 MAN+= pmc.tsc.3 MLINKS+= \ pmc_allocate.3 pmc_release.3 \ pmc_attach.3 pmc_detach.3 \ pmc_capabilities.3 pmc_ncpu.3 \ pmc_capabilities.3 pmc_npmc.3 \ pmc_capabilities.3 pmc_pmcinfo.3 \ pmc_capabilities.3 pmc_cpuinfo.3 \ pmc_capabilities.3 pmc_width.3 \ pmc_configure_logfile.3 pmc_flush_logfile.3 \ pmc_configure_logfile.3 pmc_writelog.3 \ pmc_disable.3 pmc_enable.3 \ pmc_name_of_capability.3 pmc_name_of_class.3 \ pmc_name_of_capability.3 pmc_name_of_cputype.3 \ pmc_name_of_capability.3 pmc_name_of_disposition.3 \ pmc_name_of_capability.3 pmc_name_of_event.3 \ pmc_name_of_capability.3 pmc_name_of_mode.3 \ pmc_name_of_capability.3 pmc_name_of_state.3 \ pmc_read.3 pmc_rw.3 \ pmc_read.3 pmc_write.3 \ pmc_start.3 pmc_stop.3 MLINKS+= \ pmclog.3 pmclog_open.3 \ pmclog.3 pmclog_close.3 \ pmclog.3 pmclog_feed.3 \ pmclog.3 pmclog_read.3 .include Index: stable/8/lib/libpmc/libpmc.c =================================================================== --- stable/8/lib/libpmc/libpmc.c (revision 206701) +++ stable/8/lib/libpmc/libpmc.c (revision 206702) @@ -1,2845 +1,3012 @@ /*- * Copyright (c) 2003-2008 Joseph Koshy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libpmcinternal.h" /* Function prototypes */ #if defined(__i386__) static int k7_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif #if defined(__amd64__) || defined(__i386__) static int iaf_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); static int iap_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); +static int ucf_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +static int ucp_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); static int k8_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); static int p4_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif #if defined(__i386__) static int p5_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); static int p6_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif #if defined(__amd64__) || defined(__i386__) static int tsc_allocate_pmc(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pmc_config); #endif #define PMC_CALL(cmd, params) \ syscall(pmc_syscall, PMC_OP_##cmd, (params)) /* * Event aliases provide a way for the user to ask for generic events * like "cache-misses", or "instructions-retired". These aliases are * mapped to the appropriate canonical event descriptions using a * lookup table. */ struct pmc_event_alias { const char *pm_alias; const char *pm_spec; }; static const struct pmc_event_alias *pmc_mdep_event_aliases; /* * The pmc_event_descr structure maps symbolic names known to the user * to integer codes used by the PMC KLD. */ struct pmc_event_descr { const char *pm_ev_name; enum pmc_event pm_ev_code; }; /* * The pmc_class_descr structure maps class name prefixes for * event names to event tables and other PMC class data. */ struct pmc_class_descr { const char *pm_evc_name; size_t pm_evc_name_size; enum pmc_class pm_evc_class; const struct pmc_event_descr *pm_evc_event_table; size_t pm_evc_event_table_size; int (*pm_evc_allocate_pmc)(enum pmc_event _pe, char *_ctrspec, struct pmc_op_pmcallocate *_pa); }; #define PMC_TABLE_SIZE(N) (sizeof(N)/sizeof(N[0])) #define PMC_EVENT_TABLE_SIZE(N) PMC_TABLE_SIZE(N##_event_table) #undef __PMC_EV #define __PMC_EV(C,N) { #N, PMC_EV_ ## C ## _ ## N }, /* * PMC_CLASSDEP_TABLE(NAME, CLASS) * * Define a table mapping event names and aliases to HWPMC event IDs. */ #define PMC_CLASSDEP_TABLE(N, C) \ static const struct pmc_event_descr N##_event_table[] = \ { \ __PMC_EV_##C() \ } PMC_CLASSDEP_TABLE(iaf, IAF); PMC_CLASSDEP_TABLE(k7, K7); PMC_CLASSDEP_TABLE(k8, K8); PMC_CLASSDEP_TABLE(p4, P4); PMC_CLASSDEP_TABLE(p5, P5); PMC_CLASSDEP_TABLE(p6, P6); +PMC_CLASSDEP_TABLE(ucf, UCF); #undef __PMC_EV_ALIAS #define __PMC_EV_ALIAS(N,CODE) { N, PMC_EV_##CODE }, static const struct pmc_event_descr atom_event_table[] = { __PMC_EV_ALIAS_ATOM() }; static const struct pmc_event_descr core_event_table[] = { __PMC_EV_ALIAS_CORE() }; static const struct pmc_event_descr core2_event_table[] = { __PMC_EV_ALIAS_CORE2() }; static const struct pmc_event_descr corei7_event_table[] = { __PMC_EV_ALIAS_COREI7() }; +static const struct pmc_event_descr westmere_event_table[] = +{ + __PMC_EV_ALIAS_WESTMERE() +}; + +static const struct pmc_event_descr corei7uc_event_table[] = +{ + __PMC_EV_ALIAS_COREI7UC() +}; + +static const struct pmc_event_descr westmereuc_event_table[] = +{ + __PMC_EV_ALIAS_WESTMEREUC() +}; + /* * PMC_MDEP_TABLE(NAME, PRIMARYCLASS, ADDITIONAL_CLASSES...) * * Map a CPU to the PMC classes it supports. */ #define PMC_MDEP_TABLE(N,C,...) \ static const enum pmc_class N##_pmc_classes[] = { \ PMC_CLASS_##C, __VA_ARGS__ \ } PMC_MDEP_TABLE(atom, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC); PMC_MDEP_TABLE(core, IAP, PMC_CLASS_TSC); PMC_MDEP_TABLE(core2, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC); -PMC_MDEP_TABLE(corei7, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC); +PMC_MDEP_TABLE(corei7, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC, PMC_CLASS_UCF, PMC_CLASS_UCP); +PMC_MDEP_TABLE(westmere, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC, PMC_CLASS_UCF, PMC_CLASS_UCP); PMC_MDEP_TABLE(k7, K7, PMC_CLASS_TSC); PMC_MDEP_TABLE(k8, K8, PMC_CLASS_TSC); PMC_MDEP_TABLE(p4, P4, PMC_CLASS_TSC); PMC_MDEP_TABLE(p5, P5, PMC_CLASS_TSC); PMC_MDEP_TABLE(p6, P6, PMC_CLASS_TSC); static const struct pmc_event_descr tsc_event_table[] = { __PMC_EV_TSC() }; #undef PMC_CLASS_TABLE_DESC #define PMC_CLASS_TABLE_DESC(NAME, CLASS, EVENTS, ALLOCATOR) \ static const struct pmc_class_descr NAME##_class_table_descr = \ { \ .pm_evc_name = #CLASS "-", \ .pm_evc_name_size = sizeof(#CLASS "-") - 1, \ .pm_evc_class = PMC_CLASS_##CLASS , \ .pm_evc_event_table = EVENTS##_event_table , \ .pm_evc_event_table_size = \ PMC_EVENT_TABLE_SIZE(EVENTS), \ .pm_evc_allocate_pmc = ALLOCATOR##_allocate_pmc \ } #if defined(__i386__) || defined(__amd64__) PMC_CLASS_TABLE_DESC(iaf, IAF, iaf, iaf); PMC_CLASS_TABLE_DESC(atom, IAP, atom, iap); PMC_CLASS_TABLE_DESC(core, IAP, core, iap); PMC_CLASS_TABLE_DESC(core2, IAP, core2, iap); PMC_CLASS_TABLE_DESC(corei7, IAP, corei7, iap); +PMC_CLASS_TABLE_DESC(westmere, IAP, westmere, iap); +PMC_CLASS_TABLE_DESC(ucf, UCF, ucf, ucf); +PMC_CLASS_TABLE_DESC(corei7uc, UCP, corei7uc, ucp); +PMC_CLASS_TABLE_DESC(westmereuc, UCP, westmereuc, ucp); #endif #if defined(__i386__) PMC_CLASS_TABLE_DESC(k7, K7, k7, k7); #endif #if defined(__i386__) || defined(__amd64__) PMC_CLASS_TABLE_DESC(k8, K8, k8, k8); PMC_CLASS_TABLE_DESC(p4, P4, p4, p4); #endif #if defined(__i386__) PMC_CLASS_TABLE_DESC(p5, P5, p5, p5); PMC_CLASS_TABLE_DESC(p6, P6, p6, p6); #endif #if defined(__i386__) || defined(__amd64__) PMC_CLASS_TABLE_DESC(tsc, TSC, tsc, tsc); #endif #undef PMC_CLASS_TABLE_DESC static const struct pmc_class_descr **pmc_class_table; #define PMC_CLASS_TABLE_SIZE cpu_info.pm_nclass static const enum pmc_class *pmc_mdep_class_list; static size_t pmc_mdep_class_list_size; /* * Mapping tables, mapping enumeration values to human readable * strings. */ static const char * pmc_capability_names[] = { #undef __PMC_CAP #define __PMC_CAP(N,V,D) #N , __PMC_CAPS() }; static const char * pmc_class_names[] = { #undef __PMC_CLASS #define __PMC_CLASS(C) #C , __PMC_CLASSES() }; struct pmc_cputype_map { enum pmc_class pm_cputype; const char *pm_name; }; static const struct pmc_cputype_map pmc_cputype_names[] = { #undef __PMC_CPU #define __PMC_CPU(S, V, D) { .pm_cputype = PMC_CPU_##S, .pm_name = #S } , __PMC_CPUS() }; static const char * pmc_disposition_names[] = { #undef __PMC_DISP #define __PMC_DISP(D) #D , __PMC_DISPOSITIONS() }; static const char * pmc_mode_names[] = { #undef __PMC_MODE #define __PMC_MODE(M,N) #M , __PMC_MODES() }; static const char * pmc_state_names[] = { #undef __PMC_STATE #define __PMC_STATE(S) #S , __PMC_STATES() }; static int pmc_syscall = -1; /* filled in by pmc_init() */ static struct pmc_cpuinfo cpu_info; /* filled in by pmc_init() */ /* Event masks for events */ struct pmc_masks { const char *pm_name; const uint32_t pm_value; }; #define PMCMASK(N,V) { .pm_name = #N, .pm_value = (V) } -#define NULLMASK PMCMASK(NULL,0) +#define NULLMASK { .pm_name = NULL } #if defined(__amd64__) || defined(__i386__) static int pmc_parse_mask(const struct pmc_masks *pmask, char *p, uint32_t *evmask) { const struct pmc_masks *pm; char *q, *r; int c; if (pmask == NULL) /* no mask keywords */ return (-1); q = strchr(p, '='); /* skip '=' */ if (*++q == '\0') /* no more data */ return (-1); c = 0; /* count of mask keywords seen */ while ((r = strsep(&q, "+")) != NULL) { for (pm = pmask; pm->pm_name && strcasecmp(r, pm->pm_name); pm++) ; if (pm->pm_name == NULL) /* not found */ return (-1); *evmask |= pm->pm_value; c++; } return (c); } #endif #define KWMATCH(p,kw) (strcasecmp((p), (kw)) == 0) #define KWPREFIXMATCH(p,kw) (strncasecmp((p), (kw), sizeof((kw)) - 1) == 0) #define EV_ALIAS(N,S) { .pm_alias = N, .pm_spec = S } #if defined(__i386__) /* * AMD K7 (Athlon) CPUs. */ static struct pmc_event_alias k7_aliases[] = { EV_ALIAS("branches", "k7-retired-branches"), EV_ALIAS("branch-mispredicts", "k7-retired-branches-mispredicted"), EV_ALIAS("cycles", "tsc"), EV_ALIAS("dc-misses", "k7-dc-misses"), EV_ALIAS("ic-misses", "k7-ic-misses"), EV_ALIAS("instructions", "k7-retired-instructions"), EV_ALIAS("interrupts", "k7-hardware-interrupts"), EV_ALIAS(NULL, NULL) }; #define K7_KW_COUNT "count" #define K7_KW_EDGE "edge" #define K7_KW_INV "inv" #define K7_KW_OS "os" #define K7_KW_UNITMASK "unitmask" #define K7_KW_USR "usr" static int k7_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { char *e, *p, *q; int c, has_unitmask; uint32_t count, unitmask; pmc_config->pm_md.pm_amd.pm_amd_config = 0; pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); if (pe == PMC_EV_K7_DC_REFILLS_FROM_L2 || pe == PMC_EV_K7_DC_REFILLS_FROM_SYSTEM || pe == PMC_EV_K7_DC_WRITEBACKS) { has_unitmask = 1; unitmask = AMD_PMC_UNITMASK_MOESI; } else unitmask = has_unitmask = 0; while ((p = strsep(&ctrspec, ",")) != NULL) { if (KWPREFIXMATCH(p, K7_KW_COUNT "=")) { q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); count = strtol(q, &e, 0); if (e == q || *e != '\0') return (-1); pmc_config->pm_caps |= PMC_CAP_THRESHOLD; pmc_config->pm_md.pm_amd.pm_amd_config |= AMD_PMC_TO_COUNTER(count); } else if (KWMATCH(p, K7_KW_EDGE)) { pmc_config->pm_caps |= PMC_CAP_EDGE; } else if (KWMATCH(p, K7_KW_INV)) { pmc_config->pm_caps |= PMC_CAP_INVERT; } else if (KWMATCH(p, K7_KW_OS)) { pmc_config->pm_caps |= PMC_CAP_SYSTEM; } else if (KWPREFIXMATCH(p, K7_KW_UNITMASK "=")) { if (has_unitmask == 0) return (-1); unitmask = 0; q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); while ((c = tolower(*q++)) != 0) if (c == 'm') unitmask |= AMD_PMC_UNITMASK_M; else if (c == 'o') unitmask |= AMD_PMC_UNITMASK_O; else if (c == 'e') unitmask |= AMD_PMC_UNITMASK_E; else if (c == 's') unitmask |= AMD_PMC_UNITMASK_S; else if (c == 'i') unitmask |= AMD_PMC_UNITMASK_I; else if (c == '+') continue; else return (-1); if (unitmask == 0) return (-1); } else if (KWMATCH(p, K7_KW_USR)) { pmc_config->pm_caps |= PMC_CAP_USER; } else return (-1); } if (has_unitmask) { pmc_config->pm_caps |= PMC_CAP_QUALIFIER; pmc_config->pm_md.pm_amd.pm_amd_config |= AMD_PMC_TO_UNITMASK(unitmask); } return (0); } #endif #if defined(__amd64__) || defined(__i386__) /* * Intel Core (Family 6, Model E) PMCs. */ static struct pmc_event_alias core_aliases[] = { EV_ALIAS("branches", "iap-br-instr-ret"), EV_ALIAS("branch-mispredicts", "iap-br-mispred-ret"), EV_ALIAS("cycles", "tsc-tsc"), EV_ALIAS("ic-misses", "iap-icache-misses"), EV_ALIAS("instructions", "iap-instr-ret"), EV_ALIAS("interrupts", "iap-core-hw-int-rx"), EV_ALIAS("unhalted-cycles", "iap-unhalted-core-cycles"), EV_ALIAS(NULL, NULL) }; /* * Intel Core2 (Family 6, Model F), Core2Extreme (Family 6, Model 17H) * and Atom (Family 6, model 1CH) PMCs. * * We map aliases to events on the fixed-function counters if these * are present. Note that not all CPUs in this family contain fixed-function * counters. */ static struct pmc_event_alias core2_aliases[] = { EV_ALIAS("branches", "iap-br-inst-retired.any"), EV_ALIAS("branch-mispredicts", "iap-br-inst-retired.mispred"), EV_ALIAS("cycles", "tsc-tsc"), EV_ALIAS("ic-misses", "iap-l1i-misses"), EV_ALIAS("instructions", "iaf-instr-retired.any"), EV_ALIAS("interrupts", "iap-hw-int-rcv"), EV_ALIAS("unhalted-cycles", "iaf-cpu-clk-unhalted.core"), EV_ALIAS(NULL, NULL) }; static struct pmc_event_alias core2_aliases_without_iaf[] = { EV_ALIAS("branches", "iap-br-inst-retired.any"), EV_ALIAS("branch-mispredicts", "iap-br-inst-retired.mispred"), EV_ALIAS("cycles", "tsc-tsc"), EV_ALIAS("ic-misses", "iap-l1i-misses"), EV_ALIAS("instructions", "iap-inst-retired.any_p"), EV_ALIAS("interrupts", "iap-hw-int-rcv"), EV_ALIAS("unhalted-cycles", "iap-cpu-clk-unhalted.core_p"), EV_ALIAS(NULL, NULL) }; #define atom_aliases core2_aliases #define atom_aliases_without_iaf core2_aliases_without_iaf #define corei7_aliases core2_aliases #define corei7_aliases_without_iaf core2_aliases_without_iaf +#define westmere_aliases core2_aliases +#define westmere_aliases_without_iaf core2_aliases_without_iaf #define IAF_KW_OS "os" #define IAF_KW_USR "usr" #define IAF_KW_ANYTHREAD "anythread" /* * Parse an event specifier for Intel fixed function counters. */ static int iaf_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { char *p; (void) pe; pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); pmc_config->pm_md.pm_iaf.pm_iaf_flags = 0; while ((p = strsep(&ctrspec, ",")) != NULL) { if (KWMATCH(p, IAF_KW_OS)) pmc_config->pm_caps |= PMC_CAP_SYSTEM; else if (KWMATCH(p, IAF_KW_USR)) pmc_config->pm_caps |= PMC_CAP_USER; else if (KWMATCH(p, IAF_KW_ANYTHREAD)) pmc_config->pm_md.pm_iaf.pm_iaf_flags |= IAF_ANY; else return (-1); } return (0); } /* * Core/Core2 support. */ #define IAP_KW_AGENT "agent" #define IAP_KW_ANYTHREAD "anythread" #define IAP_KW_CACHESTATE "cachestate" #define IAP_KW_CMASK "cmask" #define IAP_KW_CORE "core" #define IAP_KW_EDGE "edge" #define IAP_KW_INV "inv" #define IAP_KW_OS "os" #define IAP_KW_PREFETCH "prefetch" #define IAP_KW_SNOOPRESPONSE "snoopresponse" #define IAP_KW_SNOOPTYPE "snooptype" #define IAP_KW_TRANSITION "trans" #define IAP_KW_USR "usr" +#define IAP_KW_RSP "rsp" static struct pmc_masks iap_core_mask[] = { PMCMASK(all, (0x3 << 14)), PMCMASK(this, (0x1 << 14)), NULLMASK }; static struct pmc_masks iap_agent_mask[] = { PMCMASK(this, 0), PMCMASK(any, (0x1 << 13)), NULLMASK }; static struct pmc_masks iap_prefetch_mask[] = { PMCMASK(both, (0x3 << 12)), PMCMASK(only, (0x1 << 12)), PMCMASK(exclude, 0), NULLMASK }; static struct pmc_masks iap_cachestate_mask[] = { PMCMASK(i, (1 << 8)), PMCMASK(s, (1 << 9)), PMCMASK(e, (1 << 10)), PMCMASK(m, (1 << 11)), NULLMASK }; static struct pmc_masks iap_snoopresponse_mask[] = { PMCMASK(clean, (1 << 8)), PMCMASK(hit, (1 << 9)), PMCMASK(hitm, (1 << 11)), NULLMASK }; static struct pmc_masks iap_snooptype_mask[] = { PMCMASK(cmp2s, (1 << 8)), PMCMASK(cmp2i, (1 << 9)), NULLMASK }; static struct pmc_masks iap_transition_mask[] = { PMCMASK(any, 0x00), PMCMASK(frequency, 0x10), NULLMASK }; +static struct pmc_masks iap_rsp_mask[] = { + PMCMASK(DMND_DATA_RD, (1 << 0)), + PMCMASK(DMND_RFO, (1 << 1)), + PMCMASK(DMND_IFETCH, (1 << 2)), + PMCMASK(WB, (1 << 3)), + PMCMASK(PF_DATA_RD, (1 << 4)), + PMCMASK(PF_RFO, (1 << 5)), + PMCMASK(PF_IFETCH, (1 << 6)), + PMCMASK(OTHER, (1 << 7)), + PMCMASK(UNCORE_HIT, (1 << 8)), + PMCMASK(OTHER_CORE_HIT_SNP, (1 << 9)), + PMCMASK(OTHER_CORE_HITM, (1 << 10)), + PMCMASK(REMOTE_CACHE_FWD, (1 << 12)), + PMCMASK(REMOTE_DRAM, (1 << 13)), + PMCMASK(LOCAL_DRAM, (1 << 14)), + PMCMASK(NON_DRAM, (1 << 15)), + NULLMASK +}; + static int iap_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { char *e, *p, *q; - uint32_t cachestate, evmask; + uint32_t cachestate, evmask, rsp; int count, n; pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE | PMC_CAP_QUALIFIER); pmc_config->pm_md.pm_iap.pm_iap_config = 0; - cachestate = evmask = 0; + cachestate = evmask = rsp = 0; /* Parse additional modifiers if present */ while ((p = strsep(&ctrspec, ",")) != NULL) { n = 0; if (KWPREFIXMATCH(p, IAP_KW_CMASK "=")) { q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); count = strtol(q, &e, 0); if (e == q || *e != '\0') return (-1); pmc_config->pm_caps |= PMC_CAP_THRESHOLD; pmc_config->pm_md.pm_iap.pm_iap_config |= IAP_CMASK(count); } else if (KWMATCH(p, IAP_KW_EDGE)) { pmc_config->pm_caps |= PMC_CAP_EDGE; } else if (KWMATCH(p, IAP_KW_INV)) { pmc_config->pm_caps |= PMC_CAP_INVERT; } else if (KWMATCH(p, IAP_KW_OS)) { pmc_config->pm_caps |= PMC_CAP_SYSTEM; } else if (KWMATCH(p, IAP_KW_USR)) { pmc_config->pm_caps |= PMC_CAP_USER; } else if (KWMATCH(p, IAP_KW_ANYTHREAD)) { pmc_config->pm_md.pm_iap.pm_iap_config |= IAP_ANY; } else if (KWPREFIXMATCH(p, IAP_KW_CORE "=")) { n = pmc_parse_mask(iap_core_mask, p, &evmask); if (n != 1) return (-1); } else if (KWPREFIXMATCH(p, IAP_KW_AGENT "=")) { n = pmc_parse_mask(iap_agent_mask, p, &evmask); if (n != 1) return (-1); } else if (KWPREFIXMATCH(p, IAP_KW_PREFETCH "=")) { n = pmc_parse_mask(iap_prefetch_mask, p, &evmask); if (n != 1) return (-1); } else if (KWPREFIXMATCH(p, IAP_KW_CACHESTATE "=")) { n = pmc_parse_mask(iap_cachestate_mask, p, &cachestate); } else if (cpu_info.pm_cputype == PMC_CPU_INTEL_CORE && KWPREFIXMATCH(p, IAP_KW_TRANSITION "=")) { n = pmc_parse_mask(iap_transition_mask, p, &evmask); if (n != 1) return (-1); } else if (cpu_info.pm_cputype == PMC_CPU_INTEL_ATOM || cpu_info.pm_cputype == PMC_CPU_INTEL_CORE2 || - cpu_info.pm_cputype == PMC_CPU_INTEL_CORE2EXTREME || - cpu_info.pm_cputype == PMC_CPU_INTEL_COREI7) { + cpu_info.pm_cputype == PMC_CPU_INTEL_CORE2EXTREME) { if (KWPREFIXMATCH(p, IAP_KW_SNOOPRESPONSE "=")) { n = pmc_parse_mask(iap_snoopresponse_mask, p, &evmask); } else if (KWPREFIXMATCH(p, IAP_KW_SNOOPTYPE "=")) { n = pmc_parse_mask(iap_snooptype_mask, p, &evmask); } else return (-1); + } else if (cpu_info.pm_cputype == PMC_CPU_INTEL_COREI7 || + cpu_info.pm_cputype == PMC_CPU_INTEL_WESTMERE) { + if (KWPREFIXMATCH(p, IAP_KW_RSP "=")) { + n = pmc_parse_mask(iap_rsp_mask, p, &rsp); + } else + return (-1); } else return (-1); if (n < 0) /* Parsing failed. */ return (-1); } pmc_config->pm_md.pm_iap.pm_iap_config |= evmask; /* * If the event requires a 'cachestate' qualifier but was not * specified by the user, use a sensible default. */ switch (pe) { case PMC_EV_IAP_EVENT_28H: /* Core, Core2, Atom */ case PMC_EV_IAP_EVENT_29H: /* Core, Core2, Atom */ case PMC_EV_IAP_EVENT_2AH: /* Core, Core2, Atom */ case PMC_EV_IAP_EVENT_2BH: /* Atom, Core2 */ case PMC_EV_IAP_EVENT_2EH: /* Core, Core2, Atom */ case PMC_EV_IAP_EVENT_30H: /* Core, Core2, Atom */ case PMC_EV_IAP_EVENT_32H: /* Core */ case PMC_EV_IAP_EVENT_40H: /* Core */ case PMC_EV_IAP_EVENT_41H: /* Core */ case PMC_EV_IAP_EVENT_42H: /* Core, Core2, Atom */ case PMC_EV_IAP_EVENT_77H: /* Core */ if (cachestate == 0) cachestate = (0xF << 8); default: break; } pmc_config->pm_md.pm_iap.pm_iap_config |= cachestate; + pmc_config->pm_md.pm_iap.pm_iap_rsp = rsp; return (0); } /* + * Intel Uncore. + */ + +static int +ucf_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + (void) pe; + (void) ctrspec; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); + pmc_config->pm_md.pm_ucf.pm_ucf_flags = 0; + + return (0); +} + +#define UCP_KW_CMASK "cmask" +#define UCP_KW_EDGE "edge" +#define UCP_KW_INV "inv" + +static int +ucp_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + char *e, *p, *q; + int count, n; + + (void) pe; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE | + PMC_CAP_QUALIFIER); + pmc_config->pm_md.pm_ucp.pm_ucp_config = 0; + + /* Parse additional modifiers if present */ + while ((p = strsep(&ctrspec, ",")) != NULL) { + + n = 0; + if (KWPREFIXMATCH(p, UCP_KW_CMASK "=")) { + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + count = strtol(q, &e, 0); + if (e == q || *e != '\0') + return (-1); + pmc_config->pm_caps |= PMC_CAP_THRESHOLD; + pmc_config->pm_md.pm_ucp.pm_ucp_config |= + UCP_CMASK(count); + } else if (KWMATCH(p, UCP_KW_EDGE)) { + pmc_config->pm_caps |= PMC_CAP_EDGE; + } else if (KWMATCH(p, UCP_KW_INV)) { + pmc_config->pm_caps |= PMC_CAP_INVERT; + } else + return (-1); + + if (n < 0) /* Parsing failed. */ + return (-1); + } + + return (0); +} + +/* * AMD K8 PMCs. * * These are very similar to AMD K7 PMCs, but support more kinds of * events. */ static struct pmc_event_alias k8_aliases[] = { EV_ALIAS("branches", "k8-fr-retired-taken-branches"), EV_ALIAS("branch-mispredicts", "k8-fr-retired-taken-branches-mispredicted"), EV_ALIAS("cycles", "tsc"), EV_ALIAS("dc-misses", "k8-dc-miss"), EV_ALIAS("ic-misses", "k8-ic-miss"), EV_ALIAS("instructions", "k8-fr-retired-x86-instructions"), EV_ALIAS("interrupts", "k8-fr-taken-hardware-interrupts"), EV_ALIAS("unhalted-cycles", "k8-bu-cpu-clk-unhalted"), EV_ALIAS(NULL, NULL) }; #define __K8MASK(N,V) PMCMASK(N,(1 << (V))) /* * Parsing tables */ /* fp dispatched fpu ops */ static const struct pmc_masks k8_mask_fdfo[] = { __K8MASK(add-pipe-excluding-junk-ops, 0), __K8MASK(multiply-pipe-excluding-junk-ops, 1), __K8MASK(store-pipe-excluding-junk-ops, 2), __K8MASK(add-pipe-junk-ops, 3), __K8MASK(multiply-pipe-junk-ops, 4), __K8MASK(store-pipe-junk-ops, 5), NULLMASK }; /* ls segment register loads */ static const struct pmc_masks k8_mask_lsrl[] = { __K8MASK(es, 0), __K8MASK(cs, 1), __K8MASK(ss, 2), __K8MASK(ds, 3), __K8MASK(fs, 4), __K8MASK(gs, 5), __K8MASK(hs, 6), NULLMASK }; /* ls locked operation */ static const struct pmc_masks k8_mask_llo[] = { __K8MASK(locked-instructions, 0), __K8MASK(cycles-in-request, 1), __K8MASK(cycles-to-complete, 2), NULLMASK }; /* dc refill from {l2,system} and dc copyback */ static const struct pmc_masks k8_mask_dc[] = { __K8MASK(invalid, 0), __K8MASK(shared, 1), __K8MASK(exclusive, 2), __K8MASK(owner, 3), __K8MASK(modified, 4), NULLMASK }; /* dc one bit ecc error */ static const struct pmc_masks k8_mask_dobee[] = { __K8MASK(scrubber, 0), __K8MASK(piggyback, 1), NULLMASK }; /* dc dispatched prefetch instructions */ static const struct pmc_masks k8_mask_ddpi[] = { __K8MASK(load, 0), __K8MASK(store, 1), __K8MASK(nta, 2), NULLMASK }; /* dc dcache accesses by locks */ static const struct pmc_masks k8_mask_dabl[] = { __K8MASK(accesses, 0), __K8MASK(misses, 1), NULLMASK }; /* bu internal l2 request */ static const struct pmc_masks k8_mask_bilr[] = { __K8MASK(ic-fill, 0), __K8MASK(dc-fill, 1), __K8MASK(tlb-reload, 2), __K8MASK(tag-snoop, 3), __K8MASK(cancelled, 4), NULLMASK }; /* bu fill request l2 miss */ static const struct pmc_masks k8_mask_bfrlm[] = { __K8MASK(ic-fill, 0), __K8MASK(dc-fill, 1), __K8MASK(tlb-reload, 2), NULLMASK }; /* bu fill into l2 */ static const struct pmc_masks k8_mask_bfil[] = { __K8MASK(dirty-l2-victim, 0), __K8MASK(victim-from-l2, 1), NULLMASK }; /* fr retired fpu instructions */ static const struct pmc_masks k8_mask_frfi[] = { __K8MASK(x87, 0), __K8MASK(mmx-3dnow, 1), __K8MASK(packed-sse-sse2, 2), __K8MASK(scalar-sse-sse2, 3), NULLMASK }; /* fr retired fastpath double op instructions */ static const struct pmc_masks k8_mask_frfdoi[] = { __K8MASK(low-op-pos-0, 0), __K8MASK(low-op-pos-1, 1), __K8MASK(low-op-pos-2, 2), NULLMASK }; /* fr fpu exceptions */ static const struct pmc_masks k8_mask_ffe[] = { __K8MASK(x87-reclass-microfaults, 0), __K8MASK(sse-retype-microfaults, 1), __K8MASK(sse-reclass-microfaults, 2), __K8MASK(sse-and-x87-microtraps, 3), NULLMASK }; /* nb memory controller page access event */ static const struct pmc_masks k8_mask_nmcpae[] = { __K8MASK(page-hit, 0), __K8MASK(page-miss, 1), __K8MASK(page-conflict, 2), NULLMASK }; /* nb memory controller turnaround */ static const struct pmc_masks k8_mask_nmct[] = { __K8MASK(dimm-turnaround, 0), __K8MASK(read-to-write-turnaround, 1), __K8MASK(write-to-read-turnaround, 2), NULLMASK }; /* nb memory controller bypass saturation */ static const struct pmc_masks k8_mask_nmcbs[] = { __K8MASK(memory-controller-hi-pri-bypass, 0), __K8MASK(memory-controller-lo-pri-bypass, 1), __K8MASK(dram-controller-interface-bypass, 2), __K8MASK(dram-controller-queue-bypass, 3), NULLMASK }; /* nb sized commands */ static const struct pmc_masks k8_mask_nsc[] = { __K8MASK(nonpostwrszbyte, 0), __K8MASK(nonpostwrszdword, 1), __K8MASK(postwrszbyte, 2), __K8MASK(postwrszdword, 3), __K8MASK(rdszbyte, 4), __K8MASK(rdszdword, 5), __K8MASK(rdmodwr, 6), NULLMASK }; /* nb probe result */ static const struct pmc_masks k8_mask_npr[] = { __K8MASK(probe-miss, 0), __K8MASK(probe-hit, 1), __K8MASK(probe-hit-dirty-no-memory-cancel, 2), __K8MASK(probe-hit-dirty-with-memory-cancel, 3), NULLMASK }; /* nb hypertransport bus bandwidth */ static const struct pmc_masks k8_mask_nhbb[] = { /* HT bus bandwidth */ __K8MASK(command, 0), __K8MASK(data, 1), __K8MASK(buffer-release, 2), __K8MASK(nop, 3), NULLMASK }; #undef __K8MASK #define K8_KW_COUNT "count" #define K8_KW_EDGE "edge" #define K8_KW_INV "inv" #define K8_KW_MASK "mask" #define K8_KW_OS "os" #define K8_KW_USR "usr" static int k8_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { char *e, *p, *q; int n; uint32_t count, evmask; const struct pmc_masks *pm, *pmask; pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); pmc_config->pm_md.pm_amd.pm_amd_config = 0; pmask = NULL; evmask = 0; #define __K8SETMASK(M) pmask = k8_mask_##M /* setup parsing tables */ switch (pe) { case PMC_EV_K8_FP_DISPATCHED_FPU_OPS: __K8SETMASK(fdfo); break; case PMC_EV_K8_LS_SEGMENT_REGISTER_LOAD: __K8SETMASK(lsrl); break; case PMC_EV_K8_LS_LOCKED_OPERATION: __K8SETMASK(llo); break; case PMC_EV_K8_DC_REFILL_FROM_L2: case PMC_EV_K8_DC_REFILL_FROM_SYSTEM: case PMC_EV_K8_DC_COPYBACK: __K8SETMASK(dc); break; case PMC_EV_K8_DC_ONE_BIT_ECC_ERROR: __K8SETMASK(dobee); break; case PMC_EV_K8_DC_DISPATCHED_PREFETCH_INSTRUCTIONS: __K8SETMASK(ddpi); break; case PMC_EV_K8_DC_DCACHE_ACCESSES_BY_LOCKS: __K8SETMASK(dabl); break; case PMC_EV_K8_BU_INTERNAL_L2_REQUEST: __K8SETMASK(bilr); break; case PMC_EV_K8_BU_FILL_REQUEST_L2_MISS: __K8SETMASK(bfrlm); break; case PMC_EV_K8_BU_FILL_INTO_L2: __K8SETMASK(bfil); break; case PMC_EV_K8_FR_RETIRED_FPU_INSTRUCTIONS: __K8SETMASK(frfi); break; case PMC_EV_K8_FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS: __K8SETMASK(frfdoi); break; case PMC_EV_K8_FR_FPU_EXCEPTIONS: __K8SETMASK(ffe); break; case PMC_EV_K8_NB_MEMORY_CONTROLLER_PAGE_ACCESS_EVENT: __K8SETMASK(nmcpae); break; case PMC_EV_K8_NB_MEMORY_CONTROLLER_TURNAROUND: __K8SETMASK(nmct); break; case PMC_EV_K8_NB_MEMORY_CONTROLLER_BYPASS_SATURATION: __K8SETMASK(nmcbs); break; case PMC_EV_K8_NB_SIZED_COMMANDS: __K8SETMASK(nsc); break; case PMC_EV_K8_NB_PROBE_RESULT: __K8SETMASK(npr); break; case PMC_EV_K8_NB_HT_BUS0_BANDWIDTH: case PMC_EV_K8_NB_HT_BUS1_BANDWIDTH: case PMC_EV_K8_NB_HT_BUS2_BANDWIDTH: __K8SETMASK(nhbb); break; default: break; /* no options defined */ } while ((p = strsep(&ctrspec, ",")) != NULL) { if (KWPREFIXMATCH(p, K8_KW_COUNT "=")) { q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); count = strtol(q, &e, 0); if (e == q || *e != '\0') return (-1); pmc_config->pm_caps |= PMC_CAP_THRESHOLD; pmc_config->pm_md.pm_amd.pm_amd_config |= AMD_PMC_TO_COUNTER(count); } else if (KWMATCH(p, K8_KW_EDGE)) { pmc_config->pm_caps |= PMC_CAP_EDGE; } else if (KWMATCH(p, K8_KW_INV)) { pmc_config->pm_caps |= PMC_CAP_INVERT; } else if (KWPREFIXMATCH(p, K8_KW_MASK "=")) { if ((n = pmc_parse_mask(pmask, p, &evmask)) < 0) return (-1); pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } else if (KWMATCH(p, K8_KW_OS)) { pmc_config->pm_caps |= PMC_CAP_SYSTEM; } else if (KWMATCH(p, K8_KW_USR)) { pmc_config->pm_caps |= PMC_CAP_USER; } else return (-1); } /* other post processing */ switch (pe) { case PMC_EV_K8_FP_DISPATCHED_FPU_OPS: case PMC_EV_K8_FP_CYCLES_WITH_NO_FPU_OPS_RETIRED: case PMC_EV_K8_FP_DISPATCHED_FPU_FAST_FLAG_OPS: case PMC_EV_K8_FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS: case PMC_EV_K8_FR_RETIRED_FPU_INSTRUCTIONS: case PMC_EV_K8_FR_FPU_EXCEPTIONS: /* XXX only available in rev B and later */ break; case PMC_EV_K8_DC_DCACHE_ACCESSES_BY_LOCKS: /* XXX only available in rev C and later */ break; case PMC_EV_K8_LS_LOCKED_OPERATION: /* XXX CPU Rev A,B evmask is to be zero */ if (evmask & (evmask - 1)) /* > 1 bit set */ return (-1); if (evmask == 0) { evmask = 0x01; /* Rev C and later: #instrs */ pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } break; default: if (evmask == 0 && pmask != NULL) { for (pm = pmask; pm->pm_name; pm++) evmask |= pm->pm_value; pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } } if (pmc_config->pm_caps & PMC_CAP_QUALIFIER) pmc_config->pm_md.pm_amd.pm_amd_config = AMD_PMC_TO_UNITMASK(evmask); return (0); } #endif #if defined(__amd64__) || defined(__i386__) /* * Intel P4 PMCs */ static struct pmc_event_alias p4_aliases[] = { EV_ALIAS("branches", "p4-branch-retired,mask=mmtp+mmtm"), EV_ALIAS("branch-mispredicts", "p4-mispred-branch-retired"), EV_ALIAS("cycles", "tsc"), EV_ALIAS("instructions", "p4-instr-retired,mask=nbogusntag+nbogustag"), EV_ALIAS("unhalted-cycles", "p4-global-power-events"), EV_ALIAS(NULL, NULL) }; #define P4_KW_ACTIVE "active" #define P4_KW_ACTIVE_ANY "any" #define P4_KW_ACTIVE_BOTH "both" #define P4_KW_ACTIVE_NONE "none" #define P4_KW_ACTIVE_SINGLE "single" #define P4_KW_BUSREQTYPE "busreqtype" #define P4_KW_CASCADE "cascade" #define P4_KW_EDGE "edge" #define P4_KW_INV "complement" #define P4_KW_OS "os" #define P4_KW_MASK "mask" #define P4_KW_PRECISE "precise" #define P4_KW_TAG "tag" #define P4_KW_THRESHOLD "threshold" #define P4_KW_USR "usr" #define __P4MASK(N,V) PMCMASK(N, (1 << (V))) static const struct pmc_masks p4_mask_tcdm[] = { /* tc deliver mode */ __P4MASK(dd, 0), __P4MASK(db, 1), __P4MASK(di, 2), __P4MASK(bd, 3), __P4MASK(bb, 4), __P4MASK(bi, 5), __P4MASK(id, 6), __P4MASK(ib, 7), NULLMASK }; static const struct pmc_masks p4_mask_bfr[] = { /* bpu fetch request */ __P4MASK(tcmiss, 0), NULLMASK, }; static const struct pmc_masks p4_mask_ir[] = { /* itlb reference */ __P4MASK(hit, 0), __P4MASK(miss, 1), __P4MASK(hit-uc, 2), NULLMASK }; static const struct pmc_masks p4_mask_memcan[] = { /* memory cancel */ __P4MASK(st-rb-full, 2), __P4MASK(64k-conf, 3), NULLMASK }; static const struct pmc_masks p4_mask_memcomp[] = { /* memory complete */ __P4MASK(lsc, 0), __P4MASK(ssc, 1), NULLMASK }; static const struct pmc_masks p4_mask_lpr[] = { /* load port replay */ __P4MASK(split-ld, 1), NULLMASK }; static const struct pmc_masks p4_mask_spr[] = { /* store port replay */ __P4MASK(split-st, 1), NULLMASK }; static const struct pmc_masks p4_mask_mlr[] = { /* mob load replay */ __P4MASK(no-sta, 1), __P4MASK(no-std, 3), __P4MASK(partial-data, 4), __P4MASK(unalgn-addr, 5), NULLMASK }; static const struct pmc_masks p4_mask_pwt[] = { /* page walk type */ __P4MASK(dtmiss, 0), __P4MASK(itmiss, 1), NULLMASK }; static const struct pmc_masks p4_mask_bcr[] = { /* bsq cache reference */ __P4MASK(rd-2ndl-hits, 0), __P4MASK(rd-2ndl-hite, 1), __P4MASK(rd-2ndl-hitm, 2), __P4MASK(rd-3rdl-hits, 3), __P4MASK(rd-3rdl-hite, 4), __P4MASK(rd-3rdl-hitm, 5), __P4MASK(rd-2ndl-miss, 8), __P4MASK(rd-3rdl-miss, 9), __P4MASK(wr-2ndl-miss, 10), NULLMASK }; static const struct pmc_masks p4_mask_ia[] = { /* ioq allocation */ __P4MASK(all-read, 5), __P4MASK(all-write, 6), __P4MASK(mem-uc, 7), __P4MASK(mem-wc, 8), __P4MASK(mem-wt, 9), __P4MASK(mem-wp, 10), __P4MASK(mem-wb, 11), __P4MASK(own, 13), __P4MASK(other, 14), __P4MASK(prefetch, 15), NULLMASK }; static const struct pmc_masks p4_mask_iae[] = { /* ioq active entries */ __P4MASK(all-read, 5), __P4MASK(all-write, 6), __P4MASK(mem-uc, 7), __P4MASK(mem-wc, 8), __P4MASK(mem-wt, 9), __P4MASK(mem-wp, 10), __P4MASK(mem-wb, 11), __P4MASK(own, 13), __P4MASK(other, 14), __P4MASK(prefetch, 15), NULLMASK }; static const struct pmc_masks p4_mask_fda[] = { /* fsb data activity */ __P4MASK(drdy-drv, 0), __P4MASK(drdy-own, 1), __P4MASK(drdy-other, 2), __P4MASK(dbsy-drv, 3), __P4MASK(dbsy-own, 4), __P4MASK(dbsy-other, 5), NULLMASK }; static const struct pmc_masks p4_mask_ba[] = { /* bsq allocation */ __P4MASK(req-type0, 0), __P4MASK(req-type1, 1), __P4MASK(req-len0, 2), __P4MASK(req-len1, 3), __P4MASK(req-io-type, 5), __P4MASK(req-lock-type, 6), __P4MASK(req-cache-type, 7), __P4MASK(req-split-type, 8), __P4MASK(req-dem-type, 9), __P4MASK(req-ord-type, 10), __P4MASK(mem-type0, 11), __P4MASK(mem-type1, 12), __P4MASK(mem-type2, 13), NULLMASK }; static const struct pmc_masks p4_mask_sia[] = { /* sse input assist */ __P4MASK(all, 15), NULLMASK }; static const struct pmc_masks p4_mask_psu[] = { /* packed sp uop */ __P4MASK(all, 15), NULLMASK }; static const struct pmc_masks p4_mask_pdu[] = { /* packed dp uop */ __P4MASK(all, 15), NULLMASK }; static const struct pmc_masks p4_mask_ssu[] = { /* scalar sp uop */ __P4MASK(all, 15), NULLMASK }; static const struct pmc_masks p4_mask_sdu[] = { /* scalar dp uop */ __P4MASK(all, 15), NULLMASK }; static const struct pmc_masks p4_mask_64bmu[] = { /* 64 bit mmx uop */ __P4MASK(all, 15), NULLMASK }; static const struct pmc_masks p4_mask_128bmu[] = { /* 128 bit mmx uop */ __P4MASK(all, 15), NULLMASK }; static const struct pmc_masks p4_mask_xfu[] = { /* X87 fp uop */ __P4MASK(all, 15), NULLMASK }; static const struct pmc_masks p4_mask_xsmu[] = { /* x87 simd moves uop */ __P4MASK(allp0, 3), __P4MASK(allp2, 4), NULLMASK }; static const struct pmc_masks p4_mask_gpe[] = { /* global power events */ __P4MASK(running, 0), NULLMASK }; static const struct pmc_masks p4_mask_tmx[] = { /* TC ms xfer */ __P4MASK(cisc, 0), NULLMASK }; static const struct pmc_masks p4_mask_uqw[] = { /* uop queue writes */ __P4MASK(from-tc-build, 0), __P4MASK(from-tc-deliver, 1), __P4MASK(from-rom, 2), NULLMASK }; static const struct pmc_masks p4_mask_rmbt[] = { /* retired mispred branch type */ __P4MASK(conditional, 1), __P4MASK(call, 2), __P4MASK(return, 3), __P4MASK(indirect, 4), NULLMASK }; static const struct pmc_masks p4_mask_rbt[] = { /* retired branch type */ __P4MASK(conditional, 1), __P4MASK(call, 2), __P4MASK(retired, 3), __P4MASK(indirect, 4), NULLMASK }; static const struct pmc_masks p4_mask_rs[] = { /* resource stall */ __P4MASK(sbfull, 5), NULLMASK }; static const struct pmc_masks p4_mask_wb[] = { /* WC buffer */ __P4MASK(wcb-evicts, 0), __P4MASK(wcb-full-evict, 1), NULLMASK }; static const struct pmc_masks p4_mask_fee[] = { /* front end event */ __P4MASK(nbogus, 0), __P4MASK(bogus, 1), NULLMASK }; static const struct pmc_masks p4_mask_ee[] = { /* execution event */ __P4MASK(nbogus0, 0), __P4MASK(nbogus1, 1), __P4MASK(nbogus2, 2), __P4MASK(nbogus3, 3), __P4MASK(bogus0, 4), __P4MASK(bogus1, 5), __P4MASK(bogus2, 6), __P4MASK(bogus3, 7), NULLMASK }; static const struct pmc_masks p4_mask_re[] = { /* replay event */ __P4MASK(nbogus, 0), __P4MASK(bogus, 1), NULLMASK }; static const struct pmc_masks p4_mask_insret[] = { /* instr retired */ __P4MASK(nbogusntag, 0), __P4MASK(nbogustag, 1), __P4MASK(bogusntag, 2), __P4MASK(bogustag, 3), NULLMASK }; static const struct pmc_masks p4_mask_ur[] = { /* uops retired */ __P4MASK(nbogus, 0), __P4MASK(bogus, 1), NULLMASK }; static const struct pmc_masks p4_mask_ut[] = { /* uop type */ __P4MASK(tagloads, 1), __P4MASK(tagstores, 2), NULLMASK }; static const struct pmc_masks p4_mask_br[] = { /* branch retired */ __P4MASK(mmnp, 0), __P4MASK(mmnm, 1), __P4MASK(mmtp, 2), __P4MASK(mmtm, 3), NULLMASK }; static const struct pmc_masks p4_mask_mbr[] = { /* mispred branch retired */ __P4MASK(nbogus, 0), NULLMASK }; static const struct pmc_masks p4_mask_xa[] = { /* x87 assist */ __P4MASK(fpsu, 0), __P4MASK(fpso, 1), __P4MASK(poao, 2), __P4MASK(poau, 3), __P4MASK(prea, 4), NULLMASK }; static const struct pmc_masks p4_mask_machclr[] = { /* machine clear */ __P4MASK(clear, 0), __P4MASK(moclear, 2), __P4MASK(smclear, 3), NULLMASK }; /* P4 event parser */ static int p4_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { char *e, *p, *q; int count, has_tag, has_busreqtype, n; uint32_t evmask, cccractivemask; const struct pmc_masks *pm, *pmask; pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); pmc_config->pm_md.pm_p4.pm_p4_cccrconfig = pmc_config->pm_md.pm_p4.pm_p4_escrconfig = 0; pmask = NULL; evmask = 0; cccractivemask = 0x3; has_tag = has_busreqtype = 0; #define __P4SETMASK(M) do { \ pmask = p4_mask_##M; \ } while (0) switch (pe) { case PMC_EV_P4_TC_DELIVER_MODE: __P4SETMASK(tcdm); break; case PMC_EV_P4_BPU_FETCH_REQUEST: __P4SETMASK(bfr); break; case PMC_EV_P4_ITLB_REFERENCE: __P4SETMASK(ir); break; case PMC_EV_P4_MEMORY_CANCEL: __P4SETMASK(memcan); break; case PMC_EV_P4_MEMORY_COMPLETE: __P4SETMASK(memcomp); break; case PMC_EV_P4_LOAD_PORT_REPLAY: __P4SETMASK(lpr); break; case PMC_EV_P4_STORE_PORT_REPLAY: __P4SETMASK(spr); break; case PMC_EV_P4_MOB_LOAD_REPLAY: __P4SETMASK(mlr); break; case PMC_EV_P4_PAGE_WALK_TYPE: __P4SETMASK(pwt); break; case PMC_EV_P4_BSQ_CACHE_REFERENCE: __P4SETMASK(bcr); break; case PMC_EV_P4_IOQ_ALLOCATION: __P4SETMASK(ia); has_busreqtype = 1; break; case PMC_EV_P4_IOQ_ACTIVE_ENTRIES: __P4SETMASK(iae); has_busreqtype = 1; break; case PMC_EV_P4_FSB_DATA_ACTIVITY: __P4SETMASK(fda); break; case PMC_EV_P4_BSQ_ALLOCATION: __P4SETMASK(ba); break; case PMC_EV_P4_SSE_INPUT_ASSIST: __P4SETMASK(sia); break; case PMC_EV_P4_PACKED_SP_UOP: __P4SETMASK(psu); break; case PMC_EV_P4_PACKED_DP_UOP: __P4SETMASK(pdu); break; case PMC_EV_P4_SCALAR_SP_UOP: __P4SETMASK(ssu); break; case PMC_EV_P4_SCALAR_DP_UOP: __P4SETMASK(sdu); break; case PMC_EV_P4_64BIT_MMX_UOP: __P4SETMASK(64bmu); break; case PMC_EV_P4_128BIT_MMX_UOP: __P4SETMASK(128bmu); break; case PMC_EV_P4_X87_FP_UOP: __P4SETMASK(xfu); break; case PMC_EV_P4_X87_SIMD_MOVES_UOP: __P4SETMASK(xsmu); break; case PMC_EV_P4_GLOBAL_POWER_EVENTS: __P4SETMASK(gpe); break; case PMC_EV_P4_TC_MS_XFER: __P4SETMASK(tmx); break; case PMC_EV_P4_UOP_QUEUE_WRITES: __P4SETMASK(uqw); break; case PMC_EV_P4_RETIRED_MISPRED_BRANCH_TYPE: __P4SETMASK(rmbt); break; case PMC_EV_P4_RETIRED_BRANCH_TYPE: __P4SETMASK(rbt); break; case PMC_EV_P4_RESOURCE_STALL: __P4SETMASK(rs); break; case PMC_EV_P4_WC_BUFFER: __P4SETMASK(wb); break; case PMC_EV_P4_BSQ_ACTIVE_ENTRIES: case PMC_EV_P4_B2B_CYCLES: case PMC_EV_P4_BNR: case PMC_EV_P4_SNOOP: case PMC_EV_P4_RESPONSE: break; case PMC_EV_P4_FRONT_END_EVENT: __P4SETMASK(fee); break; case PMC_EV_P4_EXECUTION_EVENT: __P4SETMASK(ee); break; case PMC_EV_P4_REPLAY_EVENT: __P4SETMASK(re); break; case PMC_EV_P4_INSTR_RETIRED: __P4SETMASK(insret); break; case PMC_EV_P4_UOPS_RETIRED: __P4SETMASK(ur); break; case PMC_EV_P4_UOP_TYPE: __P4SETMASK(ut); break; case PMC_EV_P4_BRANCH_RETIRED: __P4SETMASK(br); break; case PMC_EV_P4_MISPRED_BRANCH_RETIRED: __P4SETMASK(mbr); break; case PMC_EV_P4_X87_ASSIST: __P4SETMASK(xa); break; case PMC_EV_P4_MACHINE_CLEAR: __P4SETMASK(machclr); break; default: return (-1); } /* process additional flags */ while ((p = strsep(&ctrspec, ",")) != NULL) { if (KWPREFIXMATCH(p, P4_KW_ACTIVE)) { q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); if (strcasecmp(q, P4_KW_ACTIVE_NONE) == 0) cccractivemask = 0x0; else if (strcasecmp(q, P4_KW_ACTIVE_SINGLE) == 0) cccractivemask = 0x1; else if (strcasecmp(q, P4_KW_ACTIVE_BOTH) == 0) cccractivemask = 0x2; else if (strcasecmp(q, P4_KW_ACTIVE_ANY) == 0) cccractivemask = 0x3; else return (-1); } else if (KWPREFIXMATCH(p, P4_KW_BUSREQTYPE)) { if (has_busreqtype == 0) return (-1); q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); count = strtol(q, &e, 0); if (e == q || *e != '\0') return (-1); evmask = (evmask & ~0x1F) | (count & 0x1F); } else if (KWMATCH(p, P4_KW_CASCADE)) pmc_config->pm_caps |= PMC_CAP_CASCADE; else if (KWMATCH(p, P4_KW_EDGE)) pmc_config->pm_caps |= PMC_CAP_EDGE; else if (KWMATCH(p, P4_KW_INV)) pmc_config->pm_caps |= PMC_CAP_INVERT; else if (KWPREFIXMATCH(p, P4_KW_MASK "=")) { if ((n = pmc_parse_mask(pmask, p, &evmask)) < 0) return (-1); pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } else if (KWMATCH(p, P4_KW_OS)) pmc_config->pm_caps |= PMC_CAP_SYSTEM; else if (KWMATCH(p, P4_KW_PRECISE)) pmc_config->pm_caps |= PMC_CAP_PRECISE; else if (KWPREFIXMATCH(p, P4_KW_TAG "=")) { if (has_tag == 0) return (-1); q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); count = strtol(q, &e, 0); if (e == q || *e != '\0') return (-1); pmc_config->pm_caps |= PMC_CAP_TAGGING; pmc_config->pm_md.pm_p4.pm_p4_escrconfig |= P4_ESCR_TO_TAG_VALUE(count); } else if (KWPREFIXMATCH(p, P4_KW_THRESHOLD "=")) { q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); count = strtol(q, &e, 0); if (e == q || *e != '\0') return (-1); pmc_config->pm_caps |= PMC_CAP_THRESHOLD; pmc_config->pm_md.pm_p4.pm_p4_cccrconfig &= ~P4_CCCR_THRESHOLD_MASK; pmc_config->pm_md.pm_p4.pm_p4_cccrconfig |= P4_CCCR_TO_THRESHOLD(count); } else if (KWMATCH(p, P4_KW_USR)) pmc_config->pm_caps |= PMC_CAP_USER; else return (-1); } /* other post processing */ if (pe == PMC_EV_P4_IOQ_ALLOCATION || pe == PMC_EV_P4_FSB_DATA_ACTIVITY || pe == PMC_EV_P4_BSQ_ALLOCATION) pmc_config->pm_caps |= PMC_CAP_EDGE; /* fill in thread activity mask */ pmc_config->pm_md.pm_p4.pm_p4_cccrconfig |= P4_CCCR_TO_ACTIVE_THREAD(cccractivemask); if (evmask) pmc_config->pm_caps |= PMC_CAP_QUALIFIER; switch (pe) { case PMC_EV_P4_FSB_DATA_ACTIVITY: if ((evmask & 0x06) == 0x06 || (evmask & 0x18) == 0x18) return (-1); /* can't have own+other bits together */ if (evmask == 0) /* default:drdy-{drv,own}+dbsy{drv,own} */ evmask = 0x1D; break; case PMC_EV_P4_MACHINE_CLEAR: /* only one bit is allowed to be set */ if ((evmask & (evmask - 1)) != 0) return (-1); if (evmask == 0) { evmask = 0x1; /* 'CLEAR' */ pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } break; default: if (evmask == 0 && pmask) { for (pm = pmask; pm->pm_name; pm++) evmask |= pm->pm_value; pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } } pmc_config->pm_md.pm_p4.pm_p4_escrconfig = P4_ESCR_TO_EVENT_MASK(evmask); return (0); } #endif #if defined(__i386__) /* * Pentium style PMCs */ static struct pmc_event_alias p5_aliases[] = { EV_ALIAS("branches", "p5-taken-branches"), EV_ALIAS("cycles", "tsc"), EV_ALIAS("dc-misses", "p5-data-read-miss-or-write-miss"), EV_ALIAS("ic-misses", "p5-code-cache-miss"), EV_ALIAS("instructions", "p5-instructions-executed"), EV_ALIAS("interrupts", "p5-hardware-interrupts"), EV_ALIAS("unhalted-cycles", "p5-number-of-cycles-not-in-halt-state"), EV_ALIAS(NULL, NULL) }; static int p5_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { return (-1 || pe || ctrspec || pmc_config); /* shut up gcc */ } /* * Pentium Pro style PMCs. These PMCs are found in Pentium II, Pentium III, * and Pentium M CPUs. */ static struct pmc_event_alias p6_aliases[] = { EV_ALIAS("branches", "p6-br-inst-retired"), EV_ALIAS("branch-mispredicts", "p6-br-miss-pred-retired"), EV_ALIAS("cycles", "tsc"), EV_ALIAS("dc-misses", "p6-dcu-lines-in"), EV_ALIAS("ic-misses", "p6-ifu-fetch-miss"), EV_ALIAS("instructions", "p6-inst-retired"), EV_ALIAS("interrupts", "p6-hw-int-rx"), EV_ALIAS("unhalted-cycles", "p6-cpu-clk-unhalted"), EV_ALIAS(NULL, NULL) }; #define P6_KW_CMASK "cmask" #define P6_KW_EDGE "edge" #define P6_KW_INV "inv" #define P6_KW_OS "os" #define P6_KW_UMASK "umask" #define P6_KW_USR "usr" static struct pmc_masks p6_mask_mesi[] = { PMCMASK(m, 0x01), PMCMASK(e, 0x02), PMCMASK(s, 0x04), PMCMASK(i, 0x08), NULLMASK }; static struct pmc_masks p6_mask_mesihw[] = { PMCMASK(m, 0x01), PMCMASK(e, 0x02), PMCMASK(s, 0x04), PMCMASK(i, 0x08), PMCMASK(nonhw, 0x00), PMCMASK(hw, 0x10), PMCMASK(both, 0x30), NULLMASK }; static struct pmc_masks p6_mask_hw[] = { PMCMASK(nonhw, 0x00), PMCMASK(hw, 0x10), PMCMASK(both, 0x30), NULLMASK }; static struct pmc_masks p6_mask_any[] = { PMCMASK(self, 0x00), PMCMASK(any, 0x20), NULLMASK }; static struct pmc_masks p6_mask_ekp[] = { PMCMASK(nta, 0x00), PMCMASK(t1, 0x01), PMCMASK(t2, 0x02), PMCMASK(wos, 0x03), NULLMASK }; static struct pmc_masks p6_mask_pps[] = { PMCMASK(packed-and-scalar, 0x00), PMCMASK(scalar, 0x01), NULLMASK }; static struct pmc_masks p6_mask_mite[] = { PMCMASK(packed-multiply, 0x01), PMCMASK(packed-shift, 0x02), PMCMASK(pack, 0x04), PMCMASK(unpack, 0x08), PMCMASK(packed-logical, 0x10), PMCMASK(packed-arithmetic, 0x20), NULLMASK }; static struct pmc_masks p6_mask_fmt[] = { PMCMASK(mmxtofp, 0x00), PMCMASK(fptommx, 0x01), NULLMASK }; static struct pmc_masks p6_mask_sr[] = { PMCMASK(es, 0x01), PMCMASK(ds, 0x02), PMCMASK(fs, 0x04), PMCMASK(gs, 0x08), NULLMASK }; static struct pmc_masks p6_mask_eet[] = { PMCMASK(all, 0x00), PMCMASK(freq, 0x02), NULLMASK }; static struct pmc_masks p6_mask_efur[] = { PMCMASK(all, 0x00), PMCMASK(loadop, 0x01), PMCMASK(stdsta, 0x02), NULLMASK }; static struct pmc_masks p6_mask_essir[] = { PMCMASK(sse-packed-single, 0x00), PMCMASK(sse-packed-single-scalar-single, 0x01), PMCMASK(sse2-packed-double, 0x02), PMCMASK(sse2-scalar-double, 0x03), NULLMASK }; static struct pmc_masks p6_mask_esscir[] = { PMCMASK(sse-packed-single, 0x00), PMCMASK(sse-scalar-single, 0x01), PMCMASK(sse2-packed-double, 0x02), PMCMASK(sse2-scalar-double, 0x03), NULLMASK }; /* P6 event parser */ static int p6_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { char *e, *p, *q; uint32_t evmask; int count, n; const struct pmc_masks *pm, *pmask; pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); pmc_config->pm_md.pm_ppro.pm_ppro_config = 0; evmask = 0; #define P6MASKSET(M) pmask = p6_mask_ ## M switch(pe) { case PMC_EV_P6_L2_IFETCH: P6MASKSET(mesi); break; case PMC_EV_P6_L2_LD: P6MASKSET(mesi); break; case PMC_EV_P6_L2_ST: P6MASKSET(mesi); break; case PMC_EV_P6_L2_RQSTS: P6MASKSET(mesi); break; case PMC_EV_P6_BUS_DRDY_CLOCKS: case PMC_EV_P6_BUS_LOCK_CLOCKS: case PMC_EV_P6_BUS_TRAN_BRD: case PMC_EV_P6_BUS_TRAN_RFO: case PMC_EV_P6_BUS_TRANS_WB: case PMC_EV_P6_BUS_TRAN_IFETCH: case PMC_EV_P6_BUS_TRAN_INVAL: case PMC_EV_P6_BUS_TRAN_PWR: case PMC_EV_P6_BUS_TRANS_P: case PMC_EV_P6_BUS_TRANS_IO: case PMC_EV_P6_BUS_TRAN_DEF: case PMC_EV_P6_BUS_TRAN_BURST: case PMC_EV_P6_BUS_TRAN_ANY: case PMC_EV_P6_BUS_TRAN_MEM: P6MASKSET(any); break; case PMC_EV_P6_EMON_KNI_PREF_DISPATCHED: case PMC_EV_P6_EMON_KNI_PREF_MISS: P6MASKSET(ekp); break; case PMC_EV_P6_EMON_KNI_INST_RETIRED: case PMC_EV_P6_EMON_KNI_COMP_INST_RET: P6MASKSET(pps); break; case PMC_EV_P6_MMX_INSTR_TYPE_EXEC: P6MASKSET(mite); break; case PMC_EV_P6_FP_MMX_TRANS: P6MASKSET(fmt); break; case PMC_EV_P6_SEG_RENAME_STALLS: case PMC_EV_P6_SEG_REG_RENAMES: P6MASKSET(sr); break; case PMC_EV_P6_EMON_EST_TRANS: P6MASKSET(eet); break; case PMC_EV_P6_EMON_FUSED_UOPS_RET: P6MASKSET(efur); break; case PMC_EV_P6_EMON_SSE_SSE2_INST_RETIRED: P6MASKSET(essir); break; case PMC_EV_P6_EMON_SSE_SSE2_COMP_INST_RETIRED: P6MASKSET(esscir); break; default: pmask = NULL; break; } /* Pentium M PMCs have a few events with different semantics */ if (cpu_info.pm_cputype == PMC_CPU_INTEL_PM) { if (pe == PMC_EV_P6_L2_LD || pe == PMC_EV_P6_L2_LINES_IN || pe == PMC_EV_P6_L2_LINES_OUT) P6MASKSET(mesihw); else if (pe == PMC_EV_P6_L2_M_LINES_OUTM) P6MASKSET(hw); } /* Parse additional modifiers if present */ while ((p = strsep(&ctrspec, ",")) != NULL) { if (KWPREFIXMATCH(p, P6_KW_CMASK "=")) { q = strchr(p, '='); if (*++q == '\0') /* skip '=' */ return (-1); count = strtol(q, &e, 0); if (e == q || *e != '\0') return (-1); pmc_config->pm_caps |= PMC_CAP_THRESHOLD; pmc_config->pm_md.pm_ppro.pm_ppro_config |= P6_EVSEL_TO_CMASK(count); } else if (KWMATCH(p, P6_KW_EDGE)) { pmc_config->pm_caps |= PMC_CAP_EDGE; } else if (KWMATCH(p, P6_KW_INV)) { pmc_config->pm_caps |= PMC_CAP_INVERT; } else if (KWMATCH(p, P6_KW_OS)) { pmc_config->pm_caps |= PMC_CAP_SYSTEM; } else if (KWPREFIXMATCH(p, P6_KW_UMASK "=")) { evmask = 0; if ((n = pmc_parse_mask(pmask, p, &evmask)) < 0) return (-1); if ((pe == PMC_EV_P6_BUS_DRDY_CLOCKS || pe == PMC_EV_P6_BUS_LOCK_CLOCKS || pe == PMC_EV_P6_BUS_TRAN_BRD || pe == PMC_EV_P6_BUS_TRAN_RFO || pe == PMC_EV_P6_BUS_TRAN_IFETCH || pe == PMC_EV_P6_BUS_TRAN_INVAL || pe == PMC_EV_P6_BUS_TRAN_PWR || pe == PMC_EV_P6_BUS_TRAN_DEF || pe == PMC_EV_P6_BUS_TRAN_BURST || pe == PMC_EV_P6_BUS_TRAN_ANY || pe == PMC_EV_P6_BUS_TRAN_MEM || pe == PMC_EV_P6_BUS_TRANS_IO || pe == PMC_EV_P6_BUS_TRANS_P || pe == PMC_EV_P6_BUS_TRANS_WB || pe == PMC_EV_P6_EMON_EST_TRANS || pe == PMC_EV_P6_EMON_FUSED_UOPS_RET || pe == PMC_EV_P6_EMON_KNI_COMP_INST_RET || pe == PMC_EV_P6_EMON_KNI_INST_RETIRED || pe == PMC_EV_P6_EMON_KNI_PREF_DISPATCHED || pe == PMC_EV_P6_EMON_KNI_PREF_MISS || pe == PMC_EV_P6_EMON_SSE_SSE2_COMP_INST_RETIRED || pe == PMC_EV_P6_EMON_SSE_SSE2_INST_RETIRED || pe == PMC_EV_P6_FP_MMX_TRANS) && (n > 1)) /* Only one mask keyword is allowed. */ return (-1); pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } else if (KWMATCH(p, P6_KW_USR)) { pmc_config->pm_caps |= PMC_CAP_USER; } else return (-1); } /* post processing */ switch (pe) { /* * The following events default to an evmask of 0 */ /* default => 'self' */ case PMC_EV_P6_BUS_DRDY_CLOCKS: case PMC_EV_P6_BUS_LOCK_CLOCKS: case PMC_EV_P6_BUS_TRAN_BRD: case PMC_EV_P6_BUS_TRAN_RFO: case PMC_EV_P6_BUS_TRANS_WB: case PMC_EV_P6_BUS_TRAN_IFETCH: case PMC_EV_P6_BUS_TRAN_INVAL: case PMC_EV_P6_BUS_TRAN_PWR: case PMC_EV_P6_BUS_TRANS_P: case PMC_EV_P6_BUS_TRANS_IO: case PMC_EV_P6_BUS_TRAN_DEF: case PMC_EV_P6_BUS_TRAN_BURST: case PMC_EV_P6_BUS_TRAN_ANY: case PMC_EV_P6_BUS_TRAN_MEM: /* default => 'nta' */ case PMC_EV_P6_EMON_KNI_PREF_DISPATCHED: case PMC_EV_P6_EMON_KNI_PREF_MISS: /* default => 'packed and scalar' */ case PMC_EV_P6_EMON_KNI_INST_RETIRED: case PMC_EV_P6_EMON_KNI_COMP_INST_RET: /* default => 'mmx to fp transitions' */ case PMC_EV_P6_FP_MMX_TRANS: /* default => 'SSE Packed Single' */ case PMC_EV_P6_EMON_SSE_SSE2_INST_RETIRED: case PMC_EV_P6_EMON_SSE_SSE2_COMP_INST_RETIRED: /* default => 'all fused micro-ops' */ case PMC_EV_P6_EMON_FUSED_UOPS_RET: /* default => 'all transitions' */ case PMC_EV_P6_EMON_EST_TRANS: break; case PMC_EV_P6_MMX_UOPS_EXEC: evmask = 0x0F; /* only value allowed */ break; default: /* * For all other events, set the default event mask * to a logical OR of all the allowed event mask bits. */ if (evmask == 0 && pmask) { for (pm = pmask; pm->pm_name; pm++) evmask |= pm->pm_value; pmc_config->pm_caps |= PMC_CAP_QUALIFIER; } break; } if (pmc_config->pm_caps & PMC_CAP_QUALIFIER) pmc_config->pm_md.pm_ppro.pm_ppro_config |= P6_EVSEL_TO_UMASK(evmask); return (0); } #endif #if defined(__i386__) || defined(__amd64__) static int tsc_allocate_pmc(enum pmc_event pe, char *ctrspec, struct pmc_op_pmcallocate *pmc_config) { if (pe != PMC_EV_TSC_TSC) return (-1); /* TSC events must be unqualified. */ if (ctrspec && *ctrspec != '\0') return (-1); pmc_config->pm_md.pm_amd.pm_amd_config = 0; pmc_config->pm_caps |= PMC_CAP_READ; return (0); } #endif /* * Match an event name `name' with its canonical form. * * Matches are case insensitive and spaces, periods, underscores and * hyphen characters are considered to match each other. * * Returns 1 for a match, 0 otherwise. */ static int pmc_match_event_name(const char *name, const char *canonicalname) { int cc, nc; const unsigned char *c, *n; c = (const unsigned char *) canonicalname; n = (const unsigned char *) name; for (; (nc = *n) && (cc = *c); n++, c++) { if ((nc == ' ' || nc == '_' || nc == '-' || nc == '.') && (cc == ' ' || cc == '_' || cc == '-' || cc == '.')) continue; if (toupper(nc) == toupper(cc)) continue; return (0); } if (*n == '\0' && *c == '\0') return (1); return (0); } /* * Match an event name against all the event named supported by a * PMC class. * * Returns an event descriptor pointer on match or NULL otherwise. */ static const struct pmc_event_descr * pmc_match_event_class(const char *name, const struct pmc_class_descr *pcd) { size_t n; const struct pmc_event_descr *ev; ev = pcd->pm_evc_event_table; for (n = 0; n < pcd->pm_evc_event_table_size; n++, ev++) if (pmc_match_event_name(name, ev->pm_ev_name)) return (ev); return (NULL); } static int pmc_mdep_is_compatible_class(enum pmc_class pc) { size_t n; for (n = 0; n < pmc_mdep_class_list_size; n++) if (pmc_mdep_class_list[n] == pc) return (1); return (0); } /* * API entry points */ int pmc_allocate(const char *ctrspec, enum pmc_mode mode, uint32_t flags, int cpu, pmc_id_t *pmcid) { size_t n; int retval; char *r, *spec_copy; const char *ctrname; const struct pmc_event_descr *ev; const struct pmc_event_alias *alias; struct pmc_op_pmcallocate pmc_config; const struct pmc_class_descr *pcd; spec_copy = NULL; retval = -1; if (mode != PMC_MODE_SS && mode != PMC_MODE_TS && mode != PMC_MODE_SC && mode != PMC_MODE_TC) { errno = EINVAL; goto out; } /* replace an event alias with the canonical event specifier */ if (pmc_mdep_event_aliases) for (alias = pmc_mdep_event_aliases; alias->pm_alias; alias++) if (!strcasecmp(ctrspec, alias->pm_alias)) { spec_copy = strdup(alias->pm_spec); break; } if (spec_copy == NULL) spec_copy = strdup(ctrspec); r = spec_copy; ctrname = strsep(&r, ","); /* * If a explicit class prefix was given by the user, restrict the * search for the event to the specified PMC class. */ ev = NULL; for (n = 0; n < PMC_CLASS_TABLE_SIZE; n++) { pcd = pmc_class_table[n]; if (pmc_mdep_is_compatible_class(pcd->pm_evc_class) && strncasecmp(ctrname, pcd->pm_evc_name, pcd->pm_evc_name_size) == 0) { if ((ev = pmc_match_event_class(ctrname + pcd->pm_evc_name_size, pcd)) == NULL) { errno = EINVAL; goto out; } break; } } /* * Otherwise, search for this event in all compatible PMC * classes. */ for (n = 0; ev == NULL && n < PMC_CLASS_TABLE_SIZE; n++) { pcd = pmc_class_table[n]; if (pmc_mdep_is_compatible_class(pcd->pm_evc_class)) ev = pmc_match_event_class(ctrname, pcd); } if (ev == NULL) { errno = EINVAL; goto out; } bzero(&pmc_config, sizeof(pmc_config)); pmc_config.pm_ev = ev->pm_ev_code; pmc_config.pm_class = pcd->pm_evc_class; pmc_config.pm_cpu = cpu; pmc_config.pm_mode = mode; pmc_config.pm_flags = flags; if (PMC_IS_SAMPLING_MODE(mode)) pmc_config.pm_caps |= PMC_CAP_INTERRUPT; if (pcd->pm_evc_allocate_pmc(ev->pm_ev_code, r, &pmc_config) < 0) { errno = EINVAL; goto out; } if (PMC_CALL(PMCALLOCATE, &pmc_config) < 0) goto out; *pmcid = pmc_config.pm_pmcid; retval = 0; out: if (spec_copy) free(spec_copy); return (retval); } int pmc_attach(pmc_id_t pmc, pid_t pid) { struct pmc_op_pmcattach pmc_attach_args; pmc_attach_args.pm_pmc = pmc; pmc_attach_args.pm_pid = pid; return (PMC_CALL(PMCATTACH, &pmc_attach_args)); } int pmc_capabilities(pmc_id_t pmcid, uint32_t *caps) { unsigned int i; enum pmc_class cl; cl = PMC_ID_TO_CLASS(pmcid); for (i = 0; i < cpu_info.pm_nclass; i++) if (cpu_info.pm_classes[i].pm_class == cl) { *caps = cpu_info.pm_classes[i].pm_caps; return (0); } errno = EINVAL; return (-1); } int pmc_configure_logfile(int fd) { struct pmc_op_configurelog cla; cla.pm_logfd = fd; if (PMC_CALL(CONFIGURELOG, &cla) < 0) return (-1); return (0); } int pmc_cpuinfo(const struct pmc_cpuinfo **pci) { if (pmc_syscall == -1) { errno = ENXIO; return (-1); } *pci = &cpu_info; return (0); } int pmc_detach(pmc_id_t pmc, pid_t pid) { struct pmc_op_pmcattach pmc_detach_args; pmc_detach_args.pm_pmc = pmc; pmc_detach_args.pm_pid = pid; return (PMC_CALL(PMCDETACH, &pmc_detach_args)); } int pmc_disable(int cpu, int pmc) { struct pmc_op_pmcadmin ssa; ssa.pm_cpu = cpu; ssa.pm_pmc = pmc; ssa.pm_state = PMC_STATE_DISABLED; return (PMC_CALL(PMCADMIN, &ssa)); } int pmc_enable(int cpu, int pmc) { struct pmc_op_pmcadmin ssa; ssa.pm_cpu = cpu; ssa.pm_pmc = pmc; ssa.pm_state = PMC_STATE_FREE; return (PMC_CALL(PMCADMIN, &ssa)); } /* * Return a list of events known to a given PMC class. 'cl' is the * PMC class identifier, 'eventnames' is the returned list of 'const * char *' pointers pointing to the names of the events. 'nevents' is * the number of event name pointers returned. * * The space for 'eventnames' is allocated using malloc(3). The caller * is responsible for freeing this space when done. */ int pmc_event_names_of_class(enum pmc_class cl, const char ***eventnames, int *nevents) { int count; const char **names; const struct pmc_event_descr *ev; switch (cl) { case PMC_CLASS_IAF: ev = iaf_event_table; count = PMC_EVENT_TABLE_SIZE(iaf); break; case PMC_CLASS_IAP: /* * Return the most appropriate set of event name * spellings for the current CPU. */ switch (cpu_info.pm_cputype) { default: case PMC_CPU_INTEL_ATOM: ev = atom_event_table; count = PMC_EVENT_TABLE_SIZE(atom); break; case PMC_CPU_INTEL_CORE: ev = core_event_table; count = PMC_EVENT_TABLE_SIZE(core); break; case PMC_CPU_INTEL_CORE2: case PMC_CPU_INTEL_CORE2EXTREME: ev = core2_event_table; count = PMC_EVENT_TABLE_SIZE(core2); break; case PMC_CPU_INTEL_COREI7: ev = corei7_event_table; count = PMC_EVENT_TABLE_SIZE(corei7); break; + case PMC_CPU_INTEL_WESTMERE: + ev = westmere_event_table; + count = PMC_EVENT_TABLE_SIZE(westmere); + break; } break; + case PMC_CLASS_UCF: + ev = ucf_event_table; + count = PMC_EVENT_TABLE_SIZE(ucf); + break; + case PMC_CLASS_UCP: + /* + * Return the most appropriate set of event name + * spellings for the current CPU. + */ + switch (cpu_info.pm_cputype) { + default: + case PMC_CPU_INTEL_COREI7: + ev = corei7uc_event_table; + count = PMC_EVENT_TABLE_SIZE(corei7uc); + break; + case PMC_CPU_INTEL_WESTMERE: + ev = westmereuc_event_table; + count = PMC_EVENT_TABLE_SIZE(westmereuc); + break; + } + break; case PMC_CLASS_TSC: ev = tsc_event_table; count = PMC_EVENT_TABLE_SIZE(tsc); break; case PMC_CLASS_K7: ev = k7_event_table; count = PMC_EVENT_TABLE_SIZE(k7); break; case PMC_CLASS_K8: ev = k8_event_table; count = PMC_EVENT_TABLE_SIZE(k8); break; case PMC_CLASS_P4: ev = p4_event_table; count = PMC_EVENT_TABLE_SIZE(p4); break; case PMC_CLASS_P5: ev = p5_event_table; count = PMC_EVENT_TABLE_SIZE(p5); break; case PMC_CLASS_P6: ev = p6_event_table; count = PMC_EVENT_TABLE_SIZE(p6); break; default: errno = EINVAL; return (-1); } if ((names = malloc(count * sizeof(const char *))) == NULL) return (-1); *eventnames = names; *nevents = count; for (;count--; ev++, names++) *names = ev->pm_ev_name; return (0); } int pmc_flush_logfile(void) { return (PMC_CALL(FLUSHLOG,0)); } int pmc_get_driver_stats(struct pmc_driverstats *ds) { struct pmc_op_getdriverstats gms; if (PMC_CALL(GETDRIVERSTATS, &gms) < 0) return (-1); /* copy out fields in the current userland<->library interface */ ds->pm_intr_ignored = gms.pm_intr_ignored; ds->pm_intr_processed = gms.pm_intr_processed; ds->pm_intr_bufferfull = gms.pm_intr_bufferfull; ds->pm_syscalls = gms.pm_syscalls; ds->pm_syscall_errors = gms.pm_syscall_errors; ds->pm_buffer_requests = gms.pm_buffer_requests; ds->pm_buffer_requests_failed = gms.pm_buffer_requests_failed; ds->pm_log_sweeps = gms.pm_log_sweeps; return (0); } int pmc_get_msr(pmc_id_t pmc, uint32_t *msr) { struct pmc_op_getmsr gm; gm.pm_pmcid = pmc; if (PMC_CALL(PMCGETMSR, &gm) < 0) return (-1); *msr = gm.pm_msr; return (0); } int pmc_init(void) { int error, pmc_mod_id; unsigned int n; uint32_t abi_version; struct module_stat pmc_modstat; struct pmc_op_getcpuinfo op_cpu_info; #if defined(__amd64__) || defined(__i386__) int cpu_has_iaf_counters; unsigned int t; #endif if (pmc_syscall != -1) /* already inited */ return (0); /* retrieve the system call number from the KLD */ if ((pmc_mod_id = modfind(PMC_MODULE_NAME)) < 0) return (-1); pmc_modstat.version = sizeof(struct module_stat); if ((error = modstat(pmc_mod_id, &pmc_modstat)) < 0) return (-1); pmc_syscall = pmc_modstat.data.intval; /* check the kernel module's ABI against our compiled-in version */ abi_version = PMC_VERSION; if (PMC_CALL(GETMODULEVERSION, &abi_version) < 0) return (pmc_syscall = -1); /* ignore patch & minor numbers for the comparision */ if ((abi_version & 0xFF000000) != (PMC_VERSION & 0xFF000000)) { errno = EPROGMISMATCH; return (pmc_syscall = -1); } if (PMC_CALL(GETCPUINFO, &op_cpu_info) < 0) return (pmc_syscall = -1); cpu_info.pm_cputype = op_cpu_info.pm_cputype; cpu_info.pm_ncpu = op_cpu_info.pm_ncpu; cpu_info.pm_npmc = op_cpu_info.pm_npmc; cpu_info.pm_nclass = op_cpu_info.pm_nclass; for (n = 0; n < cpu_info.pm_nclass; n++) cpu_info.pm_classes[n] = op_cpu_info.pm_classes[n]; pmc_class_table = malloc(PMC_CLASS_TABLE_SIZE * sizeof(struct pmc_class_descr *)); if (pmc_class_table == NULL) return (-1); for (n = 0; n < PMC_CLASS_TABLE_SIZE; n++) pmc_class_table[n] = NULL; /* * Fill in the class table. */ n = 0; #if defined(__amd64__) || defined(__i386__) pmc_class_table[n++] = &tsc_class_table_descr; /* * Check if this CPU has fixed function counters. */ cpu_has_iaf_counters = 0; for (t = 0; t < cpu_info.pm_nclass; t++) if (cpu_info.pm_classes[t].pm_class == PMC_CLASS_IAF) cpu_has_iaf_counters = 1; #endif #define PMC_MDEP_INIT(C) do { \ pmc_mdep_event_aliases = C##_aliases; \ pmc_mdep_class_list = C##_pmc_classes; \ pmc_mdep_class_list_size = \ PMC_TABLE_SIZE(C##_pmc_classes); \ } while (0) #define PMC_MDEP_INIT_INTEL_V2(C) do { \ PMC_MDEP_INIT(C); \ if (cpu_has_iaf_counters) \ pmc_class_table[n++] = &iaf_class_table_descr; \ else \ pmc_mdep_event_aliases = \ C##_aliases_without_iaf; \ pmc_class_table[n] = &C##_class_table_descr; \ } while (0) /* Configure the event name parser. */ switch (cpu_info.pm_cputype) { #if defined(__i386__) case PMC_CPU_AMD_K7: PMC_MDEP_INIT(k7); pmc_class_table[n] = &k7_class_table_descr; break; case PMC_CPU_INTEL_P5: PMC_MDEP_INIT(p5); pmc_class_table[n] = &p5_class_table_descr; break; case PMC_CPU_INTEL_P6: /* P6 ... Pentium M CPUs have */ case PMC_CPU_INTEL_PII: /* similar PMCs. */ case PMC_CPU_INTEL_PIII: case PMC_CPU_INTEL_PM: PMC_MDEP_INIT(p6); pmc_class_table[n] = &p6_class_table_descr; break; #endif #if defined(__amd64__) || defined(__i386__) case PMC_CPU_AMD_K8: PMC_MDEP_INIT(k8); pmc_class_table[n] = &k8_class_table_descr; break; case PMC_CPU_INTEL_ATOM: PMC_MDEP_INIT_INTEL_V2(atom); break; case PMC_CPU_INTEL_CORE: PMC_MDEP_INIT(core); pmc_class_table[n] = &core_class_table_descr; break; case PMC_CPU_INTEL_CORE2: case PMC_CPU_INTEL_CORE2EXTREME: PMC_MDEP_INIT_INTEL_V2(core2); break; case PMC_CPU_INTEL_COREI7: + pmc_class_table[n++] = &ucf_class_table_descr; + pmc_class_table[n++] = &corei7uc_class_table_descr; PMC_MDEP_INIT_INTEL_V2(corei7); break; + case PMC_CPU_INTEL_WESTMERE: + pmc_class_table[n++] = &ucf_class_table_descr; + pmc_class_table[n++] = &westmereuc_class_table_descr; + PMC_MDEP_INIT_INTEL_V2(westmere); + break; case PMC_CPU_INTEL_PIV: PMC_MDEP_INIT(p4); pmc_class_table[n] = &p4_class_table_descr; break; #endif default: /* * Some kind of CPU this version of the library knows nothing * about. This shouldn't happen since the abi version check * should have caught this. */ errno = ENXIO; return (pmc_syscall = -1); } return (0); } const char * pmc_name_of_capability(enum pmc_caps cap) { int i; /* * 'cap' should have a single bit set and should be in * range. */ if ((cap & (cap - 1)) || cap < PMC_CAP_FIRST || cap > PMC_CAP_LAST) { errno = EINVAL; return (NULL); } i = ffs(cap); return (pmc_capability_names[i - 1]); } const char * pmc_name_of_class(enum pmc_class pc) { if ((int) pc >= PMC_CLASS_FIRST && pc <= PMC_CLASS_LAST) return (pmc_class_names[pc]); errno = EINVAL; return (NULL); } const char * pmc_name_of_cputype(enum pmc_cputype cp) { size_t n; for (n = 0; n < PMC_TABLE_SIZE(pmc_cputype_names); n++) if (cp == pmc_cputype_names[n].pm_cputype) return (pmc_cputype_names[n].pm_name); errno = EINVAL; return (NULL); } const char * pmc_name_of_disposition(enum pmc_disp pd) { if ((int) pd >= PMC_DISP_FIRST && pd <= PMC_DISP_LAST) return (pmc_disposition_names[pd]); errno = EINVAL; return (NULL); } const char * _pmc_name_of_event(enum pmc_event pe, enum pmc_cputype cpu) { const struct pmc_event_descr *ev, *evfence; ev = evfence = NULL; if (pe >= PMC_EV_IAF_FIRST && pe <= PMC_EV_IAF_LAST) { ev = iaf_event_table; evfence = iaf_event_table + PMC_EVENT_TABLE_SIZE(iaf); } else if (pe >= PMC_EV_IAP_FIRST && pe <= PMC_EV_IAP_LAST) { switch (cpu) { case PMC_CPU_INTEL_ATOM: ev = atom_event_table; evfence = atom_event_table + PMC_EVENT_TABLE_SIZE(atom); break; case PMC_CPU_INTEL_CORE: ev = core_event_table; evfence = core_event_table + PMC_EVENT_TABLE_SIZE(core); break; case PMC_CPU_INTEL_CORE2: case PMC_CPU_INTEL_CORE2EXTREME: ev = core2_event_table; evfence = core2_event_table + PMC_EVENT_TABLE_SIZE(core2); break; case PMC_CPU_INTEL_COREI7: ev = corei7_event_table; evfence = corei7_event_table + PMC_EVENT_TABLE_SIZE(corei7); break; + case PMC_CPU_INTEL_WESTMERE: + ev = westmere_event_table; + evfence = westmere_event_table + PMC_EVENT_TABLE_SIZE(westmere); + break; default: /* Unknown CPU type. */ break; } - } if (pe >= PMC_EV_K7_FIRST && pe <= PMC_EV_K7_LAST) { + } else if (pe >= PMC_EV_UCF_FIRST && pe <= PMC_EV_UCF_LAST) { + ev = ucf_event_table; + evfence = ucf_event_table + PMC_EVENT_TABLE_SIZE(ucf); + } else if (pe >= PMC_EV_UCP_FIRST && pe <= PMC_EV_UCP_LAST) { + switch (cpu) { + case PMC_CPU_INTEL_COREI7: + ev = corei7uc_event_table; + evfence = corei7uc_event_table + PMC_EVENT_TABLE_SIZE(corei7uc); + break; + case PMC_CPU_INTEL_WESTMERE: + ev = westmereuc_event_table; + evfence = westmereuc_event_table + PMC_EVENT_TABLE_SIZE(westmereuc); + break; + default: /* Unknown CPU type. */ + break; + } + } else if (pe >= PMC_EV_K7_FIRST && pe <= PMC_EV_K7_LAST) { ev = k7_event_table; evfence = k7_event_table + PMC_EVENT_TABLE_SIZE(k7); } else if (pe >= PMC_EV_K8_FIRST && pe <= PMC_EV_K8_LAST) { ev = k8_event_table; evfence = k8_event_table + PMC_EVENT_TABLE_SIZE(k8); } else if (pe >= PMC_EV_P4_FIRST && pe <= PMC_EV_P4_LAST) { ev = p4_event_table; evfence = p4_event_table + PMC_EVENT_TABLE_SIZE(p4); } else if (pe >= PMC_EV_P5_FIRST && pe <= PMC_EV_P5_LAST) { ev = p5_event_table; evfence = p5_event_table + PMC_EVENT_TABLE_SIZE(p5); } else if (pe >= PMC_EV_P6_FIRST && pe <= PMC_EV_P6_LAST) { ev = p6_event_table; evfence = p6_event_table + PMC_EVENT_TABLE_SIZE(p6); } else if (pe == PMC_EV_TSC_TSC) { ev = tsc_event_table; evfence = tsc_event_table + PMC_EVENT_TABLE_SIZE(tsc); } for (; ev != evfence; ev++) if (pe == ev->pm_ev_code) return (ev->pm_ev_name); return (NULL); } const char * pmc_name_of_event(enum pmc_event pe) { const char *n; if ((n = _pmc_name_of_event(pe, cpu_info.pm_cputype)) != NULL) return (n); errno = EINVAL; return (NULL); } const char * pmc_name_of_mode(enum pmc_mode pm) { if ((int) pm >= PMC_MODE_FIRST && pm <= PMC_MODE_LAST) return (pmc_mode_names[pm]); errno = EINVAL; return (NULL); } const char * pmc_name_of_state(enum pmc_state ps) { if ((int) ps >= PMC_STATE_FIRST && ps <= PMC_STATE_LAST) return (pmc_state_names[ps]); errno = EINVAL; return (NULL); } int pmc_ncpu(void) { if (pmc_syscall == -1) { errno = ENXIO; return (-1); } return (cpu_info.pm_ncpu); } int pmc_npmc(int cpu) { if (pmc_syscall == -1) { errno = ENXIO; return (-1); } if (cpu < 0 || cpu >= (int) cpu_info.pm_ncpu) { errno = EINVAL; return (-1); } return (cpu_info.pm_npmc); } int pmc_pmcinfo(int cpu, struct pmc_pmcinfo **ppmci) { int nbytes, npmc; struct pmc_op_getpmcinfo *pmci; if ((npmc = pmc_npmc(cpu)) < 0) return (-1); nbytes = sizeof(struct pmc_op_getpmcinfo) + npmc * sizeof(struct pmc_info); if ((pmci = calloc(1, nbytes)) == NULL) return (-1); pmci->pm_cpu = cpu; if (PMC_CALL(GETPMCINFO, pmci) < 0) { free(pmci); return (-1); } /* kernel<->library, library<->userland interfaces are identical */ *ppmci = (struct pmc_pmcinfo *) pmci; return (0); } int pmc_read(pmc_id_t pmc, pmc_value_t *value) { struct pmc_op_pmcrw pmc_read_op; pmc_read_op.pm_pmcid = pmc; pmc_read_op.pm_flags = PMC_F_OLDVALUE; pmc_read_op.pm_value = -1; if (PMC_CALL(PMCRW, &pmc_read_op) < 0) return (-1); *value = pmc_read_op.pm_value; return (0); } int pmc_release(pmc_id_t pmc) { struct pmc_op_simple pmc_release_args; pmc_release_args.pm_pmcid = pmc; return (PMC_CALL(PMCRELEASE, &pmc_release_args)); } int pmc_rw(pmc_id_t pmc, pmc_value_t newvalue, pmc_value_t *oldvaluep) { struct pmc_op_pmcrw pmc_rw_op; pmc_rw_op.pm_pmcid = pmc; pmc_rw_op.pm_flags = PMC_F_NEWVALUE | PMC_F_OLDVALUE; pmc_rw_op.pm_value = newvalue; if (PMC_CALL(PMCRW, &pmc_rw_op) < 0) return (-1); *oldvaluep = pmc_rw_op.pm_value; return (0); } int pmc_set(pmc_id_t pmc, pmc_value_t value) { struct pmc_op_pmcsetcount sc; sc.pm_pmcid = pmc; sc.pm_count = value; if (PMC_CALL(PMCSETCOUNT, &sc) < 0) return (-1); return (0); } int pmc_start(pmc_id_t pmc) { struct pmc_op_simple pmc_start_args; pmc_start_args.pm_pmcid = pmc; return (PMC_CALL(PMCSTART, &pmc_start_args)); } int pmc_stop(pmc_id_t pmc) { struct pmc_op_simple pmc_stop_args; pmc_stop_args.pm_pmcid = pmc; return (PMC_CALL(PMCSTOP, &pmc_stop_args)); } int pmc_width(pmc_id_t pmcid, uint32_t *width) { unsigned int i; enum pmc_class cl; cl = PMC_ID_TO_CLASS(pmcid); for (i = 0; i < cpu_info.pm_nclass; i++) if (cpu_info.pm_classes[i].pm_class == cl) { *width = cpu_info.pm_classes[i].pm_width; return (0); } errno = EINVAL; return (-1); } int pmc_write(pmc_id_t pmc, pmc_value_t value) { struct pmc_op_pmcrw pmc_write_op; pmc_write_op.pm_pmcid = pmc; pmc_write_op.pm_flags = PMC_F_NEWVALUE; pmc_write_op.pm_value = value; return (PMC_CALL(PMCRW, &pmc_write_op)); } int pmc_writelog(uint32_t userdata) { struct pmc_op_writelog wl; wl.pm_userdata = userdata; return (PMC_CALL(WRITELOG, &wl)); } Index: stable/8/lib/libpmc/pmc.corei7.3 =================================================================== --- stable/8/lib/libpmc/pmc.corei7.3 (nonexistent) +++ stable/8/lib/libpmc/pmc.corei7.3 (revision 206702) @@ -0,0 +1,1581 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 24, 2010 +.Os +.Dt PMC.COREI7 3 +.Sh NAME +.Nm pmc.corei7 +.Nd measurement events for +.Tn Intel +.Tn Core i7 and Xeon 5500 +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Core i7" +CPUs contain PMCs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +These CPUs may contain up to three classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_IAP" +.It Li PMC_CLASS_IAF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_IAP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Core i7 and Xeon 5500 PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Ss COREI7 AND XEON 5500 FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.iaf 3 . +Not all CPUs in this family implement fixed-function counters. +.Ss COREI7 AND XEON 5500 PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li rsp= Ns Ar value +Configure the Off-core Response bits. +.Bl -tag -width indent +.It Li DMND_DATA_RD +Counts the number of demand and DCU prefetch data reads of full +and partial cachelines as well as demand data page table entry +cacheline reads. Does not count L2 data read prefetches or +instruction fetches. +.It Li DMND_RFO +Counts the number of demand and DCU prefetch reads for ownership +(RFO) requests generated by a write to data cacheline. Does not +count L2 RFO. +.It Li DMND_IFETCH +Counts the number of demand and DCU prefetch instruction cacheline +reads. Does not count L2 code read prefetches. +WB +Counts the number of writeback (modified to exclusive) transactions. +.It Li PF_DATA_RD +Counts the number of data cacheline reads generated by L2 prefetchers. +.It Li PF_RFO +Counts the number of RFO requests generated by L2 prefetchers. +.It Li PF_IFETCH +Counts the number of code reads generated by L2 prefetchers. +.It Li OTHER +Counts one of the following transaction types, including L3 invalidate, +I/O, full or partial writes, WC or non-temporal stores, CLFLUSH, Fences, +lock, unlock, split lock. +.It Li UNCORE_HIT +L3 Hit: local or remote home requests that hit L3 cache in the uncore +with no coherency actions required (snooping). +.It Li OTHER_CORE_HIT_SNP +L3 Hit: local or remote home requests that hit L3 cache in the uncore +and was serviced by another core with a cross core snoop where no modified +copies were found (clean). +.It Li OTHER_CORE_HITM +L3 Hit: local or remote home requests that hit L3 cache in the uncore +and was serviced by another core with a cross core snoop where modified +copies were found (HITM). +.It Li REMOTE_CACHE_FWD +L3 Miss: local homed requests that missed the L3 cache and was serviced +by forwarded data following a cross package snoop where no modified +copies found. (Remote home requests are not counted) +.It Li REMOTE_DRAM +L3 Miss: remote home requests that missed the L3 cache and were serviced +by remote DRAM. +.It Li LOCAL_DRAM +L3 Miss: local home requests that missed the L3 cache and were serviced +by local DRAM. +.It Li NON_DRAM +Non-DRAM requests that were serviced by IOH. +.El +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.It Li os +Configure the PMC to count events happening at processor privilege +level 0. +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Ss Event Specifiers (Programmable PMCs) +Core i7 and Xeon 5500 programmable PMCs support the following events: +.Bl -tag -width indent +.It Li SB_DRAIN.ANY +.Pq Event 04H , Umask 07H +Counts the number of store buffer drains. +.It Li STORE_BLOCKS.AT_RET +.Pq Event 06H , Umask 04H +Counts number of loads delayed with at-Retirement block code. The following +loads need to be executed at retirement and wait for all senior stores on +the same thread to be drained: load splitting across 4K boundary (page +split), load accessing uncacheable (UC or USWC) memory, load lock, and load +with page table in UC or USWC memory region. +.It Li STORE_BLOCKS.L1D_BLOCK +.Pq Event 06H , Umask 08H +Cacheable loads delayed with L1D block code +.It Li PARTIAL_ADDRESS_ALIAS +.Pq Event 07H , Umask 01H +Counts false dependency due to partial address aliasing +.It Li DTLB_LOAD_MISSES.ANY +.Pq Event 08H , Umask 01H +Counts all load misses that cause a page walk +.It Li DTLB_LOAD_MISSES.WALK_COMPLETED +.Pq Event 08H , Umask 02H +Counts number of completed page walks due to load miss in the STLB. +.It Li DTLB_LOAD_MISSES.STLB_HIT +.Pq Event 08H , Umask 10H +Number of cache load STLB hits +.It Li DTLB_LOAD_MISSES.PDE_MISS +.Pq Event 08H , Umask 20H +Number of DTLB cache load misses where the low part of the linear to +physical address translation was missed. +.It Li DTLB_LOAD_MISSES.PDP_MISS +.Pq Event 08H , Umask 40H +Number of DTLB cache load misses where the high part of the linear to +physical address translation was missed. +.It Li DTLB_LOAD_MISSES.LARGE_WALK_COMPLETED +.Pq Event 08H , Umask 80H +Counts number of completed large page walks due to load miss in the STLB. +.It Li MEM_INST_RETIRED.LOADS +.Pq Event 0BH , Umask 01H +Counts the number of instructions with an architecturally-visible store +retired on the architected path. +In conjunction with ld_lat facility +.It Li MEM_INST_RETIRED.STORES +.Pq Event 0BH , Umask 02H +Counts the number of instructions with an architecturally-visible store +retired on the architected path. +In conjunction with ld_lat facility +.It Li MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD +.Pq Event 0BH , Umask 10H +Counts the number of instructions exceeding the latency specified with +ld_lat facility. +In conjunction with ld_lat facility +.It Li MEM_STORE_RETIRED.DTLB_MISS +.Pq Event 0CH , Umask 01H +The event counts the number of retired stores that missed the DTLB. The DTLB +miss is not counted if the store operation causes a fault. Does not counter +prefetches. Counts both primary and secondary misses to the TLB +.It Li UOPS_ISSUED.ANY +.Pq Event 0EH , Umask 01H +Counts the number of Uops issued by the Register Allocation Table to the +Reservation Station, i.e. the UOPs issued from the front end to the back +end. +.It Li UOPS_ISSUED.STALLED_CYCLES +.Pq Event 0EH , Umask 01H +Counts the number of cycles no Uops issued by the Register Allocation Table +to the Reservation Station, i.e. the UOPs issued from the front end to the +back end. +set invert=1, cmask = 1 +.It Li UOPS_ISSUED.FUSED +.Pq Event 0EH , Umask 02H +Counts the number of fused Uops that were issued from the Register +Allocation Table to the Reservation Station. +.It Li MEM_UNCORE_RETIRED.L3_DATA_MISS_UNKNOWN +.Pq Event 0FH , Umask 01H +Counts number of memory load instructions retired where the memory reference +missed L3 and data source is unknown. +Available only for CPUID signature 06_2EH +.It Li MEM_UNCORE_RETIRED.OTHER_CORE_L2_HITM +.Pq Event 0FH , Umask 02H +Counts number of memory load instructions retired where the memory reference +hit modified data in a sibling core residing on the same socket. +.It Li MEM_UNCORE_RETIRED.REMOTE_CACHE_LOCAL_HOME_HIT +.Pq Event 0FH , Umask 08H +Counts number of memory load instructions retired where the memory reference +missed the L1, L2 and L3 caches and HIT in a remote socket's cache. Only +counts locally homed lines. +.It Li MEM_UNCORE_RETIRED.REMOTE_DRAM +.Pq Event 0FH , Umask 10H +Counts number of memory load instructions retired where the memory reference +missed the L1, L2 and L3 caches and was remotely homed. This includes both +DRAM access and HITM in a remote socket's cache for remotely homed lines. +.It Li MEM_UNCORE_RETIRED.LOCAL_DRAM +.Pq Event 0FH , Umask 20H +Counts number of memory load instructions retired where the memory reference +missed the L1, L2 and L3 caches and required a local socket memory +reference. This includes locally homed cachelines that were in a modified +state in another socket. +.It Li MEM_UNCORE_RETIRED.UNCACHEABLE +.Pq Event 0FH , Umask 80H +Counts number of memory load instructions retired where the memory reference +missed the L1, L2 and L3 caches and to perform I/O. +Available only for CPUID signature 06_2EH +.It Li FP_COMP_OPS_EXE.X87 +.Pq Event 10H , Umask 01H +Counts the number of FP Computational Uops Executed. The number of FADD, +FSUB, FCOM, FMULs, integer MULsand IMULs, FDIVs, FPREMs, FSQRTS, integer +DIVs, and IDIVs. This event does not distinguish an FADD used in the middle +of a transcendental flow from a separate FADD instruction. +.It Li FP_COMP_OPS_EXE.MMX +.Pq Event 10H , Umask 02H +Counts number of MMX Uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP +.Pq Event 10H , Umask 04H +Counts number of SSE and SSE2 FP uops executed. +.It Li FP_COMP_OPS_EXE.SSE2_INTEGER +.Pq Event 10H , Umask 08H +Counts number of SSE2 integer uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP_PACKED +.Pq Event 10H , Umask 10H +Counts number of SSE FP packed uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP_SCALAR +.Pq Event 10H , Umask 20H +Counts number of SSE FP scalar uops executed. +.It Li FP_COMP_OPS_EXE.SSE_SINGLE_PRECISION +.Pq Event 10H , Umask 40H +Counts number of SSE* FP single precision uops executed. +.It Li FP_COMP_OPS_EXE.SSE_DOUBLE_PRECISION +.Pq Event 10H , Umask 80H +Counts number of SSE* FP double precision uops executed. +.It Li SIMD_INT_128.PACKED_MPY +.Pq Event 12H , Umask 01H +Counts number of 128 bit SIMD integer multiply operations. +.It Li SIMD_INT_128.PACKED_SHIFT +.Pq Event 12H , Umask 02H +Counts number of 128 bit SIMD integer shift operations. +.It Li SIMD_INT_128.PACK +.Pq Event 12H , Umask 04H +Counts number of 128 bit SIMD integer pack operations. +.It Li SIMD_INT_128.UNPACK +.Pq Event 12H , Umask 08H +Counts number of 128 bit SIMD integer unpack operations. +.It Li SIMD_INT_128.PACKED_LOGICAL +.Pq Event 12H , Umask 10H +Counts number of 128 bit SIMD integer logical operations. +.It Li SIMD_INT_128.PACKED_ARITH +.Pq Event 12H , Umask 20H +Counts number of 128 bit SIMD integer arithmetic operations. +.It Li SIMD_INT_128.SHUFFLE_MOVE +.Pq Event 12H , Umask 40H +Counts number of 128 bit SIMD integer shuffle and move operations. +.It Li LOAD_DISPATCH.RS +.Pq Event 13H , Umask 01H +Counts number of loads dispatched from the Reservation Station that bypass +the Memory Order Buffer. +.It Li LOAD_DISPATCH.RS_DELAYED +.Pq Event 13H , Umask 02H +Counts the number of delayed RS dispatches at the stage latch. If an RS +dispatch can not bypass to LB, it has another chance to dispatch from the +one-cycle delayed staging latch before it is written into the LB. +.It Li LOAD_DISPATCH.MOB +.Pq Event 13H , Umask 04H +Counts the number of loads dispatched from the Reservation Station to the +Memory Order Buffer. +.It Li LOAD_DISPATCH.ANY +.Pq Event 13H , Umask 07H +Counts all loads dispatched from the Reservation Station. +.It Li ARITH.CYCLES_DIV_BUSY +.Pq Event 14H , Umask 01H +Counts the number of cycles the divider is busy executing divide or square +root operations. The divide can be integer, X87 or Streaming SIMD Extensions +(SSE). The square root operation can be either X87 or SSE. +Set 'edge =1, invert=1, cmask=1' to count the number of divides. +Count may be incorrect When SMT is on. +.It Li ARITH.MUL +.Pq Event 14H , Umask 02H +Counts the number of multiply operations executed. This includes integer as +well as floating point multiply operations but excludes DPPS mul and MPSAD. +Count may be incorrect When SMT is on +.It Li INST_QUEUE_WRITES +.Pq Event 17H , Umask 01H +Counts the number of instructions written into the instruction queue every +cycle. +.It Li INST_DECODED.DEC0 +.Pq Event 18H , Umask 01H +Counts number of instructions that require decoder 0 to be decoded. Usually, +this means that the instruction maps to more than 1 uop +.It Li TWO_UOP_INSTS_DECODED +.Pq Event 19H , Umask 01H +An instruction that generates two uops was decoded +.It Li INST_QUEUE_WRITE_CYCLES +.Pq Event 1EH , Umask 01H +This event counts the number of cycles during which instructions are written +to the instruction queue. Dividing this counter by the number of +instructions written to the instruction queue (INST_QUEUE_WRITES) yields the +average number of instructions decoded each cycle. If this number is less +than four and the pipe stalls, this indicates that the decoder is failing to +decode enough instructions per cycle to sustain the 4-wide pipeline. +If SSE* instructions that are 6 bytes or longer arrive one after another, +then front end throughput may limit execution speed. In such case, +.It Li LSD_OVERFLOW +.Pq Event 20H , Umask 01H +Counts number of loops that cant stream from the instruction queue. +.It Li L2_RQSTS.LD_HIT +.Pq Event 24H , Umask 01H +Counts number of loads that hit the L2 cache. L2 loads include both L1D +demand misses as well as L1D prefetches. L2 loads can be rejected for +various reasons. Only non rejected loads are counted. +.It Li L2_RQSTS.LD_MISS +.Pq Event 24H , Umask 02H +Counts the number of loads that miss the L2 cache. L2 loads include both L1D +demand misses as well as L1D prefetches. +.It Li L2_RQSTS.LOADS +.Pq Event 24H , Umask 03H +Counts all L2 load requests. L2 loads include both L1D demand misses as well +as L1D prefetches. +.It Li L2_RQSTS.RFO_HIT +.Pq Event 24H , Umask 04H +Counts the number of store RFO requests that hit the L2 cache. L2 RFO +requests include both L1D demand RFO misses as well as L1D RFO prefetches. +Count includes WC memory requests, where the data is not fetched but the +permission to write the line is required. +.It Li L2_RQSTS.RFO_MISS +.Pq Event 24H , Umask 08H +Counts the number of store RFO requests that miss the L2 cache. L2 RFO +requests include both L1D demand RFO misses as well as L1D RFO prefetches. +.It Li L2_RQSTS.RFOS +.Pq Event 24H , Umask 0CH +Counts all L2 store RFO requests. L2 RFO requests include both L1D demand +RFO misses as well as L1D RFO prefetches. +.It Li L2_RQSTS.IFETCH_HIT +.Pq Event 24H , Umask 10H +Counts number of instruction fetches that hit the L2 cache. L2 instruction +fetches include both L1I demand misses as well as L1I instruction +prefetches. +.It Li L2_RQSTS.IFETCH_MISS +.Pq Event 24H , Umask 20H +Counts number of instruction fetches that miss the L2 cache. L2 instruction +fetches include both L1I demand misses as well as L1I instruction +prefetches. +.It Li L2_RQSTS.IFETCHES +.Pq Event 24H , Umask 30H +Counts all instruction fetches. L2 instruction fetches include both L1I +demand misses as well as L1I instruction prefetches. +.It Li L2_RQSTS.PREFETCH_HIT +.Pq Event 24H , Umask 40H +Counts L2 prefetch hits for both code and data. +.It Li L2_RQSTS.PREFETCH_MISS +.Pq Event 24H , Umask 80H +Counts L2 prefetch misses for both code and data. +.It Li L2_RQSTS.PREFETCHES +.Pq Event 24H , Umask C0H +Counts all L2 prefetches for both code and data. +.It Li L2_RQSTS.MISS +.Pq Event 24H , Umask AAH +Counts all L2 misses for both code and data. +.It Li L2_RQSTS.REFERENCES +.Pq Event 24H , Umask FFH +Counts all L2 requests for both code and data. +.It Li L2_DATA_RQSTS.DEMAND.I_STATE +.Pq Event 26H , Umask 01H +Counts number of L2 data demand loads where the cache line to be loaded is +in the I (invalid) state, i.e. a cache miss. L2 demand loads are both L1D +demand misses and L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.S_STATE +.Pq Event 26H , Umask 02H +Counts number of L2 data demand loads where the cache line to be loaded is +in the S (shared) state. L2 demand loads are both L1D demand misses and L1D +prefetches. +.It Li L2_DATA_RQSTS.DEMAND.E_STATE +.Pq Event 26H , Umask 04H +Counts number of L2 data demand loads where the cache line to be loaded is +in the E (exclusive) state. L2 demand loads are both L1D demand misses and +L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.M_STATE +.Pq Event 26H , Umask 08H +Counts number of L2 data demand loads where the cache line to be loaded is +in the M (modified) state. L2 demand loads are both L1D demand misses and +L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.MESI +.Pq Event 26H , Umask 0FH +Counts all L2 data demand requests. L2 demand loads are both L1D demand +misses and L1D prefetches. +.It Li L2_DATA_RQSTS.PREFETCH.I_STATE +.Pq Event 26H , Umask 10H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the I (invalid) state, i.e. a cache miss. +.It Li L2_DATA_RQSTS.PREFETCH.S_STATE +.Pq Event 26H , Umask 20H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the S (shared) state. A prefetch RFO will miss on an S state line, while +a prefetch read will hit on an S state line. +.It Li L2_DATA_RQSTS.PREFETCH.E_STATE +.Pq Event 26H , Umask 40H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the E (exclusive) state. +.It Li L2_DATA_RQSTS.PREFETCH.M_STATE +.Pq Event 26H , Umask 80H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the M (modified) state. +.It Li L2_DATA_RQSTS.PREFETCH.MESI +.Pq Event 26H , Umask F0H +Counts all L2 prefetch requests. +.It Li L2_DATA_RQSTS.ANY +.Pq Event 26H , Umask FFH +Counts all L2 data requests. +.It Li L2_WRITE.RFO.I_STATE +.Pq Event 27H , Umask 01H +Counts number of L2 demand store RFO requests where the cache line to be +loaded is in the I (invalid) state, i.e, a cache miss. The L1D prefetcher +does not issue a RFO prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.S_STATE +.Pq Event 27H , Umask 02H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the S (shared) state. The L1D prefetcher does not issue a RFO prefetch,. +This is a demand RFO request +.It Li L2_WRITE.RFO.M_STATE +.Pq Event 27H , Umask 08H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the M (modified) state. The L1D prefetcher does not issue a RFO prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.HIT +.Pq Event 27H , Umask 0EH +Counts number of L2 store RFO requests where the cache line to be loaded is +in either the S, E or M states. The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.MESI +.Pq Event 27H , Umask 0FH +Counts all L2 store RFO requests.The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li L2_WRITE.LOCK.I_STATE +.Pq Event 27H , Umask 10H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the I (invalid) state, i.e. a cache miss. +.It Li L2_WRITE.LOCK.S_STATE +.Pq Event 27H , Umask 20H +Counts number of L2 lock RFO requests where the cache line to be loaded is +in the S (shared) state. +.It Li L2_WRITE.LOCK.E_STATE +.Pq Event 27H , Umask 40H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the E (exclusive) state. +.It Li L2_WRITE.LOCK.M_STATE +.Pq Event 27H , Umask 80H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the M (modified) state. +.It Li L2_WRITE.LOCK.HIT +.Pq Event 27H , Umask E0H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in either the S, E, or M state. +.It Li L2_WRITE.LOCK.MESI +.Pq Event 27H , Umask F0H +Counts all L2 demand lock RFO requests. +.It Li L1D_WB_L2.I_STATE +.Pq Event 28H , Umask 01H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the I (invalid) state, i.e. a cache miss. +.It Li L1D_WB_L2.S_STATE +.Pq Event 28H , Umask 02H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the S state. +.It Li L1D_WB_L2.E_STATE +.Pq Event 28H , Umask 04H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the E (exclusive) state. +.It Li L1D_WB_L2.M_STATE +.Pq Event 28H , Umask 08H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the M (modified) state. +.It Li L1D_WB_L2.MESI +.Pq Event 28H , Umask 0FH +Counts all L1 writebacks to the L2. +.It Li L3_LAT_CACHE.REFERENCE +.Pq Event 2EH , Umask 4FH +This event counts requests originating from the core that reference a cache +line in the last level cache. The event count includes speculative traffic +but excludes cache line fills due to a L2 hardware-prefetch. Because cache +hierarchy, cache sizes and other implementation-specific characteristics; +value comparison to estimate performance differences is not recommended. +see Table A-1 +.It Li L3_LAT_CACHE.MISS +.Pq Event 2EH , Umask 41H +This event counts each cache miss condition for references to the last level +cache. The event count may include speculative traffic but excludes cache +line fills due to L2 hardware-prefetches. Because cache hierarchy, cache +sizes and other implementation-specific characteristics; value comparison to +estimate performance differences is not recommended. +see Table A-1 +.It Li CPU_CLK_UNHALTED.THREAD_P +.Pq Event 3CH , Umask 00H +Counts the number of thread cycles while the thread is not in a halt state. +The thread enters the halt state when it is running the HLT instruction. The +core frequency may change from time to time due to power or thermal +throttling. +see Table A-1 +.It Li CPU_CLK_UNHALTED.REF_P +.Pq Event 3CH , Umask 01H +Increments at the frequency of TSC when not halted. +see Table A-1 +.It Li L1D_CACHE_LD.I_STATE +.Pq Event 40H , Umask 01H +Counts L1 data cache read requests where the cache line to be loaded is in +the I (invalid) state, i.e. the read request missed the cache. +Counter 0, 1 only +.It Li L1D_CACHE_LD.S_STATE +.Pq Event 40H , Umask 02H +Counts L1 data cache read requests where the cache line to be loaded is in +the S (shared) state. +Counter 0, 1 only +.It Li L1D_CACHE_LD.E_STATE +.Pq Event 40H , Umask 04H +Counts L1 data cache read requests where the cache line to be loaded is in +the E (exclusive) state. +Counter 0, 1 only +.It Li L1D_CACHE_LD.M_STATE +.Pq Event 40H , Umask 08H +Counts L1 data cache read requests where the cache line to be loaded is in +the M (modified) state. +Counter 0, 1 only +.It Li L1D_CACHE_LD.MESI +.Pq Event 40H , Umask 0FH +Counts L1 data cache read requests. +Counter 0, 1 only +.It Li L1D_CACHE_ST.S_STATE +.Pq Event 41H , Umask 02H +Counts L1 data cache store RFO requests where the cache line to be loaded is +in the S (shared) state. +Counter 0, 1 only +.It Li L1D_CACHE_ST.E_STATE +.Pq Event 41H , Umask 04H +Counts L1 data cache store RFO requests where the cache line to be loaded is +in the E (exclusive) state. +Counter 0, 1 only +.It Li L1D_CACHE_ST.M_STATE +.Pq Event 41H , Umask 08H +Counts L1 data cache store RFO requests where cache line to be loaded is in +the M (modified) state. +Counter 0, 1 only +.It Li L1D_CACHE_LOCK.HIT +.Pq Event 42H , Umask 01H +Counts retired load locks that hit in the L1 data cache or hit in an already +allocated fill buffer. The lock portion of the load lock transaction must +hit in the L1D. +The initial load will pull the lock into the L1 data cache. Counter 0, 1 +only +.It Li L1D_CACHE_LOCK.S_STATE +.Pq Event 42H , Umask 02H +Counts L1 data cache retired load locks that hit the target cache line in +the shared state. +Counter 0, 1 only +.It Li L1D_CACHE_LOCK.E_STATE +.Pq Event 42H , Umask 04H +Counts L1 data cache retired load locks that hit the target cache line in +the exclusive state. +Counter 0, 1 only +.It Li L1D_CACHE_LOCK.M_STATE +.Pq Event 42H , Umask 08H +Counts L1 data cache retired load locks that hit the target cache line in +the modified state. +Counter 0, 1 only +.It Li L1D_ALL_REF.ANY +.Pq Event 43H , Umask 01H +Counts all references (uncached, speculated and retired) to the L1 data +cache, including all loads and stores with any memory types. The event +counts memory accesses only when they are actually performed. For example, a +load blocked by unknown store address and later performed is only counted +once. +The event does not include non- memory accesses, such as I/O accesses. +Counter 0, 1 only +.It Li L1D_ALL_REF.CACHEABLE +.Pq Event 43H , Umask 02H +Counts all data reads and writes (speculated and retired) from cacheable +memory, including locked operations. +Counter 0, 1 only +.It Li L1D_PEND_MISS.LOAD_BUFFERS_FULL +.Pq Event 48H , Umask 02H +Counts cycles of L1 data cache load fill buffers full. +Counter 0, 1 only +.It Li DTLB_MISSES.ANY +.Pq Event 49H , Umask 01H +Counts the number of misses in the STLB which causes a page walk. +.It Li DTLB_MISSES.WALK_COMPLETED +.Pq Event 49H , Umask 02H +Counts number of misses in the STLB which resulted in a completed page walk. +.It Li DTLB_MISSES.STLB_HIT +.Pq Event 49H , Umask 10H +Counts the number of DTLB first level misses that hit in the second level +TLB. This event is only relevant if the core contains multiple DTLB levels. +.It Li LOAD_HIT_PRE +.Pq Event 4CH , Umask 01H +Counts load operations sent to the L1 data cache while a previous SSE +prefetch instruction to the same cache line has started prefetching but has +not yet finished. +.It Li L1D_PREFETCH.REQUESTS +.Pq Event 4EH , Umask 01H +Counts number of hardware prefetch requests dispatched out of the prefetch +FIFO. +.It Li L1D_PREFETCH.MISS +.Pq Event 4EH , Umask 02H +Counts number of hardware prefetch requests that miss the L1D. There are two +prefetchers in the L1D. A streamer, which predicts lines sequentially after +this one should be fetched, and the IP prefetcher that remembers access +patterns for the current instruction. The streamer prefetcher stops on an +L1D hit, while the IP prefetcher does not. +.It Li L1D_PREFETCH.TRIGGERS +.Pq Event 4EH , Umask 04H +Counts number of prefetch requests triggered by the Finite State Machine and +pushed into the prefetch FIFO. Some of the prefetch requests are dropped due +to overwrites or competition between the IP index prefetcher and streamer +prefetcher. The prefetch FIFO contains 4 entries. +.It Li L1D.REPL +.Pq Event 51H , Umask 01H +Counts the number of lines brought into the L1 data cache. +Counter 0, 1 only +.It Li L1D.M_REPL +.Pq Event 51H , Umask 02H +Counts the number of modified lines brought into the L1 data cache. +Counter 0, 1 only +.It Li L1D.M_EVICT +.Pq Event 51H , Umask 04H +Counts the number of modified lines evicted from the L1 data cache due to +replacement. +Counter 0, 1 only +.It Li L1D.M_SNOOP_EVICT +.Pq Event 51H , Umask 08H +Counts the number of modified lines evicted from the L1 data cache due to +snoop HITM intervention. +Counter 0, 1 only +.It Li L1D_CACHE_PREFETCH_LOCK_FB_HIT +.Pq Event 52H , Umask 01H +Counts the number of cacheable load lock speculated instructions accepted +into the fill buffer. +.It Li L1D_CACHE_LOCK_FB_HIT +.Pq Event 53H , Umask 01H +Counts the number of cacheable load lock speculated or retired instructions +accepted into the fill buffer. +.It Li CACHE_LOCK_CYCLES.L1D_L2 +.Pq Event 63H , Umask 01H +Cycle count during which the L1D and L2 are locked. A lock is asserted when +there is a locked memory access, due to uncacheable memory, a locked +operation that spans two cache lines, or a page walk from an uncacheable +page table. +Counter 0, 1 only. L1D and L2 locks have a very high performance penalty and +it is highly recommended to avoid such accesses. +.It Li CACHE_LOCK_CYCLES.L1D +.Pq Event 63H , Umask 02H +Counts the number of cycles that cacheline in the L1 data cache unit is +locked. +Counter 0, 1 only. +.It Li IO_TRANSACTIONS +.Pq Event 6CH , Umask 01H +Counts the number of completed I/O transactions. +.It Li L1I.HITS +.Pq Event 80H , Umask 01H +Counts all instruction fetches that hit the L1 instruction cache. +.It Li L1I.MISSES +.Pq Event 80H , Umask 02H +Counts all instruction fetches that miss the L1I cache. This includes +instruction cache misses, streaming buffer misses, victim cache misses and +uncacheable fetches. An instruction fetch miss is counted only once and not +once for every cycle it is outstanding. +.It Li L1I.READS +.Pq Event 80H , Umask 03H +Counts all instruction fetches, including uncacheable fetches that bypass +the L1I. +.It Li L1I.CYCLES_STALLED +.Pq Event 80H , Umask 04H +Cycle counts for which an instruction fetch stalls due to a L1I cache miss, +ITLB miss or ITLB fault. +.It Li LARGE_ITLB.HIT +.Pq Event 82H , Umask 01H +Counts number of large ITLB hits. +.It Li ITLB_MISSES.ANY +.Pq Event 85H , Umask 01H +Counts the number of misses in all levels of the ITLB which causes a page +walk. +.It Li ITLB_MISSES.WALK_COMPLETED +.Pq Event 85H , Umask 02H +Counts number of misses in all levels of the ITLB which resulted in a +completed page walk. +.It Li ILD_STALL.LCP +.Pq Event 87H , Umask 01H +Cycles Instruction Length Decoder stalls due to length changing prefixes: +66, 67 or REX.W (for EM64T) instructions which change the length of the +decoded instruction. +.It Li ILD_STALL.MRU +.Pq Event 87H , Umask 02H +Instruction Length Decoder stall cycles due to Brand Prediction Unit (PBU) +Most Recently Used (MRU) bypass. +.It Li ILD_STALL.IQ_FULL +.Pq Event 87H , Umask 04H +Stall cycles due to a full instruction queue. +.It Li ILD_STALL.REGEN +.Pq Event 87H , Umask 08H +Counts the number of regen stalls. +.It Li ILD_STALL.ANY +.Pq Event 87H , Umask 0FH +Counts any cycles the Instruction Length Decoder is stalled. +.It Li BR_INST_EXEC.COND +.Pq Event 88H , Umask 01H +Counts the number of conditional near branch instructions executed, but not +necessarily retired. +.It Li BR_INST_EXEC.DIRECT +.Pq Event 88H , Umask 02H +Counts all unconditional near branch instructions excluding calls and +indirect branches. +.It Li BR_INST_EXEC.INDIRECT_NON_CALL +.Pq Event 88H , Umask 04H +Counts the number of executed indirect near branch instructions that are not +calls. +.It Li BR_INST_EXEC.NON_CALLS +.Pq Event 88H , Umask 07H +Counts all non call near branch instructions executed, but not necessarily +retired. +.It Li BR_INST_EXEC.RETURN_NEAR +.Pq Event 88H , Umask 08H +Counts indirect near branches that have a return mnemonic. +.It Li BR_INST_EXEC.DIRECT_NEAR_CALL +.Pq Event 88H , Umask 10H +Counts unconditional near call branch instructions, excluding non call +branch, executed. +.It Li BR_INST_EXEC.INDIRECT_NEAR_CALL +.Pq Event 88H , Umask 20H +Counts indirect near calls, including both register and memory indirect, +executed. +.It Li BR_INST_EXEC.NEAR_CALLS +.Pq Event 88H , Umask 30H +Counts all near call branches executed, but not necessarily retired. +.It Li BR_INST_EXEC.TAKEN +.Pq Event 88H , Umask 40H +Counts taken near branches executed, but not necessarily retired. +.It Li BR_INST_EXEC.ANY +.Pq Event 88H , Umask 7FH +Counts all near executed branches (not necessarily retired). This includes +only instructions and not micro-op branches. Frequent branching is not +necessarily a major performance issue. However frequent branch +mispredictions may be a problem. +.It Li BR_MISP_EXEC.COND +.Pq Event 89H , Umask 01H +Counts the number of mispredicted conditional near branch instructions +executed, but not necessarily retired. +.It Li BR_MISP_EXEC.DIRECT +.Pq Event 89H , Umask 02H +Counts mispredicted macro unconditional near branch instructions, excluding +calls and indirect branches (should always be 0). +.It Li BR_MISP_EXEC.INDIRECT_NON_CALL +.Pq Event 89H , Umask 04H +Counts the number of executed mispredicted indirect near branch instructions +that are not calls. +.It Li BR_MISP_EXEC.NON_CALLS +.Pq Event 89H , Umask 07H +Counts mispredicted non call near branches executed, but not necessarily +retired. +.It Li BR_MISP_EXEC.RETURN_NEAR +.Pq Event 89H , Umask 08H +Counts mispredicted indirect branches that have a rear return mnemonic. +.It Li BR_MISP_EXEC.DIRECT_NEAR_CALL +.Pq Event 89H , Umask 10H +Counts mispredicted non-indirect near calls executed, (should always be 0). +.It Li BR_MISP_EXEC.INDIRECT_NEAR_CALL +.Pq Event 89H , Umask 20H +Counts mispredicted indirect near calls exeucted, including both register +and memory indirect. +.It Li BR_MISP_EXEC.NEAR_CALLS +.Pq Event 89H , Umask 30H +Counts all mispredicted near call branches executed, but not necessarily +retired. +.It Li BR_MISP_EXEC.TAKEN +.Pq Event 89H , Umask 40H +Counts executed mispredicted near branches that are taken, but not +necessarily retired. +.It Li BR_MISP_EXEC.ANY +.Pq Event 89H , Umask 7FH +Counts the number of mispredicted near branch instructions that were +executed, but not necessarily retired. +.It Li RESOURCE_STALLS.ANY +.Pq Event A2H , Umask 01H +Counts the number of Allocator resource related stalls. Includes register +renaming buffer entries, memory buffer entries. In addition to resource +related stalls, this event counts some other events. Includes stalls arising +during branch misprediction recovery, such as if retirement of the +mispredicted branch is delayed and stalls arising while store buffer is +draining from synchronizing operations. +Does not include stalls due to SuperQ (off core) queue full, too many cache +misses, etc. +.It Li RESOURCE_STALLS.LOAD +.Pq Event A2H , Umask 02H +Counts the cycles of stall due to lack of load buffer for load operation. +.It Li RESOURCE_STALLS.RS_FULL +.Pq Event A2H , Umask 04H +This event counts the number of cycles when the number of instructions in +the pipeline waiting for execution reaches the limit the processor can +handle. A high count of this event indicates that there are long latency +operations in the pipe (possibly load and store operations that miss the L2 +cache, or instructions dependent upon instructions further down the pipeline +that have yet to retire. +When RS is full, new instructions can not enter the reservation station and +start execution. +.It Li RESOURCE_STALLS.STORE +.Pq Event A2H , Umask 08H +This event counts the number of cycles that a resource related stall will +occur due to the number of store instructions reaching the limit of the +pipeline, (i.e. all store buffers are used). The stall ends when a store +instruction commits its data to the cache or memory. +.It Li RESOURCE_STALLS.ROB_FULL +.Pq Event A2H , Umask 10H +Counts the cycles of stall due to re- order buffer full. +.It Li RESOURCE_STALLS.FPCW +.Pq Event A2H , Umask 20H +Counts the number of cycles while execution was stalled due to writing the +floating-point unit (FPU) control word. +.It Li RESOURCE_STALLS.MXCSR +.Pq Event A2H , Umask 40H +Stalls due to the MXCSR register rename occurring to close to a previous +MXCSR rename. The MXCSR provides control and status for the MMX registers. +.It Li RESOURCE_STALLS.OTHER +.Pq Event A2H , Umask 80H +Counts the number of cycles while execution was stalled due to other +resource issues. +.It Li MACRO_INSTS.FUSIONS_DECODED +.Pq Event A6H , Umask 01H +Counts the number of instructions decoded that are macro-fused but not +necessarily executed or retired. +.It Li BACLEAR_FORCE_IQ +.Pq Event A7H , Umask 01H +Counts number of times a BACLEAR was forced by the Instruction Queue. The IQ +is also responsible for providing conditional branch prediciton direction +based on a static scheme and dynamic data provided by the L2 Branch +Prediction Unit. If the conditional branch target is not found in the Target +Array and the IQ predicts that the branch is taken, then the IQ will force +the Branch Address Calculator to issue a BACLEAR. Each BACLEAR asserted by +the BAC generates approximately an 8 cycle bubble in the instruction fetch +pipeline. +.It Li LSD.UOPS +.Pq Event A8H , Umask 01H +Counts the number of micro-ops delivered by loop stream detector +Use cmask=1 and invert to count cycles +.It Li ITLB_FLUSH +.Pq Event AEH , Umask 01H +Counts the number of ITLB flushes +.It Li OFFCORE_REQUESTS.L1D_WRITEBACK +.Pq Event B0H , Umask 40H +Counts number of L1D writebacks to the uncore. +.It Li UOPS_EXECUTED.PORT0 +.Pq Event B1H , Umask 01H +Counts number of Uops executed that were issued on port 0. Port 0 handles +integer arithmetic, SIMD and FP add Uops. +.It Li UOPS_EXECUTED.PORT1 +.Pq Event B1H , Umask 02H +Counts number of Uops executed that were issued on port 1. Port 1 handles +integer arithmetic, SIMD, integer shift, FP multiply and FP divide Uops. +.It Li UOPS_EXECUTED.PORT2_CORE +.Pq Event B1H , Umask 04H +Counts number of Uops executed that were issued on port 2. Port 2 handles +the load Uops. This is a core count only and can not be collected per +thread. +.It Li UOPS_EXECUTED.PORT3_CORE +.Pq Event B1H , Umask 08H +Counts number of Uops executed that were issued on port 3. Port 3 handles +store Uops. This is a core count only and can not be collected per thread. +.It Li UOPS_EXECUTED.PORT4_CORE +.Pq Event B1H , Umask 10H +Counts number of Uops executed that where issued on port 4. Port 4 handles +the value to be stored for the store Uops issued on port 3. This is a core +count only and can not be collected per thread. +.It Li UOPS_EXECUTED.CORE_ACTIVE_CYCLES_NO_PORT5 +.Pq Event B1H , Umask 1FH +Counts cycles when the Uops executed were issued from any ports except port +5. Use Cmask=1 for active cycles; Cmask=0 for weighted cycles; Use CMask=1, +Invert=1 to count P0-4 stalled cycles Use Cmask=1, Edge=1, Invert=1 to count +P0-4 stalls. +.It Li UOPS_EXECUTED.PORT5 +.Pq Event B1H , Umask 20H +Counts number of Uops executed that where issued on port 5. +.It Li UOPS_EXECUTED.CORE_ACTIVE_CYCLES +.Pq Event B1H , Umask 3FH +Counts cycles when the Uops are executing. Use Cmask=1 for active cycles; +Cmask=0 for weighted cycles; Use CMask=1, Invert=1 to count P0-4 stalled +cycles Use Cmask=1, Edge=1, Invert=1 to count P0-4 stalls. +.It Li UOPS_EXECUTED.PORT015 +.Pq Event B1H , Umask 40H +Counts number of Uops executed that where issued on port 0, 1, or 5. +use cmask=1, invert=1 to count stall cycles +.It Li UOPS_EXECUTED.PORT234 +.Pq Event B1H , Umask 80H +Counts number of Uops executed that where issued on port 2, 3, or 4. +.It Li OFFCORE_REQUESTS_SQ_FULL +.Pq Event B2H , Umask 01H +Counts number of cycles the SQ is full to handle off-core requests. +.It Li OFF_CORE_RESPONSE_0 +.Pq Event B7H , Umask 01H +see Section 30.6.1.3, Off-core Response Performance Monitoring in the +Processor Core +Requires programming MSR 01A6H +.It Li SNOOP_RESPONSE.HIT +.Pq Event B8H , Umask 01H +Counts HIT snoop response sent by this thread in response to a snoop +request. +.It Li SNOOP_RESPONSE.HITE +.Pq Event B8H , Umask 02H +Counts HIT E snoop response sent by this thread in response to a snoop +request. +.It Li SNOOP_RESPONSE.HITM +.Pq Event B8H , Umask 04H +Counts HIT M snoop response sent by this thread in response to a snoop +request. +.It Li OFF_CORE_RESPONSE_1 +.Pq Event BBH , Umask 01H +see Section 30.6.1.3, Off-core Response Performance Monitoring in the +Processor Core +Requires programming MSR 01A7H +.It Li INST_RETIRED.ANY_P +.Pq Event C0H , Umask 01H +See Table A-1 +Notes: INST_RETIRED.ANY is counted by a designated fixed counter. +INST_RETIRED.ANY_P is counted by a programmable counter and is an +architectural performance event. Event is supported if CPUID.A.EBX[1] = 0. +Counting: Faulting executions of GETSEC/VM entry/VM Exit/MWait will not +count as retired instructions. +.It Li INST_RETIRED.X87 +.Pq Event C0H , Umask 02H +Counts the number of MMX instructions retired:. +.It Li INST_RETIRED.MMX +.Pq Event C0H , Umask 04H +Counts the number of floating point computational operations retired: +floating point computational operations executed by the assist handler and +sub-operations of complex floating point instructions like transcendental +instructions. +.It Li UOPS_RETIRED.ANY +.Pq Event C2H , Umask 01H +Counts the number of micro-ops retired, (macro-fused=1, micro- fused=2, +others=1; maximum count of 8 per cycle). Most instructions are composed of +one or two micro-ops. Some instructions are decoded into longer sequences +such as repeat instructions, floating point transcendental instructions, and +assists. +Use cmask=1 and invert to count active cycles or stalled cycles +.It Li UOPS_RETIRED.RETIRE_SLOTS +.Pq Event C2H , Umask 02H +Counts the number of retirement slots used each cycle +.It Li UOPS_RETIRED.MACRO_FUSED +.Pq Event C2H , Umask 04H +Counts number of macro-fused uops retired. +.It Li MACHINE_CLEARS.CYCLES +.Pq Event C3H , Umask 01H +Counts the cycles machine clear is asserted. +.It Li MACHINE_CLEARS.MEM_ORDER +.Pq Event C3H , Umask 02H +Counts the number of machine clears due to memory order conflicts. +.It Li MACHINE_CLEARS.SMC +.Pq Event C3H , Umask 04H +Counts the number of times that a program writes to a code section. +Self-modifying code causes a sever penalty in all Intel 64 and IA-32 +processors. The modified cache line is written back to the L2 and L3caches. +.It Li BR_INST_RETIRED.ALL_BRANCHES +.Pq Event C4H , Umask 00H +See Table A-1 +.It Li BR_INST_RETIRED.CONDITIONAL +.Pq Event C4H , Umask 01H +Counts the number of conditional branch instructions retired. +.It Li BR_INST_RETIRED.NEAR_CALL +.Pq Event C4H , Umask 02H +Counts the number of direct & indirect near unconditional calls retired +.It Li BR_INST_RETIRED.ALL_BRANCHES +.Pq Event C4H , Umask 04H +Counts the number of branch instructions retired +.It Li BR_MISP_RETIRED.ALL_BRANCHES +.Pq Event C5H , Umask 00H +See Table A-1 +.It Li BR_MISP_RETIRED.NEAR_CALL +.Pq Event C5H , Umask 02H +Counts mispredicted direct & indirect near unconditional retired calls. +.It Li SSEX_UOPS_RETIRED.PACKED_SINGLE +.Pq Event C7H , Umask 01H +Counts SIMD packed single-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.SCALAR_SINGLE +.Pq Event C7H , Umask 02H +Counts SIMD calar single-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.PACKED_DOUBLE +.Pq Event C7H , Umask 04H +Counts SIMD packed double- precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.SCALAR_DOUBLE +.Pq Event C7H , Umask 08H +Counts SIMD scalar double-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.VECTOR_INTEGER +.Pq Event C7H , Umask 10H +Counts 128-bit SIMD vector integer Uops retired. +.It Li ITLB_MISS_RETIRED +.Pq Event C8H , Umask 20H +Counts the number of retired instructions that missed the ITLB when the +instruction was fetched. +.It Li MEM_LOAD_RETIRED.L1D_HIT +.Pq Event CBH , Umask 01H +Counts number of retired loads that hit the L1 data cache. +.It Li MEM_LOAD_RETIRED.L2_HIT +.Pq Event CBH , Umask 02H +Counts number of retired loads that hit the L2 data cache. +.It Li MEM_LOAD_RETIRED.L3_UNSHARED_HIT +.Pq Event CBH , Umask 04H +Counts number of retired loads that hit their own, unshared lines in the L3 +cache. +.It Li MEM_LOAD_RETIRED.OTHER_CORE_L2_HIT_HITM +.Pq Event CBH , Umask 08H +Counts number of retired loads that hit in a sibling core's L2 (on die +core). Since the L3 is inclusive of all cores on the package, this is an L3 +hit. This counts both clean or modified hits. +.It Li MEM_LOAD_RETIRED.L3_MISS +.Pq Event CBH , Umask 10H +Counts number of retired loads that miss the L3 cache. The load was +satisfied by a remote socket, local memory or an IOH. +.It Li MEM_LOAD_RETIRED.HIT_LFB +.Pq Event CBH , Umask 40H +Counts number of retired loads that miss the L1D and the address is located +in an allocated line fill buffer and will soon be committed to cache. This +is counting secondary L1D misses. +.It Li MEM_LOAD_RETIRED.DTLB_MISS +.Pq Event CBH , Umask 80H +Counts the number of retired loads that missed the DTLB. The DTLB miss is +not counted if the load operation causes a fault. This event counts loads +from cacheable memory only. The event does not count loads by software +prefetches. Counts both primary and secondary misses to the TLB. +.It Li FP_MMX_TRANS.TO_FP +.Pq Event CCH , Umask 01H +Counts the first floating-point instruction following any MMX instruction. +You can use this event to estimate the penalties for the transitions between +floating-point and MMX technology states. +.It Li FP_MMX_TRANS.TO_MMX +.Pq Event CCH , Umask 02H +Counts the first MMX instruction following a floating-point instruction. You +can use this event to estimate the penalties for the transitions between +floating-point and MMX technology states. +.It Li FP_MMX_TRANS.ANY +.Pq Event CCH , Umask 03H +Counts all transitions from floating point to MMX instructions and from MMX +instructions to floating point instructions. You can use this event to +estimate the penalties for the transitions between floating-point and MMX +technology states. +.It Li MACRO_INSTS.DECODED +.Pq Event D0H , Umask 01H +Counts the number of instructions decoded, (but not necessarily executed or +retired). +.It Li UOPS_DECODED.MS +.Pq Event D1H , Umask 02H +Counts the number of Uops decoded by the Microcode Sequencer, MS. The MS +delivers uops when the instruction is more than 4 uops long or a microcode +assist is occurring. +.It Li UOPS_DECODED.ESP_FOLDING +.Pq Event D1H , Umask 04H +Counts number of stack pointer (ESP) instructions decoded: push , pop , call +, ret, etc. ESP instructions do not generate a Uop to increment or decrement +ESP. Instead, they update an ESP_Offset register that keeps track of the +delta to the current value of the ESP register. +.It Li UOPS_DECODED.ESP_SYNC +.Pq Event D1H , Umask 08H +Counts number of stack pointer (ESP) sync operations where an ESP +instruction is corrected by adding the ESP offset register to the current +value of the ESP register. +.It Li RAT_STALLS.FLAGS +.Pq Event D2H , Umask 01H +Counts the number of cycles during which execution stalled due to several +reasons, one of which is a partial flag register stall. A partial register +stall may occur when two conditions are met: 1) an instruction modifies +some, but not all, of the flags in the flag register and 2) the next +instruction, which depends on flags, depends on flags that were not modified +by this instruction. +.It Li RAT_STALLS.REGISTERS +.Pq Event D2H , Umask 02H +This event counts the number of cycles instruction execution latency became +longer than the defined latency because the instruction used a register that +was partially written by previous instruction. +.It Li RAT_STALLS.ROB_READ_PORT +.Pq Event D2H , Umask 04H +Counts the number of cycles when ROB read port stalls occurred, which did +not allow new micro-ops to enter the out-of-order pipeline. Note that, at +this stage in the pipeline, additional stalls may occur at the same cycle +and prevent the stalled micro-ops from entering the pipe. In such a case, +micro-ops retry entering the execution pipe in the next cycle and the +ROB-read port stall is counted again. +.It Li RAT_STALLS.SCOREBOARD +.Pq Event D2H , Umask 08H +Counts the cycles where we stall due to microarchitecturally required +serialization. Microcode scoreboarding stalls. +.It Li RAT_STALLS.ANY +.Pq Event D2H , Umask 0FH +Counts all Register Allocation Table stall cycles due to: Cycles when ROB +read port stalls occurred, which did not allow new micro-ops to enter the +execution pipe. Cycles when partial register stalls occurred Cycles when +flag stalls occurred Cycles floating-point unit (FPU) status word stalls +occurred. To count each of these conditions separately use the events: +RAT_STALLS.ROB_READ_PORT, RAT_STALLS.PARTIAL, RAT_STALLS.FLAGS, and +RAT_STALLS.FPSW. +.It Li SEG_RENAME_STALLS +.Pq Event D4H , Umask 01H +Counts the number of stall cycles due to the lack of renaming resources for +the ES, DS, FS, and GS segment registers. If a segment is renamed but not +retired and a second update to the same segment occurs, a stall occurs in +the front-end of the pipeline until the renamed segment retires. +.It Li ES_REG_RENAMES +.Pq Event D5H , Umask 01H +Counts the number of times the ES segment register is renamed. +.It Li UOP_UNFUSION +.Pq Event DBH , Umask 01H +Counts unfusion events due to floating point exception to a fused uop. +.It Li BR_INST_DECODED +.Pq Event E0H , Umask 01H +Counts the number of branch instructions decoded. +.It Li BPU_MISSED_CALL_RET +.Pq Event E5H , Umask 01H +Counts number of times the Branch Prediciton Unit missed predicting a call +or return branch. +.It Li BACLEAR.CLEAR +.Pq Event E6H , Umask 01H +Counts the number of times the front end is resteered, mainly when the +Branch Prediction Unit cannot provide a correct prediction and this is +corrected by the Branch Address Calculator at the front end. This can occur +if the code has many branches such that they cannot be consumed by the BPU. +Each BACLEAR asserted by the BAC generates approximately an 8 cycle bubble +in the instruction fetch pipeline. The effect on total execution time +depends on the surrounding code. +.It Li BACLEAR.BAD_TARGET +.Pq Event E6H , Umask 02H +Counts number of Branch Address Calculator clears (BACLEAR) asserted due to +conditional branch instructions in which there was a target hit but the +direction was wrong. Each BACLEAR asserted by the BAC generates +approximately an 8 cycle bubble in the instruction fetch pipeline. +.It Li BPU_CLEARS.EARLY +.Pq Event E8H , Umask 01H +Counts early (normal) Branch Prediction Unit clears: BPU predicted a taken +branch after incorrectly assuming that it was not taken. +The BPU clear leads to 2 cycle bubble in the Front End. +.It Li BPU_CLEARS.LATE +.Pq Event E8H , Umask 02H +Counts late Branch Prediction Unit clears due to Most Recently Used +conflicts. The PBU clear leads to a 3 cycle bubble in the Front End. +.It Li BPU_CLEARS.ANY +.Pq Event E8H , Umask 03H +Counts all BPU clears. +.It Li L2_TRANSACTIONS.LOAD +.Pq Event F0H , Umask 01H +Counts L2 load operations due to HW prefetch or demand loads. +.It Li L2_TRANSACTIONS.RFO +.Pq Event F0H , Umask 02H +Counts L2 RFO operations due to HW prefetch or demand RFOs. +.It Li L2_TRANSACTIONS.IFETCH +.Pq Event F0H , Umask 04H +Counts L2 instruction fetch operations due to HW prefetch or demand ifetch. +.It Li L2_TRANSACTIONS.PREFETCH +.Pq Event F0H , Umask 08H +Counts L2 prefetch operations. +.It Li L2_TRANSACTIONS.L1D_WB +.Pq Event F0H , Umask 10H +Counts L1D writeback operations to the L2. +.It Li L2_TRANSACTIONS.FILL +.Pq Event F0H , Umask 20H +Counts L2 cache line fill operations due to load, RFO, L1D writeback or +prefetch. +.It Li L2_TRANSACTIONS.WB +.Pq Event F0H , Umask 40H +Counts L2 writeback operations to the L3. +.It Li L2_TRANSACTIONS.ANY +.Pq Event F0H , Umask 80H +Counts all L2 cache operations. +.It Li L2_LINES_IN.S_STATE +.Pq Event F1H , Umask 02H +Counts the number of cache lines allocated in the L2 cache in the S (shared) +state. +.It Li L2_LINES_IN.E_STATE +.Pq Event F1H , Umask 04H +Counts the number of cache lines allocated in the L2 cache in the E +(exclusive) state. +.It Li L2_LINES_IN.ANY +.Pq Event F1H , Umask 07H +Counts the number of cache lines allocated in the L2 cache. +.It Li L2_LINES_OUT.DEMAND_CLEAN +.Pq Event F2H , Umask 01H +Counts L2 clean cache lines evicted by a demand request. +.It Li L2_LINES_OUT.DEMAND_DIRTY +.Pq Event F2H , Umask 02H +Counts L2 dirty (modified) cache lines evicted by a demand request. +.It Li L2_LINES_OUT.PREFETCH_CLEAN +.Pq Event F2H , Umask 04H +Counts L2 clean cache line evicted by a prefetch request. +.It Li L2_LINES_OUT.PREFETCH_DIRTY +.Pq Event F2H , Umask 08H +Counts L2 modified cache line evicted by a prefetch request. +.It Li L2_LINES_OUT.ANY +.Pq Event F2H , Umask 0FH +Counts all L2 cache lines evicted for any reason. +.It Li SQ_MISC.SPLIT_LOCK +.Pq Event F4H , Umask 10H +Counts the number of SQ lock splits across a cache line. +.It Li SQ_FULL_STALL_CYCLES +.Pq Event F6H , Umask 01H +Counts cycles the Super Queue is full. Neither of the threads on this core +will be able to access the uncore. +.It Li FP_ASSIST.ALL +.Pq Event F7H , Umask 01H +Counts the number of floating point operations executed that required +micro-code assist intervention. Assists are required in the following cases: +SSE instructions, (Denormal input when the DAZ flag is off or Underflow +result when the FTZ flag is off): x87 instructions, (NaN or denormal are +loaded to a register or used as input from memory, Division by 0 or +Underflow output). +.It Li FP_ASSIST.OUTPUT +.Pq Event F7H , Umask 02H +Counts number of floating point micro-code assist when the output value +(destination register) is invalid. +.It Li FP_ASSIST.INPUT +.Pq Event F7H , Umask 04H +Counts number of floating point micro-code assist when the input value (one +of the source operands to an FP instruction) is invalid. +.It Li SIMD_INT_64.PACKED_MPY +.Pq Event FDH , Umask 01H +Counts number of SID integer 64 bit packed multiply operations. +.It Li SIMD_INT_64.PACKED_SHIFT +.Pq Event FDH , Umask 02H +Counts number of SID integer 64 bit packed shift operations. +.It Li SIMD_INT_64.PACK +.Pq Event FDH , Umask 04H +Counts number of SID integer 64 bit pack operations. +.It Li SIMD_INT_64.UNPACK +.Pq Event FDH , Umask 08H +Counts number of SID integer 64 bit unpack operations. +.It Li SIMD_INT_64.PACKED_LOGICAL +.Pq Event FDH , Umask 10H +Counts number of SID integer 64 bit logical operations. +.It Li SIMD_INT_64.PACKED_ARITH +.Pq Event FDH , Umask 20H +Counts number of SID integer 64 bit arithmetic operations. +.It Li SIMD_INT_64.SHUFFLE_MOVE +.Pq Event FDH , Umask 40H +Counts number of SID integer 64 bit shift or move operations. +.El +.Ss Event Specifiers (Programmable PMCs) +Core i7 and Xeon 5500 programmable PMCs support the following events as +June 2009 document (removed in December 2009): +.Bl -tag -width indent +.It Li SB_FORWARD.ANY +.Pq Event 02H , Umask 01H +Counts the number of store forwards. +.It Li LOAD_BLOCK.STD +.Pq Event 03H , Umask 01H +Counts the number of loads blocked by a preceding store with unknown data. +.It Li LOAD_BLOCK.ADDRESS_OFFSET +.Pq Event 03H , Umask 04H +Counts the number of loads blocked by a preceding store address. +.It Li LOAD_BLOCK.ADDRESS_OFFSET +.Pq Event 01H , Umask 04H +Counts the cycles of store buffer drains. +.It Li MISALIGN_MEM_REF.LOAD +.Pq Event 05H , Umask 01H +Counts the number of misaligned load references +.It Li MISALIGN_MEM_REF.STORE +.Pq Event 05H , Umask 02H +Counts the number of misaligned store references +.It Li MISALIGN_MEM_REF.ANY +.Pq Event 05H , Umask 03H +Counts the number of misaligned memory references +.It Li STORE_BLOCKS.NOT_STA +.Pq Event 06H , Umask 01H +This event counts the number of load operations delayed caused by preceding +stores whose addresses are known but whose data is unknown, and preceding +stores that conflict with the load but which incompletely overlap the load. +.It Li STORE_BLOCKS.STA +.Pq Event 06H , Umask 02H +This event counts load operations delayed caused by preceding stores whose +addresses are unknown (STA block). +.It Li STORE_BLOCKS.ANY +.Pq Event 06H , Umask 0FH +All loads delayed due to store blocks +.It Li MEMORY_DISAMBIGURATION.RESET +.Pq Event 09H , Umask 01H +Counts memory disambiguration reset cycles +.It Li MEMORY_DISAMBIGURATION.SUCCESS +.Pq Event 09H , Umask 02H +Counts the number of loads that memory disambiguration succeeded +.It Li MEMORY_DISAMBIGURATION.WATCHDOG +.Pq Event 09H , Umask 04H +Counts the number of times the memory disambiguration watchdog kicked in. +.It Li MEMORY_DISAMBIGURATION.WATCH_CYCLES +.Pq Event 09H , Umask 08H +Counts the cycles that the memory disambiguration watchdog is active. +set invert=1, cmask = 1 +.It Li HW_INT.RCV +.Pq Event 1DH , Umask 01H +Number of interrupt received +.It Li HW_INT.CYCLES_MASKED +.Pq Event 1DH , Umask 02H +Number of cycles interrupt are masked +.It Li HW_INT.CYCLES_PENDING_AND_MASKED +.Pq Event 1DH , Umask 04H +Number of cycles interrupts are pending and masked +.It Li HW_INT.CYCLES_PENDING_AND_MASKED +.Pq Event 04H , Umask 04H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the E (exclusive) state. The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li HW_INT.CYCLES_PENDING_AND_MASKED +.Pq Event 27H , Umask 04H +LONGEST_LAT_CACH E.MISS +.It Li UOPS_DECODED.DEC0 +.Pq Event 3DH , Umask 01H +Counts micro-ops decoded by decoder 0. +.It Li UOPS_DECODED.DEC0 +.Pq Event 01H , Umask 01H +Counts L1 data cache store RFO requests where the cache line to be loaded is +in the I state. +Counter 0, 1 only +.It Li 0FH +.Pq Event 41H , Umask 41H +L1D_CACHE_ST.MESI +Counts L1 data cache store RFO requests. +Counter 0, 1 only +.It Li DTLB_MISSES.PDE_MISS +.Pq Event 49H , Umask 20H +Number of DTLB cache misses where the low part of the linear to physical +address translation was missed. +.It Li DTLB_MISSES.PDP_MISS +.Pq Event 49H , Umask 40H +Number of DTLB misses where the high part of the linear to physical address +translation was missed. +.It Li DTLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 49H , Umask 80H +Counts number of completed large page walks due to misses in the STLB. +.It Li SSE_MEM_EXEC.NTA +.Pq Event 4BH , Umask 01H +Counts number of SSE NTA prefetch/weakly-ordered instructions which missed +the L1 data cache. +.It Li SSE_MEM_EXEC.STREAMING_STORES +.Pq Event 4BH , Umask 08H +Counts number of SSE non temporal stores +.It Li SFENCE_CYCLES +.Pq Event 4DH , Umask 01H +Counts store fence cycles +.It Li EPT.EPDE_MISS +.Pq Event 4FH , Umask 02H +Counts Extended Page Directory Entry misses. The Extended Page Directory +cache is used by Virtual Machine operating systems while the guest operating +systems use the standard TLB caches. +.It Li EPT.EPDPE_HIT +.Pq Event 4FH , Umask 04H +Counts Extended Page Directory Pointer Entry hits. +.It Li EPT.EPDPE_MISS +.Pq Event 4FH , Umask 08H +Counts Extended Page Directory Pointer Entry misses. T +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_DATA +.Pq Event 60H , Umask 01H +Counts weighted cycles of offcore demand data read requests. Does not +include L2 prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_CODE +.Pq Event 60H , Umask 02H +Counts weighted cycles of offcore demand code read requests. Does not +include L2 prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.RFO +.Pq Event 60H , Umask 04H +Counts weighted cycles of offcore demand RFO requests. Does not include L2 +prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.ANY.READ +.Pq Event 60H , Umask 08H +Counts weighted cycles of offcore read requests of any kind. Include L2 +prefetch requests. +counter 0 +.It Li IFU_IVC.FULL +.Pq Event 81H , Umask 01H +Instruction Fetche unit victim cache full. +.It Li IFU_IVC.L1I_EVICTION +.Pq Event 81H , Umask 02H +L1 Instruction cache evictions. +.It Li L1I_OPPORTUNISTIC_HITS +.Pq Event 83H , Umask 01H +Opportunistic hits in streaming. +.It Li ITLB_MISSES.WALK_CYCLES +.Pq Event 85H , Umask 04H +Counts ITLB miss page walk cycles. +.It Li ITLB_MISSES.PMH_BUSY_CYCLES +.Pq Event 85H , Umask 04H +Counts PMH busy cycles. +.It Li ITLB_MISSES.STLB_HIT +.Pq Event 85H , Umask 10H +Counts the number of ITLB misses that hit in the second level TLB. +.It Li ITLB_MISSES.PDE_MISS +.Pq Event 85H , Umask 20H +Number of ITLB misses where the low part of the linear to physical address +translation was missed. +.It Li ITLB_MISSES.PDP_MISS +.Pq Event 85H , Umask 40H +Number of ITLB misses where the high part of the linear to physical address +translation was missed. +.It Li ITLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 85H , Umask 80H +Counts number of completed large page walks due to misses in the STLB. +.It Li ITLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 01H , Umask 80H +Counts number of offcore demand data read requests. Does not count L2 +prefetch requests. +.It Li OFFCORE_REQUESTS.DEMAND.READ_CODE +.Pq Event B0H , Umask 02H +Counts number of offcore demand code read requests. Does not count L2 +prefetch requests. +.It Li OFFCORE_REQUESTS.DEMAND.RFO +.Pq Event B0H , Umask 04H +Counts number of offcore demand RFO requests. Does not count L2 prefetch +requests. +.It Li OFFCORE_REQUESTS.ANY.READ +.Pq Event B0H , Umask 08H +Counts number of offcore read requests. Includes L2 prefetch requests. +.It Li OFFCORE_REQUESTS.ANY.RFO +.Pq Event B0H , Umask 10H +Counts number of offcore RFO requests. Includes L2 prefetch requests. +.It Li OFFCORE_REQUESTS.UNCACHED_MEM +.Pq Event B0H , Umask 20H +Counts number of offcore uncached memory requests. +.It Li OFFCORE_REQUESTS.ANY +.Pq Event B0H , Umask 80H +Counts all offcore requests. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.DATA +.Pq Event B3H , Umask 01H +Counts weighted cycles of snoopq requests for data. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.INVALIDATE +.Pq Event B3H , Umask 02H +Counts weighted cycles of snoopq invalidate requests. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.CODE +.Pq Event B3H , Umask 04H +Counts weighted cycles of snoopq requests for code. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.CODE +.Pq Event BAH , Umask 04H +Counts number of TPR reads +.It Li PIC_ACCESSES.TPR_WRITES +.Pq Event BAH , Umask 02H +Counts number of TPR writes +one or two micro-ops. Some instructions are decoded into longer sequences +.It Li MACHINE_CLEARS.FUSION_ASSIST +.Pq Event C3H , Umask 10H +Counts the number of macro-fusion assists +Counts SIMD packed single- precision floating point Uops retired. +.It Li BOGUS_BR +.Pq Event E4H , Umask 01H +Counts the number of bogus branches. +.It Li L2_HW_PREFETCH.HIT +.Pq Event F3H , Umask 01H +Count L2 HW prefetcher detector hits +.It Li L2_HW_PREFETCH.ALLOC +.Pq Event F3H , Umask 02H +Count L2 HW prefetcher allocations +.It Li L2_HW_PREFETCH.DATA_TRIGGER +.Pq Event F3H , Umask 04H +Count L2 HW data prefetcher triggered +.It Li L2_HW_PREFETCH.CODE_TRIGGER +.Pq Event F3H , Umask 08H +Count L2 HW code prefetcher triggered +.It Li L2_HW_PREFETCH.DCA_TRIGGER +.Pq Event F3H , Umask 10H +Count L2 HW DCA prefetcher triggered +.It Li L2_HW_PREFETCH.KICK_START +.Pq Event F3H , Umask 20H +Count L2 HW prefetcher kick started +.It Li SQ_MISC.PROMOTION +.Pq Event F4H , Umask 01H +Counts the number of L2 secondary misses that hit the Super Queue. +.It Li SQ_MISC.PROMOTION_POST_GO +.Pq Event F4H , Umask 02H +Counts the number of L2 secondary misses during the Super Queue filling L2. +.It Li SQ_MISC.LRU_HINTS +.Pq Event F4H , Umask 04H +Counts number of Super Queue LRU hints sent to L3. +.It Li SQ_MISC.FILL_DROPPED +.Pq Event F4H , Umask 08H +Counts the number of SQ L2 fills dropped due to L2 busy. +.It Li SEGMENT_REG_LOADS +.Pq Event F8H , Umask 01H +Counts number of segment register loads. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.ucf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7uc 3 , +.Xr pmc.westmere 3 , +.Xr pmc.westmereuc 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . Property changes on: stable/8/lib/libpmc/pmc.corei7.3 ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/8/lib/libpmc/pmc.corei7uc.3 =================================================================== --- stable/8/lib/libpmc/pmc.corei7uc.3 (nonexistent) +++ stable/8/lib/libpmc/pmc.corei7uc.3 (revision 206702) @@ -0,0 +1,880 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 24, 2010 +.Os +.Dt PMC.COREI7UC 3 +.Sh NAME +.Nm pmc.corei7uc +.Nd uncore measurement events for +.Tn Intel +.Tn Core i7 and Xeon 5500 +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Core i7" +CPUs contain PMCs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +These CPUs contain 2 classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_UCP" +.It Li PMC_CLASS_UCF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_UCP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Core i7 and Xeon 5500 PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Ss COREI7 AND XEON 5500 UNCORE FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.ucf 3 . +.Ss COREI7 AND XEON 5500 UNCORE PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta \&No +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta \&No +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta \&No +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.El +.Ss Event Specifiers (Programmable PMCs) +Core i7 and Xeon 5500 uncore programmable PMCs support the following events: +.Bl -tag -width indent +.It Li GQ_CYCLES_FULL.READ_TRACKER +.Pq Event 00H , Umask 01H +Uncore cycles Global Queue read tracker is full. +.It Li GQ_CYCLES_FULL.WRITE_TRACKER +.Pq Event 00H , Umask 02H +Uncore cycles Global Queue write tracker is full. +.It Li GQ_CYCLES_FULL.PEER_PROBE_TRACKER +.Pq Event 00H , Umask 04H +Uncore cycles Global Queue peer probe tracker is full. The peer probe +tracker queue tracks snoops from the IOH and remote sockets. +.It Li GQ_CYCLES_NOT_EMPTY.READ_TRACKER +.Pq Event 01H , Umask 01H +Uncore cycles were Global Queue read tracker has at least one valid entry. +.It Li GQ_CYCLES_NOT_EMPTY.WRITE_TRACKER +.Pq Event 01H , Umask 02H +Uncore cycles were Global Queue write tracker has at least one valid entry. +.It Li GQ_CYCLES_NOT_EMPTY.PEER_PROBE_TRACKER +.Pq Event 01H , Umask 04H +Uncore cycles were Global Queue peer probe tracker has at least one valid +entry. The peer probe tracker queue tracks IOH and remote socket snoops. +.It Li GQ_ALLOC.READ_TRACKER +.Pq Event 03H , Umask 01H +Counts the number of tread tracker allocate to deallocate entries. The GQ +read tracker allocate to deallocate occupancy count is divided by the count +to obtain the average read tracker latency. +.It Li GQ_ALLOC.RT_L3_MISS +.Pq Event 03H , Umask 02H +Counts the number GQ read tracker entries for which a full cache line read +has missed the L3. The GQ read tracker L3 miss to fill occupancy count is +divided by this count to obtain the average cache line read L3 miss latency. +The latency represents the time after which the L3 has determined that the +cache line has missed. The time between a GQ read tracker allocation and the +L3 determining that the cache line has missed is the average L3 hit latency. +The total L3 cache line read miss latency is the hit latency + L3 miss +latency. +.It Li GQ_ALLOC.RT_TO_L3_RESP +.Pq Event 03H , Umask 04H +Counts the number of GQ read tracker entries that are allocated in the read +tracker queue that hit or miss the L3. The GQ read tracker L3 hit occupancy +count is divided by this count to obtain the average L3 hit latency. +.It Li GQ_ALLOC.RT_TO_RTID_ACQUIRED +.Pq Event 03H , Umask 08H +Counts the number of GQ read tracker entries that are allocated in the read +tracker, have missed in the L3 and have not acquired a Request Transaction +ID. The GQ read tracker L3 miss to RTID acquired occupancy count is +divided by this count to obtain the average latency for a read L3 miss to +acquire an RTID. +.It Li GQ_ALLOC.WT_TO_RTID_ACQUIRED +.Pq Event 03H , Umask 10H +Counts the number of GQ write tracker entries that are allocated in the +write tracker, have missed in the L3 and have not acquired a Request +Transaction ID. The GQ write tracker L3 miss to RTID occupancy count is +divided by this count to obtain the average latency for a write L3 miss to +acquire an RTID. +.It Li GQ_ALLOC.WRITE_TRACKER +.Pq Event 03H , Umask 20H +Counts the number of GQ write tracker entries that are allocated in the +write tracker queue that miss the L3. The GQ write tracker occupancy count +is divided by the this count to obtain the average L3 write miss latency. +.It Li GQ_ALLOC.PEER_PROBE_TRACKER +.Pq Event 03H , Umask 40H +Counts the number of GQ peer probe tracker (snoop) entries that are +allocated in the peer probe tracker queue that miss the L3. The GQ peer +probe occupancy count is divided by this count to obtain the average L3 peer +probe miss latency. +.It Li GQ_DATA.FROM_QPI +.Pq Event 04H , Umask 01H +Cycles Global Queue Quickpath Interface input data port is busy importing +data from the Quickpath Interface. Each cycle the input port can transfer 8 +or 16 bytes of data. +.It Li GQ_DATA.FROM_QMC +.Pq Event 04H , Umask 02H +Cycles Global Queue Quickpath Memory Interface input data port is busy +importing data from the Quickpath Memory Interface. Each cycle the input +port can transfer 8 or 16 bytes of data. +.It Li GQ_DATA.FROM_L3 +.Pq Event 04H , Umask 04H +Cycles GQ L3 input data port is busy importing data from the Last Level +Cache. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.FROM_CORES_02 +.Pq Event 04H , Umask 08H +Cycles GQ Core 0 and 2 input data port is busy importing data from processor +cores 0 and 2. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.FROM_CORES_13 +.Pq Event 04H , Umask 10H +Cycles GQ Core 1 and 3 input data port is busy importing data from processor +cores 1 and 3. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.TO_QPI_QMC +.Pq Event 05H , Umask 01H +Cycles GQ QPI and QMC output data port is busy sending data to the Quickpath +Interface or Quickpath Memory Interface. Each cycle the output port can +transfer 32 bytes of data. +.It Li GQ_DATA.TO_L3 +.Pq Event 05H , Umask 02H +Cycles GQ L3 output data port is busy sending data to the Last Level Cache. +Each cycle the output port can transfer 32 bytes of data. +.It Li GQ_DATA.TO_CORES +.Pq Event 05H , Umask 04H +Cycles GQ Core output data port is busy sending data to the Cores. Each +cycle the output port can transfer 32 bytes of data. +.It Li SNP_RESP_TO_LOCAL_HOME.I_STATE +.Pq Event 06H , Umask 01H +Number of snoop responses to the local home that L3 does not have the +referenced cache line. +.It Li SNP_RESP_TO_LOCAL_HOME.S_STATE +.Pq Event 06H , Umask 02H +Number of snoop responses to the local home that L3 has the referenced line +cached in the S state. +.It Li SNP_RESP_TO_LOCAL_HOME.FWD_S_STATE +.Pq Event 06H , Umask 04H +Number of responses to code or data read snoops to the local home that the +L3 has the referenced cache line in the E state. The L3 cache line state is +changed to the S state and the line is forwarded to the local home in the S +state. +.It Li SNP_RESP_TO_LOCAL_HOME.FWD_I_STATE +.Pq Event 06H , Umask 08H +Number of responses to read invalidate snoops to the local home that the L3 +has the referenced cache line in the M state. The L3 cache line state is +invalidated and the line is forwarded to the local home in the M state. +.It Li SNP_RESP_TO_LOCAL_HOME.CONFLICT +.Pq Event 06H , Umask 10H +Number of conflict snoop responses sent to the local home. +.It Li SNP_RESP_TO_LOCAL_HOME.WB +.Pq Event 06H , Umask 20H +Number of responses to code or data read snoops to the local home that the +L3 has the referenced line cached in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.I_STATE +.Pq Event 07H , Umask 01H +Number of snoop responses to a remote home that L3 does not have the +referenced cache line. +.It Li SNP_RESP_TO_REMOTE_HOME.S_STATE +.Pq Event 07H , Umask 02H +Number of snoop responses to a remote home that L3 has the referenced line +cached in the S state. +.It Li SNP_RESP_TO_REMOTE_HOME.FWD_S_STATE +.Pq Event 07H , Umask 04H +Number of responses to code or data read snoops to a remote home that the L3 +has the referenced cache line in the E state. The L3 cache line state is +changed to the S state and the line is forwarded to the remote home in the S +state. +.It Li SNP_RESP_TO_REMOTE_HOME.FWD_I_STATE +.Pq Event 07H , Umask 08H +Number of responses to read invalidate snoops to a remote home that the L3 +has the referenced cache line in the M state. The L3 cache line state is +invalidated and the line is forwarded to the remote home in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.CONFLICT +.Pq Event 07H , Umask 10H +Number of conflict snoop responses sent to the local home. +.It Li SNP_RESP_TO_REMOTE_HOME.WB +.Pq Event 07H , Umask 20H +Number of responses to code or data read snoops to a remote home that the L3 +has the referenced line cached in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.HITM +.Pq Event 07H , Umask 24H +Number of HITM snoop responses to a remote home +.It Li L3_HITS.READ +.Pq Event 08H , Umask 01H +Number of code read, data read and RFO requests that hit in the L3 +.It Li L3_HITS.WRITE +.Pq Event 08H , Umask 02H +Number of writeback requests that hit in the L3. Writebacks from the cores +will always result in L3 hits due to the inclusive property of the L3. +.It Li L3_HITS.PROBE +.Pq Event 08H , Umask 04H +Number of snoops from IOH or remote sockets that hit in the L3. +.It Li L3_HITS.ANY +.Pq Event 08H , Umask 03H +Number of reads and writes that hit the L3. +.It Li L3_MISS.READ +.Pq Event 09H , Umask 01H +Number of code read, data read and RFO requests that miss the L3. +.It Li L3_MISS.WRITE +.Pq Event 09H , Umask 02H +Number of writeback requests that miss the L3. Should always be zero as +writebacks from the cores will always result in L3 hits due to the inclusive +property of the L3. +.It Li L3_MISS.PROBE +.Pq Event 09H , Umask 04H +Number of snoops from IOH or remote sockets that miss the L3. +.It Li L3_MISS.ANY +.Pq Event 09H , Umask 03H +Number of reads and writes that miss the L3. +.It Li L3_LINES_IN.M_STATE +.Pq Event 0AH , Umask 01H +Counts the number of L3 lines allocated in M state. The only time a cache +line is allocated in the M state is when the line was forwarded in M state +is forwarded due to a Snoop Read Invalidate Own request. +.It Li L3_LINES_IN.E_STATE +.Pq Event 0AH , Umask 02H +Counts the number of L3 lines allocated in E state. +.It Li L3_LINES_IN.S_STATE +.Pq Event 0AH , Umask 04H +Counts the number of L3 lines allocated in S state. +.It Li L3_LINES_IN.F_STATE +.Pq Event 0AH , Umask 08H +Counts the number of L3 lines allocated in F state. +.It Li L3_LINES_IN.ANY +.Pq Event 0AH , Umask 0FH +Counts the number of L3 lines allocated in any state. +.It Li L3_LINES_OUT.M_STATE +.Pq Event 0BH , Umask 01H +Counts the number of L3 lines victimized that were in the M state. When the +victim cache line is in M state, the line is written to its home cache agent +which can be either local or remote. +.It Li L3_LINES_OUT.E_STATE +.Pq Event 0BH , Umask 02H +Counts the number of L3 lines victimized that were in the E state. +.It Li L3_LINES_OUT.S_STATE +.Pq Event 0BH , Umask 04H +Counts the number of L3 lines victimized that were in the S state. +.It Li L3_LINES_OUT.I_STATE +.Pq Event 0BH , Umask 08H +Counts the number of L3 lines victimized that were in the I state. +.It Li L3_LINES_OUT.F_STATE +.Pq Event 0BH , Umask 10H +Counts the number of L3 lines victimized that were in the F state. +.It Li L3_LINES_OUT.ANY +.Pq Event 0BH , Umask 1FH +Counts the number of L3 lines victimized in any state. +.It Li QHL_REQUESTS.IOH_READS +.Pq Event 20H , Umask 01H +Counts number of Quickpath Home Logic read requests from the IOH. +.It Li QHL_REQUESTS.IOH_WRITES +.Pq Event 20H , Umask 02H +Counts number of Quickpath Home Logic write requests from the IOH. +.It Li QHL_REQUESTS.REMOTE_READS +.Pq Event 20H , Umask 04H +Counts number of Quickpath Home Logic read requests from a remote socket. +.It Li QHL_REQUESTS.REMOTE_WRITES +.Pq Event 20H , Umask 08H +Counts number of Quickpath Home Logic write requests from a remote socket. +.It Li QHL_REQUESTS.LOCAL_READS +.Pq Event 20H , Umask 10H +Counts number of Quickpath Home Logic read requests from the local socket. +.It Li QHL_REQUESTS.LOCAL_WRITES +.Pq Event 20H , Umask 20H +Counts number of Quickpath Home Logic write requests from the local socket. +.It Li QHL_CYCLES_FULL.IOH +.Pq Event 21H , Umask 01H +Counts uclk cycles all entries in the Quickpath Home Logic IOH are full. +.It Li QHL_CYCLES_FULL.REMOTE +.Pq Event 21H , Umask 02H +Counts uclk cycles all entries in the Quickpath Home Logic remote tracker +are full. +.It Li QHL_CYCLES_FULL.LOCAL +.Pq Event 21H , Umask 04H +Counts uclk cycles all entries in the Quickpath Home Logic local tracker are +full. +.It Li QHL_CYCLES_NOT_EMPTY.IOH +.Pq Event 22H , Umask 01H +Counts uclk cycles all entries in the Quickpath Home Logic IOH is busy. +.It Li QHL_CYCLES_NOT_EMPTY.REMOTE +.Pq Event 22H , Umask 02H +Counts uclk cycles all entries in the Quickpath Home Logic remote tracker is +busy. +.It Li QHL_CYCLES_NOT_EMPTY.LOCAL +.Pq Event 22H , Umask 04H +Counts uclk cycles all entries in the Quickpath Home Logic local tracker is +busy. +.It Li QHL_OCCUPANCY.IOH +.Pq Event 23H , Umask 01H +QHL IOH tracker allocate to deallocate read occupancy. +.It Li QHL_OCCUPANCY.REMOTE +.Pq Event 23H , Umask 02H +QHL remote tracker allocate to deallocate read occupancy. +.It Li QHL_OCCUPANCY.LOCAL +.Pq Event 23H , Umask 04H +QHL local tracker allocate to deallocate read occupancy. +.It Li QHL_ADDRESS_CONFLICTS.2WAY +.Pq Event 24H , Umask 02H +Counts number of QHL Active Address Table (AAT) entries that saw a max of 2 +conflicts. The AAT is a structure that tracks requests that are in conflict. +The requests themselves are in the home tracker entries. The count is +reported when an AAT entry deallocates. +.It Li QHL_ADDRESS_CONFLICTS.3WAY +.Pq Event 24H , Umask 04H +Counts number of QHL Active Address Table (AAT) entries that saw a max of 3 +conflicts. The AAT is a structure that tracks requests that are in conflict. +The requests themselves are in the home tracker entries. The count is +reported when an AAT entry deallocates. +.It Li QHL_CONFLICT_CYCLES.IOH +.Pq Event 25H , Umask 01H +Counts cycles the Quickpath Home Logic IOH Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_CONFLICT_CYCLES.REMOTE +.Pq Event 25H , Umask 02H +Counts cycles the Quickpath Home Logic Remote Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_CONFLICT_CYCLES.LOCAL +.Pq Event 25H , Umask 04H +Counts cycles the Quickpath Home Logic Local Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_TO_QMC_BYPASS +.Pq Event 26H , Umask 01H +Counts number or requests to the Quickpath Memory Controller that bypass the +Quickpath Home Logic. All local accesses can be bypassed. For remote +requests, only read requests can be bypassed. +.It Li QMC_NORMAL_FULL.READ.CH0 +.Pq Event 27H , Umask 01H +Uncore cycles all the entries in the DRAM channel 0 medium or low priority +queue are occupied with read requests. +.It Li QMC_NORMAL_FULL.READ.CH1 +.Pq Event 27H , Umask 02H +Uncore cycles all the entries in the DRAM channel 1 medium or low priority +queue are occupied with read requests. +.It Li QMC_NORMAL_FULL.READ.CH2 +.Pq Event 27H , Umask 04H +Uncore cycles all the entries in the DRAM channel 2 medium or low priority +queue are occupied with read requests. +.It Li QMC_NORMAL_FULL.WRITE.CH0 +.Pq Event 27H , Umask 08H +Uncore cycles all the entries in the DRAM channel 0 medium or low priority +queue are occupied with write requests. +.It Li QMC_NORMAL_FULL.WRITE.CH1 +.Pq Event 27H , Umask 10H +Counts cycles all the entries in the DRAM channel 1 medium or low priority +queue are occupied with write requests. +.It Li QMC_NORMAL_FULL.WRITE.CH2 +.Pq Event 27H , Umask 20H +Uncore cycles all the entries in the DRAM channel 2 medium or low priority +queue are occupied with write requests. +.It Li QMC_ISOC_FULL.READ.CH0 +.Pq Event 28H , Umask 01H +Counts cycles all the entries in the DRAM channel 0 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.READ.CH1 +.Pq Event 28H , Umask 02H +Counts cycles all the entries in the DRAM channel 1high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.READ.CH2 +.Pq Event 28H , Umask 04H +Counts cycles all the entries in the DRAM channel 2 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.WRITE.CH0 +.Pq Event 28H , Umask 08H +Counts cycles all the entries in the DRAM channel 0 high priority queue are +occupied with isochronous write requests. +.It Li QMC_ISOC_FULL.WRITE.CH1 +.Pq Event 28H , Umask 10H +Counts cycles all the entries in the DRAM channel 1 high priority queue are +occupied with isochronous write requests. +.It Li QMC_ISOC_FULL.WRITE.CH2 +.Pq Event 28H , Umask 20H +Counts cycles all the entries in the DRAM channel 2 high priority queue are +occupied with isochronous write requests. +.It Li QMC_BUSY.READ.CH0 +.Pq Event 29H , Umask 01H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 0. +.It Li QMC_BUSY.READ.CH1 +.Pq Event 29H , Umask 02H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 1. +.It Li QMC_BUSY.READ.CH2 +.Pq Event 29H , Umask 04H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 2. +.It Li QMC_BUSY.WRITE.CH0 +.Pq Event 29H , Umask 08H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 0. +.It Li QMC_BUSY.WRITE.CH1 +.Pq Event 29H , Umask 10H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 1. +.It Li QMC_BUSY.WRITE.CH2 +.Pq Event 29H , Umask 20H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 2. +.It Li QMC_OCCUPANCY.CH0 +.Pq Event 2AH , Umask 01H +IMC channel 0 normal read request occupancy. +.It Li QMC_OCCUPANCY.CH1 +.Pq Event 2AH , Umask 02H +IMC channel 1 normal read request occupancy. +.It Li QMC_OCCUPANCY.CH2 +.Pq Event 2AH , Umask 04H +IMC channel 2 normal read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH0 +.Pq Event 2BH , Umask 01H +IMC channel 0 issoc read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH1 +.Pq Event 2BH , Umask 02H +IMC channel 1 issoc read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH2 +.Pq Event 2BH , Umask 04H +IMC channel 2 issoc read request occupancy. +.It Li QMC_ISSOC_READS.ANY +.Pq Event 2BH , Umask 07H +IMC issoc read request occupancy. +.It Li QMC_NORMAL_READS.CH0 +.Pq Event 2CH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 medium and low +priority read requests. The QMC channel 0 normal read occupancy divided by +this count provides the average QMC channel 0 read latency. +.It Li QMC_NORMAL_READS.CH1 +.Pq Event 2CH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 medium and low +priority read requests. The QMC channel 1 normal read occupancy divided by +this count provides the average QMC channel 1 read latency. +.It Li QMC_NORMAL_READS.CH2 +.Pq Event 2CH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 medium and low +priority read requests. The QMC channel 2 normal read occupancy divided by +this count provides the average QMC channel 2 read latency. +.It Li QMC_NORMAL_READS.ANY +.Pq Event 2CH , Umask 07H +Counts the number of Quickpath Memory Controller medium and low priority +read requests. The QMC normal read occupancy divided by this count provides +the average QMC read latency. +.It Li QMC_HIGH_PRIORITY_READS.CH0 +.Pq Event 2DH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.CH1 +.Pq Event 2DH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.CH2 +.Pq Event 2DH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.ANY +.Pq Event 2DH , Umask 07H +Counts the number of Quickpath Memory Controller high priority isochronous +read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH0 +.Pq Event 2EH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH1 +.Pq Event 2EH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH2 +.Pq Event 2EH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.ANY +.Pq Event 2EH , Umask 07H +Counts the number of Quickpath Memory Controller critical priority +isochronous read requests. +.It Li QMC_WRITES.FULL.CH0 +.Pq Event 2FH , Umask 01H +Counts number of full cache line writes to DRAM channel 0. +.It Li QMC_WRITES.FULL.CH1 +.Pq Event 2FH , Umask 02H +Counts number of full cache line writes to DRAM channel 1. +.It Li QMC_WRITES.FULL.CH2 +.Pq Event 2FH , Umask 04H +Counts number of full cache line writes to DRAM channel 2. +.It Li QMC_WRITES.FULL.ANY +.Pq Event 2FH , Umask 07H +Counts number of full cache line writes to DRAM. +.It Li QMC_WRITES.PARTIAL.CH0 +.Pq Event 2FH , Umask 08H +Counts number of partial cache line writes to DRAM channel 0. +.It Li QMC_WRITES.PARTIAL.CH1 +.Pq Event 2FH , Umask 10H +Counts number of partial cache line writes to DRAM channel 1. +.It Li QMC_WRITES.PARTIAL.CH2 +.Pq Event 2FH , Umask 20H +Counts number of partial cache line writes to DRAM channel 2. +.It Li QMC_WRITES.PARTIAL.ANY +.Pq Event 2FH , Umask 38H +Counts number of partial cache line writes to DRAM. +.It Li QMC_CANCEL.CH0 +.Pq Event 30H , Umask 01H +Counts number of DRAM channel 0 cancel requests. +.It Li QMC_CANCEL.CH1 +.Pq Event 30H , Umask 02H +Counts number of DRAM channel 1 cancel requests. +.It Li QMC_CANCEL.CH2 +.Pq Event 30H , Umask 04H +Counts number of DRAM channel 2 cancel requests. +.It Li QMC_CANCEL.ANY +.Pq Event 30H , Umask 07H +Counts number of DRAM cancel requests. +.It Li QMC_PRIORITY_UPDATES.CH0 +.Pq Event 31H , Umask 01H +Counts number of DRAM channel 0 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.CH1 +.Pq Event 31H , Umask 02H +Counts number of DRAM channel 1 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.CH2 +.Pq Event 31H , Umask 04H +Counts number of DRAM channel 2 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.ANY +.Pq Event 31H , Umask 07H +Counts number of DRAM priority updates. A priority update occurs when an +ISOC high or critical request is received by the QHL and there is a matching +request with normal priority that has already been issued to the QMC. In +this instance, the QHL will send a priority update to QMC to expedite the +request. +.It Li QHL_FRC_ACK_CNFLTS.LOCAL +.Pq Event 33H , Umask 04H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic to the local home. +.It Li QPI_TX_STALLED_SINGLE_FLIT.HOME.LINK_0 +.Pq Event 40H , Umask 01H +Counts cycles the Quickpath outbound link 0 HOME virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.SNOOP.LINK_0 +.Pq Event 40H , Umask 02H +Counts cycles the Quickpath outbound link 0 SNOOP virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.NDR.LINK_0 +.Pq Event 40H , Umask 04H +Counts cycles the Quickpath outbound link 0 non-data response virtual +channel is stalled due to lack of a VNA and VN0 credit. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.HOME.LINK_1 +.Pq Event 40H , Umask 08H +Counts cycles the Quickpath outbound link 1 HOME virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.SNOOP.LINK_1 +.Pq Event 40H , Umask 10H +Counts cycles the Quickpath outbound link 1 SNOOP virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.NDR.LINK_1 +.Pq Event 40H , Umask 20H +Counts cycles the Quickpath outbound link 1 non-data response virtual +channel is stalled due to lack of a VNA and VN0 credit. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.LINK_0 +.Pq Event 40H , Umask 07H +Counts cycles the Quickpath outbound link 0 virtual channels are stalled due +to lack of a VNA and VN0 credit. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.LINK_1 +.Pq Event 40H , Umask 38H +Counts cycles the Quickpath outbound link 1 virtual channels are stalled due +to lack of a VNA and VN0 credit. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.DRS.LINK_0 +.Pq Event 41H , Umask 01H +Counts cycles the Quickpath outbound link 0 Data ResponSe virtual channel is +stalled due to lack of VNA and VN0 credits. Note that this event does not +filter out when a flit would not have been selected for arbitration because +another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCB.LINK_0 +.Pq Event 41H , Umask 02H +Counts cycles the Quickpath outbound link 0 Non-Coherent Bypass virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCS.LINK_0 +.Pq Event 41H , Umask 04H +Counts cycles the Quickpath outbound link 0 Non-Coherent Standard virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.DRS.LINK_1 +.Pq Event 41H , Umask 08H +Counts cycles the Quickpath outbound link 1 Data ResponSe virtual channel is +stalled due to lack of VNA and VN0 credits. Note that this event does not +filter out when a flit would not have been selected for arbitration because +another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCB.LINK_1 +.Pq Event 41H , Umask 10H +Counts cycles the Quickpath outbound link 1 Non-Coherent Bypass virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCS.LINK_1 +.Pq Event 41H , Umask 20H +Counts cycles the Quickpath outbound link 1 Non-Coherent Standard virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.LINK_0 +.Pq Event 41H , Umask 07H +Counts cycles the Quickpath outbound link 0 virtual channels are stalled due +to lack of VNA and VN0 credits. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.LINK_1 +.Pq Event 41H , Umask 38H +Counts cycles the Quickpath outbound link 1 virtual channels are stalled due +to lack of VNA and VN0 credits. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_HEADER.BUSY.LINK_0 +.Pq Event 42H , Umask 02H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 0 is busy. +.It Li QPI_TX_HEADER.BUSY.LINK_1 +.Pq Event 42H , Umask 08H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 1 is busy. +.It Li QPI_RX_NO_PPT_CREDIT.STALLS.LINK_0 +.Pq Event 43H , Umask 01H +Number of cycles that snoop packets incoming to the Quickpath Interface link +0 are stalled and not sent to the GQ because the GQ Peer Probe Tracker (PPT) +does not have any available entries. +.It Li QPI_RX_NO_PPT_CREDIT.STALLS.LINK_1 +.Pq Event 43H , Umask 02H +Number of cycles that snoop packets incoming to the Quickpath Interface link +1 are stalled and not sent to the GQ because the GQ Peer Probe Tracker (PPT) +does not have any available entries. +.It Li DRAM_OPEN.CH0 +.Pq Event 60H , Umask 01H +Counts number of DRAM Channel 0 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_OPEN.CH1 +.Pq Event 60H , Umask 02H +Counts number of DRAM Channel 1 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_OPEN.CH2 +.Pq Event 60H , Umask 04H +Counts number of DRAM Channel 2 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_PAGE_CLOSE.CH0 +.Pq Event 61H , Umask 01H +DRAM channel 0 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_CLOSE.CH1 +.Pq Event 61H , Umask 02H +DRAM channel 1 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_CLOSE.CH2 +.Pq Event 61H , Umask 04H +DRAM channel 2 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH0 +.Pq Event 62H , Umask 01H +Counts the number of precharges (PRE) that were issued to DRAM channel 0 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH1 +.Pq Event 62H , Umask 02H +Counts the number of precharges (PRE) that were issued to DRAM channel 1 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH2 +.Pq Event 62H , Umask 04H +Counts the number of precharges (PRE) that were issued to DRAM channel 2 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_READ_CAS.CH0 +.Pq Event 63H , Umask 01H +Counts the number of times a read CAS command was issued on DRAM channel 0. +.It Li DRAM_READ_CAS.AUTOPRE_CH0 +.Pq Event 63H , Umask 02H +Counts the number of times a read CAS command was issued on DRAM channel 0 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_READ_CAS.CH1 +.Pq Event 63H , Umask 04H +Counts the number of times a read CAS command was issued on DRAM channel 1. +.It Li DRAM_READ_CAS.AUTOPRE_CH1 +.Pq Event 63H , Umask 08H +Counts the number of times a read CAS command was issued on DRAM channel 1 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_READ_CAS.CH2 +.Pq Event 63H , Umask 10H +Counts the number of times a read CAS command was issued on DRAM channel 2. +.It Li DRAM_READ_CAS.AUTOPRE_CH2 +.Pq Event 63H , Umask 20H +Counts the number of times a read CAS command was issued on DRAM channel 2 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH0 +.Pq Event 64H , Umask 01H +Counts the number of times a write CAS command was issued on DRAM channel 0. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH0 +.Pq Event 64H , Umask 02H +Counts the number of times a write CAS command was issued on DRAM channel 0 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH1 +.Pq Event 64H , Umask 04H +Counts the number of times a write CAS command was issued on DRAM channel 1. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH1 +.Pq Event 64H , Umask 08H +Counts the number of times a write CAS command was issued on DRAM channel 1 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH2 +.Pq Event 64H , Umask 10H +Counts the number of times a write CAS command was issued on DRAM channel 2. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH2 +.Pq Event 64H , Umask 20H +Counts the number of times a write CAS command was issued on DRAM channel 2 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_REFRESH.CH0 +.Pq Event 65H , Umask 01H +Counts number of DRAM channel 0 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_REFRESH.CH1 +.Pq Event 65H , Umask 02H +Counts number of DRAM channel 1 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_REFRESH.CH2 +.Pq Event 65H , Umask 04H +Counts number of DRAM channel 2 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_PRE_ALL.CH0 +.Pq Event 66H , Umask 01H +Counts number of DRAM Channel 0 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_PRE_ALL.CH1 +.Pq Event 66H , Umask 02H +Counts number of DRAM Channel 1 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_PRE_ALL.CH2 +.Pq Event 66H , Umask 04H +Counts number of DRAM Channel 2 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.ucf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7 3 , +.Xr pmc.westmere 3 , +.Xr pmc.westmereuc 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . Property changes on: stable/8/lib/libpmc/pmc.corei7uc.3 ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/8/lib/libpmc/pmc.ucf.3 =================================================================== --- stable/8/lib/libpmc/pmc.ucf.3 (nonexistent) +++ stable/8/lib/libpmc/pmc.ucf.3 (revision 206702) @@ -0,0 +1,115 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 30, 2010 +.Os +.Dt PMC.UCF 3 +.Sh NAME +.Nm pmc.ucf +.Nd measurement events for +.Tn Intel +uncore fixed function performance counters. +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +Each fixed-function PMC measures a specific hardware event. +The number of fixed-function PMCs implemented in a CPU can vary. +The number of fixed-function PMCs present can be determined at runtime +by using function +.Xr pmc_cpuinfo 3 . +.Pp +Intel uncore fixed-function PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Pp +.Ss PMC Capabilities +Fixed-function PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta \&No +.It PMC_CAP_INTERRUPT Ta \&No +.It PMC_CAP_INVERT Ta \&No +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta \&No +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta \&No +.It PMC_CAP_USER Ta \&No +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Class Name Prefix +These PMCs are named using a class name prefix of +.Dq Li ucf- . +.Ss Event Specifiers (Fixed Function PMCs) +The fixed function PMCs are selectable using the following +event names: +.Bl -tag -width indent +.It Li UCLOCK +.Pq Fixed Function Counter 0 +The fixed-function uncore counter increments at the rate of the U-clock. +The frequency of the uncore clock domain can be determined from the uncore +clock ratio which is available in the PCI configuration space register at +offset C0H under device number 0 and Function 0. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7 3 , +.Xr pmc.corei7uc 3 , +.Xr pmc.westmere 3 , +.Xr pmc.westmereuc 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . + + Property changes on: stable/8/lib/libpmc/pmc.ucf.3 ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/8/lib/libpmc/pmc.westmere.3 =================================================================== --- stable/8/lib/libpmc/pmc.westmere.3 (nonexistent) +++ stable/8/lib/libpmc/pmc.westmere.3 (revision 206702) @@ -0,0 +1,1329 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 24, 2010 +.Os +.Dt PMC.WESTMERE 3 +.Sh NAME +.Nm pmc.westmere +.Nd measurement events for +.Tn Intel +.Tn Westmere +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Westmere" +CPUs contain PMCs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +These CPUs may contain up to three classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_IAP" +.It Li PMC_CLASS_IAF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_IAP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Westmere PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Ss WESTMERE FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.iaf 3 . +.Ss WESTMERE PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li rsp= Ns Ar value +Configure the Off-core Response bits. +.Bl -tag -width indent +.It Li DMND_DATA_RD +Counts the number of demand and DCU prefetch data reads of full +and partial cachelines as well as demand data page table entry +cacheline reads. Does not count L2 data read prefetches or +instruction fetches. +.It Li DMND_RFO +Counts the number of demand and DCU prefetch reads for ownership +(RFO) requests generated by a write to data cacheline. Does not +count L2 RFO. +.It Li DMND_IFETCH +Counts the number of demand and DCU prefetch instruction cacheline +reads. Does not count L2 code read prefetches. +WB +Counts the number of writeback (modified to exclusive) transactions. +.It Li PF_DATA_RD +Counts the number of data cacheline reads generated by L2 prefetchers. +.It Li PF_RFO +Counts the number of RFO requests generated by L2 prefetchers. +.It Li PF_IFETCH +Counts the number of code reads generated by L2 prefetchers. +.It Li OTHER +Counts one of the following transaction types, including L3 invalidate, +I/O, full or partial writes, WC or non-temporal stores, CLFLUSH, Fences, +lock, unlock, split lock. +.It Li UNCORE_HIT +L3 Hit: local or remote home requests that hit L3 cache in the uncore +with no coherency actions required (snooping). +.It Li OTHER_CORE_HIT_SNP +L3 Hit: local or remote home requests that hit L3 cache in the uncore +and was serviced by another core with a cross core snoop where no modified +copies were found (clean). +.It Li OTHER_CORE_HITM +L3 Hit: local or remote home requests that hit L3 cache in the uncore +and was serviced by another core with a cross core snoop where modified +copies were found (HITM). +.It Li REMOTE_CACHE_FWD +L3 Miss: local homed requests that missed the L3 cache and was serviced +by forwarded data following a cross package snoop where no modified +copies found. (Remote home requests are not counted) +.It Li REMOTE_DRAM +L3 Miss: remote home requests that missed the L3 cache and were serviced +by remote DRAM. +.It Li LOCAL_DRAM +L3 Miss: local home requests that missed the L3 cache and were serviced +by local DRAM. +.It Li NON_DRAM +Non-DRAM requests that were serviced by IOH. +.El +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.It Li os +Configure the PMC to count events happening at processor privilege +level 0. +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Ss Event Specifiers (Programmable PMCs) +Westmere programmable PMCs support the following events: +.Bl -tag -width indent +.It Li LOAD_BLOCK.OVERLAP_STORE +.Pq Event 03H , Umask 02H +Loads that partially overlap an earlier store +.It Li SB_DRAIN.ANY +.Pq Event 04H , Umask 07H +All Store buffer stall cycles +.It Li MISALIGN_MEMORY.STORE +.Pq Event 05H , Umask 02H +All store referenced with misaligned address +.It Li STORE_BLOCKS.AT_RET +.Pq Event 06H , Umask 04H +Counts number of loads delayed with at-Retirement block code. The following +loads need to be executed at retirement and wait for all senior stores on +the same thread to be drained: load splitting across 4K boundary (page +split), load accessing uncacheable (UC or USWC) memory, load lock, and load +with page table in UC or USWC memory region. +.It Li STORE_BLOCKS.L1D_BLOCK +.Pq Event 06H , Umask 08H +Cacheable loads delayed with L1D block code +.It Li PARTIAL_ADDRESS_ALIAS +.Pq Event 07H , Umask 01H +Counts false dependency due to partial address aliasing +.It Li DTLB_LOAD_MISSES.ANY +.Pq Event 08H , Umask 01H +Counts all load misses that cause a page walk +.It Li DTLB_LOAD_MISSES.WALK_COMPLETED +.Pq Event 08H , Umask 02H +Counts number of completed page walks due to load miss in the STLB. +.It Li DTLB_LOAD_MISSES.WALK_CYCLES +.Pq Event 08H , Umask 04H +Cycles PMH is busy with a page walk due to a load miss in the STLB. +.It Li DTLB_LOAD_MISSES.STLB_HIT +.Pq Event 08H , Umask 10H +Number of cache load STLB hits +.It Li DTLB_LOAD_MISSES.PDE_MISS +.Pq Event 08H , Umask 20H +Number of DTLB cache load misses where the low part of the linear to +physical address translation was missed. +.It Li MEM_INST_RETIRED.LOADS +.Pq Event 0BH , Umask 01H +Counts the number of instructions with an architecturally-visible store +retired on the architected path. +In conjunction with ld_lat facility +.It Li MEM_INST_RETIRED.STORES +.Pq Event 0BH , Umask 02H +Counts the number of instructions with an architecturally-visible store +retired on the architected path. +In conjunction with ld_lat facility +.It Li MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD +.Pq Event 0BH , Umask 10H +Counts the number of instructions exceeding the latency specified with +ld_lat facility. +In conjunction with ld_lat facility +.It Li MEM_STORE_RETIRED.DTLB_MISS +.Pq Event 0CH , Umask 01H +The event counts the number of retired stores that missed the DTLB. The DTLB +miss is not counted if the store operation causes a fault. Does not counter +prefetches. Counts both primary and secondary misses to the TLB +.It Li UOPS_ISSUED.ANY +.Pq Event 0EH , Umask 01H +Counts the number of Uops issued by the Register Allocation Table to the +Reservation Station, i.e. the UOPs issued from the front end to the back +end. +.It Li UOPS_ISSUED.STALLED_CYCLES +.Pq Event 0EH , Umask 01H +Counts the number of cycles no Uops issued by the Register Allocation Table +to the Reservation Station, i.e. the UOPs issued from the front end to the +back end. +set invert=1, cmask = 1 +.It Li UOPS_ISSUED.FUSED +.Pq Event 0EH , Umask 02H +Counts the number of fused Uops that were issued from the Register +Allocation Table to the Reservation Station. +.It Li MEM_UNCORE_RETIRED.LOCAL_HITM +.Pq Event 0FH , Umask 02H +Load instructions retired that HIT modified data in sibling core (Precise +Event) +.It Li MEM_UNCORE_RETIRED.LOCAL_DRAM_AND_REMOTE_CACHE_HIT +.Pq Event 0FH , Umask 08H +Load instructions retired local dram and remote cache HIT data sources +(Precise Event) +.It Li MEM_UNCORE_RETIRED.LOCAL_DRAM +.Pq Event 0FH , Umask 10H +Load instructions retired with a data source of local DRAM or locally homed +remote cache HITM (Precise Event) +.It Li MEM_UNCORE_RETIRED.REMOTE_DRAM +.Pq Event 0FH , Umask 20H +Load instructions retired remote DRAM and remote home-remote cache HITM +(Precise Event) +.It Li MEM_UNCORE_RETIRED.UNCACHEABLE +.Pq Event 0FH , Umask 80H +Load instructions retired I/O (Precise Event) +.It Li FP_COMP_OPS_EXE.X87 +.Pq Event 10H , Umask 01H +Counts the number of FP Computational Uops Executed. The number of FADD, +FSUB, FCOM, FMULs, integer MULsand IMULs, FDIVs, FPREMs, FSQRTS, integer +DIVs, and IDIVs. This event does not distinguish an FADD used in the middle +of a transcendental flow from a separate FADD instruction. +.It Li FP_COMP_OPS_EXE.MMX +.Pq Event 10H , Umask 02H +Counts number of MMX Uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP +.Pq Event 10H , Umask 04H +Counts number of SSE and SSE2 FP uops executed. +.It Li FP_COMP_OPS_EXE.SSE2_INTEGER +.Pq Event 10H , Umask 08H +Counts number of SSE2 integer uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP_PACKED +.Pq Event 10H , Umask 10H +Counts number of SSE FP packed uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP_SCALAR +.Pq Event 10H , Umask 20H +Counts number of SSE FP scalar uops executed. +.It Li FP_COMP_OPS_EXE.SSE_SINGLE_PRECISION +.Pq Event 10H , Umask 40H +Counts number of SSE* FP single precision uops executed. +.It Li FP_COMP_OPS_EXE.SSE_DOUBLE_PRECISION +.Pq Event 10H , Umask 80H +Counts number of SSE* FP double precision uops executed. +.It Li SIMD_INT_128.PACKED_MPY +.Pq Event 12H , Umask 01H +Counts number of 128 bit SIMD integer multiply operations. +.It Li SIMD_INT_128.PACKED_SHIFT +.Pq Event 12H , Umask 02H +Counts number of 128 bit SIMD integer shift operations. +.It Li SIMD_INT_128.PACK +.Pq Event 12H , Umask 04H +Counts number of 128 bit SIMD integer pack operations. +.It Li SIMD_INT_128.UNPACK +.Pq Event 12H , Umask 08H +Counts number of 128 bit SIMD integer unpack operations. +.It Li SIMD_INT_128.PACKED_LOGICAL +.Pq Event 12H , Umask 10H +Counts number of 128 bit SIMD integer logical operations. +.It Li SIMD_INT_128.PACKED_ARITH +.Pq Event 12H , Umask 20H +Counts number of 128 bit SIMD integer arithmetic operations. +.It Li SIMD_INT_128.SHUFFLE_MOVE +.Pq Event 12H , Umask 40H +Counts number of 128 bit SIMD integer shuffle and move operations. +.It Li LOAD_DISPATCH.RS +.Pq Event 13H , Umask 01H +Counts number of loads dispatched from the Reservation Station that bypass +the Memory Order Buffer. +.It Li LOAD_DISPATCH.RS_DELAYED +.Pq Event 13H , Umask 02H +Counts the number of delayed RS dispatches at the stage latch. If an RS +dispatch can not bypass to LB, it has another chance to dispatch from the +one-cycle delayed staging latch before it is written into the LB. +.It Li LOAD_DISPATCH.MOB +.Pq Event 13H , Umask 04H +Counts the number of loads dispatched from the Reservation Station to the +Memory Order Buffer. +.It Li LOAD_DISPATCH.ANY +.Pq Event 13H , Umask 07H +Counts all loads dispatched from the Reservation Station. +.It Li ARITH.CYCLES_DIV_BUSY +.Pq Event 14H , Umask 01H +Counts the number of cycles the divider is busy executing divide or square +root operations. The divide can be integer, X87 or Streaming SIMD Extensions +(SSE). The square root operation can be either X87 or SSE. +Set 'edge =1, invert=1, cmask=1' to count the number of divides. +Count may be incorrect When SMT is on +.It Li ARITH.MUL +.Pq Event 14H , Umask 02H +Counts the number of multiply operations executed. This includes integer as +well as floating point multiply operations but excludes DPPS mul and MPSAD. +Count may be incorrect When SMT is on +.It Li INST_QUEUE_WRITES +.Pq Event 17H , Umask 01H +Counts the number of instructions written into the instruction queue every +cycle. +.It Li INST_DECODED.DEC0 +.Pq Event 18H , Umask 01H +Counts number of instructions that require decoder 0 to be decoded. Usually, +this means that the instruction maps to more than 1 uop +.It Li TWO_UOP_INSTS_DECODED +.Pq Event 19H , Umask 01H +An instruction that generates two uops was decoded +.It Li INST_QUEUE_WRITE_CYCLES +.Pq Event 1EH , Umask 01H +This event counts the number of cycles during which instructions are written +to the instruction queue. Dividing this counter by the number of +instructions written to the instruction queue (INST_QUEUE_WRITES) yields the +average number of instructions decoded each cycle. If this number is less +than four and the pipe stalls, this indicates that the decoder is failing to +decode enough instructions per cycle to sustain the 4-wide pipeline. +If SSE* instructions that are 6 bytes or longer arrive one after another, +then front end throughput may limit execution speed. In such case, +.It Li LSD_OVERFLOW +.Pq Event 20H , Umask 01H +Number of loops that can not stream from the instruction queue. +.It Li L2_RQSTS.LD_HIT +.Pq Event 24H , Umask 01H +Counts number of loads that hit the L2 cache. L2 loads include both L1D +demand misses as well as L1D prefetches. L2 loads can be rejected for +various reasons. Only non rejected loads are counted. +.It Li L2_RQSTS.LD_MISS +.Pq Event 24H , Umask 02H +Counts the number of loads that miss the L2 cache. L2 loads include both L1D +demand misses as well as L1D prefetches. +.It Li L2_RQSTS.LOADS +.Pq Event 24H , Umask 03H +Counts all L2 load requests. L2 loads include both L1D demand misses as well +as L1D prefetches. +.It Li L2_RQSTS.RFO_HIT +.Pq Event 24H , Umask 04H +Counts the number of store RFO requests that hit the L2 cache. L2 RFO +requests include both L1D demand RFO misses as well as L1D RFO prefetches. +Count includes WC memory requests, where the data is not fetched but the +permission to write the line is required. +.It Li L2_RQSTS.RFO_MISS +.Pq Event 24H , Umask 08H +Counts the number of store RFO requests that miss the L2 cache. L2 RFO +requests include both L1D demand RFO misses as well as L1D RFO prefetches. +.It Li L2_RQSTS.RFOS +.Pq Event 24H , Umask 0CH +Counts all L2 store RFO requests. L2 RFO requests include both L1D demand +RFO misses as well as L1D RFO prefetches.. +.It Li L2_RQSTS.IFETCH_HIT +.Pq Event 24H , Umask 10H +Counts number of instruction fetches that hit the L2 cache. L2 instruction +fetches include both L1I demand misses as well as L1I instruction +prefetches. +.It Li L2_RQSTS.IFETCH_MISS +.Pq Event 24H , Umask 20H +Counts number of instruction fetches that miss the L2 cache. L2 instruction +fetches include both L1I demand misses as well as L1I instruction +prefetches. +.It Li L2_RQSTS.IFETCHES +.Pq Event 24H , Umask 30H +Counts all instruction fetches. L2 instruction fetches include both L1I +demand misses as well as L1I instruction prefetches. +.It Li L2_RQSTS.PREFETCH_HIT +.Pq Event 24H , Umask 40H +Counts L2 prefetch hits for both code and data. +.It Li L2_RQSTS.PREFETCH_MISS +.Pq Event 24H , Umask 80H +Counts L2 prefetch misses for both code and data. +.It Li L2_RQSTS.PREFETCHES +.Pq Event 24H , Umask C0H +Counts all L2 prefetches for both code and data. +.It Li L2_RQSTS.MISS +.Pq Event 24H , Umask AAH +Counts all L2 misses for both code and data. +.It Li L2_RQSTS.REFERENCES +.Pq Event 24H , Umask FFH +Counts all L2 requests for both code and data. +.It Li L2_DATA_RQSTS.DEMAND.I_STATE +.Pq Event 26H , Umask 01H +Counts number of L2 data demand loads where the cache line to be loaded is +in the I (invalid) state, i.e. a cache miss. L2 demand loads are both L1D +demand misses and L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.S_STATE +.Pq Event 26H , Umask 02H +Counts number of L2 data demand loads where the cache line to be loaded is +in the S (shared) state. L2 demand loads are both L1D demand misses and L1D +prefetches. +.It Li L2_DATA_RQSTS.DEMAND.E_STATE +.Pq Event 26H , Umask 04H +Counts number of L2 data demand loads where the cache line to be loaded is +in the E (exclusive) state. L2 demand loads are both L1D demand misses and +L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.M_STATE +.Pq Event 26H , Umask 08H +Counts number of L2 data demand loads where the cache line to be loaded is +in the M (modified) state. L2 demand loads are both L1D demand misses and +L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.MESI +.Pq Event 26H , Umask 0FH +Counts all L2 data demand requests. L2 demand loads are both L1D demand +misses and L1D prefetches. +.It Li L2_DATA_RQSTS.PREFETCH.I_STATE +.Pq Event 26H , Umask 10H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the I (invalid) state, i.e. a cache miss. +.It Li L2_DATA_RQSTS.PREFETCH.S_STATE +.Pq Event 26H , Umask 20H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the S (shared) state. A prefetch RFO will miss on an S state line, while +a prefetch read will hit on an S state line. +.It Li L2_DATA_RQSTS.PREFETCH.E_STATE +.Pq Event 26H , Umask 40H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the E (exclusive) state. +.It Li L2_DATA_RQSTS.PREFETCH.M_STATE +.Pq Event 26H , Umask 80H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the M (modified) state. +.It Li L2_DATA_RQSTS.PREFETCH.MESI +.Pq Event 26H , Umask F0H +Counts all L2 prefetch requests. +.It Li L2_DATA_RQSTS.ANY +.Pq Event 26H , Umask FFH +Counts all L2 data requests. +.It Li L2_WRITE.RFO.I_STATE +.Pq Event 27H , Umask 01H +Counts number of L2 demand store RFO requests where the cache line to be +loaded is in the I (invalid) state, i.e, a cache miss. The L1D prefetcher +does not issue a RFO prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.S_STATE +.Pq Event 27H , Umask 02H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the S (shared) state. The L1D prefetcher does not issue a RFO prefetch,. +This is a demand RFO request +.It Li L2_WRITE.RFO.M_STATE +.Pq Event 27H , Umask 08H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the M (modified) state. The L1D prefetcher does not issue a RFO prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.HIT +.Pq Event 27H , Umask 0EH +Counts number of L2 store RFO requests where the cache line to be loaded is +in either the S, E or M states. The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.MESI +.Pq Event 27H , Umask 0FH +Counts all L2 store RFO requests.The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li L2_WRITE.LOCK.I_STATE +.Pq Event 27H , Umask 10H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the I (invalid) state, i.e. a cache miss. +.It Li L2_WRITE.LOCK.S_STATE +.Pq Event 27H , Umask 20H +Counts number of L2 lock RFO requests where the cache line to be loaded is +in the S (shared) state. +.It Li L2_WRITE.LOCK.E_STATE +.Pq Event 27H , Umask 40H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the E (exclusive) state. +.It Li L2_WRITE.LOCK.M_STATE +.Pq Event 27H , Umask 80H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the M (modified) state. +.It Li L2_WRITE.LOCK.HIT +.Pq Event 27H , Umask E0H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in either the S, E, or M state. +.It Li L2_WRITE.LOCK.MESI +.Pq Event 27H , Umask F0H +Counts all L2 demand lock RFO requests. +.It Li L1D_WB_L2.I_STATE +.Pq Event 28H , Umask 01H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the I (invalid) state, i.e. a cache miss. +.It Li L1D_WB_L2.S_STATE +.Pq Event 28H , Umask 02H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the S state. +.It Li L1D_WB_L2.E_STATE +.Pq Event 28H , Umask 04H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the E (exclusive) state. +.It Li L1D_WB_L2.M_STATE +.Pq Event 28H , Umask 08H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the M (modified) state. +.It Li L1D_WB_L2.MESI +.Pq Event 28H , Umask 0FH +Counts all L1 writebacks to the L2. +.It Li L3_LAT_CACHE.REFERENCE +.Pq Event 2EH , Umask 02H +Counts uncore Last Level Cache references. Because cache hierarchy, cache +sizes and other implementation-specific characteristics; value comparison to +estimate performance differences is not recommended. +see Table A-1 +.It Li L3_LAT_CACHE.MISS +.Pq Event 2EH , Umask 01H +Counts uncore Last Level Cache misses. Because cache hierarchy, cache sizes +and other implementation-specific characteristics; value comparison to +estimate performance differences is not recommended. +see Table A-1 +.It Li CPU_CLK_UNHALTED.THREAD_P +.Pq Event 3CH , Umask 00H +Counts the number of thread cycles while the thread is not in a halt state. +The thread enters the halt state when it is running the HLT instruction. The +core frequency may change from time to time due to power or thermal +throttling. +see Table A-1 +.It Li CPU_CLK_UNHALTED.REF_P +.Pq Event 3CH , Umask 01H +Increments at the frequency of TSC when not halted. +see Table A-1 +.It Li DTLB_MISSES.ANY +.Pq Event 49H , Umask 01H +Counts the number of misses in the STLB which causes a page walk. +.It Li DTLB_MISSES.WALK_COMPLETED +.Pq Event 49H , Umask 02H +Counts number of misses in the STLB which resulted in a completed page walk. +.It Li DTLB_MISSES.WALK_CYCLES +.Pq Event 49H , Umask 04H +Counts cycles of page walk due to misses in the STLB. +.It Li DTLB_MISSES.STLB_HIT +.Pq Event 49H , Umask 10H +Counts the number of DTLB first level misses that hit in the second level +TLB. This event is only relevant if the core contains multiple DTLB levels. +.It Li DTLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 49H , Umask 80H +Counts number of completed large page walks due to misses in the STLB. +.It Li LOAD_HIT_PRE +.Pq Event 4CH , Umask 01H +Counts load operations sent to the L1 data cache while a previous SSE +prefetch instruction to the same cache line has started prefetching but has +not yet finished. +.It Li L1D_PREFETCH.REQUESTS +.Pq Event 4EH , Umask 01H +Counts number of hardware prefetch requests dispatched out of the prefetch +FIFO. +.It Li L1D_PREFETCH.MISS +.Pq Event 4EH , Umask 02H +Counts number of hardware prefetch requests that miss the L1D. There are two +prefetchers in the L1D. A streamer, which predicts lines sequentially after +this one should be fetched, and the IP prefetcher that remembers access +patterns for the current instruction. The streamer prefetcher stops on an +L1D hit, while the IP prefetcher does not. +.It Li L1D_PREFETCH.TRIGGERS +.Pq Event 4EH , Umask 04H +Counts number of prefetch requests triggered by the Finite State Machine and +pushed into the prefetch FIFO. Some of the prefetch requests are dropped due +to overwrites or competition between the IP index prefetcher and streamer +prefetcher. The prefetch FIFO contains 4 entries. +.It Li EPT.WALK_CYCLES +.Pq Event 4FH , Umask 10H +Counts Extended Page walk cycles. +.It Li L1D.REPL +.Pq Event 51H , Umask 01H +Counts the number of lines brought into the L1 data cache. +Counter 0, 1 only +.It Li L1D.M_REPL +.Pq Event 51H , Umask 02H +Counts the number of modified lines brought into the L1 data cache. +Counter 0, 1 only +.It Li L1D.M_EVICT +.Pq Event 51H , Umask 04H +Counts the number of modified lines evicted from the L1 data cache due to +replacement. +Counter 0, 1 only +.It Li L1D.M_SNOOP_EVICT +.Pq Event 51H , Umask 08H +Counts the number of modified lines evicted from the L1 data cache due to +snoop HITM intervention. +Counter 0, 1 only +.It Li L1D_CACHE_PREFETCH_LOCK_FB_HIT +.Pq Event 52H , Umask 01H +Counts the number of cacheable load lock speculated instructions accepted +into the fill buffer. +.It Li L1D_CACHE_LOCK_FB_HIT +.Pq Event 53H , Umask 01H +Counts the number of cacheable load lock speculated or retired instructions +accepted into the fill buffer. +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_DATA +.Pq Event 60H , Umask 01H +Counts weighted cycles of offcore demand data read requests. Does not +include L2 prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_CODE +.Pq Event 60H , Umask 02H +Counts weighted cycles of offcore demand code read requests. Does not +include L2 prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.RFO +.Pq Event 60H , Umask 04H +Counts weighted cycles of offcore demand RFO requests. Does not include L2 +prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.ANY.READ +.Pq Event 60H , Umask 08H +Counts weighted cycles of offcore read requests of any kind. Include L2 +prefetch requests. +counter 0 +.It Li CACHE_LOCK_CYCLES.L1D_L2 +.Pq Event 63H , Umask 01H +Cycle count during which the L1D and L2 are locked. A lock is asserted when +there is a locked memory access, due to uncacheable memory, a locked +operation that spans two cache lines, or a page walk from an uncacheable +page table. +Counter 0, 1 only. L1D and L2 locks have a very high performance penalty and +it is highly recommended to avoid such accesses. +.It Li CACHE_LOCK_CYCLES.L1D +.Pq Event 63H , Umask 02H +Counts the number of cycles that cacheline in the L1 data cache unit is +locked. +Counter 0, 1 only. +.It Li IO_TRANSACTIONS +.Pq Event 6CH , Umask 01H +Counts the number of completed I/O transactions. +.It Li L1I.HITS +.Pq Event 80H , Umask 01H +Counts all instruction fetches that hit the L1 instruction cache. +.It Li L1I.MISSES +.Pq Event 80H , Umask 02H +Counts all instruction fetches that miss the L1I cache. This includes +instruction cache misses, streaming buffer misses, victim cache misses and +uncacheable fetches. An instruction fetch miss is counted only once and not +once for every cycle it is outstanding. +.It Li L1I.READS +.Pq Event 80H , Umask 03H +Counts all instruction fetches, including uncacheable fetches that bypass +the L1I. +.It Li L1I.CYCLES_STALLED +.Pq Event 80H , Umask 04H +Cycle counts for which an instruction fetch stalls due to a L1I cache miss, +ITLB miss or ITLB fault. +.It Li LARGE_ITLB.HIT +.Pq Event 82H , Umask 01H +Counts number of large ITLB hits. +.It Li ITLB_MISSES.ANY +.Pq Event 85H , Umask 01H +Counts the number of misses in all levels of the ITLB which causes a page +walk. +.It Li ITLB_MISSES.WALK_COMPLETED +.Pq Event 85H , Umask 02H +Counts number of misses in all levels of the ITLB which resulted in a +completed page walk. +.It Li ITLB_MISSES.WALK_CYCLES +.Pq Event 85H , Umask 04H +Counts ITLB miss page walk cycles. +.It Li ITLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 85H , Umask 80H +Counts number of completed large page walks due to misses in the STLB. +.It Li ILD_STALL.LCP +.Pq Event 87H , Umask 01H +Cycles Instruction Length Decoder stalls due to length changing prefixes: +66, 67 or REX.W (for EM64T) instructions which change the length of the +decoded instruction. +.It Li ILD_STALL.MRU +.Pq Event 87H , Umask 02H +Instruction Length Decoder stall cycles due to Brand Prediction Unit (PBU) +Most Recently Used (MRU) bypass. +.It Li ILD_STALL.IQ_FULL +.Pq Event 87H , Umask 04H +Stall cycles due to a full instruction queue. +.It Li ILD_STALL.REGEN +.Pq Event 87H , Umask 08H +Counts the number of regen stalls. +.It Li ILD_STALL.ANY +.Pq Event 87H , Umask 0FH +Counts any cycles the Instruction Length Decoder is stalled. +.It Li BR_INST_EXEC.COND +.Pq Event 88H , Umask 01H +Counts the number of conditional near branch instructions executed, but not +necessarily retired. +.It Li BR_INST_EXEC.DIRECT +.Pq Event 88H , Umask 02H +Counts all unconditional near branch instructions excluding calls and +indirect branches. +.It Li BR_INST_EXEC.INDIRECT_NON_CALL +.Pq Event 88H , Umask 04H +Counts the number of executed indirect near branch instructions that are not +calls. +.It Li BR_INST_EXEC.NON_CALLS +.Pq Event 88H , Umask 07H +Counts all non call near branch instructions executed, but not necessarily +retired. +.It Li BR_INST_EXEC.RETURN_NEAR +.Pq Event 88H , Umask 08H +Counts indirect near branches that have a return mnemonic. +.It Li BR_INST_EXEC.DIRECT_NEAR_CALL +.Pq Event 88H , Umask 10H +Counts unconditional near call branch instructions, excluding non call +branch, executed. +.It Li BR_INST_EXEC.INDIRECT_NEAR_CALL +.Pq Event 88H , Umask 20H +Counts indirect near calls, including both register and memory indirect, +executed. +.It Li BR_INST_EXEC.NEAR_CALLS +.Pq Event 88H , Umask 30H +Counts all near call branches executed, but not necessarily retired. +.It Li BR_INST_EXEC.TAKEN +.Pq Event 88H , Umask 40H +Counts taken near branches executed, but not necessarily retired. +.It Li BR_INST_EXEC.ANY +.Pq Event 88H , Umask 7FH +Counts all near executed branches (not necessarily retired). This includes +only instructions and not micro-op branches. Frequent branching is not +necessarily a major performance issue. However frequent branch +mispredictions may be a problem. +.It Li BR_MISP_EXEC.COND +.Pq Event 89H , Umask 01H +Counts the number of mispredicted conditional near branch instructions +executed, but not necessarily retired. +.It Li BR_MISP_EXEC.DIRECT +.Pq Event 89H , Umask 02H +Counts mispredicted macro unconditional near branch instructions, excluding +calls and indirect branches (should always be 0). +.It Li BR_MISP_EXEC.INDIRECT_NON_CALL +.Pq Event 89H , Umask 04H +Counts the number of executed mispredicted indirect near branch instructions +that are not calls. +.It Li BR_MISP_EXEC.NON_CALLS +.Pq Event 89H , Umask 07H +Counts mispredicted non call near branches executed, but not necessarily +retired. +.It Li BR_MISP_EXEC.RETURN_NEAR +.Pq Event 89H , Umask 08H +Counts mispredicted indirect branches that have a rear return mnemonic. +.It Li BR_MISP_EXEC.DIRECT_NEAR_CALL +.Pq Event 89H , Umask 10H +Counts mispredicted non-indirect near calls executed, (should always be 0). +.It Li BR_MISP_EXEC.INDIRECT_NEAR_CALL +.Pq Event 89H , Umask 20H +Counts mispredicted indirect near calls exeucted, including both register +and memory indirect. +.It Li BR_MISP_EXEC.NEAR_CALLS +.Pq Event 89H , Umask 30H +Counts all mispredicted near call branches executed, but not necessarily +retired. +.It Li BR_MISP_EXEC.TAKEN +.Pq Event 89H , Umask 40H +Counts executed mispredicted near branches that are taken, but not +necessarily retired. +.It Li BR_MISP_EXEC.ANY +.Pq Event 89H , Umask 7FH +Counts the number of mispredicted near branch instructions that were +executed, but not necessarily retired. +.It Li RESOURCE_STALLS.ANY +.Pq Event A2H , Umask 01H +Counts the number of Allocator resource related stalls. Includes register +renaming buffer entries, memory buffer entries. In addition to resource +related stalls, this event counts some other events. Includes stalls arising +during branch misprediction recovery, such as if retirement of the +mispredicted branch is delayed and stalls arising while store buffer is +draining from synchronizing operations. +Does not include stalls due to SuperQ (off core) queue full, too many cache +misses, etc. +.It Li RESOURCE_STALLS.LOAD +.Pq Event A2H , Umask 02H +Counts the cycles of stall due to lack of load buffer for load operation. +.It Li RESOURCE_STALLS.RS_FULL +.Pq Event A2H , Umask 04H +This event counts the number of cycles when the number of instructions in +the pipeline waiting for execution reaches the limit the processor can +handle. A high count of this event indicates that there are long latency +operations in the pipe (possibly load and store operations that miss the L2 +cache, or instructions dependent upon instructions further down the pipeline +that have yet to retire. +When RS is full, new instructions can not enter the reservation station and +start execution. +.It Li RESOURCE_STALLS.STORE +.Pq Event A2H , Umask 08H +This event counts the number of cycles that a resource related stall will +occur due to the number of store instructions reaching the limit of the +pipeline, (i.e. all store buffers are used). The stall ends when a store +instruction commits its data to the cache or memory. +.It Li RESOURCE_STALLS.ROB_FULL +.Pq Event A2H , Umask 10H +Counts the cycles of stall due to re- order buffer full. +.It Li RESOURCE_STALLS.FPCW +.Pq Event A2H , Umask 20H +Counts the number of cycles while execution was stalled due to writing the +floating-point unit (FPU) control word. +.It Li RESOURCE_STALLS.MXCSR +.Pq Event A2H , Umask 40H +Stalls due to the MXCSR register rename occurring to close to a previous +MXCSR rename. The MXCSR provides control and status for the MMX registers. +.It Li RESOURCE_STALLS.OTHER +.Pq Event A2H , Umask 80H +Counts the number of cycles while execution was stalled due to other +resource issues. +.It Li MACRO_INSTS.FUSIONS_DECODED +.Pq Event A6H , Umask 01H +Counts the number of instructions decoded that are macro-fused but not +necessarily executed or retired. +.It Li BACLEAR_FORCE_IQ +.Pq Event A7H , Umask 01H +Counts number of times a BACLEAR was forced by the Instruction Queue. The IQ +is also responsible for providing conditional branch prediciton direction +based on a static scheme and dynamic data provided by the L2 Branch +Prediction Unit. If the conditional branch target is not found in the Target +Array and the IQ predicts that the branch is taken, then the IQ will force +the Branch Address Calculator to issue a BACLEAR. Each BACLEAR asserted by +the BAC generates approximately an 8 cycle bubble in the instruction fetch +pipeline. +.It Li LSD.UOPS +.Pq Event A8H , Umask 01H +Counts the number of micro-ops delivered by loop stream detector +Use cmask=1 and invert to count cycles +.It Li ITLB_FLUSH +.Pq Event AEH , Umask 01H +Counts the number of ITLB flushes +.It Li OFFCORE_REQUESTS.DEMAND.READ_DATA +.Pq Event B0H , Umask 01H +Counts number of offcore demand data read requests. Does not count L2 +prefetch requests. +.It Li OFFCORE_REQUESTS.DEMAND.READ_CODE +.Pq Event B0H , Umask 02H +Counts number of offcore demand code read requests. Does not count L2 +prefetch requests. +.It Li OFFCORE_REQUESTS.DEMAND.RFO +.Pq Event B0H , Umask 04H +Counts number of offcore demand RFO requests. Does not count L2 prefetch +requests. +.It Li OFFCORE_REQUESTS.ANY.READ +.Pq Event B0H , Umask 08H +Counts number of offcore read requests. Includes L2 prefetch requests. +.It Li OFFCORE_REQUESTS.ANY.RFO +.Pq Event 80H , Umask 10H +Counts number of offcore RFO requests. Includes L2 prefetch requests. +.It Li OFFCORE_REQUESTS.L1D_WRITEBACK +.Pq Event B0H , Umask 40H +Counts number of L1D writebacks to the uncore. +.It Li OFFCORE_REQUESTS.ANY +.Pq Event B0H , Umask 80H +Counts all offcore requests. +.It Li UOPS_EXECUTED.PORT0 +.Pq Event B1H , Umask 01H +Counts number of Uops executed that were issued on port 0. Port 0 handles +integer arithmetic, SIMD and FP add Uops. +.It Li UOPS_EXECUTED.PORT1 +.Pq Event B1H , Umask 02H +Counts number of Uops executed that were issued on port 1. Port 1 handles +integer arithmetic, SIMD, integer shift, FP multiply and FP divide Uops. +.It Li UOPS_EXECUTED.PORT2_CORE +.Pq Event B1H , Umask 04H +Counts number of Uops executed that were issued on port 2. Port 2 handles +the load Uops. This is a core count only and can not be collected per +thread. +.It Li UOPS_EXECUTED.PORT3_CORE +.Pq Event B1H , Umask 08H +Counts number of Uops executed that were issued on port 3. Port 3 handles +store Uops. This is a core count only and can not be collected per thread. +.It Li UOPS_EXECUTED.PORT4_CORE +.Pq Event B1H , Umask 10H +Counts number of Uops executed that where issued on port 4. Port 4 handles +the value to be stored for the store Uops issued on port 3. This is a core +count only and can not be collected per thread. +.It Li UOPS_EXECUTED.CORE_ACTIVE_CYCLES_NO_PORT5 +.Pq Event B1H , Umask 1FH +Counts number of cycles there are one or more uops being executed and were +issued on ports 0-4. This is a core count only and can not be collected per +thread. +.It Li UOPS_EXECUTED.PORT5 +.Pq Event B1H , Umask 20H +Counts number of Uops executed that where issued on port 5. +.It Li UOPS_EXECUTED.CORE_ACTIVE_CYCLES +.Pq Event B1H , Umask 3FH +Counts number of cycles there are one or more uops being executed on any +ports. This is a core count only and can not be collected per thread. +.It Li UOPS_EXECUTED.PORT015 +.Pq Event B1H , Umask 40H +Counts number of Uops executed that where issued on port 0, 1, or 5. +use cmask=1, invert=1 to count stall cycles +.It Li UOPS_EXECUTED.PORT234 +.Pq Event B1H , Umask 80H +Counts number of Uops executed that where issued on port 2, 3, or 4. +.It Li OFFCORE_REQUESTS_SQ_FULL +.Pq Event B2H , Umask 01H +Counts number of cycles the SQ is full to handle off-core requests. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.DATA +.Pq Event B3H , Umask 01H +Counts weighted cycles of snoopq requests for data. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.INVALIDATE +.Pq Event B3H , Umask 02H +Counts weighted cycles of snoopq invalidate requests. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.CODE +.Pq Event B3H , Umask 04H +Counts weighted cycles of snoopq requests for code. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS.CODE +.Pq Event B4H , Umask 01H +Counts the number of snoop code requests +.It Li SNOOPQ_REQUESTS.DATA +.Pq Event B4H , Umask 02H +Counts the number of snoop data requests +.It Li SNOOPQ_REQUESTS.INVALIDATE +.Pq Event B4H , Umask 04H +Counts the number of snoop invalidate requests +.It Li OFF_CORE_RESPONSE_0 +.Pq Event B7H , Umask 01H +see Section 30.6.1.3, Off-core Response Performance Monitoring in the +Processor Core. +Requires programming MSR 01A6H +.It Li SNOOP_RESPONSE.HIT +.Pq Event B8H , Umask 01H +Counts HIT snoop response sent by this thread in response to a snoop +request. +.It Li SNOOP_RESPONSE.HITE +.Pq Event B8H , Umask 02H +Counts HIT E snoop response sent by this thread in response to a snoop +request. +.It Li SNOOP_RESPONSE.HITM +.Pq Event B8H , Umask 04H +Counts HIT M snoop response sent by this thread in response to a snoop +request. +.It Li OFF_CORE_RESPONSE_1 +.Pq Event BBH , Umask 01H +see Section 30.6.1.3, Off-core Response Performance Monitoring in the +Processor Core +Use MSR 01A7H +.It Li INST_RETIRED.ANY_P +.Pq Event C0H , Umask 01H +See Table A-1 +Notes: INST_RETIRED.ANY is counted by a designated fixed counter. +INST_RETIRED.ANY_P is counted by a programmable counter and is an +architectural performance event. Event is supported if CPUID.A.EBX[1] = 0. +Counting: Faulting executions of GETSEC/VM entry/VM Exit/MWait will not +count as retired instructions. +.It Li INST_RETIRED.X87 +.Pq Event C0H , Umask 02H +Counts the number of floating point computational operations retired: +floating point computational operations executed by the assist handler and +sub-operations of complex floating point instructions like transcendental +instructions. +.It Li INST_RETIRED.MMX +.Pq Event C0H , Umask 04H +Counts the number of retired: MMX instructions. +.It Li UOPS_RETIRED.ANY +.Pq Event C2H , Umask 01H +Counts the number of micro-ops retired, (macro-fused=1, micro- fused=2, +others=1; maximum count of 8 per cycle). Most instructions are composed of +one or two micro-ops. Some instructions are decoded into longer sequences +such as repeat instructions, floating point transcendental instructions, and +assists. +Use cmask=1 and invert to count active cycles or stalled cycles +.It Li UOPS_RETIRED.RETIRE_SLOTS +.Pq Event C2H , Umask 02H +Counts the number of retirement slots used each cycle +.It Li UOPS_RETIRED.MACRO_FUSED +.Pq Event C2H , Umask 04H +Counts number of macro-fused uops retired. +.It Li MACHINE_CLEARS.CYCLES +.Pq Event C3H , Umask 01H +Counts the cycles machine clear is asserted. +.It Li MACHINE_CLEARS.MEM_ORDER +.Pq Event C3H , Umask 02H +Counts the number of machine clears due to memory order conflicts. +.It Li MACHINE_CLEARS.SMC +.Pq Event C3H , Umask 04H +Counts the number of times that a program writes to a code section. +Self-modifying code causes a sever penalty in all Intel 64 and IA-32 +processors. The modified cache line is written back to the L2 and L3caches. +.It Li BR_INST_RETIRED.ALL_BRANCHES +.Pq Event C4H , Umask 00H +See Table A-1 +.It Li BR_INST_RETIRED.CONDITIONAL +.Pq Event C4H , Umask 01H +Counts the number of conditional branch instructions retired. +.It Li BR_INST_RETIRED.NEAR_CALL +.Pq Event C4H , Umask 02H +Counts the number of direct & indirect near unconditional calls retired +.It Li BR_INST_RETIRED.ALL_BRANCHES +.Pq Event C4H , Umask 04H +Counts the number of branch instructions retired +.It Li BR_MISP_RETIRED.ALL_BRANCHES +.Pq Event C5H , Umask 00H +See Table A-1 +.It Li BR_MISP_RETIRED.CONDITIONAL +.Pq Event C5H , Umask 01H +Counts mispredicted conditional retired calls. +.It Li BR_MISP_RETIRED.NEAR_CALL +.Pq Event C5H , Umask 02H +Counts mispredicted direct & indirect near unconditional retired calls. +.It Li BR_MISP_RETIRED.ALL_BRANCHES +.Pq Event C5H , Umask 04H +Counts all mispredicted retired calls. +.It Li SSEX_UOPS_RETIRED.PACKED_SINGLE +.Pq Event C7H , Umask 01H +Counts SIMD packed single-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.SCALAR_SINGLE +.Pq Event C7H , Umask 02H +Counts SIMD calar single-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.PACKED_DOUBLE +.Pq Event C7H , Umask 04H +Counts SIMD packed double- precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.SCALAR_DOUBLE +.Pq Event C7H , Umask 08H +Counts SIMD scalar double-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.VECTOR_INTEGER +.Pq Event C7H , Umask 10H +Counts 128-bit SIMD vector integer Uops retired. +.It Li ITLB_MISS_RETIRED +.Pq Event C8H , Umask 20H +Counts the number of retired instructions that missed the ITLB when the +instruction was fetched. +.It Li MEM_LOAD_RETIRED.L1D_HIT +.Pq Event CBH , Umask 01H +Counts number of retired loads that hit the L1 data cache. +.It Li MEM_LOAD_RETIRED.L2_HIT +.Pq Event CBH , Umask 02H +Counts number of retired loads that hit the L2 data cache. +.It Li MEM_LOAD_RETIRED.L3_UNSHARED_HIT +.Pq Event CBH , Umask 04H +Counts number of retired loads that hit their own, unshared lines in the L3 +cache. +.It Li MEM_LOAD_RETIRED.OTHER_CORE_L2_HIT_HITM +.Pq Event CBH , Umask 08H +Counts number of retired loads that hit in a sibling core's L2 (on die +core). Since the L3 is inclusive of all cores on the package, this is an L3 +hit. This counts both clean or modified hits. +.It Li MEM_LOAD_RETIRED.L3_MISS +.Pq Event CBH , Umask 10H +Counts number of retired loads that miss the L3 cache. The load was +satisfied by a remote socket, local memory or an IOH. +.It Li MEM_LOAD_RETIRED.HIT_LFB +.Pq Event CBH , Umask 40H +Counts number of retired loads that miss the L1D and the address is located +in an allocated line fill buffer and will soon be committed to cache. This +is counting secondary L1D misses. +.It Li MEM_LOAD_RETIRED.DTLB_MISS +.Pq Event CBH , Umask 80H +Counts the number of retired loads that missed the DTLB. The DTLB miss is +not counted if the load operation causes a fault. This event counts loads +from cacheable memory only. The event does not count loads by software +prefetches. Counts both primary and secondary misses to the TLB. +.It Li FP_MMX_TRANS.TO_FP +.Pq Event CCH , Umask 01H +Counts the first floating-point instruction following any MMX instruction. +You can use this event to estimate the penalties for the transitions between +floating-point and MMX technology states. +.It Li FP_MMX_TRANS.TO_MMX +.Pq Event CCH , Umask 02H +Counts the first MMX instruction following a floating-point instruction. You +can use this event to estimate the penalties for the transitions between +floating-point and MMX technology states. +.It Li FP_MMX_TRANS.ANY +.Pq Event CCH , Umask 03H +Counts all transitions from floating point to MMX instructions and from MMX +instructions to floating point instructions. You can use this event to +estimate the penalties for the transitions between floating-point and MMX +technology states. +.It Li MACRO_INSTS.DECODED +.Pq Event D0H , Umask 01H +Counts the number of instructions decoded, (but not necessarily executed or +retired). +.It Li UOPS_DECODED.STALL_CYCLES +.Pq Event D1H , Umask 01H +Counts the cycles of decoder stalls. +.It Li UOPS_DECODED.MS +.Pq Event D1H , Umask 02H +Counts the number of Uops decoded by the Microcode Sequencer, MS. The MS +delivers uops when the instruction is more than 4 uops long or a microcode +assist is occurring. +.It Li UOPS_DECODED.ESP_FOLDING +.Pq Event D1H , Umask 04H +Counts number of stack pointer (ESP) instructions decoded: push , pop , call +, ret, etc. ESP instructions do not generate a Uop to increment or decrement +ESP. Instead, they update an ESP_Offset register that keeps track of the +delta to the current value of the ESP register. +.It Li UOPS_DECODED.ESP_SYNC +.Pq Event D1H , Umask 08H +Counts number of stack pointer (ESP) sync operations where an ESP +instruction is corrected by adding the ESP offset register to the current +value of the ESP register. +.It Li RAT_STALLS.FLAGS +.Pq Event D2H , Umask 01H +Counts the number of cycles during which execution stalled due to several +reasons, one of which is a partial flag register stall. A partial register +stall may occur when two conditions are met: 1) an instruction modifies +some, but not all, of the flags in the flag register and 2) the next +instruction, which depends on flags, depends on flags that were not modified +by this instruction. +.It Li RAT_STALLS.REGISTERS +.Pq Event D2H , Umask 02H +This event counts the number of cycles instruction execution latency became +longer than the defined latency because the instruction used a register that +was partially written by previous instruction. +.It Li RAT_STALLS.ROB_READ_PORT +.Pq Event D2H , Umask 04H +Counts the number of cycles when ROB read port stalls occurred, which did +not allow new micro-ops to enter the out-of-order pipeline. Note that, at +this stage in the pipeline, additional stalls may occur at the same cycle +and prevent the stalled micro-ops from entering the pipe. In such a case, +micro-ops retry entering the execution pipe in the next cycle and the +ROB-read port stall is counted again. +.It Li RAT_STALLS.SCOREBOARD +.Pq Event D2H , Umask 08H +Counts the cycles where we stall due to microarchitecturally required +serialization. Microcode scoreboarding stalls. +.It Li RAT_STALLS.ANY +.Pq Event D2H , Umask 0FH +Counts all Register Allocation Table stall cycles due to: Cycles when ROB +read port stalls occurred, which did not allow new micro-ops to enter the +execution pipe. Cycles when partial register stalls occurred Cycles when +flag stalls occurred Cycles floating-point unit (FPU) status word stalls +occurred. To count each of these conditions separately use the events: +RAT_STALLS.ROB_READ_PORT, RAT_STALLS.PARTIAL, RAT_STALLS.FLAGS, and +RAT_STALLS.FPSW. +.It Li SEG_RENAME_STALLS +.Pq Event D4H , Umask 01H +Counts the number of stall cycles due to the lack of renaming resources for +the ES, DS, FS, and GS segment registers. If a segment is renamed but not +retired and a second update to the same segment occurs, a stall occurs in +the front- end of the pipeline until the renamed segment retires. +.It Li ES_REG_RENAMES +.Pq Event D5H , Umask 01H +Counts the number of times the ES segment register is renamed. +.It Li UOP_UNFUSION +.Pq Event DBH , Umask 01H +Counts unfusion events due to floating point exception to a fused uop. +.It Li BR_INST_DECODED +.Pq Event E0H , Umask 01H +Counts the number of branch instructions decoded. +.It Li BPU_MISSED_CALL_RET +.Pq Event E5H , Umask 01H +Counts number of times the Branch Prediciton Unit missed predicting a call +or return branch. +.It Li BACLEAR.CLEAR +.Pq Event E6H , Umask 01H +Counts the number of times the front end is resteered, mainly when the +Branch Prediction Unit cannot provide a correct prediction and this is +corrected by the Branch Address Calculator at the front end. This can occur +if the code has many branches such that they cannot be consumed by the BPU. +Each BACLEAR asserted by the BAC generates approximately an 8 cycle bubble +in the instruction fetch pipeline. The effect on total execution time +depends on the surrounding code. +.It Li BACLEAR.BAD_TARGET +.Pq Event E6H , Umask 02H +Counts number of Branch Address Calculator clears (BACLEAR) asserted due to +conditional branch instructions in which there was a target hit but the +direction was wrong. Each BACLEAR asserted by the BAC generates +approximately an 8 cycle bubble in the instruction fetch pipeline. +.It Li BPU_CLEARS.EARLY +.Pq Event E8H , Umask 01H +Counts early (normal) Branch Prediction Unit clears: BPU predicted a taken +branch after incorrectly assuming that it was not taken. +The BPU clear leads to 2 cycle bubble in the Front End. +.It Li BPU_CLEARS.LATE +.Pq Event E8H , Umask 02H +Counts late Branch Prediction Unit clears due to Most Recently Used +conflicts. The PBU clear leads to a 3 cycle bubble in the Front End. +.It Li THREAD_ACTIVE +.Pq Event ECH , Umask 01H +Counts cycles threads are active. +.It Li L2_TRANSACTIONS.LOAD +.Pq Event F0H , Umask 01H +Counts L2 load operations due to HW prefetch or demand loads. +.It Li L2_TRANSACTIONS.RFO +.Pq Event F0H , Umask 02H +Counts L2 RFO operations due to HW prefetch or demand RFOs. +.It Li L2_TRANSACTIONS.IFETCH +.Pq Event F0H , Umask 04H +Counts L2 instruction fetch operations due to HW prefetch or demand ifetch. +.It Li L2_TRANSACTIONS.PREFETCH +.Pq Event F0H , Umask 08H +Counts L2 prefetch operations. +.It Li L2_TRANSACTIONS.L1D_WB +.Pq Event F0H , Umask 10H +Counts L1D writeback operations to the L2. +.It Li L2_TRANSACTIONS.FILL +.Pq Event F0H , Umask 20H +Counts L2 cache line fill operations due to load, RFO, L1D writeback or +prefetch. +.It Li L2_TRANSACTIONS.WB +.Pq Event F0H , Umask 40H +Counts L2 writeback operations to the L3. +.It Li L2_TRANSACTIONS.ANY +.Pq Event F0H , Umask 80H +Counts all L2 cache operations. +.It Li L2_LINES_IN.S_STATE +.Pq Event F1H , Umask 02H +Counts the number of cache lines allocated in the L2 cache in the S (shared) +state. +.It Li L2_LINES_IN.E_STATE +.Pq Event F1H , Umask 04H +Counts the number of cache lines allocated in the L2 cache in the E +(exclusive) state. +.It Li L2_LINES_IN.ANY +.Pq Event F1H , Umask 07H +Counts the number of cache lines allocated in the L2 cache. +.It Li L2_LINES_OUT.DEMAND_CLEAN +.Pq Event F2H , Umask 01H +Counts L2 clean cache lines evicted by a demand request. +.It Li L2_LINES_OUT.DEMAND_DIRTY +.Pq Event F2H , Umask 02H +Counts L2 dirty (modified) cache lines evicted by a demand request. +.It Li L2_LINES_OUT.PREFETCH_CLEAN +.Pq Event F2H , Umask 04H +Counts L2 clean cache line evicted by a prefetch request. +.It Li L2_LINES_OUT.PREFETCH_DIRTY +.Pq Event F2H , Umask 08H +Counts L2 modified cache line evicted by a prefetch request. +.It Li L2_LINES_OUT.ANY +.Pq Event F2H , Umask 0FH +Counts all L2 cache lines evicted for any reason. +.It Li SQ_MISC.LRU_HINTS +.Pq Event F4H , Umask 04H +Counts number of Super Queue LRU hints sent to L3. +.It Li SQ_MISC.SPLIT_LOCK +.Pq Event F4H , Umask 10H +Counts the number of SQ lock splits across a cache line. +.It Li SQ_FULL_STALL_CYCLES +.Pq Event F6H , Umask 01H +Counts cycles the Super Queue is full. Neither of the threads on this core +will be able to access the uncore. +.It Li FP_ASSIST.ALL +.Pq Event F7H , Umask 01H +Counts the number of floating point operations executed that required +micro-code assist intervention. Assists are required in the following cases: +SSE instructions, (Denormal input when the DAZ flag is off or Underflow +result when the FTZ flag is off): x87 instructions, (NaN or denormal are +loaded to a register or used as input from memory, Division by 0 or +Underflow output). +.It Li FP_ASSIST.OUTPUT +.Pq Event F7H , Umask 02H +Counts number of floating point micro-code assist when the output value +(destination register) is invalid. +.It Li FP_ASSIST.INPUT +.Pq Event F7H , Umask 04H +Counts number of floating point micro-code assist when the input value (one +of the source operands to an FP instruction) is invalid. +.It Li SIMD_INT_64.PACKED_MPY +.Pq Event FDH , Umask 01H +Counts number of SID integer 64 bit packed multiply operations. +.It Li SIMD_INT_64.PACKED_SHIFT +.Pq Event FDH , Umask 02H +Counts number of SID integer 64 bit packed shift operations. +.It Li SIMD_INT_64.PACK +.Pq Event FDH , Umask 04H +Counts number of SID integer 64 bit pack operations. +.It Li SIMD_INT_64.UNPACK +.Pq Event FDH , Umask 08H +Counts number of SID integer 64 bit unpack operations. +.It Li SIMD_INT_64.PACKED_LOGICAL +.Pq Event FDH , Umask 10H +Counts number of SID integer 64 bit logical operations. +.It Li SIMD_INT_64.PACKED_ARITH +.Pq Event FDH , Umask 20H +Counts number of SID integer 64 bit arithmetic operations. +.It Li SIMD_INT_64.SHUFFLE_MOVE +.Pq Event FDH , Umask 40H +Counts number of SID integer 64 bit shift or move operations. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.ucf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7 3 , +.Xr pmc.corei7uc 3 , +.Xr pmc.westmereuc 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . Property changes on: stable/8/lib/libpmc/pmc.westmere.3 ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/8/lib/libpmc/pmc.westmereuc.3 =================================================================== --- stable/8/lib/libpmc/pmc.westmereuc.3 (nonexistent) +++ stable/8/lib/libpmc/pmc.westmereuc.3 (revision 206702) @@ -0,0 +1,1083 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 24, 2010 +.Os +.Dt PMC.WESTMEREUC 3 +.Sh NAME +.Nm pmc.westmere +.Nd uncore measurement events for +.Tn Intel +.Tn Westmere +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Westmere" +CPUs contain PMCs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +These CPUs contain two classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_UCP" +.It Li PMC_CLASS_UCF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_UCP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Westmere PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Ss WESTMERE UNCORE FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.ucf 3 . +Not all CPUs in this family implement fixed-function counters. +.Ss WESTMERE UNCORE PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta \&No +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta \&No +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta \&No +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.El +.Ss Event Specifiers (Programmable PMCs) +Westmere uncore programmable PMCs support the following events: +.Bl -tag -width indent +.It Li GQ_CYCLES_FULL.READ_TRACKER +.Pq Event 00H , Umask 01H +Uncore cycles Global Queue read tracker is full. +.It Li GQ_CYCLES_FULL.WRITE_TRACKER +.Pq Event 00H , Umask 02H +Uncore cycles Global Queue write tracker is full. +.It Li GQ_CYCLES_FULL.PEER_PROBE_TRACKER +.Pq Event 00H , Umask 04H +Uncore cycles Global Queue peer probe tracker is full. The peer probe +tracker queue tracks snoops from the IOH and remote sockets. +.It Li GQ_CYCLES_NOT_EMPTY.READ_TRACKER +.Pq Event 01H , Umask 01H +Uncore cycles were Global Queue read tracker has at least one valid entry. +.It Li GQ_CYCLES_NOT_EMPTY.WRITE_TRACKER +.Pq Event 01H , Umask 02H +Uncore cycles were Global Queue write tracker has at least one valid entry. +.It Li GQ_CYCLES_NOT_EMPTY.PEER_PROBE_TRACKER +.Pq Event 01H , Umask 04H +Uncore cycles were Global Queue peer probe tracker has at least one valid +entry. The peer probe tracker queue tracks IOH and remote socket snoops. +.It Li GQ_OCCUPANCY.READ_TRACKER +.Pq Event 02H , Umask 01H +Increments the number of queue entries (code read, data read, and RFOs) in +the tread tracker. The GQ read tracker allocate to deallocate occupancy +count is divided by the count to obtain the average read tracker latency. +.It Li GQ_ALLOC.READ_TRACKER +.Pq Event 03H , Umask 01H +Counts the number of tread tracker allocate to deallocate entries. The GQ +read tracker allocate to deallocate occupancy count is divided by the count +to obtain the average read tracker latency. +.It Li GQ_ALLOC.RT_L3_MISS +.Pq Event 03H , Umask 02H +Counts the number GQ read tracker entries for which a full cache line read +has missed the L3. The GQ read tracker L3 miss to fill occupancy count is +divided by this count to obtain the average cache line read L3 miss latency. +The latency represents the time after which the L3 has determined that the +cache line has missed. The time between a GQ read tracker allocation and the +L3 determining that the cache line has missed is the average L3 hit latency. +The total L3 cache line read miss latency is the hit latency + L3 miss +latency. +.It Li GQ_ALLOC.RT_TO_L3_RESP +.Pq Event 03H , Umask 04H +Counts the number of GQ read tracker entries that are allocated in the read +tracker queue that hit or miss the L3. The GQ read tracker L3 hit occupancy +count is divided by this count to obtain the average L3 hit latency. +.It Li GQ_ALLOC.RT_TO_RTID_ACQUIRED +.Pq Event 03H , Umask 08H +Counts the number of GQ read tracker entries that are allocated in the read +tracker, have missed in the L3 and have not acquired a Request Transaction +ID. The GQ read tracker L3 miss to RTID acquired occupancy count is +divided by this count to obtain the average latency for a read L3 miss to +acquire an RTID. +.It Li GQ_ALLOC.WT_TO_RTID_ACQUIRED +.Pq Event 03H , Umask 10H +Counts the number of GQ write tracker entries that are allocated in the +write tracker, have missed in the L3 and have not acquired a Request +Transaction ID. The GQ write tracker L3 miss to RTID occupancy count is +divided by this count to obtain the average latency for a write L3 miss to +acquire an RTID. +.It Li GQ_ALLOC.WRITE_TRACKER +.Pq Event 03H , Umask 20H +Counts the number of GQ write tracker entries that are allocated in the +write tracker queue that miss the L3. The GQ write tracker occupancy count +is divided by the this count to obtain the average L3 write miss latency. +.It Li GQ_ALLOC.PEER_PROBE_TRACKER +.Pq Event 03H , Umask 40H +Counts the number of GQ peer probe tracker (snoop) entries that are +allocated in the peer probe tracker queue that miss the L3. The GQ peer +probe occupancy count is divided by this count to obtain the average L3 peer +probe miss latency. +.It Li GQ_DATA.FROM_QPI +.Pq Event 04H , Umask 01H +Cycles Global Queue Quickpath Interface input data port is busy importing +data from the Quickpath Interface. Each cycle the input port can transfer 8 +or 16 bytes of data. +.It Li GQ_DATA.FROM_QMC +.Pq Event 04H , Umask 02H +Cycles Global Queue Quickpath Memory Interface input data port is busy +importing data from the Quickpath Memory Interface. Each cycle the input +port can transfer 8 or 16 bytes of data. +.It Li GQ_DATA.FROM_L3 +.Pq Event 04H , Umask 04H +Cycles GQ L3 input data port is busy importing data from the Last Level +Cache. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.FROM_CORES_02 +.Pq Event 04H , Umask 08H +Cycles GQ Core 0 and 2 input data port is busy importing data from processor +cores 0 and 2. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.FROM_CORES_13 +.Pq Event 04H , Umask 10H +Cycles GQ Core 1 and 3 input data port is busy importing data from processor +cores 1 and 3. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.TO_QPI_QMC +.Pq Event 05H , Umask 01H +Cycles GQ QPI and QMC output data port is busy sending data to the Quickpath +Interface or Quickpath Memory Interface. Each cycle the output port can +transfer 32 bytes of data. +.It Li GQ_DATA.TO_L3 +.Pq Event 05H , Umask 02H +Cycles GQ L3 output data port is busy sending data to the Last Level Cache. +Each cycle the output port can transfer 32 bytes of data. +.It Li GQ_DATA.TO_CORES +.Pq Event 05H , Umask 04H +Cycles GQ Core output data port is busy sending data to the Cores. Each +cycle the output port can transfer 32 bytes of data. +.It Li SNP_RESP_TO_LOCAL_HOME.I_STATE +.Pq Event 06H , Umask 01H +Number of snoop responses to the local home that L3 does not have the +referenced cache line. +.It Li SNP_RESP_TO_LOCAL_HOME.S_STATE +.Pq Event 06H , Umask 02H +Number of snoop responses to the local home that L3 has the referenced line +cached in the S state. +.It Li SNP_RESP_TO_LOCAL_HOME.FWD_S_STATE +.Pq Event 06H , Umask 04H +Number of responses to code or data read snoops to the local home that the +L3 has the referenced cache line in the E state. The L3 cache line state is +changed to the S state and the line is forwarded to the local home in the S +state. +.It Li SNP_RESP_TO_LOCAL_HOME.FWD_I_STATE +.Pq Event 06H , Umask 08H +Number of responses to read invalidate snoops to the local home that the L3 +has the referenced cache line in the M state. The L3 cache line state is +invalidated and the line is forwarded to the local home in the M state. +.It Li SNP_RESP_TO_LOCAL_HOME.CONFLICT +.Pq Event 06H , Umask 10H +Number of conflict snoop responses sent to the local home. +.It Li SNP_RESP_TO_LOCAL_HOME.WB +.Pq Event 06H , Umask 20H +Number of responses to code or data read snoops to the local home that the +L3 has the referenced line cached in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.I_STATE +.Pq Event 07H , Umask 01H +Number of snoop responses to a remote home that L3 does not have the +referenced cache line. +.It Li SNP_RESP_TO_REMOTE_HOME.S_STATE +.Pq Event 07H , Umask 02H +Number of snoop responses to a remote home that L3 has the referenced line +cached in the S state. +.It Li SNP_RESP_TO_REMOTE_HOME.FWD_S_STATE +.Pq Event 07H , Umask 04H +Number of responses to code or data read snoops to a remote home that the L3 +has the referenced cache line in the E state. The L3 cache line state is +changed to the S state and the line is forwarded to the remote home in the S +state. +.It Li SNP_RESP_TO_REMOTE_HOME.FWD_I_STATE +.Pq Event 07H , Umask 08H +Number of responses to read invalidate snoops to a remote home that the L3 +has the referenced cache line in the M state. The L3 cache line state is +invalidated and the line is forwarded to the remote home in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.CONFLICT +.Pq Event 07H , Umask 10H +Number of conflict snoop responses sent to the local home. +.It Li SNP_RESP_TO_REMOTE_HOME.WB +.Pq Event 07H , Umask 20H +Number of responses to code or data read snoops to a remote home that the L3 +has the referenced line cached in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.HITM +.Pq Event 07H , Umask 24H +Number of HITM snoop responses to a remote home +.It Li L3_HITS.READ +.Pq Event 08H , Umask 01H +Number of code read, data read and RFO requests that hit in the L3 +.It Li L3_HITS.WRITE +.Pq Event 08H , Umask 02H +Number of writeback requests that hit in the L3. Writebacks from the cores +will always result in L3 hits due to the inclusive property of the L3. +.It Li L3_HITS.PROBE +.Pq Event 08H , Umask 04H +Number of snoops from IOH or remote sockets that hit in the L3. +.It Li L3_HITS.ANY +.Pq Event 08H , Umask 03H +Number of reads and writes that hit the L3. +.It Li L3_MISS.READ +.Pq Event 09H , Umask 01H +Number of code read, data read and RFO requests that miss the L3. +.It Li L3_MISS.WRITE +.Pq Event 09H , Umask 02H +Number of writeback requests that miss the L3. Should always be zero as +writebacks from the cores will always result in L3 hits due to the inclusive +property of the L3. +.It Li L3_MISS.PROBE +.Pq Event 09H , Umask 04H +Number of snoops from IOH or remote sockets that miss the L3. +.It Li L3_MISS.ANY +.Pq Event 09H , Umask 03H +Number of reads and writes that miss the L3. +.It Li L3_LINES_IN.M_STATE +.Pq Event 0AH , Umask 01H +Counts the number of L3 lines allocated in M state. The only time a cache +line is allocated in the M state is when the line was forwarded in M state +is forwarded due to a Snoop Read Invalidate Own request. +.It Li L3_LINES_IN.E_STATE +.Pq Event 0AH , Umask 02H +Counts the number of L3 lines allocated in E state. +.It Li L3_LINES_IN.S_STATE +.Pq Event 0AH , Umask 04H +Counts the number of L3 lines allocated in S state. +.It Li L3_LINES_IN.F_STATE +.Pq Event 0AH , Umask 08H +Counts the number of L3 lines allocated in F state. +.It Li L3_LINES_IN.ANY +.Pq Event 0AH , Umask 0FH +Counts the number of L3 lines allocated in any state. +.It Li L3_LINES_OUT.M_STATE +.Pq Event 0BH , Umask 01H +Counts the number of L3 lines victimized that were in the M state. When the +victim cache line is in M state, the line is written to its home cache agent +which can be either local or remote. +.It Li L3_LINES_OUT.E_STATE +.Pq Event 0BH , Umask 02H +Counts the number of L3 lines victimized that were in the E state. +.It Li L3_LINES_OUT.S_STATE +.Pq Event 0BH , Umask 04H +Counts the number of L3 lines victimized that were in the S state. +.It Li L3_LINES_OUT.I_STATE +.Pq Event 0BH , Umask 08H +Counts the number of L3 lines victimized that were in the I state. +.It Li L3_LINES_OUT.F_STATE +.Pq Event 0BH , Umask 10H +Counts the number of L3 lines victimized that were in the F state. +.It Li L3_LINES_OUT.ANY +.Pq Event 0BH , Umask 1FH +Counts the number of L3 lines victimized in any state. +.It Li GQ_SNOOP.GOTO_S +.Pq Event 0CH , Umask 01H +Counts the number of remote snoops that have requested a cache line be set +to the S state. +.It Li GQ_SNOOP.GOTO_I +.Pq Event 0CH , Umask 02H +Counts the number of remote snoops that have requested a cache line be set +to the I state. +.It Li GQ_SNOOP.GOTO_S_HIT_E +.Pq Event 0CH , Umask 04H +Counts the number of remote snoops that have requested a cache line be set +to the S state from E state. +Requires writing MSR 301H with mask = 2H +.It Li GQ_SNOOP.GOTO_S_HIT_F +.Pq Event 0CH , Umask 04H +Counts the number of remote snoops that have requested a cache line be set +to the S state from F (forward) state. +Requires writing MSR 301H with mask = 8H +.It Li GQ_SNOOP.GOTO_S_HIT_M +.Pq Event 0CH , Umask 04H +Counts the number of remote snoops that have requested a cache line be set +to the S state from M state. +Requires writing MSR 301H with mask = 1H +.It Li GQ_SNOOP.GOTO_S_HIT_S +.Pq Event 0CH , Umask 04H +Counts the number of remote snoops that have requested a cache line be set +to the S state from S state. +Requires writing MSR 301H with mask = 4H +.It Li GQ_SNOOP.GOTO_I_HIT_E +.Pq Event 0CH , Umask 08H +Counts the number of remote snoops that have requested a cache line be set +to the I state from E state. +Requires writing MSR 301H with mask = 2H +.It Li GQ_SNOOP.GOTO_I_HIT_F +.Pq Event 0CH , Umask 08H +Counts the number of remote snoops that have requested a cache line be set +to the I state from F (forward) state. +Requires writing MSR 301H with mask = 8H +.It Li GQ_SNOOP.GOTO_I_HIT_M +.Pq Event 0CH , Umask 08H +Counts the number of remote snoops that have requested a cache line be set +to the I state from M state. +Requires writing MSR 301H with mask = 1H +.It Li GQ_SNOOP.GOTO_I_HIT_S +.Pq Event 0CH , Umask 08H +Counts the number of remote snoops that have requested a cache line be set +to the I state from S state. +Requires writing MSR 301H with mask = 4H +.It Li QHL_REQUESTS.IOH_READS +.Pq Event 20H , Umask 01H +Counts number of Quickpath Home Logic read requests from the IOH. +.It Li QHL_REQUESTS.IOH_WRITES +.Pq Event 20H , Umask 02H +Counts number of Quickpath Home Logic write requests from the IOH. +.It Li QHL_REQUESTS.REMOTE_READS +.Pq Event 20H , Umask 04H +Counts number of Quickpath Home Logic read requests from a remote socket. +.It Li QHL_REQUESTS.REMOTE_WRITES +.Pq Event 20H , Umask 08H +Counts number of Quickpath Home Logic write requests from a remote socket. +.It Li QHL_REQUESTS.LOCAL_READS +.Pq Event 20H , Umask 10H +Counts number of Quickpath Home Logic read requests from the local socket. +.It Li QHL_REQUESTS.LOCAL_WRITES +.Pq Event 20H , Umask 20H +Counts number of Quickpath Home Logic write requests from the local socket. +.It Li QHL_CYCLES_FULL.IOH +.Pq Event 21H , Umask 01H +Counts uclk cycles all entries in the Quickpath Home Logic IOH are full. +.It Li QHL_CYCLES_FULL.REMOTE +.Pq Event 21H , Umask 02H +Counts uclk cycles all entries in the Quickpath Home Logic remote tracker +are full. +.It Li QHL_CYCLES_FULL.LOCAL +.Pq Event 21H , Umask 04H +Counts uclk cycles all entries in the Quickpath Home Logic local tracker are +full. +.It Li QHL_CYCLES_NOT_EMPTY.IOH +.Pq Event 22H , Umask 01H +Counts uclk cycles all entries in the Quickpath Home Logic IOH is busy. +.It Li QHL_CYCLES_NOT_EMPTY.REMOTE +.Pq Event 22H , Umask 02H +Counts uclk cycles all entries in the Quickpath Home Logic remote tracker is +busy. +.It Li QHL_CYCLES_NOT_EMPTY.LOCAL +.Pq Event 22H , Umask 04H +Counts uclk cycles all entries in the Quickpath Home Logic local tracker is +busy. +.It Li QHL_OCCUPANCY.IOH +.Pq Event 23H , Umask 01H +QHL IOH tracker allocate to deallocate read occupancy. +.It Li QHL_OCCUPANCY.REMOTE +.Pq Event 23H , Umask 02H +QHL remote tracker allocate to deallocate read occupancy. +.It Li QHL_OCCUPANCY.LOCAL +.Pq Event 23H , Umask 04H +QHL local tracker allocate to deallocate read occupancy. +.It Li QHL_ADDRESS_CONFLICTS.2WAY +.Pq Event 24H , Umask 02H +Counts number of QHL Active Address Table (AAT) entries that saw a max of 2 +conflicts. The AAT is a structure that tracks requests that are in conflict. +The requests themselves are in the home tracker entries. The count is +reported when an AAT entry deallocates. +.It Li QHL_ADDRESS_CONFLICTS.3WAY +.Pq Event 24H , Umask 04H +Counts number of QHL Active Address Table (AAT) entries that saw a max of 3 +conflicts. The AAT is a structure that tracks requests that are in conflict. +The requests themselves are in the home tracker entries. The count is +reported when an AAT entry deallocates. +.It Li QHL_CONFLICT_CYCLES.IOH +.Pq Event 25H , Umask 01H +Counts cycles the Quickpath Home Logic IOH Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_CONFLICT_CYCLES.REMOTE +.Pq Event 25H , Umask 02H +Counts cycles the Quickpath Home Logic Remote Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_CONFLICT_CYCLES.LOCAL +.Pq Event 25H , Umask 04H +Counts cycles the Quickpath Home Logic Local Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_TO_QMC_BYPASS +.Pq Event 26H , Umask 01H +Counts number or requests to the Quickpath Memory Controller that bypass the +Quickpath Home Logic. All local accesses can be bypassed. For remote +requests, only read requests can be bypassed. +.It Li QMC_ISOC_FULL.READ.CH0 +.Pq Event 28H , Umask 01H +Counts cycles all the entries in the DRAM channel 0 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.READ.CH1 +.Pq Event 28H , Umask 02H +Counts cycles all the entries in the DRAM channel 1high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.READ.CH2 +.Pq Event 28H , Umask 04H +Counts cycles all the entries in the DRAM channel 2 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.WRITE.CH0 +.Pq Event 28H , Umask 08H +Counts cycles all the entries in the DRAM channel 0 high priority queue are +occupied with isochronous write requests. +.It Li QMC_ISOC_FULL.WRITE.CH1 +.Pq Event 28H , Umask 10H +Counts cycles all the entries in the DRAM channel 1 high priority queue are +occupied with isochronous write requests. +.It Li QMC_ISOC_FULL.WRITE.CH2 +.Pq Event 28H , Umask 20H +Counts cycles all the entries in the DRAM channel 2 high priority queue are +occupied with isochronous write requests. +.It Li QMC_BUSY.READ.CH0 +.Pq Event 29H , Umask 01H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 0. +.It Li QMC_BUSY.READ.CH1 +.Pq Event 29H , Umask 02H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 1. +.It Li QMC_BUSY.READ.CH2 +.Pq Event 29H , Umask 04H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 2. +.It Li QMC_BUSY.WRITE.CH0 +.Pq Event 29H , Umask 08H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 0. +.It Li QMC_BUSY.WRITE.CH1 +.Pq Event 29H , Umask 10H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 1. +.It Li QMC_BUSY.WRITE.CH2 +.Pq Event 29H , Umask 20H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 2. +.It Li QMC_OCCUPANCY.CH0 +.Pq Event 2AH , Umask 01H +IMC channel 0 normal read request occupancy. +.It Li QMC_OCCUPANCY.CH1 +.Pq Event 2AH , Umask 02H +IMC channel 1 normal read request occupancy. +.It Li QMC_OCCUPANCY.CH2 +.Pq Event 2AH , Umask 04H +IMC channel 2 normal read request occupancy. +.It Li QMC_OCCUPANCY.ANY +.Pq Event 2AH , Umask 07H +Normal read request occupancy for any channel. +.It Li QMC_ISSOC_OCCUPANCY.CH0 +.Pq Event 2BH , Umask 01H +IMC channel 0 issoc read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH1 +.Pq Event 2BH , Umask 02H +IMC channel 1 issoc read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH2 +.Pq Event 2BH , Umask 04H +IMC channel 2 issoc read request occupancy. +.It Li QMC_ISSOC_READS.ANY +.Pq Event 2BH , Umask 07H +IMC issoc read request occupancy. +.It Li QMC_NORMAL_READS.CH0 +.Pq Event 2CH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 medium and low +priority read requests. The QMC channel 0 normal read occupancy divided by +this count provides the average QMC channel 0 read latency. +.It Li QMC_NORMAL_READS.CH1 +.Pq Event 2CH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 medium and low +priority read requests. The QMC channel 1 normal read occupancy divided by +this count provides the average QMC channel 1 read latency. +.It Li QMC_NORMAL_READS.CH2 +.Pq Event 2CH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 medium and low +priority read requests. The QMC channel 2 normal read occupancy divided by +this count provides the average QMC channel 2 read latency. +.It Li QMC_NORMAL_READS.ANY +.Pq Event 2CH , Umask 07H +Counts the number of Quickpath Memory Controller medium and low priority +read requests. The QMC normal read occupancy divided by this count provides +the average QMC read latency. +.It Li QMC_HIGH_PRIORITY_READS.CH0 +.Pq Event 2DH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.CH1 +.Pq Event 2DH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.CH2 +.Pq Event 2DH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.ANY +.Pq Event 2DH , Umask 07H +Counts the number of Quickpath Memory Controller high priority isochronous +read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH0 +.Pq Event 2EH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH1 +.Pq Event 2EH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH2 +.Pq Event 2EH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.ANY +.Pq Event 2EH , Umask 07H +Counts the number of Quickpath Memory Controller critical priority +isochronous read requests. +.It Li QMC_WRITES.FULL.CH0 +.Pq Event 2FH , Umask 01H +Counts number of full cache line writes to DRAM channel 0. +.It Li QMC_WRITES.FULL.CH1 +.Pq Event 2FH , Umask 02H +Counts number of full cache line writes to DRAM channel 1. +.It Li QMC_WRITES.FULL.CH2 +.Pq Event 2FH , Umask 04H +Counts number of full cache line writes to DRAM channel 2. +.It Li QMC_WRITES.FULL.ANY +.Pq Event 2FH , Umask 07H +Counts number of full cache line writes to DRAM. +.It Li QMC_WRITES.PARTIAL.CH0 +.Pq Event 2FH , Umask 08H +Counts number of partial cache line writes to DRAM channel 0. +.It Li QMC_WRITES.PARTIAL.CH1 +.Pq Event 2FH , Umask 10H +Counts number of partial cache line writes to DRAM channel 1. +.It Li QMC_WRITES.PARTIAL.CH2 +.Pq Event 2FH , Umask 20H +Counts number of partial cache line writes to DRAM channel 2. +.It Li QMC_WRITES.PARTIAL.ANY +.Pq Event 2FH , Umask 38H +Counts number of partial cache line writes to DRAM. +.It Li QMC_CANCEL.CH0 +.Pq Event 30H , Umask 01H +Counts number of DRAM channel 0 cancel requests. +.It Li QMC_CANCEL.CH1 +.Pq Event 30H , Umask 02H +Counts number of DRAM channel 1 cancel requests. +.It Li QMC_CANCEL.CH2 +.Pq Event 30H , Umask 04H +Counts number of DRAM channel 2 cancel requests. +.It Li QMC_CANCEL.ANY +.Pq Event 30H , Umask 07H +Counts number of DRAM cancel requests. +.It Li QMC_PRIORITY_UPDATES.CH0 +.Pq Event 31H , Umask 01H +Counts number of DRAM channel 0 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.CH1 +.Pq Event 31H , Umask 02H +Counts number of DRAM channel 1 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.CH2 +.Pq Event 31H , Umask 04H +Counts number of DRAM channel 2 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.ANY +.Pq Event 31H , Umask 07H +Counts number of DRAM priority updates. A priority update occurs when an +ISOC high or critical request is received by the QHL and there is a matching +request with normal priority that has already been issued to the QMC. In +this instance, the QHL will send a priority update to QMC to expedite the +request. +.It Li IMC_RETRY.CH0 +.Pq Event 32H , Umask 01H +Counts number of IMC DRAM channel 0 retries. DRAM retry only occurs when +configured in RAS mode. +.It Li IMC_RETRY.CH1 +.Pq Event 32H , Umask 02H +Counts number of IMC DRAM channel 1 retries. DRAM retry only occurs when +configured in RAS mode. +.It Li IMC_RETRY.CH2 +.Pq Event 32H , Umask 04H +Counts number of IMC DRAM channel 2 retries. DRAM retry only occurs when +configured in RAS mode. +.It Li IMC_RETRY.ANY +.Pq Event 32H , Umask 07H +Counts number of IMC DRAM retries from any channel. DRAM retry only occurs +when configured in RAS mode. +.It Li QHL_FRC_ACK_CNFLTS.IOH +.Pq Event 33H , Umask 01H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic to the IOH. +.It Li QHL_FRC_ACK_CNFLTS.REMOTE +.Pq Event 33H , Umask 02H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic to the remote home. +.It Li QHL_FRC_ACK_CNFLTS.LOCAL +.Pq Event 33H , Umask 04H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic to the local home. +.It Li QHL_FRC_ACK_CNFLTS.ANY +.Pq Event 33H , Umask 07H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic. +.It Li QHL_SLEEPS.IOH_ORDER +.Pq Event 34H , Umask 01H +Counts number of occurrences a request was put to sleep due to IOH ordering +(write after read) conflicts. While in the sleep state, the request is not +eligible to be scheduled to the QMC. +.It Li QHL_SLEEPS.REMOTE_ORDER +.Pq Event 34H , Umask 02H +Counts number of occurrences a request was put to sleep due to remote socket +ordering (write after read) conflicts. While in the sleep state, the request +is not eligible to be scheduled to the QMC. +.It Li QHL_SLEEPS.LOCAL_ORDER +.Pq Event 34H , Umask 04H +Counts number of occurrences a request was put to sleep due to local socket +ordering (write after read) conflicts. While in the sleep state, the request +is not eligible to be scheduled to the QMC. +.It Li QHL_SLEEPS.IOH_CONFLICT +.Pq Event 34H , Umask 08H +Counts number of occurrences a request was put to sleep due to IOH address +conflicts. While in the sleep state, the request is not eligible to be +scheduled to the QMC. +.It Li QHL_SLEEPS.REMOTE_CONFLICT +.Pq Event 34H , Umask 10H +Counts number of occurrences a request was put to sleep due to remote socket +address conflicts. While in the sleep state, the request is not eligible to +be scheduled to the QMC. +.It Li QHL_SLEEPS.LOCAL_CONFLICT +.Pq Event 34H , Umask 20H +Counts number of occurrences a request was put to sleep due to local socket +address conflicts. While in the sleep state, the request is not eligible to +be scheduled to the QMC. +.It Li ADDR_OPCODE_MATCH.IOH +.Pq Event 35H , Umask 01H +Counts number of requests from the IOH, address/opcode of request is +qualified by mask value written to MSR 396H. The following mask values are +supported: +0: NONE 40000000_00000000H:RSPFWDI 40001A00_00000000H:RSPFWDS +40001D00_00000000H:RSPIWB +Match opcode/addres s by writing MSR 396H with mask supported mask value +.It Li ADDR_OPCODE_MATCH.REMOTE +.Pq Event 35H , Umask 02H +Counts number of requests from the remote socket, address/opcode of request +is qualified by mask value written to MSR 396H. The following mask values +are supported: +0: NONE 40000000_00000000H:RSPFWDI 40001A00_00000000H:RSPFWDS +40001D00_00000000H:RSPIWB +Match opcode/addres s by writing MSR 396H with mask supported mask value +.It Li ADDR_OPCODE_MATCH.LOCAL +.Pq Event 35H , Umask 04H +Counts number of requests from the local socket, address/opcode of request +is qualified by mask value written to MSR 396H. The following mask values +are supported: +0: NONE 40000000_00000000H:RSPFWDI 40001A00_00000000H:RSPFWDS +40001D00_00000000H:RSPIWB +Match opcode/addres s by writing MSR 396H with mask supported mask value +.It Li QPI_TX_STALLED_SINGLE_FLIT.HOME.LINK_0 +.Pq Event 40H , Umask 01H +Counts cycles the Quickpath outbound link 0 HOME virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.SNOOP.LINK_0 +.Pq Event 40H , Umask 02H +Counts cycles the Quickpath outbound link 0 SNOOP virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.NDR.LINK_0 +.Pq Event 40H , Umask 04H +Counts cycles the Quickpath outbound link 0 non-data response virtual +channel is stalled due to lack of a VNA and VN0 credit. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.HOME.LINK_1 +.Pq Event 40H , Umask 08H +Counts cycles the Quickpath outbound link 1 HOME virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.SNOOP.LINK_1 +.Pq Event 40H , Umask 10H +Counts cycles the Quickpath outbound link 1 SNOOP virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.NDR.LINK_1 +.Pq Event 40H , Umask 20H +Counts cycles the Quickpath outbound link 1 non-data response virtual +channel is stalled due to lack of a VNA and VN0 credit. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.LINK_0 +.Pq Event 40H , Umask 07H +Counts cycles the Quickpath outbound link 0 virtual channels are stalled due +to lack of a VNA and VN0 credit. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.LINK_1 +.Pq Event 40H , Umask 38H +Counts cycles the Quickpath outbound link 1 virtual channels are stalled due +to lack of a VNA and VN0 credit. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.DRS.LINK_0 +.Pq Event 41H , Umask 01H +Counts cycles the Quickpath outbound link 0 Data ResponSe virtual channel is +stalled due to lack of VNA and VN0 credits. Note that this event does not +filter out when a flit would not have been selected for arbitration because +another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCB.LINK_0 +.Pq Event 41H , Umask 02H +Counts cycles the Quickpath outbound link 0 Non-Coherent Bypass virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCS.LINK_0 +.Pq Event 41H , Umask 04H +Counts cycles the Quickpath outbound link 0 Non-Coherent Standard virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.DRS.LINK_1 +.Pq Event 41H , Umask 08H +Counts cycles the Quickpath outbound link 1 Data ResponSe virtual channel is +stalled due to lack of VNA and VN0 credits. Note that this event does not +filter out when a flit would not have been selected for arbitration because +another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCB.LINK_1 +.Pq Event 41H , Umask 10H +Counts cycles the Quickpath outbound link 1 Non-Coherent Bypass virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCS.LINK_1 +.Pq Event 41H , Umask 20H +Counts cycles the Quickpath outbound link 1 Non-Coherent Standard virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.LINK_0 +.Pq Event 41H , Umask 07H +Counts cycles the Quickpath outbound link 0 virtual channels are stalled due +to lack of VNA and VN0 credits. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.LINK_1 +.Pq Event 41H , Umask 38H +Counts cycles the Quickpath outbound link 1 virtual channels are stalled due +to lack of VNA and VN0 credits. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_HEADER.FULL.LINK_0 +.Pq Event 42H , Umask 01H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 0 is full. +.It Li QPI_TX_HEADER.BUSY.LINK_0 +.Pq Event 42H , Umask 02H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 0 is busy. +.It Li QPI_TX_HEADER.FULL.LINK_1 +.Pq Event 42H , Umask 04H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 1 is full. +.It Li QPI_TX_HEADER.BUSY.LINK_1 +.Pq Event 42H , Umask 08H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 1 is busy. +.It Li QPI_RX_NO_PPT_CREDIT.STALLS.LINK_0 +.Pq Event 43H , Umask 01H +Number of cycles that snoop packets incoming to the Quickpath Interface link +0 are stalled and not sent to the GQ because the GQ Peer Probe Tracker (PPT) +does not have any available entries. +.It Li QPI_RX_NO_PPT_CREDIT.STALLS.LINK_1 +.Pq Event 43H , Umask 02H +Number of cycles that snoop packets incoming to the Quickpath Interface link +1 are stalled and not sent to the GQ because the GQ Peer Probe Tracker (PPT) +does not have any available entries. +.It Li DRAM_OPEN.CH0 +.Pq Event 60H , Umask 01H +Counts number of DRAM Channel 0 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_OPEN.CH1 +.Pq Event 60H , Umask 02H +Counts number of DRAM Channel 1 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_OPEN.CH2 +.Pq Event 60H , Umask 04H +Counts number of DRAM Channel 2 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_PAGE_CLOSE.CH0 +.Pq Event 61H , Umask 01H +DRAM channel 0 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_CLOSE.CH1 +.Pq Event 61H , Umask 02H +DRAM channel 1 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_CLOSE.CH2 +.Pq Event 61H , Umask 04H +DRAM channel 2 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH0 +.Pq Event 62H , Umask 01H +Counts the number of precharges (PRE) that were issued to DRAM channel 0 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH1 +.Pq Event 62H , Umask 02H +Counts the number of precharges (PRE) that were issued to DRAM channel 1 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH2 +.Pq Event 62H , Umask 04H +Counts the number of precharges (PRE) that were issued to DRAM channel 2 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_READ_CAS.CH0 +.Pq Event 63H , Umask 01H +Counts the number of times a read CAS command was issued on DRAM channel 0. +.It Li DRAM_READ_CAS.AUTOPRE_CH0 +.Pq Event 63H , Umask 02H +Counts the number of times a read CAS command was issued on DRAM channel 0 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_READ_CAS.CH1 +.Pq Event 63H , Umask 04H +Counts the number of times a read CAS command was issued on DRAM channel 1. +.It Li DRAM_READ_CAS.AUTOPRE_CH1 +.Pq Event 63H , Umask 08H +Counts the number of times a read CAS command was issued on DRAM channel 1 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_READ_CAS.CH2 +.Pq Event 63H , Umask 10H +Counts the number of times a read CAS command was issued on DRAM channel 2. +.It Li DRAM_READ_CAS.AUTOPRE_CH2 +.Pq Event 63H , Umask 20H +Counts the number of times a read CAS command was issued on DRAM channel 2 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH0 +.Pq Event 64H , Umask 01H +Counts the number of times a write CAS command was issued on DRAM channel 0. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH0 +.Pq Event 64H , Umask 02H +Counts the number of times a write CAS command was issued on DRAM channel 0 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH1 +.Pq Event 64H , Umask 04H +Counts the number of times a write CAS command was issued on DRAM channel 1. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH1 +.Pq Event 64H , Umask 08H +Counts the number of times a write CAS command was issued on DRAM channel 1 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH2 +.Pq Event 64H , Umask 10H +Counts the number of times a write CAS command was issued on DRAM channel 2. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH2 +.Pq Event 64H , Umask 20H +Counts the number of times a write CAS command was issued on DRAM channel 2 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_REFRESH.CH0 +.Pq Event 65H , Umask 01H +Counts number of DRAM channel 0 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_REFRESH.CH1 +.Pq Event 65H , Umask 02H +Counts number of DRAM channel 1 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_REFRESH.CH2 +.Pq Event 65H , Umask 04H +Counts number of DRAM channel 2 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_PRE_ALL.CH0 +.Pq Event 66H , Umask 01H +Counts number of DRAM Channel 0 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_PRE_ALL.CH1 +.Pq Event 66H , Umask 02H +Counts number of DRAM Channel 1 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_PRE_ALL.CH2 +.Pq Event 66H , Umask 04H +Counts number of DRAM Channel 2 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_THERMAL_THROTTLED +.Pq Event 67H , Umask 01H +Uncore cycles DRAM was throttled due to its temperature being above the +thermal throttling threshold. +.It Li THERMAL_THROTTLING_TEMP.CORE_0 +.Pq Event 80H , Umask 01H +Cycles that the PCU records that core 0 is above the thermal throttling +threshold temperature. +.It Li THERMAL_THROTTLING_TEMP.CORE_1 +.Pq Event 80H , Umask 02H +Cycles that the PCU records that core 1 is above the thermal throttling +threshold temperature. +.It Li THERMAL_THROTTLING_TEMP.CORE_2 +.Pq Event 80H , Umask 04H +Cycles that the PCU records that core 2 is above the thermal throttling +threshold temperature. +.It Li THERMAL_THROTTLING_TEMP.CORE_3 +.Pq Event 80H , Umask 08H +Cycles that the PCU records that core 3 is above the thermal throttling +threshold temperature. +.It Li THERMAL_THROTTLED_TEMP.CORE_0 +.Pq Event 81H , Umask 01H +Cycles that the PCU records that core 0 is in the power throttled state due +to cores temperature being above the thermal throttling threshold. +.It Li THERMAL_THROTTLED_TEMP.CORE_1 +.Pq Event 81H , Umask 02H +Cycles that the PCU records that core 1 is in the power throttled state due +to cores temperature being above the thermal throttling threshold. +.It Li THERMAL_THROTTLED_TEMP.CORE_2 +.Pq Event 81H , Umask 04H +Cycles that the PCU records that core 2 is in the power throttled state due +to cores temperature being above the thermal throttling threshold. +.It Li THERMAL_THROTTLED_TEMP.CORE_3 +.Pq Event 81H , Umask 08H +Cycles that the PCU records that core 3 is in the power throttled state due +to cores temperature being above the thermal throttling threshold. +.It Li PROCHOT_ASSERTION +.Pq Event 82H , Umask 01H +Number of system assertions of PROCHOT indicating the entire processor has +exceeded the thermal limit. +.It Li THERMAL_THROTTLING_PROCHOT.CORE_0 +.Pq Event 83H , Umask 01H +Cycles that the PCU records that core 0 is a low power state due to the +system asserting PROCHOT the entire processor has exceeded the thermal +limit. +.It Li THERMAL_THROTTLING_PROCHOT.CORE_1 +.Pq Event 83H , Umask 02H +Cycles that the PCU records that core 1 is a low power state due to the +system asserting PROCHOT the entire processor has exceeded the thermal +limit. +.It Li THERMAL_THROTTLING_PROCHOT.CORE_2 +.Pq Event 83H , Umask 04H +Cycles that the PCU records that core 2 is a low power state due to the +system asserting PROCHOT the entire processor has exceeded the thermal +limit. +.It Li THERMAL_THROTTLING_PROCHOT.CORE_3 +.Pq Event 83H , Umask 08H +Cycles that the PCU records that core 3 is a low power state due to the +system asserting PROCHOT the entire processor has exceeded the thermal +limit. +.It Li TURBO_MODE.CORE_0 +.Pq Event 84H , Umask 01H +Uncore cycles that core 0 is operating in turbo mode. +.It Li TURBO_MODE.CORE_1 +.Pq Event 84H , Umask 02H +Uncore cycles that core 1 is operating in turbo mode. +.It Li TURBO_MODE.CORE_2 +.Pq Event 84H , Umask 04H +Uncore cycles that core 2 is operating in turbo mode. +.It Li TURBO_MODE.CORE_3 +.Pq Event 84H , Umask 08H +Uncore cycles that core 3 is operating in turbo mode. +.It Li CYCLES_UNHALTED_L3_FLL_ENABLE +.Pq Event 85H , Umask 02H +Uncore cycles that at least one core is unhalted and all L3 ways are +enabled. +.It Li CYCLES_UNHALTED_L3_FLL_DISABLE +.Pq Event 86H , Umask 01H +Uncore cycles that at least one core is unhalted and all L3 ways are +disabled. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.ucf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7 3 , +.Xr pmc.corei7uc 3 , +.Xr pmc.westmere 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . Property changes on: stable/8/lib/libpmc/pmc.westmereuc.3 ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/8/lib/libpmc =================================================================== --- stable/8/lib/libpmc (revision 206701) +++ stable/8/lib/libpmc (revision 206702) Property changes on: stable/8/lib/libpmc ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/lib/libpmc:r206089