diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 56ab882f9b14..7c8da2df7fe4 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -1,842 +1,858 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * AMD64 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_isa.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif +#ifdef HWPMC_HOOKS +#include +#endif #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include extern void trap(struct trapframe frame); extern void syscall(struct trapframe frame); static int trap_pfault(struct trapframe *, int); static void trap_fatal(struct trapframe *, vm_offset_t); void dblfault_handler(void); #define MAX_TRAP_MSG 28 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ }; #ifdef KDB static int kdb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW, &kdb_on_nmi, 0, "Go to KDB on NMI"); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); #ifdef WITNESS extern char *syscallnames[]; #endif #ifdef DEVICE_POLLING extern u_int32_t poll_in_trap; extern int ether_poll(int count); #endif /* DEVICE_POLLING */ /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct thread *td = curthread; struct proc *p = td->td_proc; u_int sticks = 0; int i = 0, ucode = 0, type, code; PCPU_LAZY_INC(cnt.v_trap); type = frame.tf_trapno; #ifdef KDB_STOP_NMI /* Handler for NMI IPIs used for debugging */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* KDB_STOP_NMI */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif +#ifdef HWPMC_HOOKS + /* + * CPU PMCs interrupt using an NMI. If the PMC module is + * active, pass the 'rip' value to the PMC module's interrupt + * handler. A return value of '1' from the handler means that + * the NMI was handled by it and we can return immediately. + */ + if (type == T_NMI && pmc_intr && + (*pmc_intr)(PCPU_GET(cpuid), (uintptr_t) frame.tf_rip, + TRAPF_USERMODE(&frame))) + goto out; +#endif + if ((frame.tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (ISPL(frame.tf_cs) == SEL_UPL) printf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); else if (type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We shouldn't enable interrupts while in a critical * section. */ if (td->td_critnest == 0) enable_intr(); } } code = frame.tf_err; if (type == T_PAGEFLT) { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. */ if (td->td_critnest != 0) trap_fatal(&frame, frame.tf_addr); } #ifdef DEVICE_POLLING if (poll_in_trap) ether_poll(poll_in_trap); #endif /* DEVICE_POLLING */ if (ISPL(frame.tf_cs) == SEL_UPL) { /* user trap */ sticks = td->td_sticks; td->td_frame = &frame; if (td->td_ucred != p->p_ucred) cred_update_thread(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); frame.tf_rflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ ucode = fputrap(); if (ucode == -1) goto userout; i = SIGFPE; break; case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ if (td->td_pflags & TDP_SA) thread_user_enter(td); i = trap_pfault(&frame, TRUE); if (i == -1) goto userout; if (i == 0) goto user; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: /* machine/parity/power fail/"kitchen sink" faults */ /* XXX Giant */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, &frame); } #endif /* KDB */ goto userout; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: /* transparent fault (due to context switch "late") */ if (fpudna()) goto userout; i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = 0; /* XXX */ i = SIGFPE; break; } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE); goto out; case T_DNA: /* * The kernel is apparently using fpu for copying. * XXX this should be fatal unless the kernel has * registered such use. */ if (fpudna()) { printf("fpudna in kernel mode!\n"); goto out; } break; case T_STKFLT: /* stack fault */ break; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame.tf_rip == (long)doreti_iret) { frame.tf_rip = (long)doreti_iret_fault; goto out; } if (PCPU_GET(curpcb)->pcb_onfault != NULL) { frame.tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_rflags & PSL_NT) { frame.tf_rflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap()) { /* * Reset breakpoint bits because the * processor doesn't */ /* XXX check upper bits here */ load_dr6(rdr6() & 0xfffffff0); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB /* XXX Giant */ if (kdb_trap(type, 0, &frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: /* XXX Giant */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, &frame); } #endif /* KDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALLTHROUGH */ #endif /* DEV_ISA */ } trap_fatal(&frame, 0); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); trapsignal(td, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", frame.tf_addr); uprintf("\n"); } #endif user: userret(td, &frame, sticks); mtx_assert(&Giant, MA_NOTOWNED); userout: out: return; } static int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; vm_offset_t eva = frame->tf_addr; va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, type, ss; long esp; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)], &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", frame->tf_cs & 0xffff, frame->tf_rip); if (ISPL(frame->tf_cs) == SEL_UPL) { ss = frame->tf_ss & 0xffff; esp = frame->tf_rsp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (long)&frame->tf_rsp; } printf("stack pointer = 0x%x:0x%lx\n", ss, esp); printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_rflags & PSL_T) printf("trace trap, "); if (frame->tf_rflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_rflags & PSL_NT) printf("nested task, "); if (frame->tf_rflags & PSL_RF) printf("resume, "); printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } #ifdef KDB if ((debugger_on_panic || kdb_active) && kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). */ void dblfault_handler() { printf("\nFatal double fault\n"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } /* * syscall - system call request C handler * * A system call is essentially treated as a trap. */ void syscall(frame) struct trapframe frame; { caddr_t params; struct sysent *callp; struct thread *td = curthread; struct proc *p = td->td_proc; register_t orig_tf_rflags; u_int sticks; int error; int narg; register_t args[8]; register_t *argp; u_int code; int reg, regcnt; /* * note: PCPU_LAZY_INC() can only be used if we can afford * occassional inaccuracy in the count. */ PCPU_LAZY_INC(cnt.v_syscall); #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { mtx_lock(&Giant); /* try to stabilize the system XXX */ panic("syscall"); /* NOT REACHED */ mtx_unlock(&Giant); } #endif reg = 0; regcnt = 6; sticks = td->td_sticks; td->td_frame = &frame; if (td->td_ucred != p->p_ucred) cred_update_thread(td); if (p->p_flag & P_SA) thread_user_enter(td); params = (caddr_t)frame.tf_rsp + sizeof(register_t); code = frame.tf_rax; orig_tf_rflags = frame.tf_rflags; if (p->p_sysent->sv_prepsyscall) { /* * The prep code is MP aware. */ (*p->p_sysent->sv_prepsyscall)(&frame, (int *)args, &code, ¶ms); } else { if (code == SYS_syscall || code == SYS___syscall) { code = frame.tf_rdi; reg++; regcnt--; } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg & SYF_ARGMASK; /* * copyin and the ktrsyscall()/ktrsysret() code is MP-aware */ KASSERT(narg <= sizeof(args) / sizeof(args[0]), ("Too many syscall arguments!")); error = 0; argp = &frame.tf_rdi; argp += reg; bcopy(argp, args, sizeof(args[0]) * regcnt); if (narg > regcnt) { KASSERT(params != NULL, ("copyin args with no params!")); error = copyin(params, &args[regcnt], (narg - regcnt) * sizeof(args[0])); } argp = &args[0]; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(code, narg, argp); #endif CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td, td->td_proc->p_pid, td->td_proc->p_comm, code); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame.tf_rdx; STOPEVENT(p, S_SCE, narg); PTRACESTOP_SC(p, td, S_PT_SCE); if ((callp->sy_narg & SYF_MPSAFE) == 0) { mtx_lock(&Giant); error = (*callp->sy_call)(td, argp); mtx_unlock(&Giant); } else error = (*callp->sy_call)(td, argp); } switch (error) { case 0: frame.tf_rax = td->td_retval[0]; frame.tf_rdx = td->td_retval[1]; frame.tf_rflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, we know that 'syscall' is 2 bytes. * We have to do a full context restore so that %r10 * (which was holding the value of %rcx) is restored for * the next iteration. */ frame.tf_rip -= frame.tf_err; frame.tf_r10 = frame.tf_rcx; td->td_pcb->pcb_flags |= PCB_FULLCTX; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } frame.tf_rax = error; frame.tf_rflags |= PSL_C; break; } /* * Traced syscall. */ if (orig_tf_rflags & PSL_T) { frame.tf_rflags &= ~PSL_T; trapsignal(td, SIGTRAP, 0); } /* * Handle reschedule and other end-of-syscall issues */ userret(td, &frame, sticks); CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td, td->td_proc->p_pid, td->td_proc->p_comm, code); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(code, error, td->td_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); PTRACESTOP_SC(p, td, S_PT_SCX); WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } diff --git a/sys/dev/hwpmc/hwpmc_amd.c b/sys/dev/hwpmc/hwpmc_amd.c index cd3db049e09b..1eb987dba806 100644 --- a/sys/dev/hwpmc/hwpmc_amd.c +++ b/sys/dev/hwpmc/hwpmc_amd.c @@ -1,1028 +1,1030 @@ /*- * Copyright (c) 2003-2005 Joseph Koshy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); /* Support for the AMD K7 and later processors */ #include #include #include #include #include #include #include #include /* AMD K7 and K8 PMCs */ #define AMD_PMC_EVSEL_0 0xC0010000 #define AMD_PMC_EVSEL_1 0xC0010001 #define AMD_PMC_EVSEL_2 0xC0010002 #define AMD_PMC_EVSEL_3 0xC0010003 #define AMD_PMC_PERFCTR_0 0xC0010004 #define AMD_PMC_PERFCTR_1 0xC0010005 #define AMD_PMC_PERFCTR_2 0xC0010006 #define AMD_PMC_PERFCTR_3 0xC0010007 #define K7_VALID_EVENT_CODE(c) (((c) >= 0x40 && (c) <= 0x47) || \ ((c) >= 0x80 && (c) <= 0x85) || ((c) >= 0xC0 && (c) <= 0xC7) || \ ((c) >= 0xCD && (c) <= 0xCF)) #define AMD_PMC_CAPS (PMC_CAP_INTERRUPT | PMC_CAP_USER | \ PMC_CAP_SYSTEM | PMC_CAP_EDGE | PMC_CAP_THRESHOLD | \ PMC_CAP_READ | PMC_CAP_WRITE | PMC_CAP_INVERT | PMC_CAP_QUALIFIER) /* reserved bits include bit 21 and the top two bits of the unit mask */ #define K7_PMC_RESERVED ((1 << 21) | (3 << 13)) #define K8_PMC_RESERVED (1 << 21) #define AMD_PMC_IS_STOPPED(evsel) ((rdmsr((evsel)) & AMD_PMC_ENABLE) == 0) #define AMD_PMC_HAS_OVERFLOWED(pmc) ((rdpmc(pmc) & (1ULL << 47)) == 0) #if __i386__ #define AMD_NPMCS K7_NPMCS #define AMD_PMC_CLASS PMC_CLASS_K7 #define AMD_PMC_COUNTERMASK K7_PMC_COUNTERMASK #define AMD_PMC_TO_COUNTER(x) K7_PMC_TO_COUNTER(x) #define AMD_PMC_INVERT K7_PMC_INVERT #define AMD_PMC_ENABLE K7_PMC_ENABLE #define AMD_PMC_INT K7_PMC_INT #define AMD_PMC_PC K7_PMC_PC #define AMD_PMC_EDGE K7_PMC_EDGE #define AMD_PMC_OS K7_PMC_OS #define AMD_PMC_USR K7_PMC_USR #define AMD_PMC_UNITMASK_M K7_PMC_UNITMASK_M #define AMD_PMC_UNITMASK_O K7_PMC_UNITMASK_O #define AMD_PMC_UNITMASK_E K7_PMC_UNITMASK_E #define AMD_PMC_UNITMASK_S K7_PMC_UNITMASK_S #define AMD_PMC_UNITMASK_I K7_PMC_UNITMASK_I #define AMD_PMC_UNITMASK K7_PMC_UNITMASK #define AMD_PMC_EVENTMASK K7_PMC_EVENTMASK #define AMD_PMC_TO_UNITMASK(x) K7_PMC_TO_UNITMASK(x) #define AMD_PMC_TO_EVENTMASK(x) K7_PMC_TO_EVENTMASK(x) #define AMD_VALID_BITS K7_VALID_BITS #define AMD_PMC_CLASS_NAME "K7-" #elif __amd64__ #define AMD_NPMCS K8_NPMCS #define AMD_PMC_CLASS PMC_CLASS_K8 #define AMD_PMC_COUNTERMASK K8_PMC_COUNTERMASK #define AMD_PMC_TO_COUNTER(x) K8_PMC_TO_COUNTER(x) #define AMD_PMC_INVERT K8_PMC_INVERT #define AMD_PMC_ENABLE K8_PMC_ENABLE #define AMD_PMC_INT K8_PMC_INT #define AMD_PMC_PC K8_PMC_PC #define AMD_PMC_EDGE K8_PMC_EDGE #define AMD_PMC_OS K8_PMC_OS #define AMD_PMC_USR K8_PMC_USR #define AMD_PMC_UNITMASK_M K8_PMC_UNITMASK_M #define AMD_PMC_UNITMASK_O K8_PMC_UNITMASK_O #define AMD_PMC_UNITMASK_E K8_PMC_UNITMASK_E #define AMD_PMC_UNITMASK_S K8_PMC_UNITMASK_S #define AMD_PMC_UNITMASK_I K8_PMC_UNITMASK_I #define AMD_PMC_UNITMASK K8_PMC_UNITMASK #define AMD_PMC_EVENTMASK K8_PMC_EVENTMASK #define AMD_PMC_TO_UNITMASK(x) K8_PMC_TO_UNITMASK(x) #define AMD_PMC_TO_EVENTMASK(x) K8_PMC_TO_EVENTMASK(x) #define AMD_VALID_BITS K8_VALID_BITS #define AMD_PMC_CLASS_NAME "K8-" #else #error Unsupported architecture. #endif /* AMD K7 & K8 PMCs */ struct amd_descr { struct pmc_descr pm_descr; /* "base class" */ uint32_t pm_evsel; /* address of EVSEL register */ uint32_t pm_perfctr; /* address of PERFCTR register */ }; static const struct amd_descr amd_pmcdesc[AMD_NPMCS] = { { .pm_descr = { .pd_name = "TSC", .pd_class = PMC_CLASS_TSC, .pd_caps = PMC_CAP_READ, .pd_width = 64 }, .pm_evsel = MSR_TSC, .pm_perfctr = 0 /* unused */ }, { .pm_descr = { .pd_name = AMD_PMC_CLASS_NAME "0", .pd_class = AMD_PMC_CLASS, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_0, .pm_perfctr = AMD_PMC_PERFCTR_0 }, { .pm_descr = { .pd_name = AMD_PMC_CLASS_NAME "1", .pd_class = AMD_PMC_CLASS, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_1, .pm_perfctr = AMD_PMC_PERFCTR_1 }, { .pm_descr = { .pd_name = AMD_PMC_CLASS_NAME "2", .pd_class = AMD_PMC_CLASS, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_2, .pm_perfctr = AMD_PMC_PERFCTR_2 }, { .pm_descr = { .pd_name = AMD_PMC_CLASS_NAME "3", .pd_class = AMD_PMC_CLASS, .pd_caps = AMD_PMC_CAPS, .pd_width = 48 }, .pm_evsel = AMD_PMC_EVSEL_3, .pm_perfctr = AMD_PMC_PERFCTR_3 } }; struct amd_event_code_map { enum pmc_event pe_ev; /* enum value */ uint8_t pe_code; /* encoded event mask */ uint8_t pe_mask; /* bits allowed in unit mask */ }; const struct amd_event_code_map amd_event_codes[] = { #if __i386__ { PMC_EV_K7_DC_ACCESSES, 0x40, 0 }, { PMC_EV_K7_DC_MISSES, 0x41, 0 }, { PMC_EV_K7_DC_REFILLS_FROM_L2, 0x42, K7_PMC_UNITMASK_MOESI }, { PMC_EV_K7_DC_REFILLS_FROM_SYSTEM, 0x43, K7_PMC_UNITMASK_MOESI }, { PMC_EV_K7_DC_WRITEBACKS, 0x44, K7_PMC_UNITMASK_MOESI }, { PMC_EV_K7_L1_DTLB_MISS_AND_L2_DTLB_HITS, 0x45, 0 }, { PMC_EV_K7_L1_AND_L2_DTLB_MISSES, 0x46, 0 }, { PMC_EV_K7_MISALIGNED_REFERENCES, 0x47, 0 }, { PMC_EV_K7_IC_FETCHES, 0x80, 0 }, { PMC_EV_K7_IC_MISSES, 0x81, 0 }, { PMC_EV_K7_L1_ITLB_MISSES, 0x84, 0 }, { PMC_EV_K7_L1_L2_ITLB_MISSES, 0x85, 0 }, { PMC_EV_K7_RETIRED_INSTRUCTIONS, 0xC0, 0 }, { PMC_EV_K7_RETIRED_OPS, 0xC1, 0 }, { PMC_EV_K7_RETIRED_BRANCHES, 0xC2, 0 }, { PMC_EV_K7_RETIRED_BRANCHES_MISPREDICTED, 0xC3, 0 }, { PMC_EV_K7_RETIRED_TAKEN_BRANCHES, 0xC4, 0 }, { PMC_EV_K7_RETIRED_TAKEN_BRANCHES_MISPREDICTED, 0xC5, 0 }, { PMC_EV_K7_RETIRED_FAR_CONTROL_TRANSFERS, 0xC6, 0 }, { PMC_EV_K7_RETIRED_RESYNC_BRANCHES, 0xC7, 0 }, { PMC_EV_K7_INTERRUPTS_MASKED_CYCLES, 0xCD, 0 }, { PMC_EV_K7_INTERRUPTS_MASKED_WHILE_PENDING_CYCLES, 0xCE, 0 }, { PMC_EV_K7_HARDWARE_INTERRUPTS, 0xCF, 0 } #endif #if __amd64__ { PMC_EV_K8_FP_DISPATCHED_FPU_OPS, 0x00, 0x3F }, { PMC_EV_K8_FP_CYCLES_WITH_NO_FPU_OPS_RETIRED, 0x01, 0x00 }, { PMC_EV_K8_FP_DISPATCHED_FPU_FAST_FLAG_OPS, 0x02, 0x00 }, { PMC_EV_K8_LS_SEGMENT_REGISTER_LOAD, 0x20, 0x7F }, { PMC_EV_K8_LS_MICROARCHITECTURAL_RESYNC_BY_SELF_MODIFYING_CODE, 0x21, 0x00 }, { PMC_EV_K8_LS_MICROARCHITECTURAL_RESYNC_BY_SNOOP, 0x22, 0x00 }, { PMC_EV_K8_LS_BUFFER2_FULL, 0x23, 0x00 }, { PMC_EV_K8_LS_LOCKED_OPERATION, 0x24, 0x07 }, { PMC_EV_K8_LS_MICROARCHITECTURAL_LATE_CANCEL, 0x25, 0x00 }, { PMC_EV_K8_LS_RETIRED_CFLUSH_INSTRUCTIONS, 0x26, 0x00 }, { PMC_EV_K8_LS_RETIRED_CPUID_INSTRUCTIONS, 0x27, 0x00 }, { PMC_EV_K8_DC_ACCESS, 0x40, 0x00 }, { PMC_EV_K8_DC_MISS, 0x41, 0x00 }, { PMC_EV_K8_DC_REFILL_FROM_L2, 0x42, 0x1F }, { PMC_EV_K8_DC_REFILL_FROM_SYSTEM, 0x43, 0x1F }, { PMC_EV_K8_DC_COPYBACK, 0x44, 0x1F }, { PMC_EV_K8_DC_L1_DTLB_MISS_AND_L2_DTLB_HIT, 0x45, 0x00 }, { PMC_EV_K8_DC_L1_DTLB_MISS_AND_L2_DTLB_MISS, 0x46, 0x00 }, { PMC_EV_K8_DC_MISALIGNED_DATA_REFERENCE, 0x47, 0x00 }, { PMC_EV_K8_DC_MICROARCHITECTURAL_LATE_CANCEL, 0x48, 0x00 }, { PMC_EV_K8_DC_MICROARCHITECTURAL_EARLY_CANCEL, 0x49, 0x00 }, { PMC_EV_K8_DC_ONE_BIT_ECC_ERROR, 0x4A, 0x03 }, { PMC_EV_K8_DC_DISPATCHED_PREFETCH_INSTRUCTIONS, 0x4B, 0x07 }, { PMC_EV_K8_DC_DCACHE_ACCESSES_BY_LOCKS, 0x4C, 0x03 }, { PMC_EV_K8_BU_CPU_CLK_UNHALTED, 0x76, 0x00 }, { PMC_EV_K8_BU_INTERNAL_L2_REQUEST, 0x7D, 0x1F }, { PMC_EV_K8_BU_FILL_REQUEST_L2_MISS, 0x7E, 0x07 }, { PMC_EV_K8_BU_FILL_INTO_L2, 0x7F, 0x03 }, { PMC_EV_K8_IC_FETCH, 0x80, 0x00 }, { PMC_EV_K8_IC_MISS, 0x81, 0x00 }, { PMC_EV_K8_IC_REFILL_FROM_L2, 0x82, 0x00 }, { PMC_EV_K8_IC_REFILL_FROM_SYSTEM, 0x83, 0x00 }, { PMC_EV_K8_IC_L1_ITLB_MISS_AND_L2_ITLB_HIT, 0x84, 0x00 }, { PMC_EV_K8_IC_L1_ITLB_MISS_AND_L2_ITLB_MISS, 0x85, 0x00 }, { PMC_EV_K8_IC_MICROARCHITECTURAL_RESYNC_BY_SNOOP, 0x86, 0x00 }, { PMC_EV_K8_IC_INSTRUCTION_FETCH_STALL, 0x87, 0x00 }, { PMC_EV_K8_IC_RETURN_STACK_HIT, 0x88, 0x00 }, { PMC_EV_K8_IC_RETURN_STACK_OVERFLOW, 0x89, 0x00 }, { PMC_EV_K8_FR_RETIRED_X86_INSTRUCTIONS, 0xC0, 0x00 }, { PMC_EV_K8_FR_RETIRED_UOPS, 0xC1, 0x00 }, { PMC_EV_K8_FR_RETIRED_BRANCHES, 0xC2, 0x00 }, { PMC_EV_K8_FR_RETIRED_BRANCHES_MISPREDICTED, 0xC3, 0x00 }, { PMC_EV_K8_FR_RETIRED_TAKEN_BRANCHES, 0xC4, 0x00 }, { PMC_EV_K8_FR_RETIRED_TAKEN_BRANCHES_MISPREDICTED, 0xC5, 0x00 }, { PMC_EV_K8_FR_RETIRED_FAR_CONTROL_TRANSFERS, 0xC6, 0x00 }, { PMC_EV_K8_FR_RETIRED_RESYNCS, 0xC7, 0x00 }, { PMC_EV_K8_FR_RETIRED_NEAR_RETURNS, 0xC8, 0x00 }, { PMC_EV_K8_FR_RETIRED_NEAR_RETURNS_MISPREDICTED, 0xC9, 0x00 }, { PMC_EV_K8_FR_RETIRED_TAKEN_BRANCHES_MISPREDICTED_BY_ADDR_MISCOMPARE, 0xCA, 0x00 }, { PMC_EV_K8_FR_RETIRED_FPU_INSTRUCTIONS, 0xCB, 0x0F }, { PMC_EV_K8_FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS, 0xCC, 0x07 }, { PMC_EV_K8_FR_INTERRUPTS_MASKED_CYCLES, 0xCD, 0x00 }, { PMC_EV_K8_FR_INTERRUPTS_MASKED_WHILE_PENDING_CYCLES, 0xCE, 0x00 }, { PMC_EV_K8_FR_TAKEN_HARDWARE_INTERRUPTS, 0xCF, 0x00 }, { PMC_EV_K8_FR_DECODER_EMPTY, 0xD0, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALLS, 0xD1, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_FROM_BRANCH_ABORT_TO_RETIRE, 0xD2, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_FOR_SERIALIZATION, 0xD3, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_FOR_SEGMENT_LOAD, 0xD4, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_REORDER_BUFFER_IS_FULL, 0xD5, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_RESERVATION_STATIONS_ARE_FULL, 0xD6, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_FPU_IS_FULL, 0xD7, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_LS_IS_FULL, 0xD8, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_WAITING_FOR_ALL_TO_BE_QUIET, 0xD9, 0x00 }, { PMC_EV_K8_FR_DISPATCH_STALL_WHEN_FAR_XFER_OR_RESYNC_BRANCH_PENDING, 0xDA, 0x00 }, { PMC_EV_K8_FR_FPU_EXCEPTIONS, 0xDB, 0x0F }, { PMC_EV_K8_FR_NUMBER_OF_BREAKPOINTS_FOR_DR0, 0xDC, 0x00 }, { PMC_EV_K8_FR_NUMBER_OF_BREAKPOINTS_FOR_DR1, 0xDD, 0x00 }, { PMC_EV_K8_FR_NUMBER_OF_BREAKPOINTS_FOR_DR2, 0xDE, 0x00 }, { PMC_EV_K8_FR_NUMBER_OF_BREAKPOINTS_FOR_DR3, 0xDF, 0x00 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_PAGE_ACCESS_EVENT, 0xE0, 0x7 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_PAGE_TABLE_OVERFLOW, 0xE1, 0x00 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_DRAM_COMMAND_SLOTS_MISSED, 0xE2, 0x00 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_TURNAROUND, 0xE3, 0x07 }, { PMC_EV_K8_NB_MEMORY_CONTROLLER_BYPASS_SATURATION, 0xE4, 0x0F }, { PMC_EV_K8_NB_SIZED_COMMANDS, 0xEB, 0x7F }, { PMC_EV_K8_NB_PROBE_RESULT, 0xEC, 0x0F }, { PMC_EV_K8_NB_HT_BUS0_BANDWIDTH, 0xF6, 0x0F }, { PMC_EV_K8_NB_HT_BUS1_BANDWIDTH, 0xF7, 0x0F }, { PMC_EV_K8_NB_HT_BUS2_BANDWIDTH, 0xF8, 0x0F } #endif }; const int amd_event_codes_size = sizeof(amd_event_codes) / sizeof(amd_event_codes[0]); /* * read a pmc register */ static int amd_read_pmc(int cpu, int ri, pmc_value_t *v) { enum pmc_mode mode; const struct amd_descr *pd; struct pmc *pm; const struct pmc_hw *phw; pmc_value_t tmp; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pd = &amd_pmcdesc[ri]; pm = phw->phw_pmc; KASSERT(pm != NULL, ("[amd,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__, cpu, ri)); mode = PMC_TO_MODE(pm); PMCDBG(MDP,REA,1,"amd-read id=%d class=%d", ri, pd->pm_descr.pd_class); /* Reading the TSC is a special case */ if (pd->pm_descr.pd_class == PMC_CLASS_TSC) { KASSERT(PMC_IS_COUNTING_MODE(mode), ("[amd,%d] TSC counter in non-counting mode", __LINE__)); *v = rdtsc(); PMCDBG(MDP,REA,2,"amd-read id=%d -> %jd", ri, *v); return 0; } KASSERT(pd->pm_descr.pd_class == AMD_PMC_CLASS, ("[amd,%d] unknown PMC class (%d)", __LINE__, pd->pm_descr.pd_class)); tmp = rdmsr(pd->pm_perfctr); /* RDMSR serializes */ if (PMC_IS_SAMPLING_MODE(mode)) *v = -tmp; else *v = tmp; PMCDBG(MDP,REA,2,"amd-read id=%d -> %jd", ri, *v); return 0; } /* * Write a PMC MSR. */ static int amd_write_pmc(int cpu, int ri, pmc_value_t v) { const struct amd_descr *pd; struct pmc *pm; const struct pmc_hw *phw; enum pmc_mode mode; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pd = &amd_pmcdesc[ri]; pm = phw->phw_pmc; KASSERT(pm != NULL, ("[amd,%d] PMC not owned (cpu%d,pmc%d)", __LINE__, cpu, ri)); mode = PMC_TO_MODE(pm); if (pd->pm_descr.pd_class == PMC_CLASS_TSC) return 0; KASSERT(pd->pm_descr.pd_class == AMD_PMC_CLASS, ("[amd,%d] unknown PMC class (%d)", __LINE__, pd->pm_descr.pd_class)); /* use 2's complement of the count for sampling mode PMCs */ if (PMC_IS_SAMPLING_MODE(mode)) v = -v; PMCDBG(MDP,WRI,1,"amd-write cpu=%d ri=%d v=%jx", cpu, ri, v); /* write the PMC value */ wrmsr(pd->pm_perfctr, v); return 0; } /* * configure hardware pmc according to the configuration recorded in * pmc 'pm'. */ static int amd_config_pmc(int cpu, int ri, struct pmc *pm) { struct pmc_hw *phw; PMCDBG(MDP,CFG,1, "cpu=%d ri=%d pm=%p", cpu, ri, pm); KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; KASSERT(pm == NULL || phw->phw_pmc == NULL, ("[amd,%d] pm=%p phw->pm=%p hwpmc not unconfigured", __LINE__, pm, phw->phw_pmc)); phw->phw_pmc = pm; return 0; } /* * Retrieve a configured PMC pointer from hardware state. */ static int amd_get_config(int cpu, int ri, struct pmc **ppm) { *ppm = pmc_pcpu[cpu]->pc_hwpmcs[ri]->phw_pmc; return 0; } /* * Machine dependent actions taken during the context switch in of a * thread. */ static int amd_switch_in(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; PMCDBG(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp, (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0); /* enable the RDPMC instruction if needed */ if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) load_cr4(rcr4() | CR4_PCE); return 0; } /* * Machine dependent actions taken during the context switch out of a * thread. */ static int amd_switch_out(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; (void) pp; /* can be NULL */ PMCDBG(MDP,SWO,1, "pc=%p pp=%p enable-msr=%d", pc, pp, pp ? (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) == 1 : 0); /* always turn off the RDPMC instruction */ load_cr4(rcr4() & ~CR4_PCE); return 0; } /* * Check if a given allocation is feasible. */ static int amd_allocate_pmc(int cpu, int ri, struct pmc *pm, const struct pmc_op_pmcallocate *a) { int i; uint32_t allowed_unitmask, caps, config, unitmask; enum pmc_event pe; const struct pmc_descr *pd; (void) cpu; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row index %d", __LINE__, ri)); pd = &amd_pmcdesc[ri].pm_descr; /* check class match */ if (pd->pd_class != a->pm_class) return EINVAL; caps = pm->pm_caps; PMCDBG(MDP,ALL,1,"amd-allocate ri=%d caps=0x%x", ri, caps); if ((pd->pd_caps & caps) != caps) return EPERM; if (pd->pd_class == PMC_CLASS_TSC) { /* TSC's are always allocated in system-wide counting mode */ if (a->pm_ev != PMC_EV_TSC_TSC || a->pm_mode != PMC_MODE_SC) return EINVAL; return 0; } KASSERT(pd->pd_class == AMD_PMC_CLASS, ("[amd,%d] Unknown PMC class (%d)", __LINE__, pd->pd_class)); pe = a->pm_ev; /* map ev to the correct event mask code */ config = allowed_unitmask = 0; for (i = 0; i < amd_event_codes_size; i++) if (amd_event_codes[i].pe_ev == pe) { config = AMD_PMC_TO_EVENTMASK(amd_event_codes[i].pe_code); allowed_unitmask = AMD_PMC_TO_UNITMASK(amd_event_codes[i].pe_mask); break; } if (i == amd_event_codes_size) return EINVAL; unitmask = a->pm_amd_config & AMD_PMC_UNITMASK; if (unitmask & ~allowed_unitmask) /* disallow reserved bits */ return EINVAL; if (unitmask && (caps & PMC_CAP_QUALIFIER)) config |= unitmask; if (caps & PMC_CAP_THRESHOLD) config |= a->pm_amd_config & AMD_PMC_COUNTERMASK; /* set at least one of the 'usr' or 'os' caps */ if (caps & PMC_CAP_USER) config |= AMD_PMC_USR; if (caps & PMC_CAP_SYSTEM) config |= AMD_PMC_OS; if ((caps & (PMC_CAP_USER|PMC_CAP_SYSTEM)) == 0) config |= (AMD_PMC_USR|AMD_PMC_OS); if (caps & PMC_CAP_EDGE) config |= AMD_PMC_EDGE; if (caps & PMC_CAP_INVERT) config |= AMD_PMC_INVERT; if (caps & PMC_CAP_INTERRUPT) config |= AMD_PMC_INT; pm->pm_md.pm_amd.pm_amd_evsel = config; /* save config value */ PMCDBG(MDP,ALL,2,"amd-allocate ri=%d -> config=0x%x", ri, config); return 0; } /* * Release machine dependent state associated with a PMC. This is a * no-op on this architecture. * */ /* ARGSUSED0 */ static int amd_release_pmc(int cpu, int ri, struct pmc *pmc) { #if DEBUG const struct amd_descr *pd; #endif struct pmc_hw *phw; (void) pmc; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; KASSERT(phw->phw_pmc == NULL, ("[amd,%d] PHW pmc %p non-NULL", __LINE__, phw->phw_pmc)); #if DEBUG pd = &amd_pmcdesc[ri]; if (pd->pm_descr.pd_class == AMD_PMC_CLASS) KASSERT(AMD_PMC_IS_STOPPED(pd->pm_evsel), ("[amd,%d] PMC %d released while active", __LINE__, ri)); #endif return 0; } /* * start a PMC. */ static int amd_start_pmc(int cpu, int ri) { uint32_t config; struct pmc *pm; struct pmc_hw *phw; const struct amd_descr *pd; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pm = phw->phw_pmc; pd = &amd_pmcdesc[ri]; KASSERT(pm != NULL, ("[amd,%d] starting cpu%d,pmc%d with null pmc record", __LINE__, cpu, ri)); PMCDBG(MDP,STA,1,"amd-start cpu=%d ri=%d", cpu, ri); if (pd->pm_descr.pd_class == PMC_CLASS_TSC) return 0; /* TSCs are always running */ KASSERT(pd->pm_descr.pd_class == AMD_PMC_CLASS, ("[amd,%d] unknown PMC class (%d)", __LINE__, pd->pm_descr.pd_class)); KASSERT(AMD_PMC_IS_STOPPED(pd->pm_evsel), ("[amd,%d] pmc%d,cpu%d: Starting active PMC \"%s\"", __LINE__, ri, cpu, pd->pm_descr.pd_name)); /* turn on the PMC ENABLE bit */ config = pm->pm_md.pm_amd.pm_amd_evsel | AMD_PMC_ENABLE; PMCDBG(MDP,STA,2,"amd-start config=0x%x", config); wrmsr(pd->pm_evsel, config); return 0; } /* * Stop a PMC. */ static int amd_stop_pmc(int cpu, int ri) { struct pmc *pm; struct pmc_hw *phw; const struct amd_descr *pd; uint64_t config; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pm = phw->phw_pmc; pd = &amd_pmcdesc[ri]; KASSERT(pm != NULL, ("[amd,%d] cpu%d,pmc%d no PMC to stop", __LINE__, cpu, ri)); /* can't stop a TSC */ if (pd->pm_descr.pd_class == PMC_CLASS_TSC) return 0; KASSERT(pd->pm_descr.pd_class == AMD_PMC_CLASS, ("[amd,%d] unknown PMC class (%d)", __LINE__, pd->pm_descr.pd_class)); KASSERT(!AMD_PMC_IS_STOPPED(pd->pm_evsel), ("[amd,%d] PMC%d, CPU%d \"%s\" already stopped", __LINE__, ri, cpu, pd->pm_descr.pd_name)); PMCDBG(MDP,STO,1,"amd-stop ri=%d", ri); /* turn off the PMC ENABLE bit */ config = pm->pm_md.pm_amd.pm_amd_evsel & ~AMD_PMC_ENABLE; wrmsr(pd->pm_evsel, config); return 0; } /* * Interrupt handler. This function needs to return '1' if the * interrupt was this CPU's PMCs or '0' otherwise. It is not allowed * to sleep or do anything a 'fast' interrupt handler is not allowed * to do. */ static int -amd_intr(int cpu, uintptr_t eip) +amd_intr(int cpu, uintptr_t eip, int usermode) { int i, retval; enum pmc_mode mode; uint32_t perfctr; struct pmc *pm; struct pmc_cpu *pc; struct pmc_hw *phw; + (void) usermode; + KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] out of range CPU %d", __LINE__, cpu)); retval = 0; pc = pmc_pcpu[cpu]; /* * look for all PMCs that have interrupted: * - skip over the TSC [PMC#0] * - look for a PMC with a valid 'struct pmc' association * - look for a PMC in (a) sampling mode and (b) which has * overflowed. If found, we update the process's * histogram or send it a profiling signal by calling * the appropriate helper function. */ for (i = 1; i < AMD_NPMCS; i++) { phw = pc->pc_hwpmcs[i]; perfctr = amd_pmcdesc[i].pm_perfctr; KASSERT(phw != NULL, ("[amd,%d] null PHW pointer", __LINE__)); if ((pm = phw->phw_pmc) == NULL || pm->pm_state != PMC_STATE_RUNNING) { atomic_add_int(&pmc_stats.pm_intr_ignored, 1); continue; } mode = PMC_TO_MODE(pm); if (PMC_IS_SAMPLING_MODE(mode) && AMD_PMC_HAS_OVERFLOWED(perfctr)) { atomic_add_int(&pmc_stats.pm_intr_processed, 1); if (PMC_IS_SYSTEM_MODE(mode)) pmc_update_histogram(phw, eip); else if (PMC_IS_VIRTUAL_MODE(mode)) pmc_send_signal(pm); retval = 1; } } return retval; } /* * describe a PMC */ static int amd_describe(int cpu, int ri, struct pmc_info *pi, struct pmc **ppmc) { int error; size_t copied; const struct amd_descr *pd; struct pmc_hw *phw; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] row-index %d out of range", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pd = &amd_pmcdesc[ri]; if ((error = copystr(pd->pm_descr.pd_name, pi->pm_name, PMC_NAME_MAX, &copied)) != 0) return error; pi->pm_class = pd->pm_descr.pd_class; if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) { pi->pm_enabled = TRUE; *ppmc = phw->phw_pmc; } else { pi->pm_enabled = FALSE; *ppmc = NULL; } return 0; } /* * i386 specific entry points */ /* * return the MSR address of the given PMC. */ static int amd_get_msr(int ri, uint32_t *msr) { KASSERT(ri >= 0 && ri < AMD_NPMCS, ("[amd,%d] ri %d out of range", __LINE__, ri)); *msr = amd_pmcdesc[ri].pm_perfctr - AMD_PMC_PERFCTR_0; return 0; } /* * processor dependent initialization. */ /* * Per-processor data structure * * [common stuff] * [5 struct pmc_hw pointers] * [5 struct pmc_hw structures] */ struct amd_cpu { struct pmc_cpu pc_common; struct pmc_hw *pc_hwpmcs[AMD_NPMCS]; struct pmc_hw pc_amdpmcs[AMD_NPMCS]; }; static int amd_init(int cpu) { int n; struct amd_cpu *pcs; struct pmc_hw *phw; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] insane cpu number %d", __LINE__, cpu)); PMCDBG(MDP,INI,1,"amd-init cpu=%d", cpu); MALLOC(pcs, struct amd_cpu *, sizeof(struct amd_cpu), M_PMC, M_WAITOK|M_ZERO); if (pcs == NULL) return ENOMEM; phw = &pcs->pc_amdpmcs[0]; /* * Initialize the per-cpu mutex and set the content of the * hardware descriptors to a known state. */ for (n = 0; n < AMD_NPMCS; n++, phw++) { phw->phw_state = PMC_PHW_FLAG_IS_ENABLED | PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n); phw->phw_pmc = NULL; pcs->pc_hwpmcs[n] = phw; } /* Mark the TSC as shareable */ pcs->pc_hwpmcs[0]->phw_state |= PMC_PHW_FLAG_IS_SHAREABLE; pmc_pcpu[cpu] = (struct pmc_cpu *) pcs; return 0; } /* * processor dependent cleanup prior to the KLD * being unloaded */ static int amd_cleanup(int cpu) { int i; uint32_t evsel; struct pmc_cpu *pcs; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] insane cpu number (%d)", __LINE__, cpu)); PMCDBG(MDP,INI,1,"amd-cleanup cpu=%d", cpu); /* * First, turn off all PMCs on this CPU. */ for (i = 0; i < 4; i++) { /* XXX this loop is now not needed */ evsel = rdmsr(AMD_PMC_EVSEL_0 + i); evsel &= ~AMD_PMC_ENABLE; wrmsr(AMD_PMC_EVSEL_0 + i, evsel); } /* * Next, free up allocated space. */ pcs = pmc_pcpu[cpu]; #if DEBUG /* check the TSC */ KASSERT(pcs->pc_hwpmcs[0]->phw_pmc == NULL, ("[amd,%d] CPU%d,PMC0 still in use", __LINE__, cpu)); for (i = 1; i < AMD_NPMCS; i++) { KASSERT(pcs->pc_hwpmcs[i]->phw_pmc == NULL, ("[amd,%d] CPU%d/PMC%d in use", __LINE__, cpu, i)); KASSERT(AMD_PMC_IS_STOPPED(AMD_PMC_EVSEL_0 + (i-1)), ("[amd,%d] CPU%d/PMC%d not stopped", __LINE__, cpu, i)); } #endif KASSERT(pcs != NULL, ("[amd,%d] null per-cpu state pointer (cpu%d)", __LINE__, cpu)); pmc_pcpu[cpu] = NULL; FREE(pcs, M_PMC); return 0; } /* * Initialize ourselves. */ struct pmc_mdep * pmc_amd_initialize(void) { struct pmc_mdep *pmc_mdep; /* The presence of hardware performance counters on the AMD Athlon, Duron or later processors, is _not_ indicated by any of the processor feature flags set by the 'CPUID' instruction, so we only check the 'instruction family' field returned by CPUID for instruction family >= 6. This test needs to be be refined. */ if ((cpu_id & 0xF00) < 0x600) return NULL; MALLOC(pmc_mdep, struct pmc_mdep *, sizeof(struct pmc_mdep), M_PMC, M_WAITOK|M_ZERO); #if __i386__ pmc_mdep->pmd_cputype = PMC_CPU_AMD_K7; #elif __amd64__ pmc_mdep->pmd_cputype = PMC_CPU_AMD_K8; #else #error Unknown AMD CPU type. #endif pmc_mdep->pmd_npmc = AMD_NPMCS; /* this processor has two classes of usable PMCs */ pmc_mdep->pmd_nclass = 2; /* TSC */ pmc_mdep->pmd_classes[0].pm_class = PMC_CLASS_TSC; pmc_mdep->pmd_classes[0].pm_caps = PMC_CAP_READ; pmc_mdep->pmd_classes[0].pm_width = 64; /* AMD K7/K8 PMCs */ pmc_mdep->pmd_classes[1].pm_class = AMD_PMC_CLASS; pmc_mdep->pmd_classes[1].pm_caps = AMD_PMC_CAPS; pmc_mdep->pmd_classes[1].pm_width = 48; pmc_mdep->pmd_nclasspmcs[0] = 1; pmc_mdep->pmd_nclasspmcs[1] = (AMD_NPMCS-1); pmc_mdep->pmd_init = amd_init; pmc_mdep->pmd_cleanup = amd_cleanup; pmc_mdep->pmd_switch_in = amd_switch_in; pmc_mdep->pmd_switch_out = amd_switch_out; pmc_mdep->pmd_read_pmc = amd_read_pmc; pmc_mdep->pmd_write_pmc = amd_write_pmc; pmc_mdep->pmd_config_pmc = amd_config_pmc; pmc_mdep->pmd_get_config = amd_get_config; pmc_mdep->pmd_allocate_pmc = amd_allocate_pmc; pmc_mdep->pmd_release_pmc = amd_release_pmc; pmc_mdep->pmd_start_pmc = amd_start_pmc; pmc_mdep->pmd_stop_pmc = amd_stop_pmc; pmc_mdep->pmd_intr = amd_intr; pmc_mdep->pmd_describe = amd_describe; pmc_mdep->pmd_get_msr = amd_get_msr; /* i386 */ PMCDBG(MDP,INI,0,"%s","amd-initialize"); return pmc_mdep; } diff --git a/sys/dev/hwpmc/hwpmc_piv.c b/sys/dev/hwpmc/hwpmc_piv.c index e81e4e45012e..c9ade03edd05 100644 --- a/sys/dev/hwpmc/hwpmc_piv.c +++ b/sys/dev/hwpmc/hwpmc_piv.c @@ -1,1665 +1,1666 @@ /*- * Copyright (c) 2003-2005 Joseph Koshy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* * PENTIUM 4 SUPPORT * * The P4 has 18 PMCs, divided into 4 groups with 4,4,4 and 6 PMCs * respectively. Each PMC comprises of two model specific registers: * a counter configuration control register (CCCR) and a counter * register that holds the actual event counts. * * Configuring an event requires the use of one of 45 event selection * control registers (ESCR). Events are associated with specific * ESCRs. Each PMC group has a set of ESCRs it can use. * * - The BPU counter group (4 PMCs) can use the 16 ESCRs: * BPU_ESCR{0,1}, IS_ESCR{0,1}, MOB_ESCR{0,1}, ITLB_ESCR{0,1}, * PMH_ESCR{0,1}, IX_ESCR{0,1}, FSB_ESCR{0,}, BSU_ESCR{0,1}. * * - The MS counter group (4 PMCs) can use the 6 ESCRs: MS_ESCR{0,1}, * TC_ESCR{0,1}, TBPU_ESCR{0,1}. * * - The FLAME counter group (4 PMCs) can use the 10 ESCRs: * FLAME_ESCR{0,1}, FIRM_ESCR{0,1}, SAAT_ESCR{0,1}, U2L_ESCR{0,1}, * DAC_ESCR{0,1}. * * - The IQ counter group (6 PMCs) can use the 13 ESCRs: IQ_ESCR{0,1}, * ALF_ESCR{0,1}, RAT_ESCR{0,1}, SSU_ESCR0, CRU_ESCR{0,1,2,3,4,5}. * * Even-numbered ESCRs can be used with counters 0, 1 and 4 (if * present) of a counter group. Odd-numbers ESCRs can be used with * counters 2, 3 and 5 (if present) of a counter group. The * 'p4_escrs[]' table describes these restrictions in a form that * function 'p4_allocate()' uses for making allocation decisions. * * SYSTEM-MODE AND THREAD-MODE ALLOCATION * * In addition to remembering the state of PMC rows * ('FREE','STANDALONE', or 'THREAD'), we similar need to track the * state of ESCR rows. If an ESCR is allocated to a system-mode PMC * on a CPU we cannot allocate this to a thread-mode PMC. On a * multi-cpu (multiple physical CPUs) system, ESCR allocation on each * CPU is tracked by the pc_escrs[] array. * * Each system-mode PMC that is using an ESCR records its row-index in * the appropriate entry and system-mode allocation attempts check * that an ESCR is available using this array. Process-mode PMCs do * not use the pc_escrs[] array, since ESCR row itself would have been * marked as in 'THREAD' mode. * * HYPERTHREADING SUPPORT * * When HTT is enabled, the FreeBSD kernel treats the two 'logical' * cpus as independent CPUs and can schedule kernel threads on them * independently. However, the two logical CPUs share the same set of * PMC resources. We need to ensure that: * - PMCs that use the PMC_F_DESCENDANTS semantics are handled correctly, * and, * - Threads of multi-threaded processes that get scheduled on the same * physical CPU are handled correctly. * * HTT Detection * * Not all HTT capable systems will have HTT enabled since users may * have turned HTT support off using the appropriate sysctls * (machdep.hlt_logical_cpus or machdep.logical_cpus_mask). We detect * the presence of HTT by remembering if 'p4_init()' was called for a * logical CPU. Note that hwpmc(4) cannot deal with a change in HTT * status once it is loaded. * * Handling HTT READ / WRITE / START / STOP * * PMC resources are shared across multiple logical CPUs. In each * physical CPU's state we keep track of a 'runcount' which reflects * the number of PMC-using processes that have been scheduled on the * logical CPUs of this physical CPU. Process-mode PMC operations * will actually 'start' or 'stop' hardware only if these are the * first or last processes respectively to use the hardware. PMC * values written by a 'write' operation are saved and are transferred * to hardware at PMC 'start' time if the runcount is 0. If the * runcount is greater than 0 at the time of a 'start' operation, we * keep track of the actual hardware value at the time of the 'start' * operation and use this to adjust the final readings at PMC 'stop' * or 'read' time. * * Execution sequences: * * Case 1: CPUx +...- (no overlap) * CPUy +...- * RC 0 1 0 1 0 * * Case 2: CPUx +........- (partial overlap) * CPUy +........- * RC 0 1 2 1 0 * * Case 3: CPUx +..............- (fully overlapped) * CPUy +.....- * RC 0 1 2 1 0 * * Here CPUx and CPUy are one of the two logical processors on a HTT CPU. * * Handling HTT CONFIG * * Different processes attached to the same PMC may get scheduled on * the two logical processors in the package. We keep track of config * and de-config operations using the CFGFLAGS fields of the per-physical * cpu state. */ #define P4_PMCS() \ P4_PMC(BPU_COUNTER0) \ P4_PMC(BPU_COUNTER1) \ P4_PMC(BPU_COUNTER2) \ P4_PMC(BPU_COUNTER3) \ P4_PMC(MS_COUNTER0) \ P4_PMC(MS_COUNTER1) \ P4_PMC(MS_COUNTER2) \ P4_PMC(MS_COUNTER3) \ P4_PMC(FLAME_COUNTER0) \ P4_PMC(FLAME_COUNTER1) \ P4_PMC(FLAME_COUNTER2) \ P4_PMC(FLAME_COUNTER3) \ P4_PMC(IQ_COUNTER0) \ P4_PMC(IQ_COUNTER1) \ P4_PMC(IQ_COUNTER2) \ P4_PMC(IQ_COUNTER3) \ P4_PMC(IQ_COUNTER4) \ P4_PMC(IQ_COUNTER5) \ P4_PMC(NONE) enum pmc_p4pmc { #undef P4_PMC #define P4_PMC(N) P4_PMC_##N , P4_PMCS() }; /* * P4 ESCR descriptors */ #define P4_ESCRS() \ P4_ESCR(BSU_ESCR0, 0x3A0, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(BSU_ESCR1, 0x3A1, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(FSB_ESCR0, 0x3A2, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(FSB_ESCR1, 0x3A3, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(FIRM_ESCR0, 0x3A4, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(FIRM_ESCR1, 0x3A5, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(FLAME_ESCR0, 0x3A6, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(FLAME_ESCR1, 0x3A7, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(DAC_ESCR0, 0x3A8, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(DAC_ESCR1, 0x3A9, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(MOB_ESCR0, 0x3AA, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(MOB_ESCR1, 0x3AB, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(PMH_ESCR0, 0x3AC, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(PMH_ESCR1, 0x3AD, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(SAAT_ESCR0, 0x3AE, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(SAAT_ESCR1, 0x3AF, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(U2L_ESCR0, 0x3B0, FLAME_COUNTER0, FLAME_COUNTER1, NONE) \ P4_ESCR(U2L_ESCR1, 0x3B1, FLAME_COUNTER2, FLAME_COUNTER3, NONE) \ P4_ESCR(BPU_ESCR0, 0x3B2, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(BPU_ESCR1, 0x3B3, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(IS_ESCR0, 0x3B4, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(IS_ESCR1, 0x3B5, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(ITLB_ESCR0, 0x3B6, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(ITLB_ESCR1, 0x3B7, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(CRU_ESCR0, 0x3B8, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(CRU_ESCR1, 0x3B9, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(IQ_ESCR0, 0x3BA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(IQ_ESCR1, 0x3BB, IQ_COUNTER1, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(RAT_ESCR0, 0x3BC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(RAT_ESCR1, 0x3BD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(SSU_ESCR0, 0x3BE, IQ_COUNTER0, IQ_COUNTER2, IQ_COUNTER4) \ P4_ESCR(MS_ESCR0, 0x3C0, MS_COUNTER0, MS_COUNTER1, NONE) \ P4_ESCR(MS_ESCR1, 0x3C1, MS_COUNTER2, MS_COUNTER3, NONE) \ P4_ESCR(TBPU_ESCR0, 0x3C2, MS_COUNTER0, MS_COUNTER1, NONE) \ P4_ESCR(TBPU_ESCR1, 0x3C3, MS_COUNTER2, MS_COUNTER3, NONE) \ P4_ESCR(TC_ESCR0, 0x3C4, MS_COUNTER0, MS_COUNTER1, NONE) \ P4_ESCR(TC_ESCR1, 0x3C5, MS_COUNTER2, MS_COUNTER3, NONE) \ P4_ESCR(IX_ESCR0, 0x3C8, BPU_COUNTER0, BPU_COUNTER1, NONE) \ P4_ESCR(IX_ESCR1, 0x3C9, BPU_COUNTER2, BPU_COUNTER3, NONE) \ P4_ESCR(ALF_ESCR0, 0x3CA, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(ALF_ESCR1, 0x3CB, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(CRU_ESCR2, 0x3CC, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(CRU_ESCR3, 0x3CD, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(CRU_ESCR4, 0x3E0, IQ_COUNTER0, IQ_COUNTER1, IQ_COUNTER4) \ P4_ESCR(CRU_ESCR5, 0x3E1, IQ_COUNTER2, IQ_COUNTER3, IQ_COUNTER5) \ P4_ESCR(NONE, ~0, NONE, NONE, NONE) enum pmc_p4escr { #define P4_ESCR(N, MSR, P1, P2, P3) P4_ESCR_##N , P4_ESCRS() #undef P4_ESCR }; struct pmc_p4escr_descr { const char pm_escrname[PMC_NAME_MAX]; u_short pm_escr_msr; const enum pmc_p4pmc pm_pmcs[P4_MAX_PMC_PER_ESCR]; }; static struct pmc_p4escr_descr p4_escrs[] = { #define P4_ESCR(N, MSR, P1, P2, P3) \ { \ .pm_escrname = #N, \ .pm_escr_msr = (MSR), \ .pm_pmcs = \ { \ P4_PMC_##P1, \ P4_PMC_##P2, \ P4_PMC_##P3 \ } \ } , P4_ESCRS() #undef P4_ESCR }; /* * P4 Event descriptor */ struct p4_event_descr { const enum pmc_event pm_event; const uint32_t pm_escr_eventselect; const uint32_t pm_cccr_select; const char pm_is_ti_event; enum pmc_p4escr pm_escrs[P4_MAX_ESCR_PER_EVENT]; }; static struct p4_event_descr p4_events[] = { #define P4_EVDESCR(NAME, ESCREVENTSEL, CCCRSEL, TI_EVENT, ESCR0, ESCR1) \ { \ .pm_event = PMC_EV_P4_##NAME, \ .pm_escr_eventselect = (ESCREVENTSEL), \ .pm_cccr_select = (CCCRSEL), \ .pm_is_ti_event = (TI_EVENT), \ .pm_escrs = \ { \ P4_ESCR_##ESCR0, \ P4_ESCR_##ESCR1 \ } \ } P4_EVDESCR(TC_DELIVER_MODE, 0x01, 0x01, TRUE, TC_ESCR0, TC_ESCR1), P4_EVDESCR(BPU_FETCH_REQUEST, 0x03, 0x00, FALSE, BPU_ESCR0, BPU_ESCR1), P4_EVDESCR(ITLB_REFERENCE, 0x18, 0x03, FALSE, ITLB_ESCR0, ITLB_ESCR1), P4_EVDESCR(MEMORY_CANCEL, 0x02, 0x05, FALSE, DAC_ESCR0, DAC_ESCR1), P4_EVDESCR(MEMORY_COMPLETE, 0x08, 0x02, FALSE, SAAT_ESCR0, SAAT_ESCR1), P4_EVDESCR(LOAD_PORT_REPLAY, 0x04, 0x02, FALSE, SAAT_ESCR0, SAAT_ESCR1), P4_EVDESCR(STORE_PORT_REPLAY, 0x05, 0x02, FALSE, SAAT_ESCR0, SAAT_ESCR1), P4_EVDESCR(MOB_LOAD_REPLAY, 0x03, 0x02, FALSE, MOB_ESCR0, MOB_ESCR1), P4_EVDESCR(PAGE_WALK_TYPE, 0x01, 0x04, TRUE, PMH_ESCR0, PMH_ESCR1), P4_EVDESCR(BSQ_CACHE_REFERENCE, 0x0C, 0x07, FALSE, BSU_ESCR0, BSU_ESCR1), P4_EVDESCR(IOQ_ALLOCATION, 0x03, 0x06, FALSE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(IOQ_ACTIVE_ENTRIES, 0x1A, 0x06, FALSE, FSB_ESCR1, NONE), P4_EVDESCR(FSB_DATA_ACTIVITY, 0x17, 0x06, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(BSQ_ALLOCATION, 0x05, 0x07, FALSE, BSU_ESCR0, NONE), P4_EVDESCR(BSQ_ACTIVE_ENTRIES, 0x06, 0x07, FALSE, BSU_ESCR1, NONE), /* BSQ_ACTIVE_ENTRIES inherits CPU specificity from BSQ_ALLOCATION */ P4_EVDESCR(SSE_INPUT_ASSIST, 0x34, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(PACKED_SP_UOP, 0x08, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(PACKED_DP_UOP, 0x0C, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(SCALAR_SP_UOP, 0x0A, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(SCALAR_DP_UOP, 0x0E, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(64BIT_MMX_UOP, 0x02, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(128BIT_MMX_UOP, 0x1A, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(X87_FP_UOP, 0x04, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(X87_SIMD_MOVES_UOP, 0x2E, 0x01, TRUE, FIRM_ESCR0, FIRM_ESCR1), P4_EVDESCR(GLOBAL_POWER_EVENTS, 0x13, 0x06, FALSE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(TC_MS_XFER, 0x05, 0x00, FALSE, MS_ESCR0, MS_ESCR1), P4_EVDESCR(UOP_QUEUE_WRITES, 0x09, 0x00, FALSE, MS_ESCR0, MS_ESCR1), P4_EVDESCR(RETIRED_MISPRED_BRANCH_TYPE, 0x05, 0x02, FALSE, TBPU_ESCR0, TBPU_ESCR1), P4_EVDESCR(RETIRED_BRANCH_TYPE, 0x04, 0x02, FALSE, TBPU_ESCR0, TBPU_ESCR1), P4_EVDESCR(RESOURCE_STALL, 0x01, 0x01, FALSE, ALF_ESCR0, ALF_ESCR1), P4_EVDESCR(WC_BUFFER, 0x05, 0x05, TRUE, DAC_ESCR0, DAC_ESCR1), P4_EVDESCR(B2B_CYCLES, 0x16, 0x03, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(BNR, 0x08, 0x03, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(SNOOP, 0x06, 0x03, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(RESPONSE, 0x04, 0x03, TRUE, FSB_ESCR0, FSB_ESCR1), P4_EVDESCR(FRONT_END_EVENT, 0x08, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(EXECUTION_EVENT, 0x0C, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(REPLAY_EVENT, 0x09, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(INSTR_RETIRED, 0x02, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1), P4_EVDESCR(UOPS_RETIRED, 0x01, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1), P4_EVDESCR(UOP_TYPE, 0x02, 0x02, FALSE, RAT_ESCR0, RAT_ESCR1), P4_EVDESCR(BRANCH_RETIRED, 0x06, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(MISPRED_BRANCH_RETIRED, 0x03, 0x04, FALSE, CRU_ESCR0, CRU_ESCR1), P4_EVDESCR(X87_ASSIST, 0x03, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3), P4_EVDESCR(MACHINE_CLEAR, 0x02, 0x05, FALSE, CRU_ESCR2, CRU_ESCR3) #undef P4_EVDESCR }; #define P4_EVENT_IS_TI(E) ((E)->pm_is_ti_event == TRUE) #define P4_NEVENTS (PMC_EV_P4_LAST - PMC_EV_P4_FIRST + 1) /* * P4 PMC descriptors */ struct p4pmc_descr { struct pmc_descr pm_descr; /* common information */ enum pmc_p4pmc pm_pmcnum; /* PMC number */ uint32_t pm_pmc_msr; /* PERFCTR MSR address */ uint32_t pm_cccr_msr; /* CCCR MSR address */ }; static struct p4pmc_descr p4_pmcdesc[P4_NPMCS] = { /* * TSC descriptor */ { .pm_descr = { .pd_name = "TSC", .pd_class = PMC_CLASS_TSC, .pd_caps = PMC_CAP_READ | PMC_CAP_WRITE, .pd_width = 64 }, .pm_pmcnum = ~0, .pm_cccr_msr = ~0, .pm_pmc_msr = 0x10, }, /* * P4 PMCS */ #define P4_PMC_CAPS (PMC_CAP_INTERRUPT | PMC_CAP_USER | PMC_CAP_SYSTEM | \ PMC_CAP_EDGE | PMC_CAP_THRESHOLD | PMC_CAP_READ | PMC_CAP_WRITE | \ PMC_CAP_INVERT | PMC_CAP_QUALIFIER | PMC_CAP_PRECISE | \ PMC_CAP_TAGGING | PMC_CAP_CASCADE) #define P4_PMCDESCR(N, PMC, CCCR) \ { \ .pm_descr = \ { \ .pd_name = #N, \ .pd_class = PMC_CLASS_P4, \ .pd_caps = P4_PMC_CAPS, \ .pd_width = 40 \ }, \ .pm_pmcnum = P4_PMC_##N, \ .pm_cccr_msr = (CCCR), \ .pm_pmc_msr = (PMC) \ } P4_PMCDESCR(BPU_COUNTER0, 0x300, 0x360), P4_PMCDESCR(BPU_COUNTER1, 0x301, 0x361), P4_PMCDESCR(BPU_COUNTER2, 0x302, 0x362), P4_PMCDESCR(BPU_COUNTER3, 0x303, 0x363), P4_PMCDESCR(MS_COUNTER0, 0x304, 0x364), P4_PMCDESCR(MS_COUNTER1, 0x305, 0x365), P4_PMCDESCR(MS_COUNTER2, 0x306, 0x366), P4_PMCDESCR(MS_COUNTER3, 0x307, 0x367), P4_PMCDESCR(FLAME_COUNTER0, 0x308, 0x368), P4_PMCDESCR(FLAME_COUNTER1, 0x309, 0x369), P4_PMCDESCR(FLAME_COUNTER2, 0x30A, 0x36A), P4_PMCDESCR(FLAME_COUNTER3, 0x30B, 0x36B), P4_PMCDESCR(IQ_COUNTER0, 0x30C, 0x36C), P4_PMCDESCR(IQ_COUNTER1, 0x30D, 0x36D), P4_PMCDESCR(IQ_COUNTER2, 0x30E, 0x36E), P4_PMCDESCR(IQ_COUNTER3, 0x30F, 0x36F), P4_PMCDESCR(IQ_COUNTER4, 0x310, 0x370), P4_PMCDESCR(IQ_COUNTER5, 0x311, 0x371), #undef P4_PMCDESCR }; /* HTT support */ #define P4_NHTT 2 /* logical processors/chip */ #define P4_HTT_CPU_INDEX_0 0 #define P4_HTT_CPU_INDEX_1 1 static int p4_system_has_htt; /* * Per-CPU data structure for P4 class CPUs * * [common stuff] * [19 struct pmc_hw pointers] * [19 struct pmc_hw structures] * [45 ESCRs status bytes] * [per-cpu spin mutex] * [19 flag fields for holding config flags and a runcount] * [19*2 hw value fields] (Thread mode PMC support) * or * [19*2 EIP values] (Sampling mode PMCs) * [19*2 pmc value fields] (Thread mode PMC support)) */ struct p4_cpu { struct pmc_cpu pc_common; struct pmc_hw *pc_hwpmcs[P4_NPMCS]; struct pmc_hw pc_p4pmcs[P4_NPMCS]; char pc_escrs[P4_NESCR]; struct mtx pc_mtx; /* spin lock */ unsigned char pc_flags[P4_NPMCS]; /* 4 bits each: {cfg,run}count */ union { pmc_value_t pc_hw[P4_NPMCS * P4_NHTT]; uintptr_t pc_ip[P4_NPMCS * P4_NHTT]; } pc_si; pmc_value_t pc_pmc_values[P4_NPMCS * P4_NHTT]; }; #define P4_PCPU_PMC_VALUE(PC,RI,CPU) (PC)->pc_pmc_values[(RI)*((CPU) & 1)] #define P4_PCPU_HW_VALUE(PC,RI,CPU) (PC)->pc_si.pc_hw[(RI)*((CPU) & 1)] #define P4_PCPU_SAVED_IP(PC,RI,CPU) (PC)->pc_si.pc_ip[(RI)*((CPU) & 1)] #define P4_PCPU_GET_FLAGS(PC,RI,MASK) ((PC)->pc_flags[(RI)] & (MASK)) #define P4_PCPU_SET_FLAGS(PC,RI,MASK,VAL) do { \ char _tmp; \ _tmp = (PC)->pc_flags[(RI)]; \ _tmp &= ~(MASK); \ _tmp |= (VAL) & (MASK); \ (PC)->pc_flags[(RI)] = _tmp; \ } while (0) #define P4_PCPU_GET_RUNCOUNT(PC,RI) P4_PCPU_GET_FLAGS(PC,RI,0x0F) #define P4_PCPU_SET_RUNCOUNT(PC,RI,V) P4_PCPU_SET_FLAGS(PC,RI,0x0F,V) #define P4_PCPU_GET_CFGFLAGS(PC,RI) (P4_PCPU_GET_FLAGS(PC,RI,0xF0) >> 4) #define P4_PCPU_SET_CFGFLAGS(PC,RI,C) P4_PCPU_SET_FLAGS(PC,RI,0xF0,((C) <<4)) #define P4_CPU_TO_FLAG(C) (pmc_cpu_is_logical(cpu) ? 0x2 : 0x1) /* ESCR row disposition */ static int p4_escrdisp[P4_NESCR]; #define P4_ESCR_ROW_DISP_IS_THREAD(E) (p4_escrdisp[(E)] > 0) #define P4_ESCR_ROW_DISP_IS_STANDALONE(E) (p4_escrdisp[(E)] < 0) #define P4_ESCR_ROW_DISP_IS_FREE(E) (p4_escrdisp[(E)] == 0) #define P4_ESCR_MARK_ROW_STANDALONE(E) do { \ KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\ __LINE__)); \ atomic_add_int(&p4_escrdisp[(E)], -1); \ KASSERT(p4_escrdisp[(E)] >= (-mp_ncpus), ("[p4,%d] row " \ "disposition error", __LINE__)); \ } while (0) #define P4_ESCR_UNMARK_ROW_STANDALONE(E) do { \ atomic_add_int(&p4_escrdisp[(E)], 1); \ KASSERT(p4_escrdisp[(E)] <= 0, ("[p4,%d] row disposition error",\ __LINE__)); \ } while (0) #define P4_ESCR_MARK_ROW_THREAD(E) do { \ KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error", \ __LINE__)); \ atomic_add_int(&p4_escrdisp[(E)], 1); \ } while (0) #define P4_ESCR_UNMARK_ROW_THREAD(E) do { \ atomic_add_int(&p4_escrdisp[(E)], -1); \ KASSERT(p4_escrdisp[(E)] >= 0, ("[p4,%d] row disposition error",\ __LINE__)); \ } while (0) #define P4_PMC_IS_STOPPED(cccr) ((rdmsr(cccr) & P4_CCCR_ENABLE) == 0) #define P4_TO_PHYSICAL_CPU(cpu) (pmc_cpu_is_logical(cpu) ? \ ((cpu) & ~1) : (cpu)) #define P4_CCCR_Tx_MASK (~(P4_CCCR_OVF_PMI_T0|P4_CCCR_OVF_PMI_T1| \ P4_CCCR_ENABLE|P4_CCCR_OVF)) #define P4_ESCR_Tx_MASK (~(P4_ESCR_T0_OS|P4_ESCR_T0_USR|P4_ESCR_T1_OS| \ P4_ESCR_T1_USR)) /* * support routines */ static struct p4_event_descr * p4_find_event(enum pmc_event ev) { int n; for (n = 0; n < P4_NEVENTS; n++) if (p4_events[n].pm_event == ev) break; if (n == P4_NEVENTS) return NULL; return &p4_events[n]; } /* * Initialize per-cpu state */ static int p4_init(int cpu) { int n, phycpu; char *pescr; struct p4_cpu *pcs; struct pmc_hw *phw; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p4,%d] insane cpu number %d", __LINE__, cpu)); PMCDBG(MDP,INI,0, "p4-init cpu=%d logical=%d", cpu, pmc_cpu_is_logical(cpu) != 0); /* * A 'logical' CPU shares its per-cpu state with its physical * CPU. The physical CPU would have been initialized prior to * the initialization for this cpu. */ if (pmc_cpu_is_logical(cpu)) { phycpu = P4_TO_PHYSICAL_CPU(cpu); pcs = (struct p4_cpu *) pmc_pcpu[phycpu]; PMCDBG(MDP,INI,1, "p4-init cpu=%d phycpu=%d pcs=%p", cpu, phycpu, pcs); KASSERT(pcs, ("[p4,%d] Null Per-Cpu state cpu=%d phycpu=%d", __LINE__, cpu, phycpu)); if (pcs == NULL) /* decline to init */ return ENXIO; p4_system_has_htt = 1; pmc_pcpu[cpu] = (struct pmc_cpu *) pcs; return 0; } MALLOC(pcs, struct p4_cpu *, sizeof(struct p4_cpu), M_PMC, M_WAITOK|M_ZERO); if (pcs == NULL) return ENOMEM; phw = pcs->pc_p4pmcs; for (n = 0; n < P4_NPMCS; n++, phw++) { phw->phw_state = PMC_PHW_FLAG_IS_ENABLED | PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n); phw->phw_pmc = NULL; pcs->pc_hwpmcs[n] = phw; } /* Mark the TSC as shareable */ pcs->pc_hwpmcs[0]->phw_state |= PMC_PHW_FLAG_IS_SHAREABLE; pescr = pcs->pc_escrs; for (n = 0; n < P4_NESCR; n++) *pescr++ = P4_INVALID_PMC_INDEX; pmc_pcpu[cpu] = (struct pmc_cpu *) pcs; mtx_init(&pcs->pc_mtx, "p4-pcpu", "pmc", MTX_SPIN); return 0; } /* * Destroy per-cpu state. */ static int p4_cleanup(int cpu) { struct p4_cpu *pcs; PMCDBG(MDP,INI,0, "p4-cleanup cpu=%d", cpu); /* * Free up the per-cpu structure for the given cpu if * allocated, and if this is a physical CPU. */ if ((pcs = (struct p4_cpu *) pmc_pcpu[cpu]) != NULL && !pmc_cpu_is_logical(cpu)) { mtx_destroy(&pcs->pc_mtx); FREE(pcs, M_PMC); } pmc_pcpu[cpu] = NULL; return 0; } /* * Context switch in. */ static int p4_switch_in(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; PMCDBG(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp, (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0); /* enable the RDPMC instruction */ if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) load_cr4(rcr4() | CR4_PCE); PMCDBG(MDP,SWI,2, "cr4=0x%x", rcr4()); return 0; } /* * Context switch out. */ static int p4_switch_out(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; (void) pp; /* can be null */ PMCDBG(MDP,SWO,1, "pc=%p pp=%p", pc, pp); /* always disallow the RDPMC instruction */ load_cr4(rcr4() & ~CR4_PCE); PMCDBG(MDP,SWO,2, "cr4=0x%x", rcr4()); return 0; } /* * Read a PMC */ static int p4_read_pmc(int cpu, int ri, pmc_value_t *v) { enum pmc_mode mode; struct p4pmc_descr *pd; struct pmc *pm; struct p4_cpu *pc; struct pmc_hw *phw; pmc_value_t tmp; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p4,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index %d", __LINE__, ri)); pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)]; phw = pc->pc_hwpmcs[ri]; pd = &p4_pmcdesc[ri]; pm = phw->phw_pmc; KASSERT(pm != NULL, ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__, cpu, ri)); KASSERT(pd->pm_descr.pd_class == PMC_TO_CLASS(pm), ("[p4,%d] class mismatch pd %d != id class %d", __LINE__, pd->pm_descr.pd_class, PMC_TO_CLASS(pm))); mode = PMC_TO_MODE(pm); PMCDBG(MDP,REA,1, "p4-read cpu=%d ri=%d mode=%d", cpu, ri, mode); if (PMC_TO_CLASS(pm) == PMC_CLASS_TSC) { KASSERT(PMC_IS_COUNTING_MODE(mode), ("[p4,%d] TSC counter in non-counting mode", __LINE__)); *v = rdtsc(); PMCDBG(MDP,REA,2, "p4-read -> %jx", *v); return 0; } KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4, ("[p4,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class)); tmp = rdmsr(p4_pmcdesc[ri].pm_pmc_msr); if (PMC_IS_VIRTUAL_MODE(mode)) { if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit overflow */ tmp += (P4_PERFCTR_MASK + 1) - P4_PCPU_HW_VALUE(pc,ri,cpu); else tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu); tmp += P4_PCPU_PMC_VALUE(pc,ri,cpu); } if (PMC_IS_SAMPLING_MODE(mode)) /* undo transformation */ *v = P4_PERFCTR_VALUE_TO_RELOAD_COUNT(tmp); else *v = tmp; PMCDBG(MDP,REA,2, "p4-read -> %jx", *v); return 0; } /* * Write a PMC */ static int p4_write_pmc(int cpu, int ri, pmc_value_t v) { enum pmc_mode mode; struct pmc *pm; struct p4_cpu *pc; const struct pmc_hw *phw; const struct p4pmc_descr *pd; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[amd,%d] illegal row-index %d", __LINE__, ri)); pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)]; phw = pc->pc_hwpmcs[ri]; pm = phw->phw_pmc; pd = &p4_pmcdesc[ri]; KASSERT(pm != NULL, ("[p4,%d] No owner for HWPMC [cpu%d,pmc%d]", __LINE__, cpu, ri)); mode = PMC_TO_MODE(pm); PMCDBG(MDP,WRI,1, "p4-write cpu=%d ri=%d mode=%d v=%jx", cpu, ri, mode, v); /* * The P4's TSC register is writeable, but we don't allow a * write as changing the TSC's value could interfere with * timekeeping and other system functions. */ if (PMC_TO_CLASS(pm) == PMC_CLASS_TSC) return 0; /* * write the PMC value to the register/saved value: for * sampling mode PMCs, the value to be programmed into the PMC * counter is -(C+1) where 'C' is the requested sample rate. */ if (PMC_IS_SAMPLING_MODE(mode)) v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE(v); if (PMC_IS_SYSTEM_MODE(mode)) wrmsr(pd->pm_pmc_msr, v); else P4_PCPU_PMC_VALUE(pc,ri,cpu) = v; return 0; } /* * Configure a PMC 'pm' on the given CPU and row-index. * * 'pm' may be NULL to indicate de-configuration. * * On HTT systems, a PMC may get configured twice, once for each * "logical" CPU. We track this using the CFGFLAGS field of the * per-cpu state; this field is a bit mask with one bit each for * logical CPUs 0 & 1. */ static int p4_config_pmc(int cpu, int ri, struct pmc *pm) { struct pmc_hw *phw; struct p4_cpu *pc; int cfgflags, cpuflag; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p4,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index %d", __LINE__, ri)); pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)]; phw = pc->pc_hwpmcs[ri]; KASSERT(pm == NULL || phw->phw_pmc == NULL || (p4_system_has_htt && phw->phw_pmc == pm), ("[p4,%d] hwpmc not unconfigured before re-config", __LINE__)); mtx_lock_spin(&pc->pc_mtx); cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri); KASSERT(cfgflags >= 0 || cfgflags <= 3, ("[p4,%d] illegal cfgflags cfg=%d on cpu=%d ri=%d", __LINE__, cfgflags, cpu, ri)); KASSERT(cfgflags == 0 || phw->phw_pmc, ("[p4,%d] cpu=%d ri=%d pmc configured with zero cfg count", __LINE__, cpu, ri)); PMCDBG(MDP,CFG,1, "cpu=%d ri=%d cfg=%d pm=%p", cpu, ri, cfgflags, pm); cpuflag = P4_CPU_TO_FLAG(cpu); if (pm) { /* config */ if (cfgflags == 0) phw->phw_pmc = pm; KASSERT(phw->phw_pmc == pm, ("[p4,%d] cpu=%d ri=%d config %p != hw %p", __LINE__, cpu, ri, pm, phw->phw_pmc)); cfgflags |= cpuflag; } else { /* unconfig */ cfgflags &= ~cpuflag; if (cfgflags == 0) phw->phw_pmc = NULL; } KASSERT(cfgflags >= 0 || cfgflags <= 3, ("[p4,%d] illegal runcount cfg=%d on cpu=%d ri=%d", __LINE__, cfgflags, cpu, ri)); P4_PCPU_SET_CFGFLAGS(pc,ri,cfgflags); mtx_unlock_spin(&pc->pc_mtx); return 0; } /* * Retrieve a configured PMC pointer from hardware state. */ static int p4_get_config(int cpu, int ri, struct pmc **ppm) { struct p4_cpu *pc; struct pmc_hw *phw; int cfgflags; pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)]; phw = pc->pc_hwpmcs[ri]; mtx_lock_spin(&pc->pc_mtx); cfgflags = P4_PCPU_GET_CFGFLAGS(pc,ri); mtx_unlock_spin(&pc->pc_mtx); if (cfgflags & P4_CPU_TO_FLAG(cpu)) *ppm = phw->phw_pmc; /* PMC config'ed on this CPU */ else *ppm = NULL; return 0; } /* * Allocate a PMC. * * The allocation strategy differs between HTT and non-HTT systems. * * The non-HTT case: * - Given the desired event and the PMC row-index, lookup the * list of valid ESCRs for the event. * - For each valid ESCR: * - Check if the ESCR is free and the ESCR row is in a compatible * mode (i.e., system or process)) * - Check if the ESCR is usable with a P4 PMC at the desired row-index. * If everything matches, we determine the appropriate bit values for the * ESCR and CCCR registers. * * The HTT case: * * - Process mode PMCs require special care. The FreeBSD scheduler could * schedule any two processes on the same physical CPU. We need to ensure * that a given PMC row-index is never allocated to two different * PMCs owned by different user-processes. * This is ensured by always allocating a PMC from a 'FREE' PMC row * if the system has HTT active. * - A similar check needs to be done for ESCRs; we do not want two PMCs * using the same ESCR to be scheduled at the same time. Thus ESCR * allocation is also restricted to FREE rows if the system has HTT * enabled. * - Thirdly, some events are 'thread-independent' terminology, i.e., * the PMC hardware cannot distinguish between events caused by * different logical CPUs. This makes it impossible to assign events * to a given thread of execution. If the system has HTT enabled, * these events are not allowed for process-mode PMCs. */ static int p4_allocate_pmc(int cpu, int ri, struct pmc *pm, const struct pmc_op_pmcallocate *a) { int found, n, m; uint32_t caps, cccrvalue, escrvalue, tflags; enum pmc_p4escr escr; struct p4_cpu *pc; struct p4_event_descr *pevent; const struct p4pmc_descr *pd; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p4,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index value %d", __LINE__, ri)); pd = &p4_pmcdesc[ri]; PMCDBG(MDP,ALL,1, "p4-allocate ri=%d class=%d pmccaps=0x%x " "reqcaps=0x%x", ri, pd->pm_descr.pd_class, pd->pm_descr.pd_caps, pm->pm_caps); /* check class */ if (pd->pm_descr.pd_class != a->pm_class) return EINVAL; /* check requested capabilities */ caps = a->pm_caps; if ((pd->pm_descr.pd_caps & caps) != caps) return EPERM; if (pd->pm_descr.pd_class == PMC_CLASS_TSC) { /* TSC's are always allocated in system-wide counting mode */ if (a->pm_ev != PMC_EV_TSC_TSC || a->pm_mode != PMC_MODE_SC) return EINVAL; return 0; } /* * If the system has HTT enabled, and the desired allocation * mode is process-private, and the PMC row disposition is not * FREE (0), decline the allocation. */ if (p4_system_has_htt && PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) && pmc_getrowdisp(ri) != 0) return EBUSY; KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4, ("[p4,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class)); if (pm->pm_event < PMC_EV_P4_FIRST || pm->pm_event > PMC_EV_P4_LAST) return EINVAL; if ((pevent = p4_find_event(pm->pm_event)) == NULL) return ESRCH; PMCDBG(MDP,ALL,2, "pevent={ev=%d,escrsel=0x%x,cccrsel=0x%x,isti=%d}", pevent->pm_event, pevent->pm_escr_eventselect, pevent->pm_cccr_select, pevent->pm_is_ti_event); /* * Some PMC events are 'thread independent'and therefore * cannot be used for process-private modes if HTT is being * used. */ if (P4_EVENT_IS_TI(pevent) && PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) && p4_system_has_htt) return EINVAL; pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)]; found = 0; /* look for a suitable ESCR for this event */ for (n = 0; n < P4_MAX_ESCR_PER_EVENT && !found; n++) { if ((escr = pevent->pm_escrs[n]) == P4_ESCR_NONE) break; /* out of ESCRs */ /* * Check ESCR row disposition. * * If the request is for a system-mode PMC, then the * ESCR row should not be in process-virtual mode, and * should also be free on the current CPU. */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { if (P4_ESCR_ROW_DISP_IS_THREAD(escr) || pc->pc_escrs[escr] != P4_INVALID_PMC_INDEX) continue; } /* * If the request is for a process-virtual PMC, and if * HTT is not enabled, we can use an ESCR row that is * either FREE or already in process mode. * * If HTT is enabled, then we need to ensure that a * given ESCR is never allocated to two PMCS that * could run simultaneously on the two logical CPUs of * a CPU package. We ensure this be only allocating * ESCRs from rows marked as 'FREE'. */ if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) { if (p4_system_has_htt) { if (!P4_ESCR_ROW_DISP_IS_FREE(escr)) continue; } else if (P4_ESCR_ROW_DISP_IS_STANDALONE(escr)) continue; } /* * We found a suitable ESCR for this event. Now check if * this escr can work with the PMC at row-index 'ri'. */ for (m = 0; m < P4_MAX_PMC_PER_ESCR; m++) if (p4_escrs[escr].pm_pmcs[m] == pd->pm_pmcnum) { found = 1; break; } } if (found == 0) return ESRCH; KASSERT((int) escr >= 0 && escr < P4_NESCR, ("[p4,%d] illegal ESCR value %d", __LINE__, escr)); /* mark ESCR row mode */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { pc->pc_escrs[escr] = ri; /* mark ESCR as in use on this cpu */ P4_ESCR_MARK_ROW_STANDALONE(escr); } else { KASSERT(pc->pc_escrs[escr] == P4_INVALID_PMC_INDEX, ("[p4,%d] escr[%d] already in use", __LINE__, escr)); P4_ESCR_MARK_ROW_THREAD(escr); } pm->pm_md.pm_p4.pm_p4_escrmsr = p4_escrs[escr].pm_escr_msr; pm->pm_md.pm_p4.pm_p4_escr = escr; cccrvalue = P4_CCCR_TO_ESCR_SELECT(pevent->pm_cccr_select); escrvalue = P4_ESCR_TO_EVENT_SELECT(pevent->pm_escr_eventselect); /* CCCR fields */ if (caps & PMC_CAP_THRESHOLD) cccrvalue |= (a->pm_p4_cccrconfig & P4_CCCR_THRESHOLD_MASK) | P4_CCCR_COMPARE; if (caps & PMC_CAP_EDGE) cccrvalue |= P4_CCCR_EDGE; if (caps & PMC_CAP_INVERT) cccrvalue |= P4_CCCR_COMPLEMENT; if (p4_system_has_htt) cccrvalue |= a->pm_p4_cccrconfig & P4_CCCR_ACTIVE_THREAD_MASK; else /* no HTT; thread field should be '11b' */ cccrvalue |= P4_CCCR_TO_ACTIVE_THREAD(0x3); if (caps & PMC_CAP_CASCADE) cccrvalue |= P4_CCCR_CASCADE; /* On HTT systems the PMI T0 field may get moved to T1 at pmc start */ if (caps & PMC_CAP_INTERRUPT) cccrvalue |= P4_CCCR_OVF_PMI_T0; /* ESCR fields */ if (caps & PMC_CAP_QUALIFIER) escrvalue |= a->pm_p4_escrconfig & P4_ESCR_EVENT_MASK_MASK; if (caps & PMC_CAP_TAGGING) escrvalue |= (a->pm_p4_escrconfig & P4_ESCR_TAG_VALUE_MASK) | P4_ESCR_TAG_ENABLE; if (caps & PMC_CAP_QUALIFIER) escrvalue |= (a->pm_p4_escrconfig & P4_ESCR_EVENT_MASK_MASK); /* HTT: T0_{OS,USR} bits may get moved to T1 at pmc start */ tflags = 0; if (caps & PMC_CAP_SYSTEM) tflags |= P4_ESCR_T0_OS; if (caps & PMC_CAP_USER) tflags |= P4_ESCR_T0_USR; if (tflags == 0) tflags = (P4_ESCR_T0_OS|P4_ESCR_T0_USR); escrvalue |= tflags; pm->pm_md.pm_p4.pm_p4_cccrvalue = cccrvalue; pm->pm_md.pm_p4.pm_p4_escrvalue = escrvalue; PMCDBG(MDP,ALL,2, "p4-allocate cccrsel=0x%x cccrval=0x%x " "escr=%d escrmsr=0x%x escrval=0x%x", pevent->pm_cccr_select, cccrvalue, escr, pm->pm_md.pm_p4.pm_p4_escrmsr, escrvalue); return 0; } /* * release a PMC. */ static int p4_release_pmc(int cpu, int ri, struct pmc *pm) { enum pmc_p4escr escr; struct pmc_hw *phw; struct p4_cpu *pc; if (p4_pmcdesc[ri].pm_descr.pd_class == PMC_CLASS_TSC) return 0; escr = pm->pm_md.pm_p4.pm_p4_escr; PMCDBG(MDP,REL,1, "p4-release cpu=%d ri=%d escr=%d", cpu, ri, escr); if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)]; phw = pc->pc_hwpmcs[ri]; KASSERT(phw->phw_pmc == NULL, ("[p4,%d] releasing configured PMC ri=%d", __LINE__, ri)); P4_ESCR_UNMARK_ROW_STANDALONE(escr); KASSERT(pc->pc_escrs[escr] == ri, ("[p4,%d] escr[%d] not allocated to ri %d", __LINE__, escr, ri)); pc->pc_escrs[escr] = P4_INVALID_PMC_INDEX; /* mark as free */ } else P4_ESCR_UNMARK_ROW_THREAD(escr); return 0; } /* * Start a PMC */ static int p4_start_pmc(int cpu, int ri) { int rc; uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits; struct pmc *pm; struct p4_cpu *pc; struct pmc_hw *phw; struct p4pmc_descr *pd; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p4,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row-index %d", __LINE__, ri)); pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)]; phw = pc->pc_hwpmcs[ri]; pm = phw->phw_pmc; pd = &p4_pmcdesc[ri]; KASSERT(pm != NULL, ("[p4,%d] starting cpu%d,pmc%d with null pmc", __LINE__, cpu, ri)); PMCDBG(MDP,STA,1, "p4-start cpu=%d ri=%d", cpu, ri); if (pd->pm_descr.pd_class == PMC_CLASS_TSC) /* TSC are always on */ return 0; KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P4, ("[p4,%d] wrong PMC class %d", __LINE__, pd->pm_descr.pd_class)); /* retrieve the desired CCCR/ESCR values from the PMC */ cccrvalue = pm->pm_md.pm_p4.pm_p4_cccrvalue; escrvalue = pm->pm_md.pm_p4.pm_p4_escrvalue; escrmsr = pm->pm_md.pm_p4.pm_p4_escrmsr; /* extract and zero the logical processor selection bits */ cccrtbits = cccrvalue & P4_CCCR_OVF_PMI_T0; escrtbits = escrvalue & (P4_ESCR_T0_OS|P4_ESCR_T0_USR); cccrvalue &= ~P4_CCCR_OVF_PMI_T0; escrvalue &= ~(P4_ESCR_T0_OS|P4_ESCR_T0_USR); if (pmc_cpu_is_logical(cpu)) { /* shift T0 bits to T1 position */ cccrtbits <<= 1; escrtbits >>= 2; } /* start system mode PMCs directly */ if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { wrmsr(escrmsr, escrvalue | escrtbits); wrmsr(pd->pm_cccr_msr, cccrvalue | cccrtbits | P4_CCCR_ENABLE); return 0; } /* * Thread mode PMCs * * On HTT machines, the same PMC could be scheduled on the * same physical CPU twice (once for each logical CPU), for * example, if two threads of a multi-threaded process get * scheduled on the same CPU. * */ mtx_lock_spin(&pc->pc_mtx); rc = P4_PCPU_GET_RUNCOUNT(pc,ri); KASSERT(rc == 0 || rc == 1, ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri, rc)); if (rc == 0) { /* 1st CPU and the non-HTT case */ KASSERT(P4_PMC_IS_STOPPED(pd->pm_cccr_msr), ("[p4,%d] cpu=%d ri=%d cccr=0x%x not stopped", __LINE__, cpu, ri, pd->pm_cccr_msr)); /* write out the low 40 bits of the saved value to hardware */ wrmsr(pd->pm_pmc_msr, P4_PCPU_PMC_VALUE(pc,ri,cpu) & P4_PERFCTR_MASK); } else if (rc == 1) { /* 2nd CPU */ /* * Stop the PMC and retrieve the CCCR and ESCR values * from their MSRs, and turn on the additional T[0/1] * bits for the 2nd CPU. */ cccrvalue = rdmsr(pd->pm_cccr_msr); wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE); /* check that the configuration bits read back match the PMC */ KASSERT((cccrvalue & P4_CCCR_Tx_MASK) == (pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK), ("[p4,%d] Extra CCCR bits cpu=%d rc=%d ri=%d " "cccr=0x%x PMC=0x%x", __LINE__, cpu, rc, ri, cccrvalue & P4_CCCR_Tx_MASK, pm->pm_md.pm_p4.pm_p4_cccrvalue & P4_CCCR_Tx_MASK)); KASSERT(cccrvalue & P4_CCCR_ENABLE, ("[p4,%d] 2nd cpu rc=%d cpu=%d ri=%d not running", __LINE__, rc, cpu, ri)); KASSERT((cccrvalue & cccrtbits) == 0, ("[p4,%d] CCCR T0/T1 mismatch rc=%d cpu=%d ri=%d" "cccrvalue=0x%x tbits=0x%x", __LINE__, rc, cpu, ri, cccrvalue, cccrtbits)); escrvalue = rdmsr(escrmsr); KASSERT((escrvalue & P4_ESCR_Tx_MASK) == (pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK), ("[p4,%d] Extra ESCR bits cpu=%d rc=%d ri=%d " "escr=0x%x pm=0x%x", __LINE__, cpu, rc, ri, escrvalue & P4_ESCR_Tx_MASK, pm->pm_md.pm_p4.pm_p4_escrvalue & P4_ESCR_Tx_MASK)); KASSERT((escrvalue & escrtbits) == 0, ("[p4,%d] ESCR T0/T1 mismatch rc=%d cpu=%d ri=%d " "escrmsr=0x%x escrvalue=0x%x tbits=0x%x", __LINE__, rc, cpu, ri, escrmsr, escrvalue, escrtbits)); } /* Enable the correct bits for this CPU. */ escrvalue |= escrtbits; cccrvalue |= cccrtbits | P4_CCCR_ENABLE; /* Save HW value at the time of starting hardware */ P4_PCPU_HW_VALUE(pc,ri,cpu) = rdmsr(pd->pm_pmc_msr); /* Program the ESCR and CCCR and start the PMC */ wrmsr(escrmsr, escrvalue); wrmsr(pd->pm_cccr_msr, cccrvalue); ++rc; P4_PCPU_SET_RUNCOUNT(pc,ri,rc); mtx_unlock_spin(&pc->pc_mtx); PMCDBG(MDP,STA,2,"p4-start cpu=%d rc=%d ri=%d escr=%d " "escrmsr=0x%x escrvalue=0x%x cccr_config=0x%x v=%jx", cpu, rc, ri, pm->pm_md.pm_p4.pm_p4_escr, escrmsr, escrvalue, cccrvalue, P4_PCPU_HW_VALUE(pc,ri,cpu)); return 0; } /* * Stop a PMC. */ static int p4_stop_pmc(int cpu, int ri) { int rc; uint32_t cccrvalue, cccrtbits, escrvalue, escrmsr, escrtbits; struct pmc *pm; struct p4_cpu *pc; struct pmc_hw *phw; struct p4pmc_descr *pd; pmc_value_t tmp; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p4,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] illegal row index %d", __LINE__, ri)); pd = &p4_pmcdesc[ri]; if (pd->pm_descr.pd_class == PMC_CLASS_TSC) return 0; pc = (struct p4_cpu *) pmc_pcpu[P4_TO_PHYSICAL_CPU(cpu)]; phw = pc->pc_hwpmcs[ri]; KASSERT(phw != NULL, ("[p4,%d] null phw for cpu%d, ri%d", __LINE__, cpu, ri)); pm = phw->phw_pmc; KASSERT(pm != NULL, ("[p4,%d] null pmc for cpu%d, ri%d", __LINE__, cpu, ri)); PMCDBG(MDP,STO,1, "p4-stop cpu=%d ri=%d", cpu, ri); if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) { wrmsr(pd->pm_cccr_msr, pm->pm_md.pm_p4.pm_p4_cccrvalue & ~P4_CCCR_ENABLE); return 0; } /* * Thread mode PMCs. * * On HTT machines, this PMC may be in use by two threads * running on two logical CPUS. Thus we look at the * 'pm_runcount' field and only turn off the appropriate TO/T1 * bits (and keep the PMC running) if two logical CPUs were * using the PMC. * */ /* bits to mask */ cccrtbits = P4_CCCR_OVF_PMI_T0; escrtbits = P4_ESCR_T0_OS | P4_ESCR_T0_USR; if (pmc_cpu_is_logical(cpu)) { cccrtbits <<= 1; escrtbits >>= 2; } mtx_lock_spin(&pc->pc_mtx); rc = P4_PCPU_GET_RUNCOUNT(pc,ri); KASSERT(rc == 2 || rc == 1, ("[p4,%d] illegal runcount cpu=%d ri=%d rc=%d", __LINE__, cpu, ri, rc)); --rc; P4_PCPU_SET_RUNCOUNT(pc,ri,rc); /* Stop this PMC */ cccrvalue = rdmsr(pd->pm_cccr_msr); wrmsr(pd->pm_cccr_msr, cccrvalue & ~P4_CCCR_ENABLE); escrmsr = pm->pm_md.pm_p4.pm_p4_escrmsr; escrvalue = rdmsr(escrmsr); /* The current CPU should be running on this PMC */ KASSERT(escrvalue & escrtbits, ("[p4,%d] ESCR T0/T1 mismatch cpu=%d rc=%d ri=%d escrmsr=0x%x " "escrvalue=0x%x tbits=0x%x", __LINE__, cpu, rc, ri, escrmsr, escrvalue, escrtbits)); KASSERT(PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)) || (cccrvalue & cccrtbits), ("[p4,%d] CCCR T0/T1 mismatch cpu=%d ri=%d cccrvalue=0x%x " "tbits=0x%x", __LINE__, cpu, ri, cccrvalue, cccrtbits)); /* get the current hardware reading */ tmp = rdmsr(pd->pm_pmc_msr); if (rc == 1) { /* need to keep the PMC running */ escrvalue &= ~escrtbits; cccrvalue &= ~cccrtbits; wrmsr(escrmsr, escrvalue); wrmsr(pd->pm_cccr_msr, cccrvalue); } mtx_unlock_spin(&pc->pc_mtx); PMCDBG(MDP,STO,2, "p4-stop cpu=%d rc=%d ri=%d escrmsr=0x%x " "escrval=0x%x cccrval=0x%x v=%jx", cpu, rc, ri, escrmsr, escrvalue, cccrvalue, tmp); if (tmp < P4_PCPU_HW_VALUE(pc,ri,cpu)) /* 40 bit counter overflow */ tmp += (P4_PERFCTR_MASK + 1) - P4_PCPU_HW_VALUE(pc,ri,cpu); else tmp -= P4_PCPU_HW_VALUE(pc,ri,cpu); P4_PCPU_PMC_VALUE(pc,ri,cpu) += tmp; return 0; } /* * Handle an interrupt. * * The hardware sets the CCCR_OVF whenever a counter overflow occurs, so the handler * examines all the 18 CCCR registers, processing the counters that have overflowed. * * On HTT machines, multiple logical CPUs may try to enter the NMI service * routine at the same time. */ extern volatile lapic_t *lapic; static void p4_lapic_enable_pmc_interrupt(void) { uint32_t value; value = lapic->lvt_pcint; value &= ~APIC_LVT_M; lapic->lvt_pcint = value; } static int -p4_intr(int cpu, uintptr_t eip) +p4_intr(int cpu, uintptr_t eip, int usermode) { int i, pmc_interrupted; uint32_t cccrval, pmi_ovf_mask; struct p4_cpu *pc; struct pmc_hw *phw; struct pmc *pm; pmc_value_t v; (void) eip; + (void) usermode; PMCDBG(MDP,INT, 1, "cpu=%d eip=%x pcint=0x%x", cpu, eip, lapic->lvt_pcint); pmc_interrupted = 0; pc = (struct p4_cpu *) pmc_pcpu[cpu]; pmi_ovf_mask = pmc_cpu_is_logical(cpu) ? P4_CCCR_OVF_PMI_T1 : P4_CCCR_OVF_PMI_T0; pmi_ovf_mask |= P4_CCCR_OVF; /* * Loop through all CCCRs, looking for ones that have the * OVF_PMI bit set for our logical CPU. */ for (i = 1; i < P4_NPMCS; i++) { cccrval = rdmsr(P4_CCCR_MSR_FIRST + i - 1); if ((cccrval & pmi_ovf_mask) != pmi_ovf_mask) continue; v = rdmsr(P4_PERFCTR_MSR_FIRST + i - 1); pmc_interrupted = 1; PMCDBG(MDP,INT, 2, "ri=%d v=%jx", i, v); /* Stop the counter, and turn off the overflow bit */ cccrval &= ~(P4_CCCR_OVF | P4_CCCR_ENABLE); wrmsr(P4_CCCR_MSR_FIRST + i - 1, cccrval); phw = pc->pc_hwpmcs[i]; pm = phw->phw_pmc; /* * Ignore de-configured or stopped PMCs. * Also ignore counting mode PMCs that may * have overflowed their counters. */ if (pm == NULL || pm->pm_state != PMC_STATE_RUNNING || !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) continue; /* * If the previous sample hasn't been read yet, the * sampling interrupt is coming in too fast for the * rest of the system to cope. Do not re-enable the * counter. */ if (P4_PCPU_SAVED_IP(pc,i,cpu)) { atomic_add_int(&pmc_stats.pm_intr_ignored, 1); continue; } /* * write the the reload count and restart the * hardware. */ v = P4_RELOAD_COUNT_TO_PERFCTR_VALUE( pm->pm_sc.pm_reloadcount); wrmsr(P4_PERFCTR_MSR_FIRST + i - 1, v); wrmsr(P4_CCCR_MSR_FIRST + i - 1, cccrval | P4_CCCR_ENABLE); } if (pmc_interrupted) { /* * On Intel CPUs, the PMC 'pcint' entry in the LAPIC * gets masked when a PMC interrupts the CPU. We need * to unmask this. */ p4_lapic_enable_pmc_interrupt(); /* XXX: Invoke helper (non-NMI) interrupt here */ } return pmc_interrupted; } /* * Describe a CPU's PMC state. */ static int p4_describe(int cpu, int ri, struct pmc_info *pi, struct pmc **ppmc) { int error; size_t copied; struct pmc_hw *phw; const struct p4pmc_descr *pd; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p4,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] row-index %d out of range", __LINE__, ri)); PMCDBG(MDP,OPS,1,"p4-describe cpu=%d ri=%d", cpu, ri); if (pmc_cpu_is_logical(cpu)) return EINVAL; phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pd = &p4_pmcdesc[ri]; if ((error = copystr(pd->pm_descr.pd_name, pi->pm_name, PMC_NAME_MAX, &copied)) != 0) return error; pi->pm_class = pd->pm_descr.pd_class; if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) { pi->pm_enabled = TRUE; *ppmc = phw->phw_pmc; } else { pi->pm_enabled = FALSE; *ppmc = NULL; } return 0; } /* * Get MSR# for use with RDPMC. */ static int p4_get_msr(int ri, uint32_t *msr) { KASSERT(ri >= 0 && ri < P4_NPMCS, ("[p4,%d] ri %d out of range", __LINE__, ri)); *msr = p4_pmcdesc[ri].pm_pmc_msr - P4_PERFCTR_MSR_FIRST; PMCDBG(MDP,OPS, 1, "ri=%d getmsr=0x%x", ri, *msr); return 0; } int pmc_initialize_p4(struct pmc_mdep *pmc_mdep) { struct p4_event_descr *pe; KASSERT(strcmp(cpu_vendor, "GenuineIntel") == 0, ("[p4,%d] Initializing non-intel processor", __LINE__)); PMCDBG(MDP,INI,1, "%s", "p4-initialize"); switch (pmc_mdep->pmd_cputype) { case PMC_CPU_INTEL_PIV: pmc_mdep->pmd_npmc = P4_NPMCS; pmc_mdep->pmd_classes[1].pm_class = PMC_CLASS_P4; pmc_mdep->pmd_classes[1].pm_caps = P4_PMC_CAPS; pmc_mdep->pmd_classes[1].pm_width = 40; pmc_mdep->pmd_nclasspmcs[1] = 18; pmc_mdep->pmd_init = p4_init; pmc_mdep->pmd_cleanup = p4_cleanup; pmc_mdep->pmd_switch_in = p4_switch_in; pmc_mdep->pmd_switch_out = p4_switch_out; pmc_mdep->pmd_read_pmc = p4_read_pmc; pmc_mdep->pmd_write_pmc = p4_write_pmc; pmc_mdep->pmd_config_pmc = p4_config_pmc; pmc_mdep->pmd_get_config = p4_get_config; pmc_mdep->pmd_allocate_pmc = p4_allocate_pmc; pmc_mdep->pmd_release_pmc = p4_release_pmc; pmc_mdep->pmd_start_pmc = p4_start_pmc; pmc_mdep->pmd_stop_pmc = p4_stop_pmc; pmc_mdep->pmd_intr = p4_intr; pmc_mdep->pmd_describe = p4_describe; pmc_mdep->pmd_get_msr = p4_get_msr; /* i386 */ /* model specific munging */ if ((cpu_id & 0xFFF) < 0xF27) { /* * On P4 and Xeon with CPUID < (Family 15, * Model 2, Stepping 7), only one ESCR is * available for the IOQ_ALLOCATION event. */ pe = p4_find_event(PMC_EV_P4_IOQ_ALLOCATION); pe->pm_escrs[1] = P4_ESCR_NONE; } break; default: KASSERT(0,("[p4,%d] Unknown CPU type", __LINE__)); return ENOSYS; } return 0; } diff --git a/sys/dev/hwpmc/hwpmc_ppro.c b/sys/dev/hwpmc/hwpmc_ppro.c index 13f91956ca00..370e6e575ff1 100644 --- a/sys/dev/hwpmc/hwpmc_ppro.c +++ b/sys/dev/hwpmc/hwpmc_ppro.c @@ -1,772 +1,772 @@ /*- * Copyright (c) 2003-2005 Joseph Koshy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include /* * PENTIUM PRO SUPPORT */ struct p6pmc_descr { struct pmc_descr pm_descr; /* common information */ uint32_t pm_pmc_msr; uint32_t pm_evsel_msr; }; static struct p6pmc_descr p6_pmcdesc[P6_NPMCS] = { /* TSC */ { .pm_descr = { .pd_name = "TSC", .pd_class = PMC_CLASS_TSC, .pd_caps = PMC_CAP_READ, .pd_width = 64 }, .pm_pmc_msr = 0x10, .pm_evsel_msr = ~0 }, #define P6_PMC_CAPS (PMC_CAP_INTERRUPT | PMC_CAP_USER | PMC_CAP_SYSTEM | \ PMC_CAP_EDGE | PMC_CAP_THRESHOLD | PMC_CAP_READ | PMC_CAP_WRITE | \ PMC_CAP_INVERT | PMC_CAP_QUALIFIER) /* PMC 0 */ { .pm_descr = { .pd_name ="P6-0", .pd_class = PMC_CLASS_P6, .pd_caps = P6_PMC_CAPS, .pd_width = 40 }, .pm_pmc_msr = P6_MSR_PERFCTR0, .pm_evsel_msr = P6_MSR_EVSEL0 }, /* PMC 1 */ { .pm_descr = { .pd_name ="P6-1", .pd_class = PMC_CLASS_P6, .pd_caps = P6_PMC_CAPS, .pd_width = 40 }, .pm_pmc_msr = P6_MSR_PERFCTR1, .pm_evsel_msr = P6_MSR_EVSEL1 } }; static enum pmc_cputype p6_cputype; /* * P6 Event descriptor */ struct p6_event_descr { const enum pmc_event pm_event; uint32_t pm_evsel; uint32_t pm_flags; uint32_t pm_unitmask; }; static const struct p6_event_descr p6_events[] = { #define P6_EVDESCR(NAME, EVSEL, FLAGS, UMASK) \ { \ .pm_event = PMC_EV_P6_##NAME, \ .pm_evsel = (EVSEL), \ .pm_flags = (FLAGS), \ .pm_unitmask = (UMASK) \ } #define P6F_P6 (1 << PMC_CPU_INTEL_P6) #define P6F_CL (1 << PMC_CPU_INTEL_CL) #define P6F_PII (1 << PMC_CPU_INTEL_PII) #define P6F_PIII (1 << PMC_CPU_INTEL_PIII) #define P6F_PM (1 << PMC_CPU_INTEL_PM) #define P6F_CTR0 0x0001 #define P6F_CTR1 0x0002 #define P6F_ALL_CPUS (P6F_P6 | P6F_PII | P6F_CL | P6F_PIII | P6F_PM) #define P6F_ALL_CTRS (P6F_CTR0 | P6F_CTR1) #define P6F_ALL (P6F_ALL_CPUS | P6F_ALL_CTRS) #define P6_EVENT_VALID_FOR_CPU(P,CPU) ((P)->pm_flags & (1 << (CPU))) #define P6_EVENT_VALID_FOR_CTR(P,CTR) ((P)->pm_flags & (1 << (CTR))) P6_EVDESCR(DATA_MEM_REFS, 0x43, P6F_ALL, 0x00), P6_EVDESCR(DCU_LINES_IN, 0x45, P6F_ALL, 0x00), P6_EVDESCR(DCU_M_LINES_IN, 0x46, P6F_ALL, 0x00), P6_EVDESCR(DCU_M_LINES_OUT, 0x47, P6F_ALL, 0x00), P6_EVDESCR(DCU_MISS_OUTSTANDING, 0x47, P6F_ALL, 0x00), P6_EVDESCR(IFU_FETCH, 0x80, P6F_ALL, 0x00), P6_EVDESCR(IFU_FETCH_MISS, 0x81, P6F_ALL, 0x00), P6_EVDESCR(ITLB_MISS, 0x85, P6F_ALL, 0x00), P6_EVDESCR(IFU_MEM_STALL, 0x86, P6F_ALL, 0x00), P6_EVDESCR(ILD_STALL, 0x87, P6F_ALL, 0x00), P6_EVDESCR(L2_IFETCH, 0x28, P6F_ALL, 0x0F), P6_EVDESCR(L2_LD, 0x29, P6F_ALL, 0x0F), P6_EVDESCR(L2_ST, 0x2A, P6F_ALL, 0x0F), P6_EVDESCR(L2_LINES_IN, 0x24, P6F_ALL, 0x0F), P6_EVDESCR(L2_LINES_OUT, 0x26, P6F_ALL, 0x0F), P6_EVDESCR(L2_M_LINES_INM, 0x25, P6F_ALL, 0x00), P6_EVDESCR(L2_M_LINES_OUTM, 0x27, P6F_ALL, 0x0F), P6_EVDESCR(L2_RQSTS, 0x2E, P6F_ALL, 0x0F), P6_EVDESCR(L2_ADS, 0x21, P6F_ALL, 0x00), P6_EVDESCR(L2_DBUS_BUSY, 0x22, P6F_ALL, 0x00), P6_EVDESCR(L2_DBUS_BUSY_RD, 0x23, P6F_ALL, 0x00), P6_EVDESCR(BUS_DRDY_CLOCKS, 0x62, P6F_ALL, 0x20), P6_EVDESCR(BUS_LOCK_CLOCKS, 0x63, P6F_ALL, 0x20), P6_EVDESCR(BUS_REQ_OUTSTANDING, 0x60, P6F_ALL, 0x00), P6_EVDESCR(BUS_TRAN_BRD, 0x65, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRAN_RFO, 0x66, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRANS_WB, 0x67, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRAN_IFETCH, 0x68, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRAN_INVAL, 0x69, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRAN_PWR, 0x6A, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRANS_P, 0x6B, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRANS_IO, 0x6C, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRAN_DEF, 0x6D, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRAN_BURST, 0x6E, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRAN_ANY, 0x70, P6F_ALL, 0x20), P6_EVDESCR(BUS_TRAN_MEM, 0x6F, P6F_ALL, 0x20), P6_EVDESCR(BUS_DATA_RCV, 0x64, P6F_ALL, 0x00), P6_EVDESCR(BUS_BNR_DRV, 0x61, P6F_ALL, 0x00), P6_EVDESCR(BUS_HIT_DRV, 0x7A, P6F_ALL, 0x00), P6_EVDESCR(BUS_HITM_DRV, 0x7B, P6F_ALL, 0x00), P6_EVDESCR(BUS_SNOOP_STALL, 0x7E, P6F_ALL, 0x00), P6_EVDESCR(FLOPS, 0xC1, P6F_ALL_CPUS | P6F_CTR0, 0x00), P6_EVDESCR(FP_COMPS_OPS_EXE, 0x10, P6F_ALL_CPUS | P6F_CTR0, 0x00), P6_EVDESCR(FP_ASSIST, 0x11, P6F_ALL_CPUS | P6F_CTR1, 0x00), P6_EVDESCR(MUL, 0x12, P6F_ALL_CPUS | P6F_CTR1, 0x00), P6_EVDESCR(DIV, 0x13, P6F_ALL_CPUS | P6F_CTR1, 0x00), P6_EVDESCR(CYCLES_DIV_BUSY, 0x14, P6F_ALL_CPUS | P6F_CTR0, 0x00), P6_EVDESCR(LD_BLOCKS, 0x03, P6F_ALL, 0x00), P6_EVDESCR(SB_DRAINS, 0x04, P6F_ALL, 0x00), P6_EVDESCR(MISALIGN_MEM_REF, 0x05, P6F_ALL, 0x00), P6_EVDESCR(EMON_KNI_PREF_DISPATCHED, 0x07, P6F_PIII | P6F_ALL_CTRS, 0x03), P6_EVDESCR(EMON_KNI_PREF_MISS, 0x4B, P6F_PIII | P6F_ALL_CTRS, 0x03), P6_EVDESCR(INST_RETIRED, 0xC0, P6F_ALL, 0x00), P6_EVDESCR(UOPS_RETIRED, 0xC2, P6F_ALL, 0x00), P6_EVDESCR(INST_DECODED, 0xD0, P6F_ALL, 0x00), P6_EVDESCR(EMON_KNI_INST_RETIRED, 0xD8, P6F_PIII | P6F_ALL_CTRS, 0x01), P6_EVDESCR(EMON_KNI_COMP_INST_RET, 0xD9, P6F_PIII | P6F_ALL_CTRS, 0x01), P6_EVDESCR(HW_INT_RX, 0xC8, P6F_ALL, 0x00), P6_EVDESCR(CYCLES_INT_MASKED, 0xC6, P6F_ALL, 0x00), P6_EVDESCR(CYCLES_INT_PENDING_AND_MASKED, 0xC7, P6F_ALL, 0x00), P6_EVDESCR(BR_INST_RETIRED, 0xC4, P6F_ALL, 0x00), P6_EVDESCR(BR_MISS_PRED_RETIRED, 0xC5, P6F_ALL, 0x00), P6_EVDESCR(BR_TAKEN_RETIRED, 0xC9, P6F_ALL, 0x00), P6_EVDESCR(BR_MISS_PRED_TAKEN_RET, 0xCA, P6F_ALL, 0x00), P6_EVDESCR(BR_INST_DECODED, 0xE0, P6F_ALL, 0x00), P6_EVDESCR(BTB_MISSES, 0xE2, P6F_ALL, 0x00), P6_EVDESCR(BR_BOGUS, 0xE4, P6F_ALL, 0x00), P6_EVDESCR(BACLEARS, 0xE6, P6F_ALL, 0x00), P6_EVDESCR(RESOURCE_STALLS, 0xA2, P6F_ALL, 0x00), P6_EVDESCR(PARTIAL_RAT_STALLS, 0xD2, P6F_ALL, 0x00), P6_EVDESCR(SEGMENT_REG_LOADS, 0x06, P6F_ALL, 0x00), P6_EVDESCR(CPU_CLK_UNHALTED, 0x79, P6F_ALL, 0x00), P6_EVDESCR(MMX_INSTR_EXEC, 0xB0, P6F_ALL_CTRS | P6F_CL | P6F_PII, 0x00), P6_EVDESCR(MMX_SAT_INSTR_EXEC, 0xB1, P6F_ALL_CTRS | P6F_PII | P6F_PIII, 0x00), P6_EVDESCR(MMX_UOPS_EXEC, 0xB2, P6F_ALL_CTRS | P6F_PII | P6F_PIII, 0x0F), P6_EVDESCR(MMX_INSTR_TYPE_EXEC, 0xB3, P6F_ALL_CTRS | P6F_PII | P6F_PIII, 0x3F), P6_EVDESCR(FP_MMX_TRANS, 0xCC, P6F_ALL_CTRS | P6F_PII | P6F_PIII, 0x01), P6_EVDESCR(MMX_ASSIST, 0xCD, P6F_ALL_CTRS | P6F_PII | P6F_PIII, 0x00), P6_EVDESCR(MMX_INSTR_RET, 0xCE, P6F_ALL_CTRS | P6F_PII, 0x00), P6_EVDESCR(SEG_RENAME_STALLS, 0xD4, P6F_ALL_CTRS | P6F_PII | P6F_PIII, 0x0F), P6_EVDESCR(SEG_REG_RENAMES, 0xD5, P6F_ALL_CTRS | P6F_PII | P6F_PIII, 0x0F), P6_EVDESCR(RET_SEG_RENAMES, 0xD6, P6F_ALL_CTRS | P6F_PII | P6F_PIII, 0x00), P6_EVDESCR(EMON_EST_TRANS, 0x58, P6F_ALL_CTRS | P6F_PM, 0x02), P6_EVDESCR(EMON_THERMAL_TRIP, 0x59, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_INST_EXEC, 0x88, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_MISSP_EXEC, 0x89, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_BAC_MISSP_EXEC, 0x8A, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_CND_EXEC, 0x8B, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_CND_MISSP_EXEC, 0x8C, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_IND_EXEC, 0x8D, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_IND_MISSP_EXEC, 0x8E, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_RET_EXEC, 0x8F, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_RET_MISSP_EXEC, 0x90, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_RET_BAC_MISSP_EXEC, 0x91, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_CALL_EXEC, 0x92, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_CALL_MISSP_EXEC, 0x93, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(BR_IND_CALL_EXEC, 0x94, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(EMON_SIMD_INSTR_RETIRED, 0xCE, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(EMON_SYNCH_UOPS, 0xD3, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(EMON_ESP_UOPS, 0xD7, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(EMON_FUSED_UOPS_RET, 0xDA, P6F_ALL_CTRS | P6F_PM, 0x03), P6_EVDESCR(EMON_UNFUSION, 0xDB, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(EMON_PREF_RQSTS_UP, 0xF0, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(EMON_PREF_RQSTS_DN, 0xD8, P6F_ALL_CTRS | P6F_PM, 0x00), P6_EVDESCR(EMON_SSE_SSE2_INST_RETIRED, 0xD8, P6F_ALL_CTRS | P6F_PM, 0x03), P6_EVDESCR(EMON_SSE_SSE2_COMP_INST_RETIRED, 0xD9, P6F_ALL_CTRS | P6F_PM, 0x03) #undef P6_EVDESCR }; #define P6_NEVENTS (PMC_EV_P6_LAST - PMC_EV_P6_FIRST + 1) static const struct p6_event_descr * p6_find_event(enum pmc_event ev) { int n; for (n = 0; n < P6_NEVENTS; n++) if (p6_events[n].pm_event == ev) break; if (n == P6_NEVENTS) return NULL; return &p6_events[n]; } /* * Per-CPU data structure for P6 class CPUs * * [common stuff] * [3 struct pmc_hw pointers] * [3 struct pmc_hw structures] */ struct p6_cpu { struct pmc_cpu pc_common; struct pmc_hw *pc_hwpmcs[P6_NPMCS]; struct pmc_hw pc_p6pmcs[P6_NPMCS]; }; static int p6_init(int cpu) { int n; struct p6_cpu *pcs; struct pmc_hw *phw; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p6,%d] bad cpu %d", __LINE__, cpu)); PMCDBG(MDP,INI,0,"p6-init cpu=%d", cpu); MALLOC(pcs, struct p6_cpu *, sizeof(struct p6_cpu), M_PMC, M_WAITOK|M_ZERO); if (pcs == NULL) return ENOMEM; phw = pcs->pc_p6pmcs; for (n = 0; n < P6_NPMCS; n++, phw++) { phw->phw_state = PMC_PHW_FLAG_IS_ENABLED | PMC_PHW_CPU_TO_STATE(cpu) | PMC_PHW_INDEX_TO_STATE(n); phw->phw_pmc = NULL; pcs->pc_hwpmcs[n] = phw; } /* Mark the TSC as shareable */ pcs->pc_hwpmcs[0]->phw_state |= PMC_PHW_FLAG_IS_SHAREABLE; pmc_pcpu[cpu] = (struct pmc_cpu *) pcs; return 0; } static int p6_cleanup(int cpu) { struct pmc_cpu *pcs; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p6,%d] bad cpu %d", __LINE__, cpu)); PMCDBG(MDP,INI,0,"p6-cleanup cpu=%d", cpu); if ((pcs = pmc_pcpu[cpu]) != NULL) FREE(pcs, M_PMC); pmc_pcpu[cpu] = NULL; return 0; } static int p6_switch_in(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; PMCDBG(MDP,SWI,1, "pc=%p pp=%p enable-msr=%d", pc, pp, pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS); /* allow the RDPMC instruction if needed */ if (pp->pp_flags & PMC_PP_ENABLE_MSR_ACCESS) load_cr4(rcr4() | CR4_PCE); PMCDBG(MDP,SWI,1, "cr4=0x%x", rcr4()); return 0; } static int p6_switch_out(struct pmc_cpu *pc, struct pmc_process *pp) { (void) pc; (void) pp; /* can be NULL */ PMCDBG(MDP,SWO,1, "pc=%p pp=%p cr4=0x%x", pc, pp, rcr4()); /* always turn off the RDPMC instruction */ load_cr4(rcr4() & ~CR4_PCE); return 0; } static int p6_read_pmc(int cpu, int ri, pmc_value_t *v) { struct pmc_hw *phw; struct pmc *pm; struct p6pmc_descr *pd; pmc_value_t tmp; phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pm = phw->phw_pmc; pd = &p6_pmcdesc[ri]; KASSERT(pm, ("[p6,%d] cpu %d ri %d pmc not configured", __LINE__, cpu, ri)); if (pd->pm_descr.pd_class == PMC_CLASS_TSC) return 0; tmp = rdmsr(pd->pm_pmc_msr) & P6_PERFCTR_MASK; if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) *v = -tmp; else *v = tmp; PMCDBG(MDP,REA,1, "p6-read cpu=%d ri=%d msr=0x%x -> v=%jx", cpu, ri, pd->pm_pmc_msr, *v); return 0; } static int p6_write_pmc(int cpu, int ri, pmc_value_t v) { struct pmc_hw *phw; struct pmc *pm; struct p6pmc_descr *pd; phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pm = phw->phw_pmc; pd = &p6_pmcdesc[ri]; KASSERT(pm, ("[p6,%d] cpu %d ri %d pmc not configured", __LINE__, cpu, ri)); if (pd->pm_descr.pd_class == PMC_CLASS_TSC) return 0; PMCDBG(MDP,WRI,1, "p6-write cpu=%d ri=%d msr=0x%x v=%jx", cpu, ri, pd->pm_pmc_msr, v); if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) v = -v; wrmsr(pd->pm_pmc_msr, v & P6_PERFCTR_MASK); return 0; } static int p6_config_pmc(int cpu, int ri, struct pmc *pm) { struct pmc_hw *phw; PMCDBG(MDP,CFG,1, "p6-config cpu=%d ri=%d pm=%p", cpu, ri, pm); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; phw->phw_pmc = pm; return 0; } /* * Retrieve a configured PMC pointer from hardware state. */ static int p6_get_config(int cpu, int ri, struct pmc **ppm) { *ppm = pmc_pcpu[cpu]->pc_hwpmcs[ri]->phw_pmc; return 0; } /* * A pmc may be allocated to a given row index if: * - the event is valid for this CPU * - the event is valid for this counter index */ static int p6_allocate_pmc(int cpu, int ri, struct pmc *pm, const struct pmc_op_pmcallocate *a) { uint32_t allowed_unitmask, caps, config, unitmask; const struct p6pmc_descr *pd; const struct p6_event_descr *pevent; enum pmc_event ev; (void) cpu; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p4,%d] illegal CPU %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P6_NPMCS, ("[p4,%d] illegal row-index value %d", __LINE__, ri)); pd = &p6_pmcdesc[ri]; PMCDBG(MDP,ALL,1, "p6-allocate ri=%d class=%d pmccaps=0x%x " "reqcaps=0x%x", ri, pd->pm_descr.pd_class, pd->pm_descr.pd_caps, pm->pm_caps); /* check class */ if (pd->pm_descr.pd_class != a->pm_class) return EINVAL; /* check requested capabilities */ caps = a->pm_caps; if ((pd->pm_descr.pd_caps & caps) != caps) return EPERM; if (pd->pm_descr.pd_class == PMC_CLASS_TSC) { /* TSC's are always allocated in system-wide counting mode */ if (a->pm_ev != PMC_EV_TSC_TSC || a->pm_mode != PMC_MODE_SC) return EINVAL; return 0; } /* * P6 class events */ ev = pm->pm_event; if (ev < PMC_EV_P6_FIRST || ev > PMC_EV_P6_LAST) return EINVAL; if ((pevent = p6_find_event(ev)) == NULL) return ESRCH; if (!P6_EVENT_VALID_FOR_CPU(pevent, p6_cputype) || !P6_EVENT_VALID_FOR_CTR(pevent, (ri-1))) return EINVAL; /* For certain events, Pentium M differs from the stock P6 */ allowed_unitmask = 0; if (p6_cputype == PMC_CPU_INTEL_PM) { if (ev == PMC_EV_P6_L2_LD || ev == PMC_EV_P6_L2_LINES_IN || ev == PMC_EV_P6_L2_LINES_OUT) allowed_unitmask = P6_EVSEL_TO_UMASK(0x3F); else if (ev == PMC_EV_P6_L2_M_LINES_OUTM) allowed_unitmask = P6_EVSEL_TO_UMASK(0x30); } else allowed_unitmask = P6_EVSEL_TO_UMASK(pevent->pm_unitmask); unitmask = a->pm_p6_config & P6_EVSEL_UMASK_MASK; if (unitmask & ~allowed_unitmask) /* disallow reserved bits */ return EINVAL; if (ev == PMC_EV_P6_MMX_UOPS_EXEC) /* hardcoded mask */ unitmask = P6_EVSEL_TO_UMASK(0x0F); config = 0; config |= P6_EVSEL_EVENT_SELECT(pevent->pm_evsel); if (unitmask & (caps & PMC_CAP_QUALIFIER)) config |= unitmask; if (caps & PMC_CAP_THRESHOLD) config |= a->pm_p6_config & P6_EVSEL_CMASK_MASK; /* set at least one of the 'usr' or 'os' caps */ if (caps & PMC_CAP_USER) config |= P6_EVSEL_USR; if (caps & PMC_CAP_SYSTEM) config |= P6_EVSEL_OS; if ((caps & (PMC_CAP_USER|PMC_CAP_SYSTEM)) == 0) config |= (P6_EVSEL_USR|P6_EVSEL_OS); if (caps & PMC_CAP_EDGE) config |= P6_EVSEL_E; if (caps & PMC_CAP_INVERT) config |= P6_EVSEL_INV; if (caps & PMC_CAP_INTERRUPT) config |= P6_EVSEL_INT; pm->pm_md.pm_p6.pm_p6_evsel = config; PMCDBG(MDP,ALL,2, "p6-allocate config=0x%x", config); return 0; } static int p6_release_pmc(int cpu, int ri, struct pmc *pm) { struct pmc_hw *phw; (void) pm; PMCDBG(MDP,REL,1, "p6-release cpu=%d ri=%d pm=%p", cpu, ri, pm); KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p6,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P6_NPMCS, ("[p6,%d] illegal row-index %d", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; KASSERT(phw->phw_pmc == NULL, ("[p6,%d] PHW pmc %p != pmc %p", __LINE__, phw->phw_pmc, pm)); return 0; } static int p6_start_pmc(int cpu, int ri) { uint32_t config; struct pmc *pm; struct pmc_hw *phw; const struct p6pmc_descr *pd; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p6,%d] illegal CPU value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P6_NPMCS, ("[p6,%d] illegal row-index %d", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pm = phw->phw_pmc; pd = &p6_pmcdesc[ri]; KASSERT(pm, ("[p6,%d] starting cpu%d,ri%d with no pmc configured", __LINE__, cpu, ri)); PMCDBG(MDP,STA,1, "p6-start cpu=%d ri=%d", cpu, ri); if (pd->pm_descr.pd_class == PMC_CLASS_TSC) return 0; /* TSC are always running */ KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P6, ("[p6,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class)); config = pm->pm_md.pm_p6.pm_p6_evsel; PMCDBG(MDP,STA,2, "p6-start/2 cpu=%d ri=%d evselmsr=0x%x config=0x%x", cpu, ri, pd->pm_evsel_msr, config); if (pd->pm_evsel_msr == P6_MSR_EVSEL0) /* CTR 0 */ wrmsr(pd->pm_evsel_msr, config | P6_EVSEL_EN); else { /* CTR1 shares the enable bit CTR 0 */ wrmsr(pd->pm_evsel_msr, config); wrmsr(P6_MSR_EVSEL0, rdmsr(P6_MSR_EVSEL0) | P6_EVSEL_EN); } return 0; } static int p6_stop_pmc(int cpu, int ri) { uint32_t config; struct pmc *pm; struct pmc_hw *phw; struct p6pmc_descr *pd; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[p6,%d] illegal cpu value %d", __LINE__, cpu)); KASSERT(ri >= 0 && ri < P6_NPMCS, ("[p6,%d] illegal row index %d", __LINE__, ri)); phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pm = phw->phw_pmc; pd = &p6_pmcdesc[ri]; KASSERT(pm, ("[p6,%d] cpu%d ri%d no configured PMC to stop", __LINE__, cpu, ri)); if (pd->pm_descr.pd_class == PMC_CLASS_TSC) return 0; KASSERT(pd->pm_descr.pd_class == PMC_CLASS_P6, ("[p6,%d] unknown PMC class %d", __LINE__, pd->pm_descr.pd_class)); PMCDBG(MDP,STO,1, "p6-stop cpu=%d ri=%d", cpu, ri); /* * If CTR0 is being turned off but CTR1 is active, we need * leave CTR0's EN field set. If CTR1 is being stopped, it * suffices to zero its EVSEL register. */ if (ri == 1 && pmc_pcpu[cpu]->pc_hwpmcs[2]->phw_pmc != NULL) config = P6_EVSEL_EN; else config = 0; wrmsr(pd->pm_evsel_msr, config); PMCDBG(MDP,STO,2, "p6-stop/2 cpu=%d ri=%d config=0x%x", cpu, ri, config); return 0; } static int -p6_intr(int cpu, uintptr_t eip) +p6_intr(int cpu, uintptr_t eip, int usermode) { (void) cpu; (void) eip; return 0; } static int p6_describe(int cpu, int ri, struct pmc_info *pi, struct pmc **ppmc) { int error; size_t copied; struct pmc_hw *phw; struct p6pmc_descr *pd; phw = pmc_pcpu[cpu]->pc_hwpmcs[ri]; pd = &p6_pmcdesc[ri]; if ((error = copystr(pd->pm_descr.pd_name, pi->pm_name, PMC_NAME_MAX, &copied)) != 0) return error; pi->pm_class = pd->pm_descr.pd_class; if (phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) { pi->pm_enabled = TRUE; *ppmc = phw->phw_pmc; } else { pi->pm_enabled = FALSE; *ppmc = NULL; } return 0; } static int p6_get_msr(int ri, uint32_t *msr) { KASSERT(ri >= 0 && ri < P6_NPMCS, ("[p6,%d ri %d out of range", __LINE__, ri)); *msr = p6_pmcdesc[ri].pm_pmc_msr - P6_MSR_PERFCTR0; return 0; } int pmc_initialize_p6(struct pmc_mdep *pmc_mdep) { KASSERT(strcmp(cpu_vendor, "GenuineIntel") == 0, ("[p6,%d] Initializing non-intel processor", __LINE__)); PMCDBG(MDP,INI,1, "%s", "p6-initialize"); switch (pmc_mdep->pmd_cputype) { /* * P6 Family Processors */ case PMC_CPU_INTEL_P6: case PMC_CPU_INTEL_CL: case PMC_CPU_INTEL_PII: case PMC_CPU_INTEL_PIII: case PMC_CPU_INTEL_PM: p6_cputype = pmc_mdep->pmd_cputype; pmc_mdep->pmd_npmc = P6_NPMCS; pmc_mdep->pmd_classes[1].pm_class = PMC_CLASS_P6; pmc_mdep->pmd_classes[1].pm_caps = P6_PMC_CAPS; pmc_mdep->pmd_classes[1].pm_width = 40; pmc_mdep->pmd_nclasspmcs[1] = 2; pmc_mdep->pmd_init = p6_init; pmc_mdep->pmd_cleanup = p6_cleanup; pmc_mdep->pmd_switch_in = p6_switch_in; pmc_mdep->pmd_switch_out = p6_switch_out; pmc_mdep->pmd_read_pmc = p6_read_pmc; pmc_mdep->pmd_write_pmc = p6_write_pmc; pmc_mdep->pmd_config_pmc = p6_config_pmc; pmc_mdep->pmd_get_config = p6_get_config; pmc_mdep->pmd_allocate_pmc = p6_allocate_pmc; pmc_mdep->pmd_release_pmc = p6_release_pmc; pmc_mdep->pmd_start_pmc = p6_start_pmc; pmc_mdep->pmd_stop_pmc = p6_stop_pmc; pmc_mdep->pmd_intr = p6_intr; pmc_mdep->pmd_describe = p6_describe; pmc_mdep->pmd_get_msr = p6_get_msr; /* i386 */ break; default: KASSERT(0,("[p6,%d] Unknown CPU type", __LINE__)); return ENOSYS; } return 0; } diff --git a/sys/i386/i386/trap.c b/sys/i386/i386/trap.c index 2bca06889a93..bb02052c3b5c 100644 --- a/sys/i386/i386/trap.c +++ b/sys/i386/i386/trap.c @@ -1,1033 +1,1050 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_isa.h" #include "opt_ktrace.h" #include "opt_npx.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif +#ifdef HWPMC_HOOKS +#include +#endif #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #ifdef POWERFAIL_NMI #include #include #endif extern void trap(struct trapframe frame); extern void syscall(struct trapframe frame); static int trap_pfault(struct trapframe *, int, vm_offset_t); static void trap_fatal(struct trapframe *, vm_offset_t); void dblfault_handler(void); extern inthand_t IDTVEC(lcall_syscall); #define MAX_TRAP_MSG 28 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ }; #if defined(I586_CPU) && !defined(NO_F00F_HACK) extern int has_f00f_bug; #endif #ifdef KDB static int kdb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW, &kdb_on_nmi, 0, "Go to KDB on NMI"); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); #ifdef WITNESS extern char *syscallnames[]; #endif #ifdef DEVICE_POLLING extern u_int32_t poll_in_trap; extern int ether_poll(int count); #endif /* DEVICE_POLLING */ /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(frame) struct trapframe frame; { struct thread *td = curthread; struct proc *p = td->td_proc; u_int sticks = 0; int i = 0, ucode = 0, type, code; vm_offset_t eva; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif PCPU_LAZY_INC(cnt.v_trap); type = frame.tf_trapno; #ifdef KDB_STOP_NMI /* Handler for NMI IPIs used for debugging */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* KDB_STOP_NMI */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif +#ifdef HWPMC_HOOKS + /* + * CPU PMCs interrupt using an NMI so we check for that first. + * If the HWPMC module is active, 'pmc_hook' will point to + * the function to be called. A return value of '1' from the + * hook means that the NMI was handled by it and that we can + * return immediately. + */ + if (type == T_NMI && pmc_intr && + (*pmc_intr)(PCPU_GET(cpuid), (uintptr_t) frame.tf_eip, + TRAPF_USERMODE(&frame))) + goto out; +#endif + if ((frame.tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) printf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curproc->p_comm, type); else if (type != T_BPTFLT && type != T_TRCTRAP && frame.tf_eip != (int)cpu_switch_load_gs) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * Page faults need interrupts diasabled until later, * and we shouldn't enable interrupts while in a * critical section. */ if (type != T_PAGEFLT && td->td_critnest == 0) enable_intr(); } } eva = 0; code = frame.tf_err; if (type == T_PAGEFLT) { /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and then must * reenable interrupts. * * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. */ eva = rcr2(); if (td->td_critnest == 0) enable_intr(); else trap_fatal(&frame, eva); } #ifdef DEVICE_POLLING if (poll_in_trap) ether_poll(poll_in_trap); #endif /* DEVICE_POLLING */ if ((ISPL(frame.tf_cs) == SEL_UPL) || ((frame.tf_eflags & PSL_VM) && !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL))) { /* user trap */ sticks = td->td_sticks; td->td_frame = &frame; if (td->td_ucred != p->p_ucred) cred_update_thread(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ ucode = type; i = SIGILL; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); frame.tf_eflags &= ~PSL_T; i = SIGTRAP; break; case T_ARITHTRAP: /* arithmetic trap */ #ifdef DEV_NPX ucode = npxtrap(); if (ucode == -1) goto userout; #else ucode = code; #endif i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)&frame); if (i == 0) goto user; break; } /* FALLTHROUGH */ case T_SEGNPFLT: /* segment not present fault */ case T_TSSFLT: /* invalid TSS fault */ case T_DOUBLEFLT: /* double fault */ default: ucode = code + BUS_SEGM_FAULT ; i = SIGBUS; break; case T_PAGEFLT: /* page fault */ if (td->td_pflags & TDP_SA) thread_user_enter(td); i = trap_pfault(&frame, TRUE, eva); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * The f00f hack workaround has triggered, so * treat the fault as an illegal instruction * (T_PRIVINFLT) instead of a page fault. */ type = frame.tf_trapno = T_PRIVINFLT; /* Proceed as in that case. */ ucode = type; i = SIGILL; break; } #endif if (i == -1) goto userout; if (i == 0) goto user; ucode = T_PAGEFLT; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto userout; #else /* !POWERFAIL_NMI */ /* machine/parity/power fail/"kitchen sink" faults */ /* XXX Giant */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, &frame); } #endif /* KDB */ goto userout; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: #ifdef DEV_NPX /* transparent fault (due to context switch "late") */ if (npxdna()) goto userout; #endif i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = T_FPOPFLT; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = 0; /* XXX */ i = SIGFPE; break; } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(&frame, FALSE, eva); goto out; case T_DNA: #ifdef DEV_NPX /* * The kernel is apparently using npx for copying. * XXX this should be fatal unless the kernel has * registered such use. */ if (npxdna()) goto out; #endif break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame.tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)&frame); if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)&frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame.tf_eip == (int)cpu_switch_load_gs) { PCPU_GET(curpcb)->pcb_gs = 0; #if 0 PROC_LOCK(p); psignal(p, SIGBUS); PROC_UNLOCK(p); #endif goto out; } if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame.tf_eip == (int)doreti_iret) { frame.tf_eip = (int)doreti_iret_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_ds) { frame.tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_es) { frame.tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame.tf_eip == (int)doreti_popl_fs) { frame.tf_eip = (int)doreti_popl_fs_fault; goto out; } if (PCPU_GET(curpcb)->pcb_onfault != NULL) { frame.tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame.tf_eflags & PSL_NT) { frame.tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame.tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ /* XXX Giant */ if (user_dbreg_trap() && !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & 0xfffffff0); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB /* XXX Giant */ if (kdb_trap(type, 0, &frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI mtx_lock(&Giant); if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(TIMER_FREQ/880, hz); lastalert = time_second; } mtx_unlock(&Giant); goto out; #else /* !POWERFAIL_NMI */ /* XXX Giant */ /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, &frame); } #endif /* KDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALLTHROUGH */ #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } trap_fatal(&frame, eva); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); trapsignal(td, i, ucode); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif user: userret(td, &frame, sticks); mtx_assert(&Giant, MA_NOTOWNED); userout: out: return; } static int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; struct vmspace *vm = NULL; vm_map_t map = 0; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return -2; #endif if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. * vm is initialized above to NULL. If curproc is NULL * or curproc->p_vmspace is NULL the fault is fatal. */ if (p != NULL) vm = p->p_vmspace; if (vm == NULL) goto nogo; map = &vm->vm_map; } if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else ftype = VM_PROT_READ; if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) return (0); nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && PCPU_GET(curpcb)->pcb_onfault != NULL) { frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } /* kludge to pass faulting virtual address to sendsig */ frame->tf_err = eva; return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, type, ss, esp; struct soft_segment_descriptor softseg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg[type], frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm ? curproc->p_comm : ""); } else { printf("Idle\n"); } #ifdef KDB if ((debugger_on_panic || kdb_active) && kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } /* * syscall - system call request C handler * * A system call is essentially treated as a trap. */ void syscall(frame) struct trapframe frame; { caddr_t params; struct sysent *callp; struct thread *td = curthread; struct proc *p = td->td_proc; register_t orig_tf_eflags; u_int sticks; int error; int narg; int args[8]; u_int code; /* * note: PCPU_LAZY_INC() can only be used if we can afford * occassional inaccuracy in the count. */ PCPU_LAZY_INC(cnt.v_syscall); #ifdef DIAGNOSTIC if (ISPL(frame.tf_cs) != SEL_UPL) { mtx_lock(&Giant); /* try to stabilize the system XXX */ panic("syscall"); /* NOT REACHED */ mtx_unlock(&Giant); } #endif sticks = td->td_sticks; td->td_frame = &frame; if (td->td_ucred != p->p_ucred) cred_update_thread(td); if (p->p_flag & P_SA) thread_user_enter(td); params = (caddr_t)frame.tf_esp + sizeof(int); code = frame.tf_eax; orig_tf_eflags = frame.tf_eflags; if (p->p_sysent->sv_prepsyscall) { /* * The prep code is MP aware. */ (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); } else { /* * Need to check if this is a 32 bit or 64 bit syscall. * fuword is MP aware. */ if (code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ code = fuword(params); params += sizeof(int); } else if (code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ code = fuword(params); params += sizeof(quad_t); } } if (p->p_sysent->sv_mask) code &= p->p_sysent->sv_mask; if (code >= p->p_sysent->sv_size) callp = &p->p_sysent->sv_table[0]; else callp = &p->p_sysent->sv_table[code]; narg = callp->sy_narg & SYF_ARGMASK; /* * copyin and the ktrsyscall()/ktrsysret() code is MP-aware */ if (params != NULL && narg != 0) error = copyin(params, (caddr_t)args, (u_int)(narg * sizeof(int))); else error = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_SYSCALL)) ktrsyscall(code, narg, args); #endif CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td, td->td_proc->p_pid, td->td_proc->p_comm, code); /* * Try to run the syscall without Giant if the syscall * is MP safe. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_lock(&Giant); if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame.tf_edx; STOPEVENT(p, S_SCE, narg); PTRACESTOP_SC(p, td, S_PT_SCE); error = (*callp->sy_call)(td, args); } switch (error) { case 0: frame.tf_eax = td->td_retval[0]; frame.tf_edx = td->td_retval[1]; frame.tf_eflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes, * int 0x80 is 2 bytes. We saved this in tf_err. */ frame.tf_eip -= frame.tf_err; break; case EJUSTRETURN: break; default: if (p->p_sysent->sv_errsize) { if (error >= p->p_sysent->sv_errsize) error = -1; /* XXX */ else error = p->p_sysent->sv_errtbl[error]; } frame.tf_eax = error; frame.tf_eflags |= PSL_C; break; } /* * Release Giant if we previously set it. */ if ((callp->sy_narg & SYF_MPSAFE) == 0) mtx_unlock(&Giant); /* * Traced syscall. */ if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { frame.tf_eflags &= ~PSL_T; trapsignal(td, SIGTRAP, 0); } /* * Handle reschedule and other end-of-syscall issues */ userret(td, &frame, sticks); CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td, td->td_proc->p_pid, td->td_proc->p_comm, code); #ifdef KTRACE if (KTRPOINT(td, KTR_SYSRET)) ktrsysret(code, error, td->td_retval[0]); #endif /* * This works because errno is findable through the * register set. If we ever support an emulation where this * is not the case, this code will need to be revisited. */ STOPEVENT(p, S_SCX, code); PTRACESTOP_SC(p, td, S_PT_SCX); WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning", (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"); mtx_assert(&sched_lock, MA_NOTOWNED); mtx_assert(&Giant, MA_NOTOWNED); } diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 4ac211df140c..070d459cfc1e 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -1,552 +1,561 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ntp.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GPROF #include #endif +#ifdef HWPMC_HOOKS +#include +#endif + #ifdef DEVICE_POLLING extern void hardclock_device_poll(void); #endif /* DEVICE_POLLING */ static void initclocks(void *dummy); SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) /* Some of these don't belong here, but it's easiest to concentrate them. */ long cp_time[CPUSTATES]; SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time), "LU", "CPU time statistics"); #ifdef SW_WATCHDOG #include static int watchdog_ticks; static int watchdog_enabled; static void watchdog_fire(void); static void watchdog_config(void *, u_int, int *); #endif /* SW_WATCHDOG */ /* * Clock handling routines. * * This code is written to operate with two timers that run independently of * each other. * * The main timer, running hz times per second, is used to trigger interval * timers, timeouts and rescheduling as needed. * * The second timer handles kernel and user profiling, * and does resource use estimation. If the second timer is programmable, * it is randomized to avoid aliasing between the two clocks. For example, * the randomization prevents an adversary from always giving up the cpu * just before its quantum expires. Otherwise, it would never accumulate * cpu ticks. The mean frequency of the second timer is stathz. * * If no second timer exists, stathz will be zero; in this case we drive * profiling and statistics off the main clock. This WILL NOT be accurate; * do not do it unless absolutely necessary. * * The statistics clock may (or may not) be run at a higher rate while * profiling. This profile clock runs at profhz. We require that profhz * be an integral multiple of stathz. * * If the statistics clock is running fast, it must be divided by the ratio * profhz/stathz for statistics. (For profiling, every tick counts.) * * Time-of-day is maintained using a "timecounter", which may or may * not be related to the hardware generating the above mentioned * interrupts. */ int stathz; int profhz; int profprocs; int ticks; int psratio; /* * Initialize clock frequencies and start both clocks running. */ /* ARGSUSED*/ static void initclocks(dummy) void *dummy; { register int i; /* * Set divisors to 1 (normal case) and let the machine-specific * code do its bit. */ cpu_initclocks(); /* * Compute profhz/stathz, and fix profhz if needed. */ i = stathz ? stathz : hz; if (profhz == 0) profhz = i; psratio = profhz / i; #ifdef SW_WATCHDOG EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0); #endif } /* * Each time the real-time timer fires, this function is called on all CPUs. * Note that hardclock() calls hardclock_process() for the boot CPU, so only * the other CPUs in the system need to call this function. */ void hardclock_process(frame) register struct clockframe *frame; { struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; /* * Run current process's virtual and profile time, as needed. */ mtx_lock_spin_flags(&sched_lock, MTX_QUIET); if (p->p_flag & P_SA) { /* XXXKSE What to do? */ } else { pstats = p->p_stats; if (CLKF_USERMODE(frame) && timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { p->p_sflag |= PS_ALRMPEND; td->td_flags |= TDF_ASTPENDING; } if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { p->p_sflag |= PS_PROFPEND; td->td_flags |= TDF_ASTPENDING; } } mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); + +#ifdef HWPMC_HOOKS + if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid))) + PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL); +#endif } /* * The real-time timer, interrupting hz times per second. */ void hardclock(frame) register struct clockframe *frame; { int need_softclock = 0; CTR0(KTR_CLK, "hardclock fired"); hardclock_process(frame); tc_ticktock(); /* * If no separate statistics clock is available, run it from here. * * XXX: this only works for UP */ if (stathz == 0) { profclock(frame); statclock(frame); } #ifdef DEVICE_POLLING hardclock_device_poll(); /* this is very short and quick */ #endif /* DEVICE_POLLING */ /* * Process callouts at a very low cpu priority, so we don't keep the * relatively high clock interrupt priority any longer than necessary. */ mtx_lock_spin_flags(&callout_lock, MTX_QUIET); ticks++; if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) { need_softclock = 1; } else if (softticks + 1 == ticks) ++softticks; mtx_unlock_spin_flags(&callout_lock, MTX_QUIET); /* * swi_sched acquires sched_lock, so we don't want to call it with * callout_lock held; incorrect locking order. */ if (need_softclock) swi_sched(softclock_ih, 0); #ifdef SW_WATCHDOG if (watchdog_enabled > 0 && --watchdog_ticks <= 0) watchdog_fire(); #endif /* SW_WATCHDOG */ } /* * Compute number of ticks in the specified amount of time. */ int tvtohz(tv) struct timeval *tv; { register unsigned long ticks; register long sec, usec; /* * If the number of usecs in the whole seconds part of the time * difference fits in a long, then the total number of usecs will * fit in an unsigned long. Compute the total and convert it to * ticks, rounding up and adding 1 to allow for the current tick * to expire. Rounding also depends on unsigned long arithmetic * to avoid overflow. * * Otherwise, if the number of ticks in the whole seconds part of * the time difference fits in a long, then convert the parts to * ticks separately and add, using similar rounding methods and * overflow avoidance. This method would work in the previous * case but it is slightly slower and assumes that hz is integral. * * Otherwise, round the time difference down to the maximum * representable value. * * If ints have 32 bits, then the maximum value for any timeout in * 10ms ticks is 248 days. */ sec = tv->tv_sec; usec = tv->tv_usec; if (usec < 0) { sec--; usec += 1000000; } if (sec < 0) { #ifdef DIAGNOSTIC if (usec > 0) { sec++; usec -= 1000000; } printf("tvotohz: negative time difference %ld sec %ld usec\n", sec, usec); #endif ticks = 1; } else if (sec <= LONG_MAX / 1000000) ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) / tick + 1; else if (sec <= LONG_MAX / hz) ticks = sec * hz + ((unsigned long)usec + (tick - 1)) / tick + 1; else ticks = LONG_MAX; if (ticks > INT_MAX) ticks = INT_MAX; return ((int)ticks); } /* * Start profiling on a process. * * Kernel profiling passes proc0 which never exits and hence * keeps the profile clock running constantly. */ void startprofclock(p) register struct proc *p; { /* * XXX; Right now sched_lock protects statclock(), but perhaps * it should be protected later on by a time_lock, which would * cover psdiv, etc. as well. */ PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_STOPPROF) return; if ((p->p_flag & P_PROFIL) == 0) { mtx_lock_spin(&sched_lock); p->p_flag |= P_PROFIL; if (++profprocs == 1) cpu_startprofclock(); mtx_unlock_spin(&sched_lock); } } /* * Stop profiling on a process. */ void stopprofclock(p) register struct proc *p; { PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_PROFIL) { if (p->p_profthreads != 0) { p->p_flag |= P_STOPPROF; while (p->p_profthreads != 0) msleep(&p->p_profthreads, &p->p_mtx, PPAUSE, "stopprof", 0); p->p_flag &= ~P_STOPPROF; } if ((p->p_flag & P_PROFIL) == 0) return; mtx_lock_spin(&sched_lock); p->p_flag &= ~P_PROFIL; if (--profprocs == 0) cpu_stopprofclock(); mtx_unlock_spin(&sched_lock); } } /* * Statistics clock. Grab profile sample, and if divider reaches 0, * do process and kernel statistics. Most of the statistics are only * used by user-level statistics programs. The main exceptions are * ke->ke_uticks, p->p_rux.rux_sticks, p->p_rux.rux_iticks, and p->p_estcpu. * This should be called by all active processors. */ void statclock(frame) register struct clockframe *frame; { struct rusage *ru; struct vmspace *vm; struct thread *td; struct proc *p; long rss; td = curthread; p = td->td_proc; mtx_lock_spin_flags(&sched_lock, MTX_QUIET); if (CLKF_USERMODE(frame)) { /* * Charge the time as appropriate. */ if (p->p_flag & P_SA) thread_statclock(1); p->p_rux.rux_uticks++; if (p->p_nice > NZERO) cp_time[CP_NICE]++; else cp_time[CP_USER]++; } else { /* * Came from kernel mode, so we were: * - handling an interrupt, * - doing syscall or trap work on behalf of the current * user process, or * - spinning in the idle loop. * Whichever it is, charge the time as appropriate. * Note that we charge interrupts to the current process, * regardless of whether they are ``for'' that process, * so that we know how much of its real time was spent * in ``non-process'' (i.e., interrupt) work. */ if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) { p->p_rux.rux_iticks++; cp_time[CP_INTR]++; } else { if (p->p_flag & P_SA) thread_statclock(0); td->td_sticks++; p->p_rux.rux_sticks++; if (p != PCPU_GET(idlethread)->td_proc) cp_time[CP_SYS]++; else cp_time[CP_IDLE]++; } } CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d", td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz); sched_clock(td); /* Update resource usage integrals and maximums. */ MPASS(p->p_stats != NULL); MPASS(p->p_vmspace != NULL); vm = p->p_vmspace; ru = &p->p_stats->p_ru; ru->ru_ixrss += pgtok(vm->vm_tsize); ru->ru_idrss += pgtok(vm->vm_dsize); ru->ru_isrss += pgtok(vm->vm_ssize); rss = pgtok(vmspace_resident_count(vm)); if (ru->ru_maxrss < rss) ru->ru_maxrss = rss; mtx_unlock_spin_flags(&sched_lock, MTX_QUIET); } void profclock(frame) register struct clockframe *frame; { struct thread *td; #ifdef GPROF struct gmonparam *g; int i; #endif td = curthread; if (CLKF_USERMODE(frame)) { /* * Came from user mode; CPU was in user state. * If this process is being profiled, record the tick. * if there is no related user location yet, don't * bother trying to count it. */ if (td->td_proc->p_flag & P_PROFIL) addupc_intr(td, CLKF_PC(frame), 1); } #ifdef GPROF else { /* * Kernel statistics are just like addupc_intr, only easier. */ g = &_gmonparam; if (g->state == GMON_PROF_ON) { i = CLKF_PC(frame) - g->lowpc; if (i < g->textsize) { i /= HISTFRACTION * sizeof(*g->kcount); g->kcount[i]++; } } } #endif } /* * Return information about system clocks. */ static int sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) { struct clockinfo clkinfo; /* * Construct clockinfo structure. */ bzero(&clkinfo, sizeof(clkinfo)); clkinfo.hz = hz; clkinfo.tick = tick; clkinfo.profhz = profhz; clkinfo.stathz = stathz ? stathz : hz; return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); } SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, 0, 0, sysctl_kern_clockrate, "S,clockinfo", "Rate and period of various kernel clocks"); #ifdef SW_WATCHDOG static void watchdog_config(void *unused __unused, u_int cmd, int *err) { u_int u; u = cmd & WD_INTERVAL; if (cmd && u >= WD_TO_1SEC) { u = cmd & WD_INTERVAL; watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz; watchdog_enabled = 1; *err = 0; } else { watchdog_enabled = 0; } } /* * Handle a watchdog timeout by dumping interrupt information and * then either dropping to DDB or panicing. */ static void watchdog_fire(void) { int nintr; u_int64_t inttotal; u_long *curintr; char *curname; curintr = intrcnt; curname = intrnames; inttotal = 0; nintr = eintrcnt - intrcnt; printf("interrupt total\n"); while (--nintr >= 0) { if (*curintr) printf("%-12s %20lu\n", curname, *curintr); curname += strlen(curname) + 1; inttotal += *curintr++; } printf("Total %20ju\n", (uintmax_t)inttotal); #ifdef KDB kdb_backtrace(); kdb_enter("watchdog timeout"); #else panic("watchdog timeout"); #endif /* KDB */ } #endif /* SW_WATCHDOG */ diff --git a/sys/kern/kern_pmc.c b/sys/kern/kern_pmc.c index c6fbb45d2928..ee0f8ec6fcb5 100644 --- a/sys/kern/kern_pmc.c +++ b/sys/kern/kern_pmc.c @@ -1,77 +1,79 @@ /*- - * Copyright (c) 2003 Joseph Koshy + * Copyright (c) 2003-2005, Joseph Koshy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include struct sx pmc_sx; /* Hook variable. */ int (*pmc_hook)(struct thread *td, int function, void *arg) = NULL; /* Interrupt handler */ -int (*pmc_intr)(int cpu, uintptr_t pc) = NULL; +int (*pmc_intr)(int cpu, uintptr_t pc, int usermode) = NULL; + +cpumask_t pmc_cpumask; /* * Since PMC(4) may not be loaded in the current kernel, the * convention followed is that a non-NULL value of 'pmc_hook' implies * the presence of this kernel module. * * This requires us to protect 'pmc_hook' with a * shared (sx) lock -- thus making the process of calling into PMC(4) * somewhat more expensive than a simple 'if' check and indirect call. */ SX_SYSINIT(pmc, &pmc_sx, "pmc shared lock"); /* * pmc_cpu_is_disabled * * return TRUE if the cpu specified has been disabled. */ int pmc_cpu_is_disabled(int cpu) { #ifdef SMP return ((hlt_cpus_mask & (1 << cpu)) != 0); #else return 0; #endif } int pmc_cpu_is_logical(int cpu) { #ifdef SMP return ((logical_cpus_mask & (1 << cpu)) != 0); #else return 0; #endif } diff --git a/sys/sys/pmc.h b/sys/sys/pmc.h index 745469d1c519..ae1d1245eb7b 100644 --- a/sys/sys/pmc.h +++ b/sys/sys/pmc.h @@ -1,1460 +1,1460 @@ /*- * Copyright (c) 2003, Joseph Koshy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_PMC_H_ #define _SYS_PMC_H_ #include #define PMC_MODULE_NAME "hwpmc" #define PMC_NAME_MAX 16 /* HW counter name size */ #define PMC_CLASS_MAX 4 /* #classes of PMCs in a system */ /* Kernel<->userland API version number [MMmmpppp] */ #define PMC_VERSION_MAJOR 0x01 #define PMC_VERSION_MINOR 0x01 #define PMC_VERSION_PATCH 0x0002 #define PMC_VERSION (PMC_VERSION_MAJOR << 24 | \ PMC_VERSION_MINOR << 16 | PMC_VERSION_PATCH) /* * Kinds of CPUs known */ #define __PMC_CPUS() \ __PMC_CPU(AMD_K7, "AMD K7") \ __PMC_CPU(AMD_K8, "AMD K8") \ __PMC_CPU(INTEL_P5, "Intel Pentium") \ __PMC_CPU(INTEL_P6, "Intel Pentium Pro") \ __PMC_CPU(INTEL_CL, "Intel Celeron") \ __PMC_CPU(INTEL_PII, "Intel Pentium II") \ __PMC_CPU(INTEL_PIII, "Intel Pentium III") \ __PMC_CPU(INTEL_PM, "Intel Pentium M") \ __PMC_CPU(INTEL_PIV, "Intel Pentium IV") enum pmc_cputype { #undef __PMC_CPU #define __PMC_CPU(S,D) PMC_CPU_##S , __PMC_CPUS() }; #define PMC_CPU_FIRST PMC_CPU_AMD_K7 #define PMC_CPU_LAST PMC_CPU_INTEL_PIV /* * Classes of PMCs */ #define __PMC_CLASSES() \ __PMC_CLASS(TSC) /* CPU Timestamp counter */ \ __PMC_CLASS(K7) /* AMD K7 performance counters */ \ __PMC_CLASS(K8) /* AMD K8 performance counters */ \ __PMC_CLASS(P5) /* Intel Pentium counters */ \ __PMC_CLASS(P6) /* Intel Pentium Pro counters */ \ __PMC_CLASS(P4) /* Intel Pentium-IV counters */ enum pmc_class { #undef __PMC_CLASS #define __PMC_CLASS(N) PMC_CLASS_##N , __PMC_CLASSES() }; #define PMC_CLASS_FIRST PMC_CLASS_TSC #define PMC_CLASS_LAST PMC_CLASS_P4 /* * A PMC can be in the following states: * * Hardware states: * DISABLED -- administratively prohibited from being used. * FREE -- HW available for use * Software states: * ALLOCATED -- allocated * STOPPED -- allocated, but not counting events * RUNNING -- allocated, and in operation; 'pm_runcount' * holds the number of CPUs using this PMC at * a given instant * DELETED -- being destroyed */ #define __PMC_HWSTATES() \ __PMC_STATE(DISABLED) \ __PMC_STATE(FREE) #define __PMC_SWSTATES() \ __PMC_STATE(ALLOCATED) \ __PMC_STATE(STOPPED) \ __PMC_STATE(RUNNING) \ __PMC_STATE(DELETED) #define __PMC_STATES() \ __PMC_HWSTATES() \ __PMC_SWSTATES() enum pmc_state { #undef __PMC_STATE #define __PMC_STATE(S) PMC_STATE_##S, __PMC_STATES() __PMC_STATE(MAX) }; #define PMC_STATE_FIRST PMC_STATE_DISABLED #define PMC_STATE_LAST PMC_STATE_DELETED /* * An allocated PMC may used as a 'global' counter or as a * 'thread-private' one. Each such mode of use can be in either * statistical sampling mode or in counting mode. Thus a PMC in use * * SS i.e., SYSTEM STATISTICAL -- system-wide statistical profiling * SC i.e., SYSTEM COUNTER -- system-wide counting mode * TS i.e., THREAD STATISTICAL -- thread virtual, statistical profiling * TC i.e., THREAD COUNTER -- thread virtual, counting mode * * Statistical profiling modes rely on the PMC periodically delivering * a interrupt to the CPU (when the configured number of events have * been measured), so the PMC must have the ability to generate * interrupts. * * In counting modes, the PMC counts its configured events, with the * value of the PMC being read whenever needed by its owner process. * * The thread specific modes "virtualize" the PMCs -- the PMCs appear * to be thread private and count events only when the profiled thread * actually executes on the CPU. * * The system-wide "global" modes keep the PMCs running all the time * and are used to measure the behaviour of the whole system. */ #define __PMC_MODES() \ __PMC_MODE(SS, 0) \ __PMC_MODE(SC, 1) \ __PMC_MODE(TS, 2) \ __PMC_MODE(TC, 3) enum pmc_mode { #undef __PMC_MODE #define __PMC_MODE(M,N) PMC_MODE_##M = N, __PMC_MODES() }; #define PMC_MODE_FIRST PMC_MODE_SS #define PMC_MODE_LAST PMC_MODE_TC #define PMC_IS_COUNTING_MODE(mode) \ ((mode) == PMC_MODE_SC || (mode) == PMC_MODE_TC) #define PMC_IS_SYSTEM_MODE(mode) \ ((mode) == PMC_MODE_SS || (mode) == PMC_MODE_SC) #define PMC_IS_SAMPLING_MODE(mode) \ ((mode) == PMC_MODE_SS || (mode) == PMC_MODE_TS) #define PMC_IS_VIRTUAL_MODE(mode) \ ((mode) == PMC_MODE_TS || (mode) == PMC_MODE_TC) /* * PMC row disposition */ #define __PMC_DISPOSITIONS(N) \ __PMC_DISP(STANDALONE) /* global/disabled counters */ \ __PMC_DISP(FREE) /* free/available */ \ __PMC_DISP(THREAD) /* thread-virtual PMCs */ \ __PMC_DISP(UNKNOWN) /* sentinel */ enum pmc_disp { #undef __PMC_DISP #define __PMC_DISP(D) PMC_DISP_##D , __PMC_DISPOSITIONS() }; #define PMC_DISP_FIRST PMC_DISP_STANDALONE #define PMC_DISP_LAST PMC_DISP_THREAD /* * PMC event codes * * __PMC_EV(CLASS, SYMBOLIC-NAME, VALUE, READABLE-NAME) */ /* * AMD K7 Events, from "The AMD Athlon(tm) Processor x86 Code * Optimization Guide" [Doc#22007K, Feb 2002] */ #define __PMC_EV_K7() \ __PMC_EV(K7, DC_ACCESSES, k7-dc-accesses) \ __PMC_EV(K7, DC_MISSES, k7-dc-misses) \ __PMC_EV(K7, DC_REFILLS_FROM_L2, k7-dc-refills-from-l2) \ __PMC_EV(K7, DC_REFILLS_FROM_SYSTEM, k7-dc-refills-from-system) \ __PMC_EV(K7, DC_WRITEBACKS, k7-dc-writebacks) \ __PMC_EV(K7, L1_DTLB_MISS_AND_L2_DTLB_HITS, \ k7-l1-dtlb-miss-and-l2-dtlb-hits) \ __PMC_EV(K7, L1_AND_L2_DTLB_MISSES, k7-l1-and-l2-dtlb-misses) \ __PMC_EV(K7, MISALIGNED_REFERENCES, k7-misaligned-references) \ __PMC_EV(K7, IC_FETCHES, k7-ic-fetches) \ __PMC_EV(K7, IC_MISSES, k7-ic-misses) \ __PMC_EV(K7, L1_ITLB_MISSES, k7-l1-itlb-misses) \ __PMC_EV(K7, L1_L2_ITLB_MISSES, k7-l1-l2-itlb-misses) \ __PMC_EV(K7, RETIRED_INSTRUCTIONS, k7-retired-instructions) \ __PMC_EV(K7, RETIRED_OPS, k7-retired-ops) \ __PMC_EV(K7, RETIRED_BRANCHES, k7-retired-branches) \ __PMC_EV(K7, RETIRED_BRANCHES_MISPREDICTED, \ k7-retired-branches-mispredicted) \ __PMC_EV(K7, RETIRED_TAKEN_BRANCHES, k7-retired-taken-branches) \ __PMC_EV(K7, RETIRED_TAKEN_BRANCHES_MISPREDICTED, \ k7-retired-taken-branches-mispredicted) \ __PMC_EV(K7, RETIRED_FAR_CONTROL_TRANSFERS, \ k7-retired-far-control-transfers) \ __PMC_EV(K7, RETIRED_RESYNC_BRANCHES, k7-retired-resync-branches) \ __PMC_EV(K7, INTERRUPTS_MASKED_CYCLES, k7-interrupts-masked-cycles) \ __PMC_EV(K7, INTERRUPTS_MASKED_WHILE_PENDING_CYCLES, \ k7-interrupts-masked-while-pending-cycles) \ __PMC_EV(K7, HARDWARE_INTERRUPTS, k7-hardware-interrupts) #define PMC_EV_K7_FIRST PMC_EV_K7_DC_ACCESSES #define PMC_EV_K7_LAST PMC_EV_K7_HARDWARE_INTERRUPTS /* * Intel P4 Events, from "IA-32 Intel(r) Architecture Software * Developer's Manual, Volume 3: System Programming Guide" [245472-012] */ #define __PMC_EV_P4() \ __PMC_EV(P4, TC_DELIVER_MODE, p4-tc-deliver-mode) \ __PMC_EV(P4, BPU_FETCH_REQUEST, p4-bpu-fetch-request) \ __PMC_EV(P4, ITLB_REFERENCE, p4-itlb-reference) \ __PMC_EV(P4, MEMORY_CANCEL, p4-memory-cancel) \ __PMC_EV(P4, MEMORY_COMPLETE, p4-memory-complete) \ __PMC_EV(P4, LOAD_PORT_REPLAY, p4-load-port-replay) \ __PMC_EV(P4, STORE_PORT_REPLAY, p4-store-port-replay) \ __PMC_EV(P4, MOB_LOAD_REPLAY, p4-mob-load-replay) \ __PMC_EV(P4, PAGE_WALK_TYPE, p4-page-walk-type) \ __PMC_EV(P4, BSQ_CACHE_REFERENCE, p4-bsq-cache-reference) \ __PMC_EV(P4, IOQ_ALLOCATION, p4-ioq-allocation) \ __PMC_EV(P4, IOQ_ACTIVE_ENTRIES, p4-ioq-active-entries) \ __PMC_EV(P4, FSB_DATA_ACTIVITY, p4-fsb-data-activity) \ __PMC_EV(P4, BSQ_ALLOCATION, p4-bsq-allocation) \ __PMC_EV(P4, BSQ_ACTIVE_ENTRIES, p4-bsq-active-entries) \ __PMC_EV(P4, SSE_INPUT_ASSIST, p4-sse-input-assist) \ __PMC_EV(P4, PACKED_SP_UOP, p4-packed-sp-uop) \ __PMC_EV(P4, PACKED_DP_UOP, p4-packed-dp-uop) \ __PMC_EV(P4, SCALAR_SP_UOP, p4-scalar-sp-uop) \ __PMC_EV(P4, SCALAR_DP_UOP, p4-scalar-dp-uop) \ __PMC_EV(P4, 64BIT_MMX_UOP, p4-64bit-mmx-uop) \ __PMC_EV(P4, 128BIT_MMX_UOP, p4-128bit-mmx-uop) \ __PMC_EV(P4, X87_FP_UOP, p4-x87-fp-uop) \ __PMC_EV(P4, X87_SIMD_MOVES_UOP, p4-x87-simd-moves-uop) \ __PMC_EV(P4, GLOBAL_POWER_EVENTS, p4-global-power-events) \ __PMC_EV(P4, TC_MS_XFER, p4-tc-ms-xfer) \ __PMC_EV(P4, UOP_QUEUE_WRITES, p4-uop-queue-writes) \ __PMC_EV(P4, RETIRED_MISPRED_BRANCH_TYPE, \ p4-retired-mispred-branch-type) \ __PMC_EV(P4, RETIRED_BRANCH_TYPE, p4-retired-branch-type) \ __PMC_EV(P4, RESOURCE_STALL, p4-resource-stall) \ __PMC_EV(P4, WC_BUFFER, p4-wc-buffer) \ __PMC_EV(P4, B2B_CYCLES, p4-b2b-cycles) \ __PMC_EV(P4, BNR, p4-bnr) \ __PMC_EV(P4, SNOOP, p4-snoop) \ __PMC_EV(P4, RESPONSE, p4-response) \ __PMC_EV(P4, FRONT_END_EVENT, p4-front-end-event) \ __PMC_EV(P4, EXECUTION_EVENT, p4-execution-event) \ __PMC_EV(P4, REPLAY_EVENT, p4-replay-event) \ __PMC_EV(P4, INSTR_RETIRED, p4-instr-retired) \ __PMC_EV(P4, UOPS_RETIRED, p4-uops-retired) \ __PMC_EV(P4, UOP_TYPE, p4-uop-type) \ __PMC_EV(P4, BRANCH_RETIRED, p4-branch-retired) \ __PMC_EV(P4, MISPRED_BRANCH_RETIRED, p4-mispred-branch-retired) \ __PMC_EV(P4, X87_ASSIST, p4-x87-assist) \ __PMC_EV(P4, MACHINE_CLEAR, p4-machine-clear) #define PMC_EV_P4_FIRST PMC_EV_P4_TC_DELIVER_MODE #define PMC_EV_P4_LAST PMC_EV_P4_MACHINE_CLEAR /* Intel Pentium Pro, P-II, P-III and Pentium-M style events */ #define __PMC_EV_P6() \ __PMC_EV(P6, DATA_MEM_REFS, p6-data-mem-refs) \ __PMC_EV(P6, DCU_LINES_IN, p6-dcu-lines-in) \ __PMC_EV(P6, DCU_M_LINES_IN, p6-dcu-m-lines-in) \ __PMC_EV(P6, DCU_M_LINES_OUT, p6-dcu-m-lines-out) \ __PMC_EV(P6, DCU_MISS_OUTSTANDING, p6-dcu-miss-outstanding) \ __PMC_EV(P6, IFU_FETCH, p6-ifu-fetch) \ __PMC_EV(P6, IFU_FETCH_MISS, p6-ifu-fetch-miss) \ __PMC_EV(P6, ITLB_MISS, p6-itlb-miss) \ __PMC_EV(P6, IFU_MEM_STALL, p6-ifu-mem-stall) \ __PMC_EV(P6, ILD_STALL, p6-ild-stall) \ __PMC_EV(P6, L2_IFETCH, p6-l2-ifetch) \ __PMC_EV(P6, L2_LD, p6-l2-ld) \ __PMC_EV(P6, L2_ST, p6-l2-st) \ __PMC_EV(P6, L2_LINES_IN, p6-l2-lines-in) \ __PMC_EV(P6, L2_LINES_OUT, p6-l2-lines-out) \ __PMC_EV(P6, L2_M_LINES_INM, p6-l2-m-lines-inm) \ __PMC_EV(P6, L2_M_LINES_OUTM, p6-l2-m-lines-outm) \ __PMC_EV(P6, L2_RQSTS, p6-l2-rqsts) \ __PMC_EV(P6, L2_ADS, p6-l2-ads) \ __PMC_EV(P6, L2_DBUS_BUSY, p6-l2-dbus-busy) \ __PMC_EV(P6, L2_DBUS_BUSY_RD, p6-l2-dbus-busy-rd) \ __PMC_EV(P6, BUS_DRDY_CLOCKS, p6-bus-drdy-clocks) \ __PMC_EV(P6, BUS_LOCK_CLOCKS, p6-bus-lock-clocks) \ __PMC_EV(P6, BUS_REQ_OUTSTANDING, p6-bus-req-outstanding) \ __PMC_EV(P6, BUS_TRAN_BRD, p6-bus-tran-brd) \ __PMC_EV(P6, BUS_TRAN_RFO, p6-bus-tran-rfo) \ __PMC_EV(P6, BUS_TRANS_WB, p6-bus-trans-wb) \ __PMC_EV(P6, BUS_TRAN_IFETCH, p6-bus-tran-ifetch) \ __PMC_EV(P6, BUS_TRAN_INVAL, p6-bus-tran-inval) \ __PMC_EV(P6, BUS_TRAN_PWR, p6-bus-tran-pwr) \ __PMC_EV(P6, BUS_TRANS_P, p6-bus-trans-p) \ __PMC_EV(P6, BUS_TRANS_IO, p6-bus-trans-io) \ __PMC_EV(P6, BUS_TRAN_DEF, p6-bus-tran-def) \ __PMC_EV(P6, BUS_TRAN_BURST, p6-bus-tran-burst) \ __PMC_EV(P6, BUS_TRAN_ANY, p6-bus-tran-any) \ __PMC_EV(P6, BUS_TRAN_MEM, p6-bus-tran-mem) \ __PMC_EV(P6, BUS_DATA_RCV, p6-bus-data-rcv) \ __PMC_EV(P6, BUS_BNR_DRV, p6-bus-bnr-drv) \ __PMC_EV(P6, BUS_HIT_DRV, p6-bus-hit-drv) \ __PMC_EV(P6, BUS_HITM_DRV, p6-bus-hitm-drv) \ __PMC_EV(P6, BUS_SNOOP_STALL, p6-bus-snoop-stall) \ __PMC_EV(P6, FLOPS, p6-flops) \ __PMC_EV(P6, FP_COMPS_OPS_EXE, p6-fp-comps-ops-exe) \ __PMC_EV(P6, FP_ASSIST, p6-fp-assist) \ __PMC_EV(P6, MUL, p6-mul) \ __PMC_EV(P6, DIV, p6-div) \ __PMC_EV(P6, CYCLES_DIV_BUSY, p6-cycles-div-busy) \ __PMC_EV(P6, LD_BLOCKS, p6-ld-blocks) \ __PMC_EV(P6, SB_DRAINS, p6-sb-drains) \ __PMC_EV(P6, MISALIGN_MEM_REF, p6-misalign-mem-ref) \ __PMC_EV(P6, EMON_KNI_PREF_DISPATCHED, p6-emon-kni-pref-dispatched) \ __PMC_EV(P6, EMON_KNI_PREF_MISS, p6-emon-kni-pref-miss) \ __PMC_EV(P6, INST_RETIRED, p6-inst-retired) \ __PMC_EV(P6, UOPS_RETIRED, p6-uops-retired) \ __PMC_EV(P6, INST_DECODED, p6-inst-decoded) \ __PMC_EV(P6, EMON_KNI_INST_RETIRED, p6-emon-kni-inst-retired) \ __PMC_EV(P6, EMON_KNI_COMP_INST_RET, p6-emon-kni-comp-inst-ret) \ __PMC_EV(P6, HW_INT_RX, p6-hw-int-rx) \ __PMC_EV(P6, CYCLES_INT_MASKED, p6-cycles-int-masked) \ __PMC_EV(P6, CYCLES_INT_PENDING_AND_MASKED, \ p6-cycles-in-pending-and-masked) \ __PMC_EV(P6, BR_INST_RETIRED, p6-br-inst-retired) \ __PMC_EV(P6, BR_MISS_PRED_RETIRED, p6-br-miss-pred-retired) \ __PMC_EV(P6, BR_TAKEN_RETIRED, p6-br-taken-retired) \ __PMC_EV(P6, BR_MISS_PRED_TAKEN_RET, p6-br-miss-pred-taken-ret) \ __PMC_EV(P6, BR_INST_DECODED, p6-br-inst-decoded) \ __PMC_EV(P6, BTB_MISSES, p6-btb-misses) \ __PMC_EV(P6, BR_BOGUS, p6-br-bogus) \ __PMC_EV(P6, BACLEARS, p6-baclears) \ __PMC_EV(P6, RESOURCE_STALLS, p6-resource-stalls) \ __PMC_EV(P6, PARTIAL_RAT_STALLS, p6-partial-rat-stalls) \ __PMC_EV(P6, SEGMENT_REG_LOADS, p6-segment-reg-loads) \ __PMC_EV(P6, CPU_CLK_UNHALTED, p6-cpu-clk-unhalted) \ __PMC_EV(P6, MMX_INSTR_EXEC, p6-mmx-instr-exec) \ __PMC_EV(P6, MMX_SAT_INSTR_EXEC, p6-mmx-sat-instr-exec) \ __PMC_EV(P6, MMX_UOPS_EXEC, p6-mmx-uops-exec) \ __PMC_EV(P6, MMX_INSTR_TYPE_EXEC, p6-mmx-instr-type-exec) \ __PMC_EV(P6, FP_MMX_TRANS, p6-fp-mmx-trans) \ __PMC_EV(P6, MMX_ASSIST, p6-mmx-assist) \ __PMC_EV(P6, MMX_INSTR_RET, p6-mmx-instr-ret) \ __PMC_EV(P6, SEG_RENAME_STALLS, p6-seg-rename-stalls) \ __PMC_EV(P6, SEG_REG_RENAMES, p6-seg-reg-renames) \ __PMC_EV(P6, RET_SEG_RENAMES, p6-ret-seg-renames) \ __PMC_EV(P6, EMON_EST_TRANS, p6-emon-est-trans) \ __PMC_EV(P6, EMON_THERMAL_TRIP, p6-emon-thermal-trip) \ __PMC_EV(P6, BR_INST_EXEC, p6-br-inst-exec) \ __PMC_EV(P6, BR_MISSP_EXEC, p6-br-missp-exec) \ __PMC_EV(P6, BR_BAC_MISSP_EXEC, p6-br-bac-missp-exec) \ __PMC_EV(P6, BR_CND_EXEC, p6-br-cnd-exec) \ __PMC_EV(P6, BR_CND_MISSP_EXEC, p6-br-cnd-missp-exec) \ __PMC_EV(P6, BR_IND_EXEC, p6-br-ind-exec) \ __PMC_EV(P6, BR_IND_MISSP_EXEC, p6-br-ind-missp-exec) \ __PMC_EV(P6, BR_RET_EXEC, p6-br-ret-exec) \ __PMC_EV(P6, BR_RET_MISSP_EXEC, p6-br-ret-missp-exec) \ __PMC_EV(P6, BR_RET_BAC_MISSP_EXEC, p6-br-ret-bac-missp-exec) \ __PMC_EV(P6, BR_CALL_EXEC, p6-br-call-exec) \ __PMC_EV(P6, BR_CALL_MISSP_EXEC, p6-br-call-missp-exec) \ __PMC_EV(P6, BR_IND_CALL_EXEC, p6-br-ind-call-exec) \ __PMC_EV(P6, EMON_SIMD_INSTR_RETIRED, p6-emon-simd-instr-retired) \ __PMC_EV(P6, EMON_SYNCH_UOPS, p6-emon-synch-uops) \ __PMC_EV(P6, EMON_ESP_UOPS, p6-emon-esp-uops) \ __PMC_EV(P6, EMON_FUSED_UOPS_RET, p6-emon-fused-uops-ret) \ __PMC_EV(P6, EMON_UNFUSION, p6-emon-unfusion) \ __PMC_EV(P6, EMON_PREF_RQSTS_UP, p6-emon-pref-rqsts-up) \ __PMC_EV(P6, EMON_PREF_RQSTS_DN, p6-emon-pref-rqsts-dn) \ __PMC_EV(P6, EMON_SSE_SSE2_INST_RETIRED, \ p6-emon-sse-sse2-inst-retired) \ __PMC_EV(P6, EMON_SSE_SSE2_COMP_INST_RETIRED, \ p6-emon-sse-sse2-comp-inst-retired) #define PMC_EV_P6_FIRST PMC_EV_P6_DATA_MEM_REFS #define PMC_EV_P6_LAST PMC_EV_P6_EMON_SSE_SSE2_COMP_INST_RETIRED /* AMD K8 PMCs */ #define __PMC_EV_K8() \ __PMC_EV(K8, FP_DISPATCHED_FPU_OPS, k8-fp-dispatched-fpu-ops) \ __PMC_EV(K8, FP_CYCLES_WITH_NO_FPU_OPS_RETIRED, \ k8-fp-cycles-with-no-fpu-ops-retired) \ __PMC_EV(K8, FP_DISPATCHED_FPU_FAST_FLAG_OPS, \ k8-fp-dispatched-fpu-fast-flag-ops) \ __PMC_EV(K8, LS_SEGMENT_REGISTER_LOAD, k8-ls-segment-register-load) \ __PMC_EV(K8, LS_MICROARCHITECTURAL_RESYNC_BY_SELF_MODIFYING_CODE, \ k8-ls-microarchitectural-resync-by-self-modifying-code) \ __PMC_EV(K8, LS_MICROARCHITECTURAL_RESYNC_BY_SNOOP, \ k8-ls-microarchitectural-resync-by-snoop) \ __PMC_EV(K8, LS_BUFFER2_FULL, k8-ls-buffer2-full) \ __PMC_EV(K8, LS_LOCKED_OPERATION, k8-ls-locked-operation) \ __PMC_EV(K8, LS_MICROARCHITECTURAL_LATE_CANCEL, \ k8-ls-microarchitectural-late-cancel) \ __PMC_EV(K8, LS_RETIRED_CFLUSH_INSTRUCTIONS, \ k8-ls-retired-cflush-instructions) \ __PMC_EV(K8, LS_RETIRED_CPUID_INSTRUCTIONS, \ k8-ls-retired-cpuid-instructions) \ __PMC_EV(K8, DC_ACCESS, k8-dc-access) \ __PMC_EV(K8, DC_MISS, k8-dc-miss) \ __PMC_EV(K8, DC_REFILL_FROM_L2, k8-dc-refill-from-l2) \ __PMC_EV(K8, DC_REFILL_FROM_SYSTEM, k8-dc-refill-from-system) \ __PMC_EV(K8, DC_COPYBACK, k8-dc-copyback) \ __PMC_EV(K8, DC_L1_DTLB_MISS_AND_L2_DTLB_HIT, \ k8-dc-l1-dtlb-miss-and-l2-dtlb-hit) \ __PMC_EV(K8, DC_L1_DTLB_MISS_AND_L2_DTLB_MISS, \ k8-dc-l1-dtlb-miss-and-l2-dtlb-miss) \ __PMC_EV(K8, DC_MISALIGNED_DATA_REFERENCE, \ k8-dc-misaligned-data-reference) \ __PMC_EV(K8, DC_MICROARCHITECTURAL_LATE_CANCEL, \ k8-dc-microarchitectural-late-cancel-of-an-access) \ __PMC_EV(K8, DC_MICROARCHITECTURAL_EARLY_CANCEL, \ k8-dc-microarchitectural-early-cancel-of-an-access) \ __PMC_EV(K8, DC_ONE_BIT_ECC_ERROR, k8-dc-one-bit-ecc-error) \ __PMC_EV(K8, DC_DISPATCHED_PREFETCH_INSTRUCTIONS, \ k8-dc-dispatched-prefetch-instructions) \ __PMC_EV(K8, DC_DCACHE_ACCESSES_BY_LOCKS, \ k8-dc-dcache-accesses-by-locks) \ __PMC_EV(K8, BU_CPU_CLK_UNHALTED, k8-bu-cpu-clk-unhalted) \ __PMC_EV(K8, BU_INTERNAL_L2_REQUEST, k8-bu-internal-l2-request) \ __PMC_EV(K8, BU_FILL_REQUEST_L2_MISS, k8-bu-fill-request-l2-miss) \ __PMC_EV(K8, BU_FILL_INTO_L2, k8-bu-fill-into-l2) \ __PMC_EV(K8, IC_FETCH, k8-ic-fetch) \ __PMC_EV(K8, IC_MISS, k8-ic-miss) \ __PMC_EV(K8, IC_REFILL_FROM_L2, k8-ic-refill-from-l2) \ __PMC_EV(K8, IC_REFILL_FROM_SYSTEM, k8-ic-refill-from-system) \ __PMC_EV(K8, IC_L1_ITLB_MISS_AND_L2_ITLB_HIT, \ k8-ic-l1-itlb-miss-and-l2-itlb-hit) \ __PMC_EV(K8, IC_L1_ITLB_MISS_AND_L2_ITLB_MISS, \ k8-ic-l1-itlb-miss-and-l2-itlb-miss) \ __PMC_EV(K8, IC_MICROARCHITECTURAL_RESYNC_BY_SNOOP, \ k8-ic-microarchitectural-resync-by-snoop) \ __PMC_EV(K8, IC_INSTRUCTION_FETCH_STALL, \ k8-ic-instruction-fetch-stall) \ __PMC_EV(K8, IC_RETURN_STACK_HIT, k8-ic-return-stack-hit) \ __PMC_EV(K8, IC_RETURN_STACK_OVERFLOW, k8-ic-return-stack-overflow) \ __PMC_EV(K8, FR_RETIRED_X86_INSTRUCTIONS, \ k8-fr-retired-x86-instructions) \ __PMC_EV(K8, FR_RETIRED_UOPS, k8-fr-retired-uops) \ __PMC_EV(K8, FR_RETIRED_BRANCHES, k8-fr-retired-branches) \ __PMC_EV(K8, FR_RETIRED_BRANCHES_MISPREDICTED, \ k8-fr-retired-branches-mispredicted) \ __PMC_EV(K8, FR_RETIRED_TAKEN_BRANCHES, \ k8-fr-retired-taken-branches) \ __PMC_EV(K8, FR_RETIRED_TAKEN_BRANCHES_MISPREDICTED, \ k8-fr-retired-taken-branches-mispredicted) \ __PMC_EV(K8, FR_RETIRED_FAR_CONTROL_TRANSFERS, \ k8-fr-retired-far-control-transfers) \ __PMC_EV(K8, FR_RETIRED_RESYNCS, k8-fr-retired-resyncs) \ __PMC_EV(K8, FR_RETIRED_NEAR_RETURNS, k8-fr-retired-near-returns) \ __PMC_EV(K8, FR_RETIRED_NEAR_RETURNS_MISPREDICTED, \ k8-fr-retired-near-returns-mispredicted) \ __PMC_EV(K8, \ FR_RETIRED_TAKEN_BRANCHES_MISPREDICTED_BY_ADDR_MISCOMPARE, \ k8-fr-retired-taken-branches-mispredicted-by-addr-miscompare) \ __PMC_EV(K8, FR_RETIRED_FPU_INSTRUCTIONS, \ k8-fr-retired-fpu-instructions) \ __PMC_EV(K8, FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS, \ k8-fr-retired-fastpath-double-op-instructions) \ __PMC_EV(K8, FR_INTERRUPTS_MASKED_CYCLES, \ k8-fr-interrupts-masked-cycles) \ __PMC_EV(K8, FR_INTERRUPTS_MASKED_WHILE_PENDING_CYCLES, \ k8-fr-interrupts-masked-while-pending-cycles) \ __PMC_EV(K8, FR_TAKEN_HARDWARE_INTERRUPTS, \ k8-fr-taken-hardware-interrupts) \ __PMC_EV(K8, FR_DECODER_EMPTY, k8-fr-decoder-empty) \ __PMC_EV(K8, FR_DISPATCH_STALLS, k8-fr-dispatch-stalls) \ __PMC_EV(K8, FR_DISPATCH_STALL_FROM_BRANCH_ABORT_TO_RETIRE, \ k8-fr-dispatch-stall-from-branch-abort-to-retire) \ __PMC_EV(K8, FR_DISPATCH_STALL_FOR_SERIALIZATION, \ k8-fr-dispatch-stall-for-serialization) \ __PMC_EV(K8, FR_DISPATCH_STALL_FOR_SEGMENT_LOAD, \ k8-fr-dispatch-stall-for-segment-load) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_REORDER_BUFFER_IS_FULL, \ k8-fr-dispatch-stall-when-reorder-buffer-is-full) \ __PMC_EV(K8, \ FR_DISPATCH_STALL_WHEN_RESERVATION_STATIONS_ARE_FULL, \ k8-fr-dispatch-stall-when-reservation-stations-are-full) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_FPU_IS_FULL, \ k8-fr-dispatch-stall-when-fpu-is-full) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_LS_IS_FULL, \ k8-fr-dispatch-stall-when-ls-is-full) \ __PMC_EV(K8, FR_DISPATCH_STALL_WHEN_WAITING_FOR_ALL_TO_BE_QUIET, \ k8-fr-dispatch-stall-when-waiting-for-all-to-be-quiet) \ __PMC_EV(K8, \ FR_DISPATCH_STALL_WHEN_FAR_XFER_OR_RESYNC_BRANCH_PENDING, \ k8-fr-dispatch-stall-when-far-xfer-or-resync-branch-pending) \ __PMC_EV(K8, FR_FPU_EXCEPTIONS, k8-fr-fpu-exceptions) \ __PMC_EV(K8, FR_NUMBER_OF_BREAKPOINTS_FOR_DR0, \ k8-fr-number-of-breakpoints-for-dr0) \ __PMC_EV(K8, FR_NUMBER_OF_BREAKPOINTS_FOR_DR1, \ k8-fr-number-of-breakpoints-for-dr1) \ __PMC_EV(K8, FR_NUMBER_OF_BREAKPOINTS_FOR_DR2, \ k8-fr-number-of-breakpoints-for-dr2) \ __PMC_EV(K8, FR_NUMBER_OF_BREAKPOINTS_FOR_DR3, \ k8-fr-number-of-breakpoints-for-dr3) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_PAGE_ACCESS_EVENT, \ k8-nb-memory-controller-page-access-event) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_PAGE_TABLE_OVERFLOW, \ k8-nb-memory-controller-page-table-overflow) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_DRAM_COMMAND_SLOTS_MISSED, \ k8-nb-memory-controller-dram-slots-missed) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_TURNAROUND, \ k8-nb-memory-controller-turnaround) \ __PMC_EV(K8, NB_MEMORY_CONTROLLER_BYPASS_SATURATION, \ k8-nb-memory-controller-bypass-saturation) \ __PMC_EV(K8, NB_SIZED_COMMANDS, k8-nb-sized-commands) \ __PMC_EV(K8, NB_PROBE_RESULT, k8-nb-probe-result) \ __PMC_EV(K8, NB_HT_BUS0_BANDWIDTH, k8-nb-ht-bus0-bandwidth) \ __PMC_EV(K8, NB_HT_BUS1_BANDWIDTH, k8-nb-ht-bus1-bandwidth) \ __PMC_EV(K8, NB_HT_BUS2_BANDWIDTH, k8-nb-ht-bus2-bandwidth) #define PMC_EV_K8_FIRST PMC_EV_K8_FP_DISPATCHED_FPU_OPS #define PMC_EV_K8_LAST PMC_EV_K8_NB_HT_BUS2_BANDWIDTH /* Intel Pentium Events */ #define __PMC_EV_P5() \ __PMC_EV(P5, DATA_READ, p5-data-read) \ __PMC_EV(P5, DATA_WRITE, p5-data-write) \ __PMC_EV(P5, DATA_TLB_MISS, p5-data-tlb-miss) \ __PMC_EV(P5, DATA_READ_MISS, p5-data-read-miss) \ __PMC_EV(P5, DATA_WRITE_MISS, p5-data-write-miss) \ __PMC_EV(P5, WRITE_HIT_TO_M_OR_E_STATE_LINES, \ p5-write-hit-to-m-or-e-state-lines) \ __PMC_EV(P5, DATA_CACHE_LINES_WRITTEN_BACK, \ p4-data-cache-lines-written-back) \ __PMC_EV(P5, EXTERNAL_SNOOPS, p5-external-snoops) \ __PMC_EV(P5, EXTERNAL_DATA_CACHE_SNOOP_HITS, \ p5-external-data-cache-snoop-hits) \ __PMC_EV(P5, MEMORY_ACCESSES_IN_BOTH_PIPES, \ p5-memory-accesses-in-both-pipes) \ __PMC_EV(P5, BANK_CONFLICTS, p5-bank-conflicts) \ __PMC_EV(P5, MISALIGNED_DATA_OR_IO_REFERENCES, \ p5-misaligned-data-or-io-references) \ __PMC_EV(P5, CODE_READ, p5-code-read) \ __PMC_EV(P5, CODE_TLB_MISS, p5-code-tlb-miss) \ __PMC_EV(P5, CODE_CACHE_MISS, p5-code-cache-miss) \ __PMC_EV(P5, ANY_SEGMENT_REGISTER_LOADED, \ p5-any-segment-register-loaded) \ __PMC_EV(P5, BRANCHES, p5-branches) \ __PMC_EV(P5, BTB_HITS, p5-btb-hits) \ __PMC_EV(P5, TAKEN_BRANCH_OR_BTB_HIT, \ p5-taken-branch-or-btb-hit) \ __PMC_EV(P5, PIPELINE_FLUSHES, p5-pipeline-flushes) \ __PMC_EV(P5, INSTRUCTIONS_EXECUTED, p5-instructions-executed) \ __PMC_EV(P5, INSTRUCTIONS_EXECUTED_V_PIPE, \ p5-instructions-executed-v-pipe) \ __PMC_EV(P5, BUS_CYCLE_DURATION, p5-bus-cycle-duration) \ __PMC_EV(P5, WRITE_BUFFER_FULL_STALL_DURATION, \ p5-write-buffer-full-stall-duration) \ __PMC_EV(P5, WAITING_FOR_DATA_MEMORY_READ_STALL_DURATION, \ p5-waiting-for-data-memory-read-stall-duration) \ __PMC_EV(P5, STALL_ON_WRITE_TO_AN_E_OR_M_STATE_LINE, \ p5-stall-on-write-to-an-e-or-m-state-line) \ __PMC_EV(P5, LOCKED_BUS_CYCLE, p5-locked-bus-cycle) \ __PMC_EV(P5, IO_READ_OR_WRITE_CYCLE, p5-io-read-or-write-cycle) \ __PMC_EV(P5, NONCACHEABLE_MEMORY_READS, \ p5-noncacheable-memory-reads) \ __PMC_EV(P5, PIPELINE_AGI_STALLS, p5-pipeline-agi-stalls) \ __PMC_EV(P5, FLOPS, p5-flops) \ __PMC_EV(P5, BREAKPOINT_MATCH_ON_DR0_REGISTER, \ p5-breakpoint-match-on-dr0-register) \ __PMC_EV(P5, BREAKPOINT_MATCH_ON_DR1_REGISTER, \ p5-breakpoint-match-on-dr1-register) \ __PMC_EV(P5, BREAKPOINT_MATCH_ON_DR2_REGISTER, \ p5-breakpoint-match-on-dr2-register) \ __PMC_EV(P5, BREAKPOINT_MATCH_ON_DR3_REGISTER, \ p5-breakpoint-match-on-dr3-register) \ __PMC_EV(P5, HARDWARE_INTERRUPTS, p5-hardware-interrupts) \ __PMC_EV(P5, DATA_READ_OR_WRITE, p5-data-read-or-write) \ __PMC_EV(P5, DATA_READ_MISS_OR_WRITE_MISS, \ p5-data-read-miss-or-write-miss) \ __PMC_EV(P5, BUS_OWNERSHIP_LATENCY, p5-bus-ownership-latency) \ __PMC_EV(P5, BUS_OWNERSHIP_TRANSFERS, p5-bus-ownership-transfers) \ __PMC_EV(P5, MMX_INSTRUCTIONS_EXECUTED_U_PIPE, \ p5-mmx-instructions-executed-u-pipe) \ __PMC_EV(P5, MMX_INSTRUCTIONS_EXECUTED_V_PIPE, \ p5-mmx-instructions-executed-v-pipe) \ __PMC_EV(P5, CACHE_M_LINE_SHARING, p5-cache-m-line-sharing) \ __PMC_EV(P5, CACHE_LINE_SHARING, p5-cache-line-sharing) \ __PMC_EV(P5, EMMS_INSTRUCTIONS_EXECUTED, \ p5-emms-instructions-executed) \ __PMC_EV(P5, TRANSITIONS_BETWEEN_MMX_AND_FP_INSTRUCTIONS, \ p5-transitions-between-mmx-and-fp-instructions) \ __PMC_EV(P5, BUS_UTILIZATION_DUE_TO_PROCESSOR_ACTIVITY, \ p5-bus-utilization-due-to-processor-activity) \ __PMC_EV(P5, WRITES_TO_NONCACHEABLE_MEMORY, \ p5-writes-to-noncacheable-memory) \ __PMC_EV(P5, SATURATING_MMX_INSTRUCTIONS_EXECUTED, \ p5-saturating-mmx-instructions-executed) \ __PMC_EV(P5, SATURATIONS_PERFORMED, p5-saturations-performed) \ __PMC_EV(P5, NUMBER_OF_CYCLES_NOT_IN_HALT_STATE, \ p5-number-of-cycles-not-in-halt-state) \ __PMC_EV(P5, DATA_CACHE_TLB_MISS_STALL_DURATION, \ p5-data-cache-tlb-miss-stall-duration) \ __PMC_EV(P5, MMX_INSTRUCTION_DATA_READS, \ p5-mmx-instruction-data-reads) \ __PMC_EV(P5, MMX_INSTRUCTION_DATA_READ_MISSES, \ p5-mmx-instruction-data-read-misses) \ __PMC_EV(P5, FLOATING_POINT_STALLS_DURATION, \ p5-floating-point-stalls-duration) \ __PMC_EV(P5, TAKEN_BRANCHES, p5-taken-branches) \ __PMC_EV(P5, D1_STARVATION_AND_FIFO_IS_EMPTY, \ p5-d1-starvation-and-fifo-is-empty) \ __PMC_EV(P5, D1_STARVATION_AND_ONLY_ONE_INSTRUCTION_IN_FIFO, \ p5-d1-starvation-and-only-instruction-in-fifo) \ __PMC_EV(P5, MMX_INSTRUCTION_DATA_WRITES, \ p5-mmx-instruction-data-writes) \ __PMC_EV(P5, MMX_INSTRUCTION_DATA_WRITE_MISSES, \ p5-mmx-instruction-data-write-misses) \ __PMC_EV(P5, PIPELINE_FLUSHES_DUE_TO_WRONG_BRANCH_PREDICTIONS, \ p5-pipeline-flushes-due-to-wrong-branch-predictions) \ __PMC_EV(P5, \ PIPELINE_FLUSHES_DUE_TO_WRONG_BRANCH_PREDICTIONS_RESOLVED_IN_WB_STAGE, \ p5-pipeline-flushes-due-to-wrong-branch-predictions-resolved-in-wb-stage) \ __PMC_EV(P5, MISALIGNED_DATA_MEMORY_REFERENCE_ON_MMX_INSTRUCTIONS, \ p5-misaligned-data-memory-reference-on-mmx-instructions) \ __PMC_EV(P5, PIPELINE_STALL_FOR_MMX_INSTRUCTION_DATA_MEMORY_READS, \ p5-pipeline-stall-for-mmx-instruction-data-memory-reads) \ __PMC_EV(P5, MISPREDICTED_OR_UNPREDICTED_RETURNS, \ p5-mispredicted-or-unpredicted-returns) \ __PMC_EV(P5, PREDICTED_RETURNS, p5-predicted-returns) \ __PMC_EV(P5, MMX_MULTIPLY_UNIT_INTERLOCK, \ p5-mmx-multiply-unit-interlock) \ __PMC_EV(P5, MOVD_MOVQ_STORE_STALL_DUE_TO_PREVIOUS_MMX_OPERATION, \ p5-movd-movq-store-stall-due-to-previous-mmx-operation) \ __PMC_EV(P5, RETURNS, p5-returns) \ __PMC_EV(P5, BTB_FALSE_ENTRIES, p5-btb-false-entries) \ __PMC_EV(P5, BTB_MISS_PREDICTION_ON_NOT_TAKEN_BRANCH, \ p5-btb-miss-prediction-on-not-taken-branch) \ __PMC_EV(P5, \ FULL_WRITE_BUFFER_STALL_DURATION_WHILE_EXECUTING_MMX_INSTRUCTIONS, \ p5-full-write-buffer-stall-duration-while-executing-mmx-instructions) \ __PMC_EV(P5, STALL_ON_MMX_INSTRUCTION_WRITE_TO_E_OR_M_STATE_LINE, \ p5-stall-on-mmx-instruction-write-to-e-o-m-state-line) #define PMC_EV_P5_FIRST PMC_EV_P5_DATA_READ #define PMC_EV_P5_LAST \ PMC_EV_P5_STALL_ON_MMX_INSTRUCTION_WRITE_TO_E_OR_M_STATE_LINE /* timestamp counters. */ #define __PMC_EV_TSC() \ __PMC_EV(TSC, TSC, tsc) /* All known PMC events */ #define __PMC_EVENTS() \ __PMC_EV_TSC() \ __PMC_EV_K7() \ __PMC_EV_P6() \ __PMC_EV_P4() \ __PMC_EV_K8() \ __PMC_EV_P5() \ enum pmc_event { #undef __PMC_EV #define __PMC_EV(C,N,D) PMC_EV_ ## C ## _ ## N , __PMC_EVENTS() }; #define PMC_EVENT_FIRST PMC_EV_TSC_TSC #define PMC_EVENT_LAST PMC_EV_P5_LAST /* * Counter capabilities * * __PMC_CAPS(NAME, VALUE, DESCRIPTION) */ #define __PMC_CAPS() \ __PMC_CAP(INTERRUPT, 0, "generate interrupts") \ __PMC_CAP(USER, 1, "count user-mode events") \ __PMC_CAP(SYSTEM, 2, "count system-mode events") \ __PMC_CAP(EDGE, 3, "do edge detection of events") \ __PMC_CAP(THRESHOLD, 4, "ignore events below a threshold") \ __PMC_CAP(READ, 5, "read PMC counter") \ __PMC_CAP(WRITE, 6, "reprogram PMC counter") \ __PMC_CAP(INVERT, 7, "invert comparision sense") \ __PMC_CAP(QUALIFIER, 8, "further qualify monitored events") \ __PMC_CAP(PRECISE, 9, "perform precise sampling") \ __PMC_CAP(TAGGING, 10, "tag upstream events") \ __PMC_CAP(CASCADE, 11, "cascade counters") enum pmc_caps { #undef __PMC_CAP #define __PMC_CAP(NAME, VALUE, DESCR) PMC_CAP_##NAME = (1 << VALUE) , __PMC_CAPS() }; #define PMC_CAP_FIRST PMC_CAP_INTERRUPT #define PMC_CAP_LAST PMC_CAP_CASCADE /* * PMC SYSCALL INTERFACE */ /* * "PMC_OPS" -- these are the commands recognized by the kernel * module, and are used when performing a system call from userland. */ #define __PMC_OPS() \ __PMC_OP(CONFIGURELOG, "Set log file") \ __PMC_OP(GETCPUINFO, "Get system CPU information") \ __PMC_OP(GETDRIVERSTATS, "Get driver statistics") \ __PMC_OP(GETMODULEVERSION, "Get module version") \ __PMC_OP(GETPMCINFO, "Get per-cpu PMC information") \ __PMC_OP(PMCADMIN, "Set PMC state") \ __PMC_OP(PMCALLOCATE, "Allocate and configure a PMC") \ __PMC_OP(PMCATTACH, "Attach a PMC to a process") \ __PMC_OP(PMCDETACH, "Detach a PMC from a process") \ __PMC_OP(PMCRELEASE, "Release a PMC") \ __PMC_OP(PMCRW, "Read/Set a PMC") \ __PMC_OP(PMCSETCOUNT, "Set initial count/sampling rate") \ __PMC_OP(PMCSTART, "Start a PMC") \ __PMC_OP(PMCSTOP, "Start a PMC") \ __PMC_OP(WRITELOG, "Write a log file entry") \ __PMC_OP(PMCX86GETMSR, "(x86 architectures) retrieve MSR") enum pmc_ops { #undef __PMC_OP #define __PMC_OP(N, D) PMC_OP_##N, __PMC_OPS() }; /* * Flags used in operations on PMCs. */ #define PMC_F_FORCE 0x00000001 /*OP ADMIN force operation */ #define PMC_F_DESCENDANTS 0x00000002 /*OP ALLOCATE track descendants */ #define PMC_F_LOG_TC_CSW 0x00000004 /*OP ALLOCATE track ctx switches */ #define PMC_F_LOG_TC_PROCEXIT 0x00000008 /*OP ALLOCATE log proc exits */ #define PMC_F_NEWVALUE 0x00000010 /*OP RW write new value */ #define PMC_F_OLDVALUE 0x00000020 /*OP RW get old value */ #define PMC_F_ATTACHED_TO_OWNER 0x00010000 /*attached to owner*/ /* * Cookies used to denote allocated PMCs, and the values of PMCs. */ typedef uint32_t pmc_id_t; typedef uint64_t pmc_value_t; #define PMC_ID_INVALID (~ (pmc_id_t) 0) /* * PMC IDs have the following format: * * +--------+----------+-----------+-----------+ * | CPU | PMC MODE | PMC CLASS | ROW INDEX | * +--------+----------+-----------+-----------+ * * where each field is 8 bits wide. Field 'CPU' is set to the * requested CPU for system-wide PMCs or PMC_CPU_ANY for process-mode * PMCs. Field 'PMC MODE' is the allocated PMC mode. Field 'PMC * CLASS' is the class of the PMC. Field 'ROW INDEX' is the row index * for the PMC. * * The 'ROW INDEX' ranges over 0..NWPMCS where NHWPMCS is the total * number of hardware PMCs on this cpu. */ #define PMC_ID_TO_ROWINDEX(ID) ((ID) & 0xFF) #define PMC_ID_TO_CLASS(ID) (((ID) & 0xFF00) >> 8) #define PMC_ID_TO_MODE(ID) (((ID) & 0xFF0000) >> 16) #define PMC_ID_TO_CPU(ID) (((ID) & 0xFF000000) >> 24) #define PMC_ID_MAKE_ID(CPU,MODE,CLASS,ROWINDEX) \ ((((CPU) & 0xFF) << 24) | (((MODE) & 0xFF) << 16) | \ (((CLASS) & 0xFF) << 8) | ((ROWINDEX) & 0xFF)) /* * Data structures for system calls supported by the pmc driver. */ /* * OP PMCALLOCATE * * Allocate a PMC on the named CPU. */ #define PMC_CPU_ANY ~0 struct pmc_op_pmcallocate { uint32_t pm_caps; /* PMC_CAP_* */ uint32_t pm_cpu; /* CPU number or PMC_CPU_ANY */ enum pmc_class pm_class; /* class of PMC desired */ enum pmc_event pm_ev; /* [enum pmc_event] desired */ uint32_t pm_flags; /* additional modifiers PMC_F_* */ enum pmc_mode pm_mode; /* desired mode */ pmc_id_t pm_pmcid; /* [return] process pmc id */ /* * Machine dependent extensions */ #if __i386__ uint32_t pm_config1; uint32_t pm_config2; #define pm_amd_config pm_config1 #define pm_p4_cccrconfig pm_config1 #define pm_p4_escrconfig pm_config2 #define pm_p6_config pm_config1 #elif __amd64__ uint32_t pm_k8_config; #define pm_amd_config pm_k8_config #endif }; /* * OP PMCADMIN * * Set the administrative state (i.e., whether enabled or disabled) of * a PMC 'pm_pmc' on CPU 'pm_cpu'. Note that 'pm_pmc' specifies an * absolute PMC number and need not have been first allocated by the * calling process. */ struct pmc_op_pmcadmin { int pm_cpu; /* CPU# */ uint32_t pm_flags; /* flags */ int pm_pmc; /* PMC# */ enum pmc_state pm_state; /* desired state */ }; /* * OP PMCATTACH / OP PMCDETACH * * Attach/detach a PMC and a process. */ struct pmc_op_pmcattach { pmc_id_t pm_pmc; /* PMC to attach to */ pid_t pm_pid; /* target process */ }; /* * OP PMCSETCOUNT * * Set the sampling rate (i.e., the reload count) for statistical counters. * 'pm_pmcid' need to have been previously allocated using PMCALLOCATE. */ struct pmc_op_pmcsetcount { pmc_value_t pm_count; /* initial/sample count */ pmc_id_t pm_pmcid; /* PMC id to set */ }; /* * OP PMCRW * * Read the value of a PMC named by 'pm_pmcid'. 'pm_pmcid' needs * to have been previously allocated using PMCALLOCATE. */ struct pmc_op_pmcrw { uint32_t pm_flags; /* PMC_F_{OLD,NEW}VALUE*/ pmc_id_t pm_pmcid; /* pmc id */ pmc_value_t pm_value; /* new&returned value */ }; /* * OP GETPMCINFO * * retrieve PMC state for a named CPU. The caller is expected to * allocate 'npmc' * 'struct pmc_info' bytes of space for the return * values. */ struct pmc_info { char pm_name[PMC_NAME_MAX]; /* pmc name */ enum pmc_class pm_class; /* enum pmc_class */ int pm_enabled; /* whether enabled */ enum pmc_disp pm_rowdisp; /* FREE, THREAD or STANDLONE */ pid_t pm_ownerpid; /* owner, or -1 */ enum pmc_mode pm_mode; /* current mode [enum pmc_mode] */ enum pmc_event pm_event; /* current event */ uint32_t pm_flags; /* current flags */ pmc_value_t pm_reloadcount; /* sampling counters only */ }; struct pmc_op_getpmcinfo { int32_t pm_cpu; /* 0 <= cpu < mp_maxid */ struct pmc_info pm_pmcs[]; /* space for 'npmc' structures */ }; /* * OP GETCPUINFO * * Retrieve system CPU information. */ struct pmc_classinfo { enum pmc_class pm_class; /* class id */ uint32_t pm_caps; /* counter capabilities */ uint32_t pm_width; /* width of the PMC */ }; struct pmc_op_getcpuinfo { enum pmc_cputype pm_cputype; /* what kind of CPU */ uint32_t pm_ncpu; /* number of CPUs */ uint32_t pm_npmc; /* #PMCs per CPU */ uint32_t pm_nclass; /* #classes of PMCs */ struct pmc_classinfo pm_classes[PMC_CLASS_MAX]; }; /* * OP CONFIGURELOG * * Configure a log file for writing system-wide statistics to. */ struct pmc_op_configurelog { int pm_flags; int pm_logfd; /* logfile fd (or -1) */ }; /* * OP GETDRIVERSTATS * * Retrieve pmc(4) driver-wide statistics. */ struct pmc_op_getdriverstats { int pm_intr_ignored; /* #interrupts ignored */ int pm_intr_processed; /* #interrupts processed */ int pm_syscalls; /* #syscalls */ int pm_syscall_errors; /* #syscalls with errors */ }; /* * OP RELEASE / OP START / OP STOP * * Simple operations on a PMC id. */ struct pmc_op_simple { pmc_id_t pm_pmcid; }; #if __i386__ || __amd64__ /* * OP X86_GETMSR * * Retrieve the model specific register assoicated with the * allocated PMC. This number can be used subsequently with * RDPMC instructions. */ struct pmc_op_x86_getmsr { uint32_t pm_msr; /* MSR for the PMC */ pmc_id_t pm_pmcid; /* allocated pmc id */ }; #endif #ifdef _KERNEL #include #include #define PMC_REQUEST_POOL_SIZE 128 #define PMC_HASH_SIZE 16 #define PMC_PCPU_BUFFER_SIZE 4096 #define PMC_MTXPOOL_SIZE 32 /* * PMC commands */ struct pmc_syscall_args { uint32_t pmop_code; /* one of PMC_OP_* */ void *pmop_data; /* syscall parameter */ }; /* * Interface to processor specific s1tuff */ /* * struct pmc_descr * * Machine independent (i.e., the common parts) of a human readable * PMC description. */ struct pmc_descr { const char pd_name[PMC_NAME_MAX]; /* name */ uint32_t pd_caps; /* capabilities */ enum pmc_class pd_class; /* class of the PMC */ uint32_t pd_width; /* width in bits */ }; /* * struct pmc_target * * This structure records all the target processes associated with a * PMC. */ struct pmc_target { LIST_ENTRY(pmc_target) pt_next; struct pmc_process *pt_process; /* target descriptor */ }; /* * struct pmc * * Describes each allocated PMC. * * Each PMC has precisely one owner, namely the process that allocated * the PMC. * * A PMC may be attached to multiple target processes. The * 'pm_targets' field links all the target processes being monitored * by this PMC. * * The 'pm_savedvalue' field is protected by a mutex. * * On a multi-cpu machine, multiple target threads associated with a * process-virtual PMC could be concurrently executing on different * CPUs. The 'pm_runcount' field is atomically incremented every time * the PMC gets scheduled on a CPU and atomically decremented when it * get descheduled. Deletion of a PMC is only permitted when this * field is '0'. * */ struct pmc { LIST_HEAD(,pmc_target) pm_targets; /* list of target processes */ /* * System-wide PMCs are allocated on a CPU and are not moved * around. For system-wide PMCs we record the CPU the PMC was * allocated on in the 'CPU' field of the pmc ID. * * Virtual PMCs run on whichever CPU is currently executing * their targets' threads. For these PMCs we need to save * their current PMC counter values when they are taken off * CPU. */ union { pmc_value_t pm_savedvalue; /* Virtual PMCS */ } pm_gv; /* * For sampling mode PMCs, we keep track of the PMC's "reload * count", which is the counter value to be loaded in when * arming the PMC for the next counting session. For counting * modes on PMCs that are read-only (e.g., the x86 TSC), we * keep track of the initial value at the start of * counting-mode operation. */ union { pmc_value_t pm_reloadcount; /* sampling PMC modes */ pmc_value_t pm_initial; /* counting PMC modes */ } pm_sc; uint32_t pm_caps; /* PMC capabilities */ enum pmc_event pm_event; /* event being measured */ uint32_t pm_flags; /* additional flags PMC_F_... */ struct pmc_owner *pm_owner; /* owner thread state */ uint32_t pm_runcount; /* #cpus currently on */ enum pmc_state pm_state; /* current PMC state */ /* * The PMC ID field encodes the row-index for the PMC, its * mode, class and the CPU# associated with the PMC. */ pmc_id_t pm_id; /* allocated PMC id */ /* md extensions */ #if __i386__ union { /* AMD Athlon counters */ struct { uint32_t pm_amd_evsel; } pm_amd; /* Intel P4 counters */ struct { uint32_t pm_p4_cccrvalue; uint32_t pm_p4_escrvalue; uint32_t pm_p4_escr; uint32_t pm_p4_escrmsr; } pm_p4; /* Intel P6 counters */ struct { uint32_t pm_p6_evsel; } pm_p6; } pm_md; #elif __amd64__ union { /* AMD Athlon counters */ struct { uint32_t pm_amd_evsel; } pm_amd; } pm_md; #endif }; /* * Accessor macros for 'struct pmc' */ #define PMC_TO_MODE(P) PMC_ID_TO_MODE((P)->pm_id) #define PMC_TO_CLASS(P) PMC_ID_TO_CLASS((P)->pm_id) #define PMC_TO_ROWINDEX(P) PMC_ID_TO_ROWINDEX((P)->pm_id) #define PMC_TO_CPU(P) PMC_ID_TO_CPU((P)->pm_id) /* * struct pmc_list * * Describes a list of PMCs. */ struct pmc_list { LIST_ENTRY(pmc_list) pl_next; struct pmc *pl_pmc; /* PMC descriptor */ }; /* * struct pmc_process * * Record a 'target' process being profiled. * * The target process being profiled could be different from the owner * process which allocated the PMCs. Each target process descriptor * is associated with NHWPMC 'struct pmc *' pointers. Each PMC at a * given hardware row-index 'n' will use slot 'n' of the 'pp_pmcs[]' * array. The size of this structure is thus PMC architecture * dependent. * * TODO: Only process-private counting mode PMCs may be attached to a * process different from the allocator process (since we do not have * the infrastructure to make sense of an interrupted PC value from a * 'target' process (yet)). * */ struct pmc_targetstate { struct pmc *pp_pmc; /* target PMC */ pmc_value_t pp_pmcval; /* per-process value */ }; struct pmc_process { LIST_ENTRY(pmc_process) pp_next; /* hash chain */ int pp_refcnt; /* reference count */ uint32_t pp_flags; /* flags PMC_PP_* */ struct proc *pp_proc; /* target thread */ struct pmc_targetstate pp_pmcs[]; /* NHWPMCs */ }; #define PMC_PP_ENABLE_MSR_ACCESS 0x00000001 /* * struct pmc_owner * * We associate a PMC with an 'owner' process. * * A process can be associated with 0..NCPUS*NHWPMC PMCs during its * lifetime, where NCPUS is the numbers of CPUS in the system and * NHWPMC is the number of hardware PMCs per CPU. These are * maintained in the list headed by the 'po_pmcs' to save on space. * */ struct pmc_owner { LIST_ENTRY(pmc_owner) po_next; /* hash chain */ LIST_HEAD(, pmc_list) po_pmcs; /* list of owned PMCs */ uint32_t po_flags; /* flags PMC_PO_* */ struct proc *po_owner; /* owner proc */ int po_logfd; /* XXX for now */ }; #define PMC_PO_HAS_TS_PMC 0x00000001 #define PMC_PO_OWNS_LOGFILE 0x00000002 /* * struct pmc_hw -- describe the state of the PMC hardware * * When in use, a HW PMC is associated with one allocated 'struct pmc' * pointed to by field 'phw_pmc'. When inactive, this field is NULL. * * On an SMP box, one or more HW PMC's in process virtual mode with * the same 'phw_pmc' could be executing on different CPUs. In order * to handle this case correctly, we need to ensure that only * incremental counts get added to the saved value in the associated * 'struct pmc'. The 'phw_save' field is used to keep the saved PMC * value at the time the hardware is started during this context * switch (i.e., the difference between the new (hardware) count and * the saved count is atomically added to the count field in 'struct * pmc' at context switch time). * */ struct pmc_hw { uint32_t phw_state; /* see PHW_* macros below */ struct pmc *phw_pmc; /* current thread PMC */ }; #define PMC_PHW_RI_MASK 0x000000FF #define PMC_PHW_CPU_SHIFT 8 #define PMC_PHW_CPU_MASK 0x0000FF00 #define PMC_PHW_FLAGS_SHIFT 16 #define PMC_PHW_FLAGS_MASK 0xFFFF0000 #define PMC_PHW_INDEX_TO_STATE(ri) ((ri) & PMC_PHW_RI_MASK) #define PMC_PHW_STATE_TO_INDEX(state) ((state) & PMC_PHW_RI_MASK) #define PMC_PHW_CPU_TO_STATE(cpu) (((cpu) << PMC_PHW_CPU_SHIFT) & \ PMC_PHW_CPU_MASK) #define PMC_PHW_STATE_TO_CPU(state) (((state) & PMC_PHW_CPU_MASK) >> \ PMC_PHW_CPU_SHIFT) #define PMC_PHW_FLAGS_TO_STATE(flags) (((flags) << PMC_PHW_FLAGS_SHIFT) & \ PMC_PHW_FLAGS_MASK) #define PMC_PHW_STATE_TO_FLAGS(state) (((state) & PMC_PHW_FLAGS_MASK) >> \ PMC_PHW_FLAGS_SHIFT) #define PMC_PHW_FLAG_IS_ENABLED (PMC_PHW_FLAGS_TO_STATE(0x01)) #define PMC_PHW_FLAG_IS_SHAREABLE (PMC_PHW_FLAGS_TO_STATE(0x02)) /* * struct pmc_cpustate * * A CPU is modelled as a collection of HW PMCs with space for additional * flags. */ struct pmc_cpu { uint32_t pc_state; /* physical cpu number + flags */ struct pmc_hw *pc_hwpmcs[]; /* 'npmc' pointers */ /* other machine dependent fields come here */ }; #define PMC_PCPU_CPU_MASK 0x000000FF #define PMC_PCPU_FLAGS_MASK 0xFFFFFF00 #define PMC_PCPU_FLAGS_SHIFT 8 #define PMC_PCPU_STATE_TO_CPU(S) ((S) & PMC_PCPU_CPU_MASK) #define PMC_PCPU_STATE_TO_FLAGS(S) (((S) & PMC_PCPU_FLAGS_MASK) >> PMC_PCPU_FLAGS_SHIFT) #define PMC_PCPU_FLAGS_TO_STATE(F) (((F) << PMC_PCPU_FLAGS_SHIFT) & PMC_PCPU_FLAGS_MASK) #define PMC_PCPU_CPU_TO_STATE(C) ((C) & PMC_PCPU_CPU_MASK) #define PMC_PCPU_FLAG_HTT (PMC_PCPU_FLAGS_TO_STATE(0x1)) /* * struct pmc_binding * * CPU binding information. */ struct pmc_binding { int pb_bound; /* is bound? */ int pb_cpu; /* if so, to which CPU */ }; /* * struct pmc_mdep * * Machine dependent bits needed per CPU type. */ struct pmc_mdep { uint32_t pmd_cputype; /* from enum pmc_cputype */ uint32_t pmd_npmc; /* max PMCs per CPU */ uint32_t pmd_nclass; /* # PMC classes supported */ struct pmc_classinfo pmd_classes[PMC_CLASS_MAX]; int pmd_nclasspmcs[PMC_CLASS_MAX]; /* * Methods */ int (*pmd_init)(int _cpu); /* machine dependent initialization */ int (*pmd_cleanup)(int _cpu); /* machine dependent cleanup */ /* thread context switch in/out */ int (*pmd_switch_in)(struct pmc_cpu *_p, struct pmc_process *_pp); int (*pmd_switch_out)(struct pmc_cpu *_p, struct pmc_process *_pp); /* configuring/reading/writing the hardware PMCs */ int (*pmd_config_pmc)(int _cpu, int _ri, struct pmc *_pm); int (*pmd_get_config)(int _cpu, int _ri, struct pmc **_ppm); int (*pmd_read_pmc)(int _cpu, int _ri, pmc_value_t *_value); int (*pmd_write_pmc)(int _cpu, int _ri, pmc_value_t _value); /* pmc allocation/release */ int (*pmd_allocate_pmc)(int _cpu, int _ri, struct pmc *_t, const struct pmc_op_pmcallocate *_a); int (*pmd_release_pmc)(int _cpu, int _ri, struct pmc *_pm); /* starting and stopping PMCs */ int (*pmd_start_pmc)(int _cpu, int _ri); int (*pmd_stop_pmc)(int _cpu, int _ri); /* handle a PMC interrupt */ - int (*pmd_intr)(int _cpu, uintptr_t _pc); + int (*pmd_intr)(int _cpu, uintptr_t _pc, int _usermode); int (*pmd_describe)(int _cpu, int _ri, struct pmc_info *_pi, struct pmc **_ppmc); /* Machine dependent methods */ #if __i386__ || __amd64__ int (*pmd_get_msr)(int _ri, uint32_t *_msr); #endif }; /* * Per-CPU state. This is an array of 'mp_ncpu' pointers * to struct pmc_cpu descriptors. */ extern struct pmc_cpu **pmc_pcpu; /* driver statistics */ extern struct pmc_op_getdriverstats pmc_stats; #if DEBUG /* debug flags */ extern unsigned int pmc_debugflags; /* [Maj:12bits] [Min:16bits] [level:4] */ #define PMC_DEBUG_DEFAULT_FLAGS 0 #define PMC_DEBUG_STRSIZE 128 #define __PMCDFMAJ(M) (1 << (PMC_DEBUG_MAJ_##M+20)) #define __PMCDFMIN(M) (1 << (PMC_DEBUG_MIN_##M+4)) #define __PMCDF(M,N) (__PMCDFMAJ(M) | __PMCDFMIN(N)) #define PMCDBG(M,N,L,F,...) do { \ if (((pmc_debugflags & __PMCDF(M,N)) == __PMCDF(M,N)) && \ ((pmc_debugflags & 0xF) > (L))) \ printf(#M ":" #N ": " F "\n", __VA_ARGS__); \ } while (0) /* Major numbers */ #define PMC_DEBUG_MAJ_MOD 0 /* misc module infrastructure */ #define PMC_DEBUG_MAJ_PMC 1 /* pmc management */ #define PMC_DEBUG_MAJ_CTX 2 /* context switches */ #define PMC_DEBUG_MAJ_OWN 3 /* owner */ #define PMC_DEBUG_MAJ_PRC 4 /* processes */ #define PMC_DEBUG_MAJ_MDP 5 /* machine dependent */ #define PMC_DEBUG_MAJ_CPU 6 /* cpu switches */ /* Minor numbers */ /* Common (8 bits) */ #define PMC_DEBUG_MIN_ALL 0 /* allocation */ #define PMC_DEBUG_MIN_REL 1 /* release */ #define PMC_DEBUG_MIN_OPS 2 /* ops: start, stop, ... */ #define PMC_DEBUG_MIN_INI 3 /* init */ #define PMC_DEBUG_MIN_FND 4 /* find */ /* MODULE */ #define PMC_DEBUG_MIN_PMH 14 /* pmc_hook */ #define PMC_DEBUG_MIN_PMS 15 /* pmc_syscall */ /* OWN */ #define PMC_DEBUG_MIN_ORM 8 /* owner remove */ #define PMC_DEBUG_MIN_OMR 9 /* owner maybe remove */ /* PROCESSES */ #define PMC_DEBUG_MIN_TLK 8 /* link target */ #define PMC_DEBUG_MIN_TUL 9 /* unlink target */ #define PMC_DEBUG_MIN_EXT 10 /* process exit */ #define PMC_DEBUG_MIN_EXC 11 /* process exec */ #define PMC_DEBUG_MIN_FRK 12 /* process fork */ #define PMC_DEBUG_MIN_ATT 13 /* attach/detach */ /* CONTEXT SWITCHES */ #define PMC_DEBUG_MIN_SWI 8 /* switch in */ #define PMC_DEBUG_MIN_SWO 9 /* switch out */ /* PMC */ #define PMC_DEBUG_MIN_REG 8 /* pmc register */ #define PMC_DEBUG_MIN_ALR 9 /* allocate row */ /* MACHINE DEPENDENT LAYER */ #define PMC_DEBUG_MIN_REA 8 /* read */ #define PMC_DEBUG_MIN_WRI 9 /* write */ #define PMC_DEBUG_MIN_CFG 10 /* config */ #define PMC_DEBUG_MIN_STA 11 /* start */ #define PMC_DEBUG_MIN_STO 12 /* stop */ #define PMC_DEBUG_MIN_INT 13 /* interrupts */ /* CPU */ #define PMC_DEBUG_MIN_BND 8 /* bind */ #define PMC_DEBUG_MIN_SEL 9 /* select */ #else #define PMCDBG(M,N,L,F,...) /* nothing */ #endif /* declare a dedicated memory pool */ MALLOC_DECLARE(M_PMC); /* * Functions */ void pmc_update_histogram(struct pmc_hw *phw, uintptr_t pc); void pmc_send_signal(struct pmc *pmc); int pmc_getrowdisp(int ri); #endif /* _KERNEL */ #endif /* _SYS_PMC_H_ */ diff --git a/sys/sys/pmckern.h b/sys/sys/pmckern.h index 3c11172554de..7a222da3b4dc 100644 --- a/sys/sys/pmckern.h +++ b/sys/sys/pmckern.h @@ -1,93 +1,102 @@ /*- - * Copyright (c) 2003, Joseph Koshy + * Copyright (c) 2003-2005, Joseph Koshy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * PMC interface used by the base kernel. */ #ifndef _SYS_PMCKERN_H_ #define _SYS_PMCKERN_H_ #include #include #include #include #include -#define PMC_FN_PROCESS_EXIT 1 -#define PMC_FN_PROCESS_EXEC 2 -#define PMC_FN_PROCESS_FORK 3 -#define PMC_FN_CSW_IN 4 -#define PMC_FN_CSW_OUT 5 +#define PMC_FN_PROCESS_EXEC 1 +#define PMC_FN_CSW_IN 2 +#define PMC_FN_CSW_OUT 3 +#define PMC_FN_DO_SAMPLES 4 + +#define PMC_FN_PROCESS_EXIT 5 /* obsolete */ +#define PMC_FN_PROCESS_FORK 6 /* obsolete */ /* hook */ extern int (*pmc_hook)(struct thread *_td, int _function, void *_arg); -extern int (*pmc_intr)(int cpu, uintptr_t pc); +extern int (*pmc_intr)(int _cpu, uintptr_t _pc, int _usermode); /* SX lock protecting the hook */ extern struct sx pmc_sx; -/* hook invocation; for use within the kernel */ +/* Per-cpu flags indicating availability of sampling data */ +extern cpumask_t pmc_cpumask; + +/* Hook invocation; for use within the kernel */ #define PMC_CALL_HOOK(t, cmd, arg) \ do { \ sx_slock(&pmc_sx); \ if (pmc_hook != NULL) \ (pmc_hook)((t), (cmd), (arg)); \ sx_sunlock(&pmc_sx); \ } while (0) -/* hook invocation that needs an exclusive lock */ +/* Hook invocation that needs an exclusive lock */ #define PMC_CALL_HOOK_X(t, cmd, arg) \ do { \ sx_xlock(&pmc_sx); \ if (pmc_hook != NULL) \ (pmc_hook)((t), (cmd), (arg)); \ sx_xunlock(&pmc_sx); \ } while (0) -/* context switches cannot take locks */ -#define PMC_SWITCH_CONTEXT(t, cmd) \ +/* + * Some hook invocations (e.g., from context switch and clock handling + * code) need to be lock-free. + */ +#define PMC_CALL_HOOK_UNLOCKED(t, cmd, arg) \ do { \ if (pmc_hook != NULL) \ - (pmc_hook)((t), (cmd), NULL); \ + (pmc_hook)((t), (cmd), (arg)); \ } while (0) +#define PMC_SWITCH_CONTEXT(t,cmd) PMC_CALL_HOOK_UNLOCKED(t,cmd,NULL) -/* - * check if a process is using HWPMCs. - */ - +/* Check if a process is using HWPMCs.*/ #define PMC_PROC_IS_USING_PMCS(p) \ (__predict_false(atomic_load_acq_int(&(p)->p_flag) & \ P_HWPMC)) +/* Check if a CPU has recorded samples. */ +#define PMC_CPU_HAS_SAMPLES(C) (__predict_false(pmc_cpumask & (1 << (C)))) + /* helper functions */ int pmc_cpu_is_disabled(int _cpu); int pmc_cpu_is_logical(int _cpu); #endif /* _SYS_PMCKERN_H_ */