Index: sys/dev/hwpmc/hwpmc_mod.c =================================================================== --- sys/dev/hwpmc/hwpmc_mod.c +++ sys/dev/hwpmc/hwpmc_mod.c @@ -195,8 +195,11 @@ #endif static int load(struct module *module, int cmd, void *arg); +static int pmc_add_normal_sample(int cpu, int ring, struct pmc *pm, + struct trapframe *tf, int inuserspace, uint32_t count); static void pmc_add_thread_descriptors_from_proc(struct proc *p, struct pmc_process *pp); +static void pmc_add_useronly_sample(int cpu, struct pmc *pm); static int pmc_attach_process(struct proc *p, struct pmc *pm); static struct pmc *pmc_allocate_pmc_descriptor(void); static struct pmc_owner *pmc_allocate_owner_descriptor(struct proc *p); @@ -204,7 +207,7 @@ static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu); static int pmc_can_attach(struct pmc *pm, struct proc *p); -static void pmc_capture_user_callchain(int cpu, int soft, struct trapframe *tf); +static int pmc_capture_user_callchain(int cpu, int soft, struct trapframe *tf); static void pmc_cleanup(void); static int pmc_detach_process(struct proc *p, struct pmc *pm); static int pmc_detach_one_process(struct proc *p, struct pmc *pm, @@ -236,6 +239,7 @@ static void pmc_release_pmc_descriptor(struct pmc *pmc); static void pmc_process_thread_add(struct thread *td); static void pmc_process_thread_delete(struct thread *td); +static void pmc_process_thread_userret(struct thread *td); static void pmc_remove_owner(struct pmc_owner *po); static void pmc_remove_process_descriptor(struct pmc_process *pp); static void pmc_restore_cpu_binding(struct pmc_binding *pb); @@ -898,6 +902,9 @@ KASSERT(pt_td->pt_pmcs[ri].pt_pmcval == (pmc_value_t) 0, ("[pmc,%d] pt_pmcval not cleared for pid=%d at " "ri=%d", __LINE__, pp->pp_proc->p_pid, ri)); + KASSERT(pt_td->pt_pmcs[ri].pt_pendusamples == 0, + ("[pmc,%d] pt_pendusamples not cleared for pid=%d " + "at ri=%d", __LINE__, pp->pp_proc->p_pid, ri)); } mtx_unlock_spin(pp->pp_tdslock); } @@ -940,8 +947,10 @@ /* Clear the per-thread values at this row index. */ if (PMC_TO_MODE(pm) == PMC_MODE_TS) { mtx_lock_spin(pp->pp_tdslock); - LIST_FOREACH(pt, &pp->pp_tds, pt_next) + LIST_FOREACH(pt, &pp->pp_tds, pt_next) { pt->pt_pmcs[ri].pt_pmcval = (pmc_value_t) 0; + pt->pt_pmcs[ri].pt_pendusamples = 0; + } mtx_unlock_spin(pp->pp_tdslock); } @@ -1360,6 +1369,8 @@ ("[pmc,%d] No thread found for td=%p", __LINE__, td)); + phw->phw_pmcthread = pt; + mtx_pool_lock_spin(pmc_mtxpool, pm); /* @@ -1444,6 +1455,7 @@ struct proc *p; enum pmc_mode mode; struct pmc_cpu *pc; + struct pmc_hw *phw; pmc_value_t newvalue; unsigned int adjri, ri; struct pmc_process *pp; @@ -1512,6 +1524,8 @@ ("[pmc,%d] ri mismatch pmc(%d) ri(%d)", __LINE__, PMC_TO_ROWINDEX(pm), ri)); + phw = pc->pc_hwpmcs[ri]; + /* * Change desired state, and then stop if not stalled. * This two-step dance should avoid race conditions where @@ -1547,8 +1561,7 @@ cpu, ri, newvalue); if (pt == NULL) - pt = pmc_find_thread_descriptor(pp, td, - PMC_FLAG_NONE); + pt = phw->phw_pmcthread; KASSERT(pt != NULL, ("[pmc,%d] No thread found for td=%p", @@ -1660,6 +1673,95 @@ } /* + * A userret() call for a thread. + */ +static void +pmc_process_thread_userret(struct thread *td) +{ + struct pmc *pm; + struct pmc_process *pp; + struct pmc_thread *pmc_td; + struct pmc_classdep *pcd; + uint32_t num_samples; + int adjri, cpu, needast, ri; + + needast = 0; + + /* Find our process and thread. */ + pp = pmc_find_process_descriptor(td->td_proc, PMC_FLAG_NONE); + + if (pp == NULL) + return; + + pmc_td = pmc_find_thread_descriptor(pp, td, PMC_FLAG_NONE); + + if (pmc_td == NULL) + return; + + /* We may touch td->td_flags and td->td_pflags. */ + thread_lock(td); + + /* Don't switch CPUs while we are processing samples. */ + critical_enter(); + + cpu = td->td_oncpu; + + KASSERT(cpu >= 0 && cpu < pmc_cpu_max(), + ("[pmc,%d] wierd CPU id %d", __LINE__, cpu)); + + /* + * Loop through the current PMCs, looking for any that care about + * our process. + */ + for (ri = 0; ri < md->pmd_npmc; ri++) { + pcd = pmc_ri_to_classdep(md, ri, &adjri); + KASSERT(pcd != NULL, + ("[pmc,%d] null pcd ri=%d", __LINE__, ri)); + (void) (*pcd->pcd_get_config)(cpu,adjri,&pm); + + if (pm == NULL || (pm->pm_flags & PMC_F_USERCALLCHAIN) == 0) + continue; + +#ifdef INVARIANTS + { + struct pmc_target *pt; + int found = FALSE; + + LIST_FOREACH(pt, &pm->pm_targets, pt_next) { + if (pt->pt_process->pp_proc == td->td_proc) { + found = TRUE; + break; + } + } + + KASSERT(found, + ("[pmc,%d] called to process user call chain " + "for td=%p and found pm=%p not for our process", + __LINE__, td, pm)); + } +#endif + + /* Determine how many (if any) samples we owe. */ + num_samples = atomic_readandclear_32( + &pmc_td->pt_pmcs[ri].pt_pendusamples); + if (num_samples == 0) + continue; + + /* Record a sample. */ + if (pmc_add_normal_sample(cpu, PMC_UR, pm, NULL, TRUE, + num_samples) == 0) + needast++; + } + + critical_exit(); + + if (needast > 0) + curthread->td_flags |= TDF_ASTPENDING; + + thread_unlock(td); +} + +/* * A mapping change for a process. */ @@ -1982,6 +2084,7 @@ "SOFTSAMPLING", "THR-CREATE", "THR-EXIT", + "THR-USERRET", }; #endif @@ -1989,6 +2092,9 @@ pmc_hook_handler(struct thread *td, int function, void *arg) { int cpu; +#ifdef INVARIANTS + int ncallchains; +#endif PMCDBG4(MOD,PMH,1, "hook td=%p func=%d \"%s\" arg=%p", td, function, pmc_hooknames[function], arg); @@ -2144,6 +2250,7 @@ CPU_CLR_ATOMIC(cpu, &pmc_cpumask); pmc_process_samples(cpu, PMC_HR); pmc_process_samples(cpu, PMC_SR); + pmc_process_samples(cpu, PMC_UR); break; case PMC_FN_MMAP: @@ -2163,8 +2270,22 @@ KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); - pmc_capture_user_callchain(PCPU_GET(cpuid), PMC_HR, + cpu = PCPU_GET(cpuid); +#ifdef INVARIANTS + ncallchains = +#endif + pmc_capture_user_callchain(cpu, PMC_HR, (struct trapframe *) arg); + + KASSERT(ncallchains > 0, + ("[pmc,%d] cpu %d didn't find a sample to collect", + __LINE__, cpu)); + + KASSERT(td->td_pinned == 1, + ("[pmc,%d] invalid td_pinned value", __LINE__)); + + sched_unpin(); /* Can migrate safely now. */ + td->td_pflags &= ~TDP_CALLCHAIN; break; @@ -2174,8 +2295,28 @@ */ KASSERT(td == curthread, ("[pmc,%d] td != curthread", __LINE__)); - pmc_capture_user_callchain(PCPU_GET(cpuid), PMC_SR, + + cpu = PCPU_GET(cpuid); +#ifdef INVARIANTS + ncallchains = +#endif + pmc_capture_user_callchain(cpu, PMC_SR, (struct trapframe *) arg); +#ifdef INVARIANTS + ncallchains += +#endif + pmc_capture_user_callchain(cpu, PMC_UR, + (struct trapframe *) arg); + + KASSERT(ncallchains > 0, + ("[pmc,%d] cpu %d didn't find a sample to collect", + __LINE__, cpu)); + + KASSERT(td->td_pinned == 1, + ("[pmc,%d] invalid td_pinned value", __LINE__)); + + sched_unpin(); /* Can migrate safely now. */ + td->td_pflags &= ~TDP_CALLCHAIN; break; @@ -2196,6 +2337,12 @@ pmc_process_thread_delete(td); break; + case PMC_FN_THR_USERRET: + KASSERT(td == curthread, ("[pmc,%d] td != curthread", + __LINE__)); + pmc_process_thread_userret(td); + break; + default: #ifdef HWPMC_DEBUG KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function)); @@ -3711,7 +3858,22 @@ */ if ((pa.pm_flags & ~(PMC_F_DESCENDANTS | PMC_F_LOG_PROCCSW | - PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN)) != 0) { + PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN | + PMC_F_USERCALLCHAIN)) != 0) { + error = EINVAL; + break; + } + + /* PMC_F_USERCALLCHAIN is only valid with PMC_F_CALLCHAIN */ + if ((pa.pm_flags & (PMC_F_CALLCHAIN | PMC_F_USERCALLCHAIN)) == + PMC_F_USERCALLCHAIN) { + error = EINVAL; + break; + } + + /* PMC_F_USERCALLCHAIN is only valid with PMC_MODE_TS */ + if ((pa.pm_flags & PMC_F_USERCALLCHAIN) && + mode != PMC_MODE_TS) { error = EINVAL; break; } @@ -4443,8 +4605,6 @@ } /* - * Interrupt processing. - * * Find a free slot in the per-cpu array of samples and capture the * current callchain there. If a sample was successfully added, a bit * is set in mask 'pmc_cpumask' denoting that the DO_SAMPLES hook @@ -4454,9 +4614,9 @@ * use any of the locking primitives supplied by the OS. */ -int -pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf, - int inuserspace) +static int +pmc_add_normal_sample(int cpu, int ring, struct pmc *pm, struct trapframe *tf, + int inuserspace, uint32_t count) { int error, callchaindepth; struct thread *td; @@ -4526,6 +4686,7 @@ } ps->ps_nsamples = callchaindepth; /* mark entry as in use */ + ps->ps_count = count; /* increment write pointer, modulo ring buffer size */ ps++; @@ -4543,12 +4704,78 @@ } /* + * Interrupt processing for a user call chain while in kernel space. + * + * Find the thread descriptor and increment the number of samples we + * have deferred while in kernel space. Just before the return to user- + * space, we will capture the callchain to record these. + * + * This function is meant to be called from an NMI handler. It cannot + * use any of the locking primitives supplied by the OS. However, we + * can use the pmc_thread pointer we saved during context switch in. + */ + +static void +pmc_add_useronly_sample(int cpu, struct pmc *pm) +{ + struct pmc_thread *pmc_td; + struct thread *td; + char *status; + uint32_t num_outstanding = 0; + + status = "(uspc) "; + + /* Ignore user call chain requests without a valid process. */ + if ((td = curthread) == NULL || td->td_proc == NULL) { + PMCDBG4(SAM,INT,1,"(notd) cpu=%d pm=%p um=%d n=%u", cpu, pm, 0, + num_outstanding); + return; + } + + /* Get the saved PMC thread descriptor. */ + pmc_td = pmc_pcpu[cpu]->pc_hwpmcs[PMC_TO_ROWINDEX(pm)]->phw_pmcthread; + + if (pmc_td == NULL) { + PMCDBG4(SAM,INT,1,"(nopmctd) cpu=%d pm=%p um=%d n=%u", cpu, pm, + 0, num_outstanding); + return; + } + + /* Increment the number of pending user-space samples. */ + num_outstanding = atomic_fetchadd_32( + &pmc_td->pt_pmcs[PMC_TO_ROWINDEX(pm)].pt_pendusamples, 1) + 1; + + PMCDBG4(SAM,INT,1,"(uspc) cpu=%d pm=%p um=%d n=%u", cpu, pm, 0, + num_outstanding); + + return; +} + +/* + * Interrupt processing. + * + * This function is meant to be called from an NMI handler. It cannot + * use any of the locking primitives supplied by the OS. + */ + +int +pmc_process_interrupt(int cpu, int ring, struct pmc *pm, struct trapframe *tf, + int inuserspace) +{ + if ((pm->pm_flags & PMC_F_USERCALLCHAIN) != 0 && !inuserspace) { + pmc_add_useronly_sample(cpu, pm); + return (0); + } + return (pmc_add_normal_sample(cpu, ring, pm, tf, inuserspace, 1)); +} + +/* * Capture a user call chain. This function will be called from ast() * before control returns to userland and before the process gets * rescheduled. */ -static void +static int pmc_capture_user_callchain(int cpu, int ring, struct trapframe *tf) { struct pmc *pm; @@ -4614,18 +4841,15 @@ ps = psb->ps_samples; } while (ps != ps_end); - KASSERT(ncallchains > 0, - ("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__, - cpu)); - - KASSERT(td->td_pinned == 1, - ("[pmc,%d] invalid td_pinned value", __LINE__)); - sched_unpin(); /* Can migrate safely now. */ - /* mark CPU as needing processing */ CPU_SET_ATOMIC(cpu, &pmc_cpumask); - return; +#ifdef INVARIANTS + /* The return value only matters when INVARIANTS is defined. */ + return (ncallchains); +#else + return (0); +#endif } /* @@ -4683,28 +4907,27 @@ (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); - /* - * If this is a process-mode PMC that is attached to - * its owner, and if the PC is in user mode, update - * profiling statistics like timer-based profiling - * would have done. - */ - if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) { - if (ps->ps_flags & PMC_CC_F_USERSPACE) { - td = FIRST_THREAD_IN_PROC(po->po_owner); - addupc_intr(td, ps->ps_pc[0], 1); - } - goto entrydone; + while (ps->ps_count-- != 0) { + /* + * If this is a process-mode PMC that is attached to + * its owner, and if the PC is in user mode, update + * profiling statistics like timer-based profiling + * would have done. + * + * Otherwise, this is either a sampling-mode PMC that + * is attached to a different process than its owner, + * or a system-wide sampling PMC. Dispatch a log + * entry to the PMC's owner process. + */ + if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) { + if (ps->ps_flags & PMC_CC_F_USERSPACE) { + td = FIRST_THREAD_IN_PROC(po->po_owner); + addupc_intr(td, ps->ps_pc[0], 1); + } + } else + pmclog_process_callchain(pm, ps); } - /* - * Otherwise, this is either a sampling mode PMC that - * is attached to a different process than its owner, - * or a system-wide sampling PMC. Dispatch a log - * entry to the PMC's owner process. - */ - pmclog_process_callchain(pm, ps); - entrydone: ps->ps_nsamples = 0; /* mark entry as free */ atomic_subtract_rel_int(&pm->pm_runcount, 1); @@ -5309,6 +5532,24 @@ (n * pmc_callchaindepth); pmc_pcpu[cpu]->pc_sb[PMC_SR] = sb; + + sb = malloc(sizeof(struct pmc_samplebuffer) + + pmc_nsamples * sizeof(struct pmc_sample), M_PMC, + M_WAITOK|M_ZERO); + sb->ps_read = sb->ps_write = sb->ps_samples; + sb->ps_fence = sb->ps_samples + pmc_nsamples; + + KASSERT(pmc_pcpu[cpu] != NULL, + ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); + + sb->ps_callchains = malloc(pmc_callchaindepth * pmc_nsamples * + sizeof(uintptr_t), M_PMC, M_WAITOK|M_ZERO); + + for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) + ps->ps_pc = sb->ps_callchains + + (n * pmc_callchaindepth); + + pmc_pcpu[cpu]->pc_sb[PMC_UR] = sb; } /* allocate space for the row disposition array */ @@ -5524,10 +5765,15 @@ KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_SR] != NULL, ("[pmc,%d] Null sw cpu sample buffer cpu=%d", __LINE__, cpu)); + KASSERT(pmc_pcpu[cpu]->pc_sb[PMC_UR] != NULL, + ("[pmc,%d] Null userret cpu sample buffer cpu=%d", __LINE__, + cpu)); free(pmc_pcpu[cpu]->pc_sb[PMC_HR]->ps_callchains, M_PMC); free(pmc_pcpu[cpu]->pc_sb[PMC_HR], M_PMC); free(pmc_pcpu[cpu]->pc_sb[PMC_SR]->ps_callchains, M_PMC); free(pmc_pcpu[cpu]->pc_sb[PMC_SR], M_PMC); + free(pmc_pcpu[cpu]->pc_sb[PMC_UR]->ps_callchains, M_PMC); + free(pmc_pcpu[cpu]->pc_sb[PMC_UR], M_PMC); free(pmc_pcpu[cpu], M_PMC); } Index: sys/kern/subr_trap.c =================================================================== --- sys/kern/subr_trap.c +++ sys/kern/subr_trap.c @@ -132,6 +132,11 @@ */ if (p->p_flag & P_PROFIL) addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio); + +#ifdef HWPMC_HOOKS + if (PMC_PROC_IS_USING_PMCS(p)) + PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL); +#endif /* * Let the scheduler adjust our priority etc. */ Index: sys/sys/pmc.h =================================================================== --- sys/sys/pmc.h +++ sys/sys/pmc.h @@ -343,6 +343,7 @@ #define PMC_F_KGMON 0x00000040 /*OP ALLOCATE kgmon(8) profiling */ /* V2 API */ #define PMC_F_CALLCHAIN 0x00000080 /*OP ALLOCATE capture callchains */ +#define PMC_F_USERCALLCHAIN 0x00000100 /*OP ALLOCATE use userspace stack */ /* internal flags */ #define PMC_F_ATTACHED_TO_OWNER 0x00010000 /*attached to owner*/ @@ -750,7 +751,8 @@ * Record per-PMC, per-thread state. */ struct pmc_threadpmcstate { - pmc_value_t pt_pmcval; /* per-thread reload count */ + pmc_value_t pt_pmcval; /* per-thread reload count */ + uint32_t pt_pendusamples; /* pending user-space samples */ }; /* @@ -846,8 +848,9 @@ */ struct pmc_hw { - uint32_t phw_state; /* see PHW_* macros below */ - struct pmc *phw_pmc; /* current thread PMC */ + uint32_t phw_state; /* see PHW_* macros below */ + struct pmc *phw_pmc; /* current thread PMC */ + struct pmc_thread *phw_pmcthread; /* current thread state */ }; #define PMC_PHW_RI_MASK 0x000000FF @@ -883,6 +886,7 @@ struct thread *ps_td; /* which thread */ struct pmc *ps_pmc; /* interrupting PMC */ uintptr_t *ps_pc; /* (const) callchain start */ + uint32_t ps_count; /* number of entries */ }; #define PMC_SAMPLE_FREE ((uint16_t) 0) @@ -906,7 +910,7 @@ struct pmc_cpu { uint32_t pc_state; /* physical cpu number + flags */ - struct pmc_samplebuffer *pc_sb[2]; /* space for samples */ + struct pmc_samplebuffer *pc_sb[3]; /* space for samples */ struct pmc_hw *pc_hwpmcs[]; /* 'npmc' pointers */ }; @@ -1152,7 +1156,7 @@ struct pmc_mdep *pmc_md_initialize(void); /* MD init function */ void pmc_md_finalize(struct pmc_mdep *_md); /* MD fini function */ int pmc_getrowdisp(int _ri); -int pmc_process_interrupt(int _cpu, int _soft, struct pmc *_pm, +int pmc_process_interrupt(int _cpu, int _ring, struct pmc *_pm, struct trapframe *_tf, int _inuserspace); int pmc_save_kernel_callchain(uintptr_t *_cc, int _maxsamples, struct trapframe *_tf); Index: sys/sys/pmckern.h =================================================================== --- sys/sys/pmckern.h +++ sys/sys/pmckern.h @@ -60,9 +60,11 @@ #define PMC_FN_SOFT_SAMPLING 11 #define PMC_FN_THR_CREATE 12 #define PMC_FN_THR_EXIT 13 +#define PMC_FN_THR_USERRET 14 #define PMC_HR 0 /* Hardware ring buffer */ #define PMC_SR 1 /* Software ring buffer */ +#define PMC_UR 2 /* userret ring buffer */ struct pmckern_procexec { int pm_credentialschanged; Index: usr.sbin/pmcstat/pmcstat.8 =================================================================== --- usr.sbin/pmcstat/pmcstat.8 +++ usr.sbin/pmcstat/pmcstat.8 @@ -45,6 +45,7 @@ .Op Fl R Ar logfilename .Op Fl S Ar event-spec .Op Fl T +.Op Fl U .Op Fl W .Op Fl a Ar pathname .Op Fl c Ar cpu-spec @@ -217,6 +218,10 @@ to delta mode, 'm' merge PMCs, 'n' change view, 'p' show next PMC, ' ' pause, 'q' quit. calltree only: 'f' cost under threshold is seen as a dot. +.It Fl U +Toggle capturing user-space call traces while in kernel mode. +The default is for sampling PMCs to capture user-space callchain information +while in user-space mode, and kernel callchain information while in kernel mode. .It Fl W Toggle logging the incremental counts seen by the threads of a tracked process each time they are scheduled on a CPU. Index: usr.sbin/pmcstat/pmcstat.c =================================================================== --- usr.sbin/pmcstat/pmcstat.c +++ usr.sbin/pmcstat/pmcstat.c @@ -557,7 +557,7 @@ int option, npmc; int c, check_driver_stats, current_sampling_count; int do_callchain, do_descendants, do_logproccsw, do_logprocexit; - int do_print, do_read; + int do_print, do_read, do_userspace; size_t len; int graphdepth; int pipefd[2], rfd; @@ -580,6 +580,7 @@ do_descendants = 0; do_logproccsw = 0; do_logprocexit = 0; + do_userspace = 0; use_cumulative_counts = 0; graphfilename = "-"; args.pa_required = 0; @@ -628,7 +629,7 @@ CPU_COPY(&rootmask, &cpumask); while ((option = getopt(argc, argv, - "CD:EF:G:M:NO:P:R:S:TWa:c:def:gk:l:m:n:o:p:qr:s:t:vw:z:")) != -1) + "CD:EF:G:M:NO:P:R:S:TUWa:c:def:gk:l:m:n:o:p:qr:s:t:vw:z:")) != -1) switch (option) { case 'a': /* Annotate + callgraph */ args.pa_flags |= FLAG_DO_ANNOTATE; @@ -779,8 +780,11 @@ ev->ev_cpu = PMC_CPU_ANY; ev->ev_flags = 0; - if (do_callchain) + if (do_callchain) { ev->ev_flags |= PMC_F_CALLCHAIN; + if (do_userspace && option == 'P') + ev->ev_flags |= PMC_F_USERCALLCHAIN; + } if (do_descendants) ev->ev_flags |= PMC_F_DESCENDANTS; if (do_logprocexit) @@ -872,6 +876,11 @@ args.pa_printfile = stdout; break; + case 'U': /* toggle user-space callchain capture */ + do_userspace = !do_userspace; + args.pa_required |= FLAG_HAS_SAMPLING_PMCS; + break; + case 'v': /* verbose */ args.pa_verbosity++; break;