diff --git a/sys/amd64/pt/pt.c b/sys/amd64/pt/pt.c index d96da1c1ac17..c7b75767680a 100644 --- a/sys/amd64/pt/pt.c +++ b/sys/amd64/pt/pt.c @@ -1,977 +1,978 @@ /* * Copyright (c) 2025 Bojan Novković * * SPDX-License-Identifier: BSD-2-Clause */ /* * hwt(4) Intel Processor Trace (PT) backend * * Driver Design Overview * * - Since PT is configured on a per-core basis, the driver uses * 'smp_rendezvous' to start and disable tracing on each target core. * - PT-specific resources are stored in a 'struct pt_ctx' context structure for * each traced CPU core or thread. Upon initialization, a ToPA configuration * is generated for each 'pt_ctx' structure using the HWT tracing buffers. * The HWT tracing buffer is split into 4K ToPA entries. Currently, each * 4K ToPA entry is configured to trigger an interrupt after it is filled. * - The PT driver uses the XSAVE/XRSTOR PT extensions to load and save all * relevant PT registers. Every time a traced thread is switched * out or in, its state will be saved to or loaded from its corresponding * 'pt_ctx' context. * - When tracing starts, the PT hardware will start writing data into the * tracing buffer. When a TOPA_INT entry is filled, it will trigger an * interrupt before continuing. The interrupt handler will then fetch the * last valid tracing buffer offset and enqueue a HWT_RECORD_BUFFER record. * The driver is currently configured to use the NMI interrupt line. * - The userspace PT backend waits for incoming HWT_RECORD_BUFFER records * and uses the offsets to decode data from the tracing buffer. * * Future improvements and limitations * * - We currently configure the PT hardware to trigger an interrupt whenever * a 4K ToPA entry is filled. While this is fine when tracing smaller * functions or infrequent code paths, this will generate too much interrupt * traffic when tracing hotter functions. A proper solution for this issue * should estimate the amount of data generated by the current configuration * and use it to determine interrupt frequency. * * - Support for more tracing options and PT features. * */ #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PT_DEBUG #define dprintf(fmt, ...) printf(fmt, ##__VA_ARGS__) #else #define dprintf(fmt, ...) #endif #define PT_SUPPORTED_FLAGS \ (RTIT_CTL_MTCEN | RTIT_CTL_CR3FILTER | RTIT_CTL_DIS_TNT | \ RTIT_CTL_USER | RTIT_CTL_OS | RTIT_CTL_BRANCHEN) #define PT_XSAVE_MASK (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE) #define PT_XSTATE_BV (PT_XSAVE_MASK | XFEATURE_ENABLED_PT) #define PT_MAX_IP_RANGES 2 #define PT_TOPA_MASK_PTRS 0x7f #define PT_TOPA_PAGE_MASK 0xffffff80 #define PT_TOPA_PAGE_SHIFT 7 #define CPUID_PT_LEAF 0x14 MALLOC_DEFINE(M_PT, "pt", "Intel Processor Trace"); SDT_PROVIDER_DEFINE(pt); SDT_PROBE_DEFINE(pt, , , topa__intr); TASKQUEUE_FAST_DEFINE_THREAD(pt); static void pt_send_buffer_record(void *arg, int pending __unused); static int pt_topa_intr(struct trapframe *tf); /* * Intel Processor Trace XSAVE-managed state. */ struct pt_ext_area { uint64_t rtit_ctl; uint64_t rtit_output_base; uint64_t rtit_output_mask_ptrs; uint64_t rtit_status; uint64_t rtit_cr3_match; uint64_t rtit_addr0_a; uint64_t rtit_addr0_b; uint64_t rtit_addr1_a; uint64_t rtit_addr1_b; }; struct pt_buffer { uint64_t *topa_hw; /* ToPA table entries. */ size_t size; struct mtx lock; /* Lock for fields below. */ vm_offset_t offset; uint64_t wrap_count; int curpage; }; struct pt_ctx { int id; struct pt_buffer buf; /* ToPA buffer metadata */ struct task task; /* ToPA buffer notification task */ struct hwt_context *hwt_ctx; uint8_t *save_area; /* PT XSAVE area */ }; /* PT tracing contexts used for CPU mode. */ static struct pt_ctx *pt_pcpu_ctx; enum pt_cpu_state { PT_DISABLED = 0, PT_STOPPED, PT_ACTIVE }; static struct pt_cpu { struct pt_ctx *ctx; /* active PT tracing context */ enum pt_cpu_state state; /* used as part of trace stop protocol */ } *pt_pcpu; /* * PT-related CPUID bits. */ static struct pt_cpu_info { uint32_t l0_eax; uint32_t l0_ebx; uint32_t l0_ecx; uint32_t l1_eax; uint32_t l1_ebx; size_t xsave_area_size; size_t xstate_hdr_offset; size_t pt_xsave_offset; } pt_info __read_mostly; static bool initialized = false; static int cpu_mode_ctr = 0; static __inline enum pt_cpu_state pt_cpu_get_state(int cpu_id) { return (atomic_load_int(&pt_pcpu[cpu_id].state)); } static __inline void pt_cpu_set_state(int cpu_id, enum pt_cpu_state state) { atomic_store_int(&pt_pcpu[cpu_id].state, state); } static __inline struct xstate_hdr * pt_ctx_get_xstate_hdr(struct pt_ctx *ctx) { return ((struct xstate_hdr *)(ctx->save_area + pt_info.xstate_hdr_offset)); } static __inline struct pt_ext_area * pt_ctx_get_ext_area(struct pt_ctx *ctx) { return ((struct pt_ext_area *)(ctx->save_area + pt_info.pt_xsave_offset)); } /* * Updates current trace buffer offset from the * ToPA MSRs. Records if the trace buffer wrapped. */ static __inline void pt_update_buffer(struct pt_buffer *buf) { uint64_t reg; int curpage; /* Update buffer offset. */ reg = rdmsr(MSR_IA32_RTIT_OUTPUT_MASK_PTRS); curpage = (reg & PT_TOPA_PAGE_MASK) >> PT_TOPA_PAGE_SHIFT; mtx_lock_spin(&buf->lock); /* Check if the output wrapped. */ if (buf->curpage > curpage) buf->wrap_count++; buf->curpage = curpage; buf->offset = reg >> 32; mtx_unlock_spin(&buf->lock); dprintf("%s: wrap_cnt: %lu, curpage: %d, offset: %zu\n", __func__, buf->wrap_count, buf->curpage, buf->offset); } static __inline void pt_fill_buffer_record(int id, struct pt_buffer *buf, struct hwt_record_entry *rec) { rec->record_type = HWT_RECORD_BUFFER; rec->buf_id = id; rec->curpage = buf->curpage; rec->offset = buf->offset + (buf->wrap_count * buf->size); } /* * Enables or disables tracing on curcpu * using the XSAVE/XRSTOR PT extensions. */ static void pt_cpu_toggle_local(uint8_t *save_area, bool enable) { u_long xcr0, cr0; u_long xss; cr0 = rcr0(); if (cr0 & CR0_TS) clts(); xcr0 = rxcr(XCR0); if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) load_xcr(XCR0, xcr0 | PT_XSAVE_MASK); xss = rdmsr(MSR_IA32_XSS); wrmsr(MSR_IA32_XSS, xss | XFEATURE_ENABLED_PT); if (!enable) { KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) != 0, ("%s: PT is disabled", __func__)); xsaves(save_area, XFEATURE_ENABLED_PT); } else { KASSERT((rdmsr(MSR_IA32_RTIT_CTL) & RTIT_CTL_TRACEEN) == 0, ("%s: PT is enabled", __func__)); xrstors(save_area, XFEATURE_ENABLED_PT); } wrmsr(MSR_IA32_XSS, xss); if ((xcr0 & PT_XSAVE_MASK) != PT_XSAVE_MASK) load_xcr(XCR0, xcr0); if (cr0 & CR0_TS) load_cr0(cr0); } /* * Starts PT tracing on 'curcpu'. */ static void pt_cpu_start(void *dummy) { struct pt_cpu *cpu; cpu = &pt_pcpu[curcpu]; MPASS(cpu->ctx != NULL); dprintf("%s: curcpu %d\n", __func__, curcpu); load_cr4(rcr4() | CR4_XSAVE); wrmsr(MSR_IA32_RTIT_STATUS, 0); pt_cpu_set_state(curcpu, PT_ACTIVE); pt_cpu_toggle_local(cpu->ctx->save_area, true); } /* * Stops PT tracing on 'curcpu'. * Updates trace buffer offset to ensure * any data generated between the last interrupt * and the trace stop gets picked up by userspace. */ static void pt_cpu_stop(void *dummy) { struct pt_cpu *cpu; struct pt_ctx *ctx; /* Shutdown may occur before PT gets properly configured. */ if (pt_cpu_get_state(curcpu) == PT_DISABLED) return; cpu = &pt_pcpu[curcpu]; ctx = cpu->ctx; MPASS(ctx != NULL); dprintf("%s: curcpu %d\n", __func__, curcpu); pt_cpu_set_state(curcpu, PT_STOPPED); pt_cpu_toggle_local(cpu->ctx->save_area, false); pt_update_buffer(&ctx->buf); } /* * Prepares the Table of Physical Addresses (ToPA) metadata for 'pt_ctx'. * The HWT trace buffer is split into 4K ToPA table entries and used * as a circular buffer, meaning that the last ToPA entry points to * the first ToPA entry. Each entry is configured to raise an * interrupt after being filled. */ static int pt_topa_prepare(struct pt_ctx *ctx, struct hwt_vm *vm) { struct pt_buffer *buf; size_t topa_size; int i; topa_size = TOPA_SIZE_4K; buf = &ctx->buf; KASSERT(buf->topa_hw == NULL, ("%s: ToPA info already exists", __func__)); buf->topa_hw = mallocarray(vm->npages + 1, sizeof(uint64_t), M_PT, M_ZERO | M_WAITOK); dprintf("%s: ToPA virt addr %p\n", __func__, buf->topa_hw); buf->size = vm->npages * PAGE_SIZE; for (i = 0; i < vm->npages; i++) { buf->topa_hw[i] = VM_PAGE_TO_PHYS(vm->pages[i]) | topa_size; /* * XXX: TOPA_INT should ideally be set according to * expected amount of incoming trace data. Too few TOPA_INT * entries will not trigger interrupts often enough when tracing * smaller functions. */ buf->topa_hw[i] |= TOPA_INT; } buf->topa_hw[vm->npages] = (uint64_t)vtophys(buf->topa_hw) | TOPA_END; return (0); } /* * Configures IP filtering for trace generation. * A maximum of 2 ranges can be specified due to * limitations imposed by the XSAVE/XRSTOR PT extensions. */ static int pt_configure_ranges(struct pt_ctx *ctx, struct pt_cpu_config *cfg) { struct pt_ext_area *pt_ext; int nranges_supp, n, error = 0; pt_ext = pt_ctx_get_ext_area(ctx); if (pt_info.l0_ebx & CPUPT_IPF) { nranges_supp = (pt_info.l1_eax & CPUPT_NADDR_M) >> CPUPT_NADDR_S; if (nranges_supp > PT_IP_FILTER_MAX_RANGES) nranges_supp = PT_IP_FILTER_MAX_RANGES; n = cfg->nranges; if (n > nranges_supp) { printf("%s: %d IP filtering ranges requested, CPU " "supports %d, truncating\n", __func__, n, nranges_supp); n = nranges_supp; } switch (n) { case 2: pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(1)); pt_ext->rtit_addr1_a = cfg->ip_ranges[1].start; pt_ext->rtit_addr1_b = cfg->ip_ranges[1].end; case 1: pt_ext->rtit_ctl |= (1UL << RTIT_CTL_ADDR_CFG_S(0)); pt_ext->rtit_addr0_a = cfg->ip_ranges[0].start; pt_ext->rtit_addr0_b = cfg->ip_ranges[0].end; break; default: error = (EINVAL); break; }; } else error = (ENXIO); return (error); } static int pt_init_ctx(struct pt_ctx *pt_ctx, struct hwt_vm *vm, int ctx_id) { dprintf("%s: ctx id %d\n", __func__, ctx_id); KASSERT(pt_ctx->buf.topa_hw == NULL, ("%s: active ToPA buffer in context %p\n", __func__, pt_ctx)); memset(pt_ctx, 0, sizeof(struct pt_ctx)); mtx_init(&pt_ctx->buf.lock, "pttopa", NULL, MTX_SPIN); pt_ctx->save_area = malloc_aligned(pt_info.xsave_area_size, 64, M_PT, M_NOWAIT | M_ZERO); if (pt_ctx->save_area == NULL) return (ENOMEM); dprintf("%s: preparing ToPA buffer\n", __func__); if (pt_topa_prepare(pt_ctx, vm) != 0) { dprintf("%s: failed to prepare ToPA buffer\n", __func__); free(pt_ctx->save_area, M_PT); return (ENOMEM); } pt_ctx->id = ctx_id; TASK_INIT(&pt_ctx->task, 0, pt_send_buffer_record, pt_ctx); return (0); } static void pt_deinit_ctx(struct pt_ctx *pt_ctx) { if (pt_ctx->buf.topa_hw != NULL) free(pt_ctx->buf.topa_hw, M_PT); if (pt_ctx->save_area != NULL) free(pt_ctx->save_area, M_PT); memset(pt_ctx, 0, sizeof(*pt_ctx)); pt_ctx->buf.topa_hw = NULL; } /* * HWT backend configuration method. * * Checks and translates the user-defined configuration to a * set of PT tracing features. Uses the feature set to initialize * the tracing context for the target CPU or thread. */ static int pt_backend_configure(struct hwt_context *ctx, int cpu_id, int thread_id) { struct hwt_cpu *hwt_cpu; struct hwt_thread *thr; struct pt_ctx *pt_ctx; struct pt_cpu_config *cfg; struct pt_ext_area *pt_ext; struct xstate_hdr *hdr; int error; dprintf("%s\n", __func__); cfg = (struct pt_cpu_config *)ctx->config; pt_ctx = NULL; /* Clear any flags we don't support yet. */ cfg->rtit_ctl &= PT_SUPPORTED_FLAGS; if (cfg->rtit_ctl & RTIT_CTL_MTCEN) { if ((pt_info.l0_ebx & CPUPT_MTC) == 0) { printf("%s: CPU does not support generating MTC " "packets\n", __func__); return (ENXIO); } } if (cfg->rtit_ctl & RTIT_CTL_CR3FILTER) { if ((pt_info.l0_ebx & CPUPT_CR3) == 0) { printf("%s: CPU does not support CR3 filtering\n", __func__); return (ENXIO); } } if (cfg->rtit_ctl & RTIT_CTL_DIS_TNT) { if ((pt_info.l0_ebx & CPUPT_DIS_TNT) == 0) { printf("%s: CPU does not support TNT\n", __func__); return (ENXIO); } } /* TODO: support for more config bits. */ if (ctx->mode == HWT_MODE_CPU) { TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { if (hwt_cpu->cpu_id != cpu_id) continue; pt_ctx = &pt_pcpu_ctx[cpu_id]; break; } } else { TAILQ_FOREACH(thr, &ctx->threads, next) { if (thr->thread_id != thread_id) continue; KASSERT(thr->private != NULL, ("%s: hwt thread private" " not set, thr %p", __func__, thr)); pt_ctx = (struct pt_ctx *)thr->private; break; } } if (pt_ctx == NULL) return (ENOENT); dprintf("%s: preparing MSRs\n", __func__); pt_ext = pt_ctx_get_ext_area(pt_ctx); hdr = pt_ctx_get_xstate_hdr(pt_ctx); pt_ext->rtit_ctl |= cfg->rtit_ctl; if (cfg->nranges != 0) { dprintf("%s: preparing IPF ranges\n", __func__); if ((error = pt_configure_ranges(pt_ctx, cfg)) != 0) return (error); } pt_ctx->hwt_ctx = ctx; pt_ext->rtit_ctl |= RTIT_CTL_TOPA; pt_ext->rtit_output_base = (uint64_t)vtophys(pt_ctx->buf.topa_hw); pt_ext->rtit_output_mask_ptrs = PT_TOPA_MASK_PTRS; hdr->xstate_bv = XFEATURE_ENABLED_PT; hdr->xstate_xcomp_bv = XFEATURE_ENABLED_PT | XSTATE_XCOMP_BV_COMPACT; pt_ext->rtit_ctl |= RTIT_CTL_TRACEEN; pt_pcpu[cpu_id].ctx = pt_ctx; pt_cpu_set_state(cpu_id, PT_STOPPED); return (0); } /* * hwt backend trace start operation. CPU affine. */ static void pt_backend_enable(struct hwt_context *ctx, int cpu_id) { if (ctx->mode == HWT_MODE_CPU) return; KASSERT(curcpu == cpu_id, ("%s: attempting to start PT on another cpu", __func__)); pt_cpu_start(NULL); CPU_SET(cpu_id, &ctx->cpu_map); } /* * hwt backend trace stop operation. CPU affine. */ static void pt_backend_disable(struct hwt_context *ctx, int cpu_id) { struct pt_cpu *cpu; if (ctx->mode == HWT_MODE_CPU) return; KASSERT(curcpu == cpu_id, ("%s: attempting to disable PT on another cpu", __func__)); pt_cpu_stop(NULL); CPU_CLR(cpu_id, &ctx->cpu_map); cpu = &pt_pcpu[cpu_id]; cpu->ctx = NULL; } /* * hwt backend trace start operation for remote CPUs. */ static int pt_backend_enable_smp(struct hwt_context *ctx) { dprintf("%s\n", __func__); if (ctx->mode == HWT_MODE_CPU && atomic_swap_32(&cpu_mode_ctr, 1) != 0) return (-1); KASSERT(ctx->mode == HWT_MODE_CPU, ("%s: should only be used for CPU mode", __func__)); smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_start, NULL, NULL); return (0); } /* * hwt backend trace stop operation for remote CPUs. */ static int pt_backend_disable_smp(struct hwt_context *ctx) { dprintf("%s\n", __func__); if (ctx->mode == HWT_MODE_CPU && atomic_swap_32(&cpu_mode_ctr, 0) == 0) return (-1); if (CPU_EMPTY(&ctx->cpu_map)) { dprintf("%s: empty cpu map\n", __func__); return (-1); } smp_rendezvous_cpus(ctx->cpu_map, NULL, pt_cpu_stop, NULL, NULL); return (0); } /* * HWT backend initialization method. * * Installs the ToPA interrupt handler and initializes * the tracing contexts used for HWT_MODE_CPU. */ static int pt_backend_init(struct hwt_context *ctx) { struct hwt_cpu *hwt_cpu; int error; dprintf("%s\n", __func__); if (ctx->mode == HWT_MODE_CPU) { TAILQ_FOREACH(hwt_cpu, &ctx->cpus, next) { error = pt_init_ctx(&pt_pcpu_ctx[hwt_cpu->cpu_id], hwt_cpu->vm, hwt_cpu->cpu_id); if (error) return (error); } } return (0); } /* * HWT backend teardown method. * * Removes the ToPA interrupt handler, stops tracing on all active CPUs, * and releases all previously allocated ToPA metadata. */ static int pt_backend_deinit(struct hwt_context *ctx) { struct pt_ctx *pt_ctx; struct hwt_thread *thr; int cpu_id; dprintf("%s\n", __func__); pt_backend_disable_smp(ctx); if (ctx->mode == HWT_MODE_THREAD) { TAILQ_FOREACH(thr, &ctx->threads, next) { KASSERT(thr->private != NULL, ("%s: thr->private not set", __func__)); pt_ctx = (struct pt_ctx *)thr->private; pt_deinit_ctx(pt_ctx); } } else { CPU_FOREACH(cpu_id) { if (!CPU_ISSET(cpu_id, &ctx->cpu_map)) continue; if (pt_pcpu[cpu_id].ctx != NULL) { KASSERT(pt_pcpu[cpu_id].ctx == &pt_pcpu_ctx[cpu_id], ("%s: CPU mode tracing with non-cpu mode PT" "context active", __func__)); pt_pcpu[cpu_id].ctx = NULL; } pt_ctx = &pt_pcpu_ctx[cpu_id]; pt_deinit_ctx(pt_ctx); memset(&pt_pcpu[cpu_id], 0, sizeof(struct pt_cpu)); } } return (0); } /* * Fetches current offset into the tracing buffer. */ static int pt_backend_read(struct hwt_vm *vm, int *curpage, vm_offset_t *curpage_offset, uint64_t *data) { struct pt_buffer *buf; if (vm->ctx->mode == HWT_MODE_THREAD) buf = &((struct pt_ctx *)vm->thr->private)->buf; else buf = &pt_pcpu[vm->cpu->cpu_id].ctx->buf; mtx_lock_spin(&buf->lock); *curpage = buf->curpage; *curpage_offset = buf->offset + (buf->wrap_count * vm->ctx->bufsize); mtx_unlock_spin(&buf->lock); return (0); } /* * HWT thread creation hook. * Allocates and associates a 'struct pt_ctx' for a given hwt thread. */ static int pt_backend_alloc_thread(struct hwt_thread *thr) { struct pt_ctx *pt_ctx; int error; /* Omit M_WAITOK since this might get invoked a non-sleepable context */ pt_ctx = malloc(sizeof(*pt_ctx), M_PT, M_NOWAIT | M_ZERO); if (pt_ctx == NULL) return (ENOMEM); error = pt_init_ctx(pt_ctx, thr->vm, thr->thread_id); if (error) return (error); thr->private = pt_ctx; return (0); } /* * HWT thread teardown hook. */ static void pt_backend_free_thread(struct hwt_thread *thr) { struct pt_ctx *ctx; ctx = (struct pt_ctx *)thr->private; pt_deinit_ctx(ctx); free(ctx, M_PT); } static void pt_backend_dump(int cpu_id) { } static struct hwt_backend_ops pt_ops = { .hwt_backend_init = pt_backend_init, .hwt_backend_deinit = pt_backend_deinit, .hwt_backend_configure = pt_backend_configure, .hwt_backend_enable = pt_backend_enable, .hwt_backend_disable = pt_backend_disable, #ifdef SMP .hwt_backend_enable_smp = pt_backend_enable_smp, .hwt_backend_disable_smp = pt_backend_disable_smp, #endif .hwt_backend_read = pt_backend_read, .hwt_backend_dump = pt_backend_dump, .hwt_backend_thread_alloc = pt_backend_alloc_thread, .hwt_backend_thread_free = pt_backend_free_thread, }; static struct hwt_backend backend = { .ops = &pt_ops, .name = "pt", .kva_req = 1, }; /* * Reads the latest valid trace buffer offset and enqueues * a HWT_RECORD_BUFFER record. * Used as a taskqueue routine from the ToPA interrupt handler. */ static void pt_send_buffer_record(void *arg, int pending __unused) { struct hwt_record_entry record; struct pt_ctx *ctx = (struct pt_ctx *)arg; /* Prepare buffer record. */ mtx_lock_spin(&ctx->buf.lock); pt_fill_buffer_record(ctx->id, &ctx->buf, &record); mtx_unlock_spin(&ctx->buf.lock); hwt_record_ctx(ctx->hwt_ctx, &record, M_ZERO | M_NOWAIT); } static void pt_topa_status_clear(void) { uint64_t reg; reg = rdmsr(MSR_IA_GLOBAL_STATUS_RESET); reg &= ~GLOBAL_STATUS_FLAG_TRACETOPAPMI; reg |= GLOBAL_STATUS_FLAG_TRACETOPAPMI; wrmsr(MSR_IA_GLOBAL_STATUS_RESET, reg); } /* * ToPA PMI handler. * * Invoked every time a ToPA entry marked with TOPA_INT is filled. * Uses taskqueue to enqueue a buffer record for userspace. * Re-enables the PC interrupt line as long as tracing is active. */ static int pt_topa_intr(struct trapframe *tf) { struct pt_buffer *buf; struct pt_ctx *ctx; uint64_t reg; SDT_PROBE0(pt, , , topa__intr); if (pt_cpu_get_state(curcpu) != PT_ACTIVE) { return (0); } reg = rdmsr(MSR_IA_GLOBAL_STATUS); if ((reg & GLOBAL_STATUS_FLAG_TRACETOPAPMI) == 0) { /* ACK spurious or leftover interrupt. */ pt_topa_status_clear(); return (1); } ctx = pt_pcpu[curcpu].ctx; buf = &ctx->buf; KASSERT(buf->topa_hw != NULL, ("%s: ToPA PMI interrupt with invalid buffer", __func__)); pt_cpu_toggle_local(ctx->save_area, false); pt_update_buffer(buf); pt_topa_status_clear(); taskqueue_enqueue_flags(taskqueue_pt, &ctx->task, TASKQUEUE_FAIL_IF_PENDING); if (pt_cpu_get_state(curcpu) == PT_ACTIVE) { pt_cpu_toggle_local(ctx->save_area, true); lapic_reenable_pcint(); } return (1); } /* * Module initialization. * * Saves all PT-related cpuid info, registers itself as a HWT backend, * and allocates metadata required to keep track of tracing operations * on each CPU. */ static int pt_init(void) { u_int cp[4]; int error; dprintf("pt: Enumerating part 1\n"); cpuid_count(CPUID_PT_LEAF, 0, cp); dprintf("pt: Maximum valid sub-leaf Index: %x\n", cp[0]); dprintf("pt: ebx %x\n", cp[1]); dprintf("pt: ecx %x\n", cp[2]); pt_info.l0_eax = cp[0]; pt_info.l0_ebx = cp[1]; pt_info.l0_ecx = cp[2]; dprintf("pt: Enumerating part 2\n"); cpuid_count(CPUID_PT_LEAF, 1, cp); dprintf("pt: eax %x\n", cp[0]); dprintf("pt: ebx %x\n", cp[1]); pt_info.l1_eax = cp[0]; pt_info.l1_ebx = cp[1]; error = hwt_backend_register(&backend); if (error != 0) { printf("pt: unable to register hwt backend, error %d\n", error); return (error); } pt_pcpu = mallocarray(mp_ncpus, sizeof(struct pt_cpu), M_PT, M_ZERO | M_WAITOK); pt_pcpu_ctx = mallocarray(mp_ncpus, sizeof(struct pt_ctx), M_PT, M_ZERO | M_WAITOK); nmi_register_handler(pt_topa_intr); if (!lapic_enable_pcint()) { nmi_remove_handler(pt_topa_intr); hwt_backend_unregister(&backend); free(pt_pcpu, M_PT); free(pt_pcpu_ctx, M_PT); pt_pcpu = NULL; pt_pcpu_ctx = NULL; printf("pt: failed to setup interrupt line\n"); return (error); } initialized = true; return (0); } /* * Checks whether the CPU support Intel PT and * initializes XSAVE area info. * * The driver relies on XSAVE/XRSTOR PT extensions, * Table of Physical Addresses (ToPA) support, and * support for multiple ToPA entries. */ static bool pt_supported(void) { u_int cp[4]; if ((cpu_stdext_feature & CPUID_STDEXT_PROCTRACE) == 0) { printf("pt: CPU does not support Intel Processor Trace\n"); return (false); } if ((cpu_feature2 & CPUID2_XSAVE) == 0) { printf("pt: XSAVE is not supported\n"); return (false); } if (!xsave_extfeature_supported(XFEATURE_ENABLED_PT, true)) { printf("pt: CPU does not support managing PT state using XSAVE\n"); return (false); } if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVEC)) { printf("pt: XSAVE compaction is not supported\n"); return (false); } if (!xsave_extension_supported(CPUID_EXTSTATE_XSAVES)) { printf("pt: CPU does not support XSAVES/XRSTORS\n"); return (false); } /* Require ToPA support. */ cpuid_count(CPUID_PT_LEAF, 0, cp); if ((cp[2] & CPUPT_TOPA) == 0) { printf("pt: ToPA is not supported\n"); return (false); } if ((cp[2] & CPUPT_TOPA_MULTI) == 0) { printf("pt: multiple ToPA outputs are not supported\n"); return (false); } pt_info.xstate_hdr_offset = xsave_area_hdr_offset(); pt_info.xsave_area_size = xsave_area_size(PT_XSTATE_BV, true, true); pt_info.pt_xsave_offset = xsave_area_offset(PT_XSTATE_BV, XFEATURE_ENABLED_PT, true, true); return (true); } static void pt_deinit(void) { if (!initialized) return; nmi_remove_handler(pt_topa_intr); lapic_disable_pcint(); hwt_backend_unregister(&backend); free(pt_pcpu, M_PT); free(pt_pcpu_ctx, M_PT); pt_pcpu = NULL; initialized = false; } static int pt_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: if (!pt_supported() || pt_init() != 0) { return (ENXIO); } break; case MOD_UNLOAD: pt_deinit(); break; default: break; } return (0); } static moduledata_t pt_mod = { "intel_pt", pt_modevent, NULL }; DECLARE_MODULE(intel_pt, pt_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); MODULE_DEPEND(intel_pt, hwt, 1, 1, 1); MODULE_VERSION(intel_pt, 1);