diff --git a/sys/kern/kern_boottrace.c b/sys/kern/kern_boottrace.c index 1b097e7378ad..5516f3160587 100644 --- a/sys/kern/kern_boottrace.c +++ b/sys/kern/kern_boottrace.c @@ -1,616 +1,613 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2008-2022 NetApp, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define dprintf(fmt, ...) \ do { \ if (dotrace_debugging) \ printf(fmt, ##__VA_ARGS__); \ } while (0); static MALLOC_DEFINE(M_BOOTTRACE, "boottrace", "memory for boot tracing"); #define BT_TABLE_DEFSIZE 3000 #define BT_TABLE_RUNSIZE 2000 #define BT_TABLE_SHTSIZE 1000 #define BT_TABLE_MINSIZE 500 /* * Boot-time & shutdown-time event. */ struct bt_event { uint64_t tsc; /* Timestamp */ uint64_t tick; /* Kernel tick */ uint64_t cputime; /* Microseconds of process CPU time */ uint32_t cpuid; /* CPU on which the event ran */ uint32_t inblock; /* # of blocks in */ uint32_t oublock; /* # of blocks out */ pid_t pid; /* Current PID */ char name[BT_EVENT_NAMELEN]; /* Event name */ char tdname[BT_EVENT_TDNAMELEN]; /* Thread name */ }; struct bt_table { uint32_t size; /* Trace table size */ uint32_t curr; /* Trace entry to use */ uint32_t wrap; /* Wrap-around, instead of dropping */ uint32_t drops_early; /* Trace entries dropped before init */ uint32_t drops_full; /* Trace entries dropped after full */ struct bt_event *table; }; /* Boot-time tracing */ static struct bt_table bt; /* Run-time tracing */ static struct bt_table rt; /* Shutdown-time tracing */ static struct bt_table st; /* Set when system boot is complete, and we've switched to runtime tracing. */ static bool bootdone; /* Set when system shutdown has started. */ static bool shutdown; /* Turn on dotrace() debug logging to console. */ static bool dotrace_debugging; TUNABLE_BOOL("debug.boottrace.dotrace_debugging", &dotrace_debugging); /* Trace kernel events */ static bool dotrace_kernel = true; TUNABLE_BOOL("kern.boottrace.dotrace_kernel", &dotrace_kernel); /* Trace userspace events */ static bool dotrace_user = true; TUNABLE_BOOL("kern.boottrace.dotrace_user", &dotrace_user); static int sysctl_log(SYSCTL_HANDLER_ARGS); static int sysctl_boottrace(SYSCTL_HANDLER_ARGS); static int sysctl_runtrace(SYSCTL_HANDLER_ARGS); static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS); static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_kern, OID_AUTO, boottrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "boottrace statistics"); SYSCTL_PROC(_kern_boottrace, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 0, 0, sysctl_log, "A", "Print a log of the boottrace trace data"); SYSCTL_PROC(_kern_boottrace, OID_AUTO, boottrace, CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 0, 0, sysctl_boottrace, "A", "Capture a boot-time trace event"); SYSCTL_PROC(_kern_boottrace, OID_AUTO, runtrace, CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 0, 0, sysctl_runtrace, "A", "Capture a run-time trace event"); SYSCTL_PROC(_kern_boottrace, OID_AUTO, shuttrace, CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE, 0, 0, sysctl_shuttrace, "A", "Capture a shutdown-time trace event"); SYSCTL_PROC(_kern_boottrace, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, 0, 0, sysctl_boottrace_reset, "A", "Reset run-time tracing table"); /* * Global enable. */ bool __read_mostly boottrace_enabled = false; SYSCTL_BOOL(_kern_boottrace, OID_AUTO, enabled, CTLFLAG_RDTUN, &boottrace_enabled, 0, "Boot-time and shutdown-time tracing enabled"); /* * Enable dumping of the shutdown trace entries to console. */ bool shutdown_trace = false; SYSCTL_BOOL(_kern_boottrace, OID_AUTO, shutdown_trace, CTLFLAG_RWTUN, &shutdown_trace, 0, "Enable kernel shutdown tracing to the console"); /* * Set the delta threshold (ms) below which events are ignored, used in * determining what to dump to the console. */ static int shutdown_trace_threshold; SYSCTL_INT(_kern_boottrace, OID_AUTO, shutdown_trace_threshold, CTLFLAG_RWTUN, &shutdown_trace_threshold, 0, "Tracing threshold (ms) below which tracing is ignored"); SYSCTL_UINT(_kern_boottrace, OID_AUTO, table_size, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &bt.size, 0, "Boot-time tracing table size"); /* * Dump a trace to buffer or if buffer is NULL to console. * * Non-zero delta_threshold selectively prints entries based on delta * with current and previous entry. Otherwise, delta_threshold of 0 * prints every trace entry and delta. * * Output something like this: * * CPU msecs delta process event * 11 1228262715 0 init shutdown pre sync begin * 3 1228265622 2907 init shutdown pre sync complete * 3 1228265623 0 init shutdown turned swap off * 18 1228266466 843 init shutdown unmounted all filesystems * * How to interpret: * * delta column represents the time in milliseconds between this event and the previous. * Usually that means you can take the previous event, current event, match them * up in the code, and whatever lies between is the culprit taking time. * * For example, above: Pre-Sync is taking 2907ms, and something between swap and unmount * filesystems is taking 843 milliseconds. * * An event with a delta of 0 are 'landmark' events that simply exist in the output * for the developer to know where the time measurement begins. The 0 is an arbitrary * number that can effectively be ignored. */ #define BTC_DELTA_PRINT(bte, msecs, delta) do { \ if (sbp) { \ sbuf_printf(sbp, fmt, (bte)->cpuid, msecs, delta, \ (bte)->tdname, (bte)->name, (bte)->pid, \ (bte)->cputime / 1000000, \ ((bte)->cputime % 1000000) / 10000, \ (bte)->inblock, (bte)->oublock); \ } else { \ printf(fmt, (bte)->cpuid, msecs, delta, \ (bte)->tdname, (bte)->name, (bte)->pid, \ (bte)->cputime / 1000000, \ ((bte)->cputime % 1000000) / 10000, \ (bte)->inblock, (bte)->oublock); \ } \ } while (0) /* * Print the trace entries to the message buffer, or to an sbuf, if provided. * * Entries with a difference less than dthres will not be printed. */ static void boottrace_display(struct sbuf *sbp, struct bt_table *btp, uint64_t dthres) { struct bt_event *evtp; struct bt_event *last_evtp; uint64_t msecs; uint64_t first_msecs; uint64_t last_msecs; uint64_t dmsecs; uint64_t last_dmsecs; uint64_t total_dmsecs; uint32_t i; uint32_t curr; const char *fmt = "%3u %10llu %10llu %-24s %-40s %5d %4d.%02d %5u %5u\n"; const char *hdr_fmt = "\n\n%3s %10s %10s %-24s %-40s %5s %6s %5s %5s\n"; bool printed; bool last_printed; /* Print the header */ if (sbp != NULL) sbuf_printf(sbp, hdr_fmt, "CPU", "msecs", "delta", "process", "event", "PID", "CPUtime", "IBlks", "OBlks"); else printf(hdr_fmt, "CPU", "msecs", "delta", "process", "event", "PID", "CPUtime", "IBlks", "OBlks"); first_msecs = 0; last_evtp = NULL; last_msecs = 0; last_dmsecs = 0; last_printed = false; i = curr = btp->curr; do { evtp = &btp->table[i]; if (evtp->tsc == 0) goto next; msecs = cputick2usec(evtp->tick) / 1000; dmsecs = (last_msecs != 0 && msecs > last_msecs) ? msecs - last_msecs : 0; printed = false; /* * If a threshold is defined, start filtering events by the * delta of msecs. */ if (dthres != 0 && (dmsecs > dthres)) { /* * Print the previous entry as a landmark, even if it * falls below the threshold. */ if (last_evtp != NULL && !last_printed) BTC_DELTA_PRINT(last_evtp, last_msecs, last_dmsecs); BTC_DELTA_PRINT(evtp, msecs, dmsecs); printed = true; } else if (dthres == 0) { BTC_DELTA_PRINT(evtp, msecs, dmsecs); printed = true; } if (first_msecs == 0 || msecs < first_msecs) first_msecs = msecs; last_evtp = evtp; last_msecs = msecs; last_dmsecs = dmsecs; last_printed = printed; maybe_yield(); next: i = (i + 1) % btp->size; } while (i != curr); total_dmsecs = last_msecs > first_msecs ? (last_msecs - first_msecs) : 0; if (sbp != NULL) sbuf_printf(sbp, "Total measured time: %ju msecs\n", (uintmax_t)total_dmsecs); else printf("Total measured time: %ju msecs\n", (uintmax_t)total_dmsecs); } /* * Dump trace table entries to the console, given a delta threshold. */ void boottrace_dump_console(void) { if (!boottrace_enabled) return; if (shutdown || rebooting || KERNEL_PANICKED()) { boottrace_display(NULL, &st, shutdown_trace_threshold); } else { boottrace_display(NULL, &bt, 0); boottrace_display(NULL, &rt, 0); } } /* * Records a new tracing event to the specified table. * * We don't use a lock because we want this to be callable from interrupt * context. */ static int dotrace(struct bt_table *btp, const char *eventname, const char *tdname) { uint32_t idx, nxt; struct rusage usage; MPASS(boottrace_enabled); if (tdname == NULL) tdname = (curproc->p_flag & P_SYSTEM) ? curthread->td_name : curproc->p_comm; dprintf("dotrace["); dprintf("cpu=%u, pid=%d, tsc=%ju, tick=%d, td='%s', event='%s'", PCPU_GET(cpuid), curthread->td_proc->p_pid, (uintmax_t)get_cyclecount(), ticks, tdname, eventname); if (btp->table == NULL) { btp->drops_early++; dprintf(", return=ENOSPC_1]\n"); return (ENOSPC); } /* Claim a slot in the table. */ do { idx = btp->curr; nxt = (idx + 1) % btp->size; if (nxt == 0 && btp->wrap == 0) { btp->drops_full++; dprintf(", return=ENOSPC_2]\n"); return (ENOSPC); } } while (!atomic_cmpset_int(&btp->curr, idx, nxt)); btp->table[idx].cpuid = PCPU_GET(cpuid); btp->table[idx].tsc = get_cyclecount(), btp->table[idx].tick = cpu_ticks(); btp->table[idx].pid = curthread->td_proc->p_pid; /* * Get the resource usage for the process. Don't attempt this for the * kernel proc0 or for critical section activities. */ if ((curthread->td_proc == &proc0) || (curthread->td_critnest != 0)) { btp->table[idx].cputime = 0; btp->table[idx].inblock = 0; btp->table[idx].oublock = 0; } else { kern_getrusage(curthread, RUSAGE_CHILDREN, &usage); btp->table[idx].cputime = (uint64_t)(usage.ru_utime.tv_sec * 1000000 + usage.ru_utime.tv_usec + usage.ru_stime.tv_sec * 1000000 + usage.ru_stime.tv_usec); btp->table[idx].inblock = (uint32_t)usage.ru_inblock; btp->table[idx].oublock = (uint32_t)usage.ru_oublock; } strlcpy(btp->table[idx].name, eventname, BT_EVENT_NAMELEN); strlcpy(btp->table[idx].tdname, tdname, BT_EVENT_TDNAMELEN); dprintf(", return=0]\n"); return (0); } /* * Log various boot-time events, with the trace message encoded using * printf-like arguments. */ int boottrace(const char *tdname, const char *fmt, ...) { char eventname[BT_EVENT_NAMELEN]; struct bt_table *btp; va_list ap; if (!dotrace_kernel) return (ENXIO); va_start(ap, fmt); vsnprintf(eventname, sizeof(eventname), fmt, ap); va_end(ap); btp = &bt; if (bootdone) btp = &rt; if (shutdown || rebooting || KERNEL_PANICKED()) btp = &st; return (dotrace(btp, eventname, tdname)); } /* * Log a run-time event & switch over to run-time tracing mode. */ static int runtrace(const char *eventname, const char *tdname) { bootdone = true; return (dotrace(&rt, eventname, tdname)); } /* * Parse a boottrace message from userspace. * * The input from may contain a ':' to denote tdname. If not, tdname is * inferred from the process' name. * * e.g. reboot(8):SIGINT to init(8) */ static void boottrace_parse_message(char *message, char **eventname, char **tdname) { char *delim; delim = strchr(message, ':'); if (delim != NULL) { *delim = '\0'; *tdname = message; *eventname = ++delim; } else { *tdname = curproc->p_comm; *eventname = message; } } static int _boottrace_sysctl(struct bt_table *btp, struct sysctl_oid *oidp, struct sysctl_req *req) { char message[BT_MSGLEN]; char *eventname, *tdname; int error; if (!dotrace_user) return (ENXIO); message[0] = '\0'; error = sysctl_handle_string(oidp, message, sizeof(message), req); if (error) return (error); boottrace_parse_message(message, &eventname, &tdname); error = dotrace(btp, eventname, tdname); if (error == ENOSPC) { /* Ignore table full error. */ error = 0; } return (error); } static int sysctl_log(SYSCTL_HANDLER_ARGS) { struct sbuf *sbuf; int error; if (!boottrace_enabled || req->newptr != NULL) return (0); sbuf = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND); boottrace_display(sbuf, &bt, 0); boottrace_display(sbuf, &rt, 0); sbuf_finish(sbuf); error = SYSCTL_OUT(req, sbuf_data(sbuf), sbuf_len(sbuf)); sbuf_delete(sbuf); return (error); } static int sysctl_boottrace(SYSCTL_HANDLER_ARGS) { if (!boottrace_enabled) return (0); /* No output */ if (req->newptr == NULL) return (0); /* Trace to rt if we have reached multi-user. */ return (_boottrace_sysctl(bootdone ? &rt : &bt, oidp, req)); } /* * Log a run-time event and switch over to run-time tracing mode. */ static int sysctl_runtrace(SYSCTL_HANDLER_ARGS) { if (!boottrace_enabled) return (0); /* No output */ if (req->newptr == NULL) return (0); bootdone = true; return (_boottrace_sysctl(&rt, oidp, req)); } /* * Log a shutdown-time event and switch over to shutdown tracing mode. */ static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS) { if (!boottrace_enabled) return (0); /* No output */ if (req->newptr == NULL) return (0); shutdown = true; return (_boottrace_sysctl(&st, oidp, req)); } /* * Reset the runtime tracing buffer. */ void boottrace_reset(const char *actor) { char tmpbuf[64]; snprintf(tmpbuf, sizeof(tmpbuf), "reset: %s", actor); runtrace(tmpbuf, NULL); } /* * Note that a resize implies a reset, i.e., the index is reset to 0. * We never shrink the array; we can only increase its size. */ int boottrace_resize(u_int newsize) { if (newsize <= rt.size) { return (EINVAL); } rt.table = realloc(rt.table, newsize * sizeof(struct bt_event), M_BOOTTRACE, M_WAITOK | M_ZERO); - if (rt.table == NULL) - return (ENOMEM); - rt.size = newsize; boottrace_reset("boottrace_resize"); return (0); } static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS) { if (!boottrace_enabled) return (0); if (req->newptr != NULL) boottrace_reset("sysctl_boottrace_reset"); return (0); } static void boottrace_init(void) { if (!boottrace_enabled) return; /* Boottime trace table */ bt.size = BT_TABLE_DEFSIZE; TUNABLE_INT_FETCH("kern.boottrace.table_size", &bt.size); bt.size = max(bt.size, BT_TABLE_MINSIZE); bt.table = malloc(bt.size * sizeof(struct bt_event), M_BOOTTRACE, M_WAITOK | M_ZERO); /* Stick in an initial entry. */ bt.table[0].cpuid = PCPU_GET(cpuid); strlcpy(bt.table[0].tdname, "boottime", BT_EVENT_TDNAMELEN); strlcpy(bt.table[0].name, "initial event", BT_EVENT_NAMELEN); bt.curr = 1; /* Run-time trace table (may wrap-around). */ rt.wrap = 1; rt.size = BT_TABLE_RUNSIZE; rt.table = malloc(rt.size * sizeof(struct bt_event), M_BOOTTRACE, M_WAITOK | M_ZERO); /* Shutdown trace table */ st.size = BT_TABLE_SHTSIZE; st.table = malloc(st.size * sizeof(struct bt_event), M_BOOTTRACE, M_WAITOK | M_ZERO); } SYSINIT(boottrace, SI_SUB_CPU, SI_ORDER_ANY, boottrace_init, NULL);