diff --git a/sys/kern/kern_cons.c b/sys/kern/kern_cons.c index 24952561449b..a8f6b689bff7 100644 --- a/sys/kern/kern_cons.c +++ b/sys/kern/kern_cons.c @@ -1,773 +1,773 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * Copyright (c) 1999 Michael Smith * Copyright (c) 2005 Pawel Jakub Dawidek * * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)cons.c 7.2 (Berkeley) 5/9/91 */ #include #include "opt_ddb.h" #include "opt_syscons.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_TTYCONS, "tty console", "tty console handling"); struct cn_device { STAILQ_ENTRY(cn_device) cnd_next; struct consdev *cnd_cn; }; #define CNDEVPATHMAX 32 #define CNDEVTAB_SIZE 4 static struct cn_device cn_devtab[CNDEVTAB_SIZE]; static STAILQ_HEAD(, cn_device) cn_devlist = STAILQ_HEAD_INITIALIZER(cn_devlist); int cons_avail_mask = 0; /* Bit mask. Each registered low level console * which is currently unavailable for inpit * (i.e., if it is in graphics mode) will have * this bit cleared. */ static int cn_mute; SYSCTL_INT(_kern, OID_AUTO, consmute, CTLFLAG_RW, &cn_mute, 0, "State of the console muting"); static char *consbuf; /* buffer used by `consmsgbuf' */ static struct callout conscallout; /* callout for outputting to constty */ struct msgbuf consmsgbuf; /* message buffer for console tty */ static bool console_pausing; /* pause after each line during probe */ static const char console_pausestr[] = ""; struct tty *constty; /* pointer to console "window" tty */ static struct mtx constty_mtx; /* Mutex for constty assignment. */ MTX_SYSINIT(constty_mtx, &constty_mtx, "constty_mtx", MTX_DEF); static struct mtx cnputs_mtx; /* Mutex for cnputs(). */ MTX_SYSINIT(cnputs_mtx, &cnputs_mtx, "cnputs_mtx", MTX_SPIN | MTX_NOWITNESS); static void constty_timeout(void *arg); static struct consdev cons_consdev; DATA_SET(cons_set, cons_consdev); SET_DECLARE(cons_set, struct consdev); /* * Stub for configurations that don't actually have a keyboard driver. Inclusion * of kbd.c is contingent on any number of keyboard/console drivers being * present in the kernel; rather than trying to catch them all, we'll just * maintain this weak kbdinit that will be overridden by the strong version in * kbd.c if it's present. */ __weak_symbol void kbdinit(void) { } void cninit(void) { struct consdev *best_cn, *cn, **list; TSENTER(); /* * Check if we should mute the console (for security reasons perhaps) * It can be changes dynamically using sysctl kern.consmute * once we are up and going. * */ cn_mute = ((boothowto & (RB_MUTE |RB_SINGLE |RB_VERBOSE |RB_ASKNAME)) == RB_MUTE); /* * Bring up the kbd layer just in time for cnprobe. Console drivers * have a dependency on kbd being ready, so this fits nicely between the * machdep callers of cninit() and MI probing/initialization of consoles * here. */ kbdinit(); /* * Find the first console with the highest priority. */ best_cn = NULL; SET_FOREACH(list, cons_set) { cn = *list; cnremove(cn); /* Skip cons_consdev. */ if (cn->cn_ops == NULL) continue; cn->cn_ops->cn_probe(cn); if (cn->cn_pri == CN_DEAD) continue; if (best_cn == NULL || cn->cn_pri > best_cn->cn_pri) best_cn = cn; if (boothowto & RB_MULTIPLE) { /* * Initialize console, and attach to it. */ cn->cn_ops->cn_init(cn); cnadd(cn); } } if (best_cn == NULL) return; if ((boothowto & RB_MULTIPLE) == 0) { best_cn->cn_ops->cn_init(best_cn); cnadd(best_cn); } if (boothowto & RB_PAUSE) console_pausing = true; /* * Make the best console the preferred console. */ cnselect(best_cn); #ifdef EARLY_PRINTF /* * Release early console. */ early_putc = NULL; #endif TSEXIT(); } void cninit_finish(void) { console_pausing = false; } /* add a new physical console to back the virtual console */ int cnadd(struct consdev *cn) { struct cn_device *cnd; int i; STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) if (cnd->cnd_cn == cn) return (0); for (i = 0; i < CNDEVTAB_SIZE; i++) { cnd = &cn_devtab[i]; if (cnd->cnd_cn == NULL) break; } if (cnd->cnd_cn != NULL) return (ENOMEM); cnd->cnd_cn = cn; if (cn->cn_name[0] == '\0') { /* XXX: it is unclear if/where this print might output */ printf("WARNING: console at %p has no name\n", cn); } STAILQ_INSERT_TAIL(&cn_devlist, cnd, cnd_next); if (STAILQ_FIRST(&cn_devlist) == cnd) ttyconsdev_select(cnd->cnd_cn->cn_name); /* Add device to the active mask. */ cnavailable(cn, (cn->cn_flags & CN_FLAG_NOAVAIL) == 0); return (0); } void cnremove(struct consdev *cn) { struct cn_device *cnd; int i; STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { if (cnd->cnd_cn != cn) continue; if (STAILQ_FIRST(&cn_devlist) == cnd) ttyconsdev_select(NULL); STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next); cnd->cnd_cn = NULL; /* Remove this device from available mask. */ for (i = 0; i < CNDEVTAB_SIZE; i++) if (cnd == &cn_devtab[i]) { cons_avail_mask &= ~(1 << i); break; } #if 0 /* * XXX * syscons gets really confused if console resources are * freed after the system has initialized. */ if (cn->cn_term != NULL) cn->cn_ops->cn_term(cn); #endif return; } } void cnselect(struct consdev *cn) { struct cn_device *cnd; STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { if (cnd->cnd_cn != cn) continue; if (cnd == STAILQ_FIRST(&cn_devlist)) return; STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next); STAILQ_INSERT_HEAD(&cn_devlist, cnd, cnd_next); ttyconsdev_select(cnd->cnd_cn->cn_name); return; } } void cnavailable(struct consdev *cn, int available) { int i; for (i = 0; i < CNDEVTAB_SIZE; i++) { if (cn_devtab[i].cnd_cn == cn) break; } if (available) { if (i < CNDEVTAB_SIZE) cons_avail_mask |= (1 << i); cn->cn_flags &= ~CN_FLAG_NOAVAIL; } else { if (i < CNDEVTAB_SIZE) cons_avail_mask &= ~(1 << i); cn->cn_flags |= CN_FLAG_NOAVAIL; } } int cnunavailable(void) { return (cons_avail_mask == 0); } /* * sysctl_kern_console() provides output parseable in conscontrol(1). */ static int sysctl_kern_console(SYSCTL_HANDLER_ARGS) { struct cn_device *cnd; struct consdev *cp, **list; char *p; bool delete; int error; struct sbuf *sb; sb = sbuf_new(NULL, NULL, CNDEVPATHMAX * 2, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); if (sb == NULL) return (ENOMEM); sbuf_clear(sb); STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) sbuf_printf(sb, "%s,", cnd->cnd_cn->cn_name); - sbuf_printf(sb, "/"); + sbuf_putc(sb, '/'); SET_FOREACH(list, cons_set) { cp = *list; if (cp->cn_name[0] != '\0') sbuf_printf(sb, "%s,", cp->cn_name); } sbuf_finish(sb); error = sysctl_handle_string(oidp, sbuf_data(sb), sbuf_len(sb), req); if (error == 0 && req->newptr != NULL) { p = sbuf_data(sb); error = ENXIO; delete = false; if (*p == '-') { delete = true; p++; } SET_FOREACH(list, cons_set) { cp = *list; if (strcmp(p, cp->cn_name) != 0) continue; if (delete) { cnremove(cp); error = 0; } else { error = cnadd(cp); if (error == 0) cnselect(cp); } break; } } sbuf_delete(sb); return (error); } SYSCTL_PROC(_kern, OID_AUTO, console, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_kern_console, "A", "Console device control"); void cngrab(void) { struct cn_device *cnd; struct consdev *cn; STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { cn = cnd->cnd_cn; if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) cn->cn_ops->cn_grab(cn); } } void cnungrab(void) { struct cn_device *cnd; struct consdev *cn; STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { cn = cnd->cnd_cn; if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) cn->cn_ops->cn_ungrab(cn); } } void cnresume(void) { struct cn_device *cnd; struct consdev *cn; STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { cn = cnd->cnd_cn; if (cn->cn_ops->cn_resume != NULL) cn->cn_ops->cn_resume(cn); } } /* * Low level console routines. */ int cngetc(void) { int c; if (cn_mute) return (-1); while ((c = cncheckc()) == -1) cpu_spinwait(); if (c == '\r') c = '\n'; /* console input is always ICRNL */ return (c); } int cncheckc(void) { struct cn_device *cnd; struct consdev *cn; int c; if (cn_mute) return (-1); STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { cn = cnd->cnd_cn; if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) { c = cn->cn_ops->cn_getc(cn); if (c != -1) return (c); } } return (-1); } void cngets(char *cp, size_t size, int visible) { char *lp, *end; int c; cngrab(); lp = cp; end = cp + size - 1; for (;;) { c = cngetc() & 0177; switch (c) { case '\n': case '\r': cnputc(c); *lp = '\0'; cnungrab(); return; case '\b': case '\177': if (lp > cp) { if (visible) cnputs("\b \b"); lp--; } continue; case '\0': continue; default: if (lp < end) { switch (visible) { case GETS_NOECHO: break; case GETS_ECHOPASS: cnputc('*'); break; default: cnputc(c); break; } *lp++ = c; } } } } void cnputc(int c) { struct cn_device *cnd; struct consdev *cn; const char *cp; #ifdef EARLY_PRINTF if (early_putc != NULL) { if (c == '\n') early_putc('\r'); early_putc(c); return; } #endif if (cn_mute || c == '\0') return; STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) { cn = cnd->cnd_cn; if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) { if (c == '\n') cn->cn_ops->cn_putc(cn, '\r'); cn->cn_ops->cn_putc(cn, c); } } if (console_pausing && c == '\n' && !kdb_active) { for (cp = console_pausestr; *cp != '\0'; cp++) cnputc(*cp); cngrab(); if (cngetc() == '.') console_pausing = false; cnungrab(); cnputc('\r'); for (cp = console_pausestr; *cp != '\0'; cp++) cnputc(' '); cnputc('\r'); } } void cnputsn(const char *p, size_t n) { size_t i; bool unlock_reqd = false; if (mtx_initialized(&cnputs_mtx)) { /* * NOTE: Debug prints and/or witness printouts in * console driver clients can cause the "cnputs_mtx" * mutex to recurse. Simply return if that happens. */ if (mtx_owned(&cnputs_mtx)) return; mtx_lock_spin(&cnputs_mtx); unlock_reqd = true; } for (i = 0; i < n; i++) cnputc(p[i]); if (unlock_reqd) mtx_unlock_spin(&cnputs_mtx); } void cnputs(const char *p) { cnputsn(p, strlen(p)); } static unsigned int consmsgbuf_size = 65536; SYSCTL_UINT(_kern, OID_AUTO, consmsgbuf_size, CTLFLAG_RWTUN, &consmsgbuf_size, 0, "Console tty buffer size"); /* * Redirect console output to a tty. */ int constty_set(struct tty *tp) { int size = consmsgbuf_size; void *buf = NULL; tty_assert_locked(tp); if (constty == tp) return (0); if (constty != NULL) return (EBUSY); if (consbuf == NULL) { tty_unlock(tp); buf = malloc(size, M_TTYCONS, M_WAITOK); tty_lock(tp); } mtx_lock(&constty_mtx); if (constty != NULL) { mtx_unlock(&constty_mtx); free(buf, M_TTYCONS); return (EBUSY); } if (consbuf == NULL) { consbuf = buf; msgbuf_init(&consmsgbuf, buf, size); } else free(buf, M_TTYCONS); constty = tp; mtx_unlock(&constty_mtx); callout_init_mtx(&conscallout, tty_getlock(tp), 0); constty_timeout(tp); return (0); } /* * Disable console redirection to a tty. */ int constty_clear(struct tty *tp) { int c; tty_assert_locked(tp); if (constty != tp) return (ENXIO); callout_stop(&conscallout); mtx_lock(&constty_mtx); constty = NULL; mtx_unlock(&constty_mtx); while ((c = msgbuf_getchar(&consmsgbuf)) != -1) cnputc(c); /* We never free consbuf because it can still be in use. */ return (0); } /* Times per second to check for pending console tty messages. */ static int constty_wakeups_per_second = 15; SYSCTL_INT(_kern, OID_AUTO, constty_wakeups_per_second, CTLFLAG_RW, &constty_wakeups_per_second, 0, "Times per second to check for pending console tty messages"); static void constty_timeout(void *arg) { struct tty *tp = arg; int c; tty_assert_locked(tp); while ((c = msgbuf_getchar(&consmsgbuf)) != -1) { if (tty_putchar(tp, c) < 0) { constty_clear(tp); return; } } callout_reset_sbt(&conscallout, SBT_1S / constty_wakeups_per_second, 0, constty_timeout, tp, C_PREL(1)); } /* * Sysbeep(), if we have hardware for it */ #ifdef HAS_TIMER_SPKR static bool beeping; static struct callout beeping_timer; static void sysbeepstop(void *chan) { timer_spkr_release(); beeping = false; } int sysbeep(int pitch, sbintime_t duration) { if (timer_spkr_acquire()) { if (!beeping) { /* Something else owns it. */ return (EBUSY); } } timer_spkr_setfreq(pitch); if (!beeping) { beeping = true; callout_reset_sbt(&beeping_timer, duration, 0, sysbeepstop, NULL, C_PREL(5)); } return (0); } static void sysbeep_init(void *unused) { callout_init(&beeping_timer, 1); } SYSINIT(sysbeep, SI_SUB_SOFTINTR, SI_ORDER_ANY, sysbeep_init, NULL); #else /* * No hardware, no sound */ int sysbeep(int pitch __unused, sbintime_t duration __unused) { return (ENODEV); } #endif /* * Temporary support for sc(4) to vt(4) transition. */ static unsigned vty_prefer; static char vty_name[16]; SYSCTL_STRING(_kern, OID_AUTO, vty, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, vty_name, 0, "Console vty driver"); int vty_enabled(unsigned vty) { static unsigned vty_selected = 0; if (vty_selected == 0) { TUNABLE_STR_FETCH("kern.vty", vty_name, sizeof(vty_name)); do { #if defined(DEV_SC) if (strcmp(vty_name, "sc") == 0) { vty_selected = VTY_SC; break; } #endif #if defined(DEV_VT) if (strcmp(vty_name, "vt") == 0) { vty_selected = VTY_VT; break; } #endif if (vty_prefer != 0) { vty_selected = vty_prefer; break; } #if defined(DEV_VT) vty_selected = VTY_VT; #elif defined(DEV_SC) vty_selected = VTY_SC; #endif } while (0); if (vty_selected == VTY_VT) strcpy(vty_name, "vt"); else if (vty_selected == VTY_SC) strcpy(vty_name, "sc"); } return ((vty_selected & vty) != 0); } void vty_set_preferred(unsigned vty) { vty_prefer = vty; #if !defined(DEV_SC) vty_prefer &= ~VTY_SC; #endif #if !defined(DEV_VT) vty_prefer &= ~VTY_VT; #endif } diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index f9445a481d92..112f9c7b0f33 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -1,1471 +1,1471 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 */ #include #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for acct_process() function prototype */ #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exit; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exit, "int"); static int kern_kill_on_dbg_exit = 1; SYSCTL_INT(_kern, OID_AUTO, kill_on_debugger_exit, CTLFLAG_RWTUN, &kern_kill_on_dbg_exit, 0, "Kill ptraced processes when debugger exits"); static bool kern_wait_dequeue_sigchld = 1; SYSCTL_BOOL(_kern, OID_AUTO, wait_dequeue_sigchld, CTLFLAG_RWTUN, &kern_wait_dequeue_sigchld, 0, "Dequeue SIGCHLD on wait(2) for live process"); struct proc * proc_realparent(struct proc *child) { struct proc *p, *parent; sx_assert(&proctree_lock, SX_LOCKED); if ((child->p_treeflag & P_TREE_ORPHANED) == 0) return (child->p_pptr->p_pid == child->p_oppid ? child->p_pptr : child->p_reaper); for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) { /* Cannot use LIST_PREV(), since the list head is not known. */ p = __containerof(p->p_orphan.le_prev, struct proc, p_orphan.le_next); KASSERT((p->p_treeflag & P_TREE_ORPHANED) != 0, ("missing P_ORPHAN %p", p)); } parent = __containerof(p->p_orphan.le_prev, struct proc, p_orphans.lh_first); return (parent); } void reaper_abandon_children(struct proc *p, bool exiting) { struct proc *p1, *p2, *ptmp; sx_assert(&proctree_lock, SX_XLOCKED); KASSERT(p != initproc, ("reaper_abandon_children for initproc")); if ((p->p_treeflag & P_TREE_REAPER) == 0) return; p1 = p->p_reaper; LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) { LIST_REMOVE(p2, p_reapsibling); p2->p_reaper = p1; p2->p_reapsubtree = p->p_reapsubtree; LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling); if (exiting && p2->p_pptr == p) { PROC_LOCK(p2); proc_reparent(p2, p1, true); PROC_UNLOCK(p2); } } KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty")); p->p_treeflag &= ~P_TREE_REAPER; } static void reaper_clear(struct proc *p) { struct proc *p1; bool clear; sx_assert(&proctree_lock, SX_LOCKED); LIST_REMOVE(p, p_reapsibling); if (p->p_reapsubtree == 1) return; clear = true; LIST_FOREACH(p1, &p->p_reaper->p_reaplist, p_reapsibling) { if (p1->p_reapsubtree == p->p_reapsubtree) { clear = false; break; } } if (clear) proc_id_clear(PROC_ID_REAP, p->p_reapsubtree); } void proc_clear_orphan(struct proc *p) { struct proc *p1; sx_assert(&proctree_lock, SA_XLOCKED); if ((p->p_treeflag & P_TREE_ORPHANED) == 0) return; if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) { p1 = LIST_NEXT(p, p_orphan); if (p1 != NULL) p1->p_treeflag |= P_TREE_FIRST_ORPHAN; p->p_treeflag &= ~P_TREE_FIRST_ORPHAN; } LIST_REMOVE(p, p_orphan); p->p_treeflag &= ~P_TREE_ORPHANED; } void exit_onexit(struct proc *p) { MPASS(p->p_numthreads == 1); umtx_thread_exit(FIRST_THREAD_IN_PROC(p)); } /* * exit -- death of process. */ int sys_exit(struct thread *td, struct exit_args *uap) { exit1(td, uap->rval, 0); __unreachable(); } void proc_set_p2_wexit(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag2 |= P2_WEXIT; } /* * Exit: deallocate address space and other resources, change proc state to * zombie, and unlink proc from allproc and parent's lists. Save exit status * and rusage for wait(). Check for child processes and orphan them. */ void exit1(struct thread *td, int rval, int signo) { struct proc *p, *nq, *q, *t; struct thread *tdt; ksiginfo_t *ksi, *ksi1; int signal_parent; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(rval == 0 || signo == 0, ("exit1 rv %d sig %d", rval, signo)); TSPROCEXIT(td->td_proc->p_pid); p = td->td_proc; /* * In case we're rebooting we just let init die in order to * work around an issues where pid 1 might get a fatal signal. * For instance, if network interface serving NFS root is * going down due to reboot, page-in requests for text are * failing. */ if (p == initproc && rebooting == 0) { printf("init died (signal %d, exit %d)\n", signo, rval); panic("Going nowhere without my init!"); } /* * Process deferred operations, designated with ASTF_KCLEAR. * For instance, we need to deref SU mp, since the thread does * not return to userspace, and wait for geom to stabilize. */ ast_kclear(td); /* * MUST abort all other threads before proceeding past here. */ PROC_LOCK(p); proc_set_p2_wexit(p); /* * First check if some other thread or external request got * here before us. If so, act appropriately: exit or suspend. * We must ensure that stop requests are handled before we set * P_WEXIT. */ thread_suspend_check(0); while (p->p_flag & P_HADTHREADS) { /* * Kill off the other threads. This requires * some co-operation from other parts of the kernel * so it may not be instantaneous. With this state set * any thread attempting to interruptibly * sleep will return immediately with EINTR or EWOULDBLOCK * which will hopefully force them to back out to userland * freeing resources as they go. Any thread attempting * to return to userland will thread_exit() from ast(). * thread_exit() will unsuspend us when the last of the * other threads exits. * If there is already a thread singler after resumption, * calling thread_single() will fail; in that case, we just * re-check all suspension request, the thread should * either be suspended there or exit. */ if (!thread_single(p, SINGLE_EXIT)) /* * All other activity in this process is now * stopped. Threading support has been turned * off. */ break; /* * Recheck for new stop or suspend requests which * might appear while process lock was dropped in * thread_single(). */ thread_suspend_check(0); } KASSERT(p->p_numthreads == 1, ("exit1: proc %p exiting with %d threads", p, p->p_numthreads)); racct_sub(p, RACCT_NTHR, 1); /* Let event handler change exit status */ p->p_xexit = rval; p->p_xsig = signo; /* * Ignore any pending request to stop due to a stop signal. * Once P_WEXIT is set, future requests will be ignored as * well. */ p->p_flag &= ~P_STOPPED_SIG; KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped")); /* Note that we are exiting. */ p->p_flag |= P_WEXIT; /* * Wait for any processes that have a hold on our vmspace to * release their reference. */ while (p->p_lock > 0) msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0); PROC_UNLOCK(p); /* Drain the limit callout while we don't have the proc locked */ callout_drain(&p->p_limco); #ifdef AUDIT /* * The Sun BSM exit token contains two components: an exit status as * passed to exit(), and a return value to indicate what sort of exit * it was. The exit status is WEXITSTATUS(rv), but it's not clear * what the return value is. */ AUDIT_ARG_EXIT(rval, 0); AUDIT_SYSCALL_EXIT(0, td); #endif /* Are we a task leader with peers? */ if (p->p_peers != NULL && p == p->p_leader) { mtx_lock(&ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); kern_psignal(q, SIGKILL); PROC_UNLOCK(q); q = q->p_peers; } while (p->p_peers != NULL) msleep(p, &ppeers_lock, PWAIT, "exit1", 0); mtx_unlock(&ppeers_lock); } itimers_exit(p); /* * Check if any loadable modules need anything done at process exit. * E.g. SYSV IPC stuff. * Event handler could change exit status. * XXX what if one of these generates an error? */ EVENTHANDLER_DIRECT_INVOKE(process_exit, p); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ PROC_LOCK(p); stopprofclock(p); p->p_ptevents = 0; /* * Stop the real interval timer. If the handler is currently * executing, prevent it from rearming itself and let it finish. */ if (timevalisset(&p->p_realtimer.it_value) && callout_stop(&p->p_itcallout) == 0) { timevalclear(&p->p_realtimer.it_interval); PROC_UNLOCK(p); callout_drain(&p->p_itcallout); } else { PROC_UNLOCK(p); } if (p->p_sysent->sv_onexit != NULL) p->p_sysent->sv_onexit(p); seltdfini(td); /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pid. The P_WEXIT flag interlocks with fsetown(). */ funsetownlst(&p->p_sigiolst); /* * Close open files and release open-file table. * This may block! */ pdescfree(td); fdescfree(td); /* * Remove ourself from our leader's peer list and wake our leader. */ if (p->p_leader->p_peers != NULL) { mtx_lock(&ppeers_lock); if (p->p_leader->p_peers != NULL) { q = p->p_leader; while (q->p_peers != p) q = q->p_peers; q->p_peers = p->p_peers; wakeup(p->p_leader); } mtx_unlock(&ppeers_lock); } exec_free_abi_mappings(p); vmspace_exit(td); (void)acct_process(td); #ifdef KTRACE ktrprocexit(td); #endif /* * Release reference to text vnode etc */ if (p->p_textvp != NULL) { vrele(p->p_textvp); p->p_textvp = NULL; } if (p->p_textdvp != NULL) { vrele(p->p_textdvp); p->p_textdvp = NULL; } if (p->p_binname != NULL) { free(p->p_binname, M_PARGS); p->p_binname = NULL; } /* * Release our limits structure. */ lim_free(p->p_limit); p->p_limit = NULL; tidhash_remove(td); /* * Call machine-dependent code to release any * machine-dependent resources other than the address space. * The address space is released by "vmspace_exitfree(p)" in * vm_waitproc(). */ cpu_exit(td); WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid); /* * Remove from allproc. It still sits in the hash. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); #ifdef DDB /* * Used by ddb's 'ps' command to find this process via the * pidhash. */ p->p_list.le_prev = NULL; #endif prison_proc_unlink(p->p_ucred->cr_prison, p); sx_xunlock(&allproc_lock); sx_xlock(&proctree_lock); if ((p->p_flag & (P_TRACED | P_PPWAIT | P_PPTRACE)) != 0) { PROC_LOCK(p); p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE); PROC_UNLOCK(p); } /* * killjobc() might drop and re-acquire proctree_lock to * revoke control tty if exiting process was a session leader. */ killjobc(); /* * Reparent all children processes: * - traced ones to the original parent (or init if we are that parent) * - the rest to init */ q = LIST_FIRST(&p->p_children); if (q != NULL) /* only need this if any child is S_ZOMB */ wakeup(q->p_reaper); for (; q != NULL; q = nq) { nq = LIST_NEXT(q, p_sibling); ksi = ksiginfo_alloc(M_WAITOK); PROC_LOCK(q); q->p_sigparent = SIGCHLD; if ((q->p_flag & P_TRACED) == 0) { proc_reparent(q, q->p_reaper, true); if (q->p_state == PRS_ZOMBIE) { /* * Inform reaper about the reparented * zombie, since wait(2) has something * new to report. Guarantee queueing * of the SIGCHLD signal, similar to * the _exit() behaviour, by providing * our ksiginfo. Ksi is freed by the * signal delivery. */ if (q->p_ksi == NULL) { ksi1 = NULL; } else { ksiginfo_copy(q->p_ksi, ksi); ksi->ksi_flags |= KSI_INS; ksi1 = ksi; ksi = NULL; } PROC_LOCK(q->p_reaper); pksignal(q->p_reaper, SIGCHLD, ksi1); PROC_UNLOCK(q->p_reaper); } else if (q->p_pdeathsig > 0) { /* * The child asked to received a signal * when we exit. */ kern_psignal(q, q->p_pdeathsig); } } else { /* * Traced processes are killed by default * since their existence means someone is * screwing up. */ t = proc_realparent(q); if (t == p) { proc_reparent(q, q->p_reaper, true); } else { PROC_LOCK(t); proc_reparent(q, t, true); PROC_UNLOCK(t); } /* * Since q was found on our children list, the * proc_reparent() call moved q to the orphan * list due to present P_TRACED flag. Clear * orphan link for q now while q is locked. */ proc_clear_orphan(q); q->p_flag &= ~P_TRACED; q->p_flag2 &= ~P2_PTRACE_FSTP; q->p_ptevents = 0; p->p_xthread = NULL; FOREACH_THREAD_IN_PROC(q, tdt) { tdt->td_dbgflags &= ~(TDB_SUSPEND | TDB_XSIG | TDB_FSTP); tdt->td_xsig = 0; } if (kern_kill_on_dbg_exit) { q->p_flag &= ~P_STOPPED_TRACE; kern_psignal(q, SIGKILL); } else if ((q->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) != 0) { sigqueue_delete_proc(q, SIGTRAP); ptrace_unsuspend(q); } } PROC_UNLOCK(q); if (ksi != NULL) ksiginfo_free(ksi); } /* * Also get rid of our orphans. */ while ((q = LIST_FIRST(&p->p_orphans)) != NULL) { PROC_LOCK(q); KASSERT(q->p_oppid == p->p_pid, ("orphan %p of %p has unexpected oppid %d", q, p, q->p_oppid)); q->p_oppid = q->p_reaper->p_pid; /* * If we are the real parent of this process * but it has been reparented to a debugger, then * check if it asked for a signal when we exit. */ if (q->p_pdeathsig > 0) kern_psignal(q, q->p_pdeathsig); CTR2(KTR_PTRACE, "exit: pid %d, clearing orphan %d", p->p_pid, q->p_pid); proc_clear_orphan(q); PROC_UNLOCK(q); } #ifdef KDTRACE_HOOKS if (SDT_PROBES_ENABLED()) { int reason = CLD_EXITED; if (WCOREDUMP(signo)) reason = CLD_DUMPED; else if (WIFSIGNALED(signo)) reason = CLD_KILLED; SDT_PROBE1(proc, , , exit, reason); } #endif /* Save exit status. */ PROC_LOCK(p); p->p_xthread = td; if (p->p_sysent->sv_ontdexit != NULL) p->p_sysent->sv_ontdexit(td); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exit if it * has declared an interest. */ if (dtrace_fasttrap_exit) dtrace_fasttrap_exit(p); #endif /* * Notify interested parties of our demise. */ KNOTE_LOCKED(p->p_klist, NOTE_EXIT); /* * If this is a process with a descriptor, we may not need to deliver * a signal to the parent. proctree_lock is held over * procdesc_exit() to serialize concurrent calls to close() and * exit(). */ signal_parent = 0; if (p->p_procdesc == NULL || procdesc_exit(p)) { /* * Notify parent that we're gone. If parent has the * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN, * notify process 1 instead (and hope it will handle this * situation). */ PROC_LOCK(p->p_pptr); mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { struct proc *pp; mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); pp = p->p_pptr; PROC_UNLOCK(pp); proc_reparent(p, p->p_reaper, true); p->p_sigparent = SIGCHLD; PROC_LOCK(p->p_pptr); /* * Notify parent, so in case he was wait(2)ing or * executing waitpid(2) with our pid, he will * continue. */ wakeup(pp); } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr == p->p_reaper || p->p_pptr == initproc) { signal_parent = 1; } else if (p->p_sigparent != 0) { if (p->p_sigparent == SIGCHLD) { signal_parent = 1; } else { /* LINUX thread */ signal_parent = 2; } } } else PROC_LOCK(p->p_pptr); sx_xunlock(&proctree_lock); if (signal_parent == 1) { childproc_exited(p); } else if (signal_parent == 2) { kern_psignal(p->p_pptr, p->p_sigparent); } /* Tell the prison that we are gone. */ prison_proc_free(p->p_ucred->cr_prison); /* * The state PRS_ZOMBIE prevents other processes from sending * signal to the process, to avoid memory leak, we free memory * for signal queue at the time when the state is set. */ sigqueue_flush(&p->p_sigqueue); sigqueue_flush(&td->td_sigqueue); /* * We have to wait until after acquiring all locks before * changing p_state. We need to avoid all possible context * switches (including ones from blocking on a mutex) while * marked as a zombie. We also have to set the zombie state * before we release the parent process' proc lock to avoid * a lost wakeup. So, we first call wakeup, then we grab the * sched lock, update the state, and release the parent process' * proc lock. */ wakeup(p->p_pptr); cv_broadcast(&p->p_pwait); sched_exit(p->p_pptr, td); PROC_SLOCK(p); p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); /* * Save our children's rusage information in our exit rusage. */ PROC_STATLOCK(p); ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux); PROC_STATUNLOCK(p); /* * Make sure the scheduler takes this thread out of its tables etc. * This will also release this thread's reference to the ucred. * Other thread parts to release include pcb bits and such. */ thread_exit(); } #ifndef _SYS_SYSPROTO_H_ struct abort2_args { char *why; int nargs; void **args; }; #endif int sys_abort2(struct thread *td, struct abort2_args *uap) { void *uargs[16]; void **uargsp; int error, nargs; nargs = uap->nargs; if (nargs < 0 || nargs > nitems(uargs)) nargs = -1; uargsp = NULL; if (nargs > 0) { if (uap->args != NULL) { error = copyin(uap->args, uargs, nargs * sizeof(void *)); if (error != 0) nargs = -1; else uargsp = uargs; } else nargs = -1; } return (kern_abort2(td, uap->why, nargs, uargsp)); } /* * kern_abort2() * Arguments: * why - user pointer to why * nargs - number of arguments copied or -1 if an error occurred in copying * args - pointer to an array of pointers in kernel format */ int kern_abort2(struct thread *td, const char *why, int nargs, void **uargs) { struct proc *p = td->td_proc; struct sbuf *sb; int error, i, sig; /* * Do it right now so we can log either proper call of abort2(), or * note, that invalid argument was passed. 512 is big enough to * handle 16 arguments' descriptions with additional comments. */ sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN); sbuf_clear(sb); sbuf_printf(sb, "%s(pid %d uid %d) aborted: ", p->p_comm, p->p_pid, td->td_ucred->cr_uid); /* * Since we can't return from abort2(), send SIGKILL in cases, where * abort2() was called improperly */ sig = SIGKILL; /* Prevent from DoSes from user-space. */ if (nargs == -1) goto out; KASSERT(nargs >= 0 && nargs <= 16, ("called with too many args (%d)", nargs)); /* * Limit size of 'reason' string to 128. Will fit even when * maximal number of arguments was chosen to be logged. */ if (why != NULL) { error = sbuf_copyin(sb, why, 128); if (error < 0) goto out; } else { - sbuf_printf(sb, "(null)"); + sbuf_cat(sb, "(null)"); } if (nargs > 0) { - sbuf_printf(sb, "("); + sbuf_putc(sb, '('); for (i = 0;i < nargs; i++) sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]); - sbuf_printf(sb, ")"); + sbuf_putc(sb, ')'); } /* * Final stage: arguments were proper, string has been * successfully copied from userspace, and copying pointers * from user-space succeed. */ sig = SIGABRT; out: if (sig == SIGKILL) { sbuf_trim(sb); - sbuf_printf(sb, " (Reason text inaccessible)"); + sbuf_cat(sb, " (Reason text inaccessible)"); } sbuf_cat(sb, "\n"); sbuf_finish(sb); log(LOG_INFO, "%s", sbuf_data(sb)); sbuf_delete(sb); PROC_LOCK(p); sigexit(td, sig); /* NOTREACHED */ } #ifdef COMPAT_43 /* * The dirty work is handled by kern_wait(). */ int owait(struct thread *td, struct owait_args *uap __unused) { int error, status; error = kern_wait(td, WAIT_ANY, &status, 0, NULL); if (error == 0) td->td_retval[1] = status; return (error); } #endif /* COMPAT_43 */ /* * The dirty work is handled by kern_wait(). */ int sys_wait4(struct thread *td, struct wait4_args *uap) { struct rusage ru, *rup; int error, status; if (uap->rusage != NULL) rup = &ru; else rup = NULL; error = kern_wait(td, uap->pid, &status, uap->options, rup); if (uap->status != NULL && error == 0 && td->td_retval[0] != 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->rusage != NULL && error == 0 && td->td_retval[0] != 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int sys_wait6(struct thread *td, struct wait6_args *uap) { struct __wrusage wru, *wrup; siginfo_t si, *sip; idtype_t idtype; id_t id; int error, status; idtype = uap->idtype; id = uap->id; if (uap->wrusage != NULL) wrup = &wru; else wrup = NULL; if (uap->info != NULL) { sip = &si; bzero(sip, sizeof(*sip)); } else sip = NULL; /* * We expect all callers of wait6() to know about WEXITED and * WTRAPPED. */ error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip); if (uap->status != NULL && error == 0 && td->td_retval[0] != 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->wrusage != NULL && error == 0 && td->td_retval[0] != 0) error = copyout(&wru, uap->wrusage, sizeof(wru)); if (uap->info != NULL && error == 0) error = copyout(&si, uap->info, sizeof(si)); return (error); } /* * Reap the remains of a zombie process and optionally return status and * rusage. Asserts and will release both the proctree_lock and the process * lock as part of its work. */ void proc_reap(struct thread *td, struct proc *p, int *status, int options) { struct proc *q, *t; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE")); mtx_spin_wait_unlocked(&p->p_slock); q = td->td_proc; if (status) *status = KW_EXITCODE(p->p_xexit, p->p_xsig); if (options & WNOWAIT) { /* * Only poll, returning the status. Caller does not wish to * release the proc struct just yet. */ PROC_UNLOCK(p); sx_xunlock(&proctree_lock); return; } PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); /* * If we got the child via a ptrace 'attach', we need to give it back * to the old parent. */ if (p->p_oppid != p->p_pptr->p_pid) { PROC_UNLOCK(p); t = proc_realparent(p); PROC_LOCK(t); PROC_LOCK(p); CTR2(KTR_PTRACE, "wait: traced child %d moved back to parent %d", p->p_pid, t->p_pid); proc_reparent(p, t, false); PROC_UNLOCK(p); pksignal(t, SIGCHLD, p->p_ksi); wakeup(t); cv_broadcast(&p->p_pwait); PROC_UNLOCK(t); sx_xunlock(&proctree_lock); return; } PROC_UNLOCK(p); /* * Remove other references to this process to ensure we have an * exclusive reference. */ sx_xlock(PIDHASHLOCK(p->p_pid)); LIST_REMOVE(p, p_hash); sx_xunlock(PIDHASHLOCK(p->p_pid)); LIST_REMOVE(p, p_sibling); reaper_abandon_children(p, true); reaper_clear(p); PROC_LOCK(p); proc_clear_orphan(p); PROC_UNLOCK(p); leavepgrp(p); if (p->p_procdesc != NULL) procdesc_reap(p); sx_xunlock(&proctree_lock); proc_id_clear(PROC_ID_PID, p->p_pid); PROC_LOCK(p); knlist_detach(p->p_klist); p->p_klist = NULL; PROC_UNLOCK(p); /* * Removal from allproc list and process group list paired with * PROC_LOCK which was executed during that time should guarantee * nothing can reach this process anymore. As such further locking * is unnecessary. */ p->p_xexit = p->p_xsig = 0; /* XXX: why? */ PROC_LOCK(q); ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux); PROC_UNLOCK(q); /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); /* * Destroy resource accounting information associated with the process. */ #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } #endif racct_proc_exit(p); /* * Free credentials, arguments, and sigacts. */ proc_unset_cred(p); pargs_drop(p->p_args); p->p_args = NULL; sigacts_free(p->p_sigacts); p->p_sigacts = NULL; /* * Do any thread-system specific cleanups. */ thread_wait(p); /* * Give vm and machine-dependent layer a chance to free anything that * cpu_exit couldn't release while still running in process context. */ vm_waitproc(p); #ifdef MAC mac_proc_destroy(p); #endif KASSERT(FIRST_THREAD_IN_PROC(p), ("proc_reap: no residual thread!")); uma_zfree(proc_zone, p); atomic_add_int(&nprocs, -1); } static int proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id, int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo, int check_only) { struct rusage *rup; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK(p); switch (idtype) { case P_ALL: if (p->p_procdesc == NULL || (p->p_pptr == td->td_proc && (p->p_flag & P_TRACED) != 0)) { break; } PROC_UNLOCK(p); return (0); case P_PID: if (p->p_pid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_PGID: if (p->p_pgid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_SID: if (p->p_session->s_sid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_UID: if (p->p_ucred->cr_uid != (uid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_GID: if (p->p_ucred->cr_gid != (gid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_JAILID: if (p->p_ucred->cr_prison->pr_id != (int)id) { PROC_UNLOCK(p); return (0); } break; /* * It seems that the thread structures get zeroed out * at process exit. This makes it impossible to * support P_SETID, P_CID or P_CPUID. */ default: PROC_UNLOCK(p); return (0); } if (p_canwait(td, p)) { PROC_UNLOCK(p); return (0); } if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) { PROC_UNLOCK(p); return (0); } /* * This special case handles a kthread spawned by linux_clone * (see linux_misc.c). The linux_wait4 and linux_waitpid * functions need to be able to distinguish between waiting * on a process and waiting on a thread. It is a thread if * p_sigparent is not SIGCHLD, and the WLINUXCLONE option * signifies we want to wait for threads and not processes. */ if ((p->p_sigparent != SIGCHLD) ^ ((options & WLINUXCLONE) != 0)) { PROC_UNLOCK(p); return (0); } if (siginfo != NULL) { bzero(siginfo, sizeof(*siginfo)); siginfo->si_errno = 0; /* * SUSv4 requires that the si_signo value is always * SIGCHLD. Obey it despite the rfork(2) interface * allows to request other signal for child exit * notification. */ siginfo->si_signo = SIGCHLD; /* * This is still a rough estimate. We will fix the * cases TRAPPED, STOPPED, and CONTINUED later. */ if (WCOREDUMP(p->p_xsig)) { siginfo->si_code = CLD_DUMPED; siginfo->si_status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { siginfo->si_code = CLD_KILLED; siginfo->si_status = WTERMSIG(p->p_xsig); } else { siginfo->si_code = CLD_EXITED; siginfo->si_status = p->p_xexit; } siginfo->si_pid = p->p_pid; siginfo->si_uid = p->p_ucred->cr_uid; /* * The si_addr field would be useful additional * detail, but apparently the PC value may be lost * when we reach this point. bzero() above sets * siginfo->si_addr to NULL. */ } /* * There should be no reason to limit resources usage info to * exited processes only. A snapshot about any resources used * by a stopped process may be exactly what is needed. */ if (wrusage != NULL) { rup = &wrusage->wru_self; *rup = p->p_ru; PROC_STATLOCK(p); calcru(p, &rup->ru_utime, &rup->ru_stime); PROC_STATUNLOCK(p); rup = &wrusage->wru_children; *rup = p->p_stats->p_cru; calccru(p, &rup->ru_utime, &rup->ru_stime); } if (p->p_state == PRS_ZOMBIE && !check_only) { proc_reap(td, p, status, options); return (-1); } return (1); } int kern_wait(struct thread *td, pid_t pid, int *status, int options, struct rusage *rusage) { struct __wrusage wru, *wrup; idtype_t idtype; id_t id; int ret; /* * Translate the special pid values into the (idtype, pid) * pair for kern_wait6. The WAIT_MYPGRP case is handled by * kern_wait6() on its own. */ if (pid == WAIT_ANY) { idtype = P_ALL; id = 0; } else if (pid < 0) { idtype = P_PGID; id = (id_t)-pid; } else { idtype = P_PID; id = (id_t)pid; } if (rusage != NULL) wrup = &wru; else wrup = NULL; /* * For backward compatibility we implicitly add flags WEXITED * and WTRAPPED here. */ options |= WEXITED | WTRAPPED; ret = kern_wait6(td, idtype, id, status, options, wrup, NULL); if (rusage != NULL) *rusage = wru.wru_self; return (ret); } static void report_alive_proc(struct thread *td, struct proc *p, siginfo_t *siginfo, int *status, int options, int si_code) { bool cont; PROC_LOCK_ASSERT(p, MA_OWNED); sx_assert(&proctree_lock, SA_XLOCKED); MPASS(si_code == CLD_TRAPPED || si_code == CLD_STOPPED || si_code == CLD_CONTINUED); cont = si_code == CLD_CONTINUED; if ((options & WNOWAIT) == 0) { if (cont) p->p_flag &= ~P_CONTINUED; else p->p_flag |= P_WAITED; if (kern_wait_dequeue_sigchld && (td->td_proc->p_sysent->sv_flags & SV_SIG_WAITNDQ) == 0) { PROC_LOCK(td->td_proc); sigqueue_take(p->p_ksi); PROC_UNLOCK(td->td_proc); } } sx_xunlock(&proctree_lock); if (siginfo != NULL) { siginfo->si_code = si_code; siginfo->si_status = cont ? SIGCONT : p->p_xsig; } if (status != NULL) *status = cont ? SIGCONT : W_STOPCODE(p->p_xsig); td->td_retval[0] = p->p_pid; PROC_UNLOCK(p); } int kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo) { struct proc *p, *q; pid_t pid; int error, nfound, ret; bool report; AUDIT_ARG_VALUE((int)idtype); /* XXX - This is likely wrong! */ AUDIT_ARG_PID((pid_t)id); /* XXX - This may be wrong! */ AUDIT_ARG_VALUE(options); q = td->td_proc; if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) { PROC_LOCK(q); id = (id_t)q->p_pgid; PROC_UNLOCK(q); idtype = P_PGID; } /* If we don't know the option, just return. */ if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT | WEXITED | WTRAPPED | WLINUXCLONE)) != 0) return (EINVAL); if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) { /* * We will be unable to find any matching processes, * because there are no known events to look for. * Prefer to return error instead of blocking * indefinitely. */ return (EINVAL); } loop: if (q->p_flag & P_STATCHILD) { PROC_LOCK(q); q->p_flag &= ~P_STATCHILD; PROC_UNLOCK(q); } sx_xlock(&proctree_lock); loop_locked: nfound = 0; LIST_FOREACH(p, &q->p_children, p_sibling) { pid = p->p_pid; ret = proc_to_reap(td, p, idtype, id, status, options, wrusage, siginfo, 0); if (ret == 0) continue; else if (ret != 1) { td->td_retval[0] = pid; return (0); } nfound++; PROC_LOCK_ASSERT(p, MA_OWNED); if ((options & WTRAPPED) != 0 && (p->p_flag & P_TRACED) != 0) { PROC_SLOCK(p); report = ((p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) && p->p_suspcount == p->p_numthreads && (p->p_flag & P_WAITED) == 0); PROC_SUNLOCK(p); if (report) { CTR4(KTR_PTRACE, "wait: returning trapped pid %d status %#x " "(xstat %d) xthread %d", p->p_pid, W_STOPCODE(p->p_xsig), p->p_xsig, p->p_xthread != NULL ? p->p_xthread->td_tid : -1); report_alive_proc(td, p, siginfo, status, options, CLD_TRAPPED); return (0); } } if ((options & WUNTRACED) != 0 && (p->p_flag & P_STOPPED_SIG) != 0) { PROC_SLOCK(p); report = (p->p_suspcount == p->p_numthreads && ((p->p_flag & P_WAITED) == 0)); PROC_SUNLOCK(p); if (report) { report_alive_proc(td, p, siginfo, status, options, CLD_STOPPED); return (0); } } if ((options & WCONTINUED) != 0 && (p->p_flag & P_CONTINUED) != 0) { report_alive_proc(td, p, siginfo, status, options, CLD_CONTINUED); return (0); } PROC_UNLOCK(p); } /* * Look in the orphans list too, to allow the parent to * collect it's child exit status even if child is being * debugged. * * Debugger detaches from the parent upon successful * switch-over from parent to child. At this point due to * re-parenting the parent loses the child to debugger and a * wait4(2) call would report that it has no children to wait * for. By maintaining a list of orphans we allow the parent * to successfully wait until the child becomes a zombie. */ if (nfound == 0) { LIST_FOREACH(p, &q->p_orphans, p_orphan) { ret = proc_to_reap(td, p, idtype, id, NULL, options, NULL, NULL, 1); if (ret != 0) { KASSERT(ret != -1, ("reaped an orphan (pid %d)", (int)td->td_retval[0])); PROC_UNLOCK(p); nfound++; break; } } } if (nfound == 0) { sx_xunlock(&proctree_lock); return (ECHILD); } if (options & WNOHANG) { sx_xunlock(&proctree_lock); td->td_retval[0] = 0; return (0); } PROC_LOCK(q); if (q->p_flag & P_STATCHILD) { q->p_flag &= ~P_STATCHILD; PROC_UNLOCK(q); goto loop_locked; } sx_xunlock(&proctree_lock); error = msleep(q, &q->p_mtx, PWAIT | PCATCH | PDROP, "wait", 0); if (error) return (error); goto loop; } void proc_add_orphan(struct proc *child, struct proc *parent) { sx_assert(&proctree_lock, SX_XLOCKED); KASSERT((child->p_flag & P_TRACED) != 0, ("proc_add_orphan: not traced")); if (LIST_EMPTY(&parent->p_orphans)) { child->p_treeflag |= P_TREE_FIRST_ORPHAN; LIST_INSERT_HEAD(&parent->p_orphans, child, p_orphan); } else { LIST_INSERT_AFTER(LIST_FIRST(&parent->p_orphans), child, p_orphan); } child->p_treeflag |= P_TREE_ORPHANED; } /* * Make process 'parent' the new parent of process 'child'. * Must be called with an exclusive hold of proctree lock. */ void proc_reparent(struct proc *child, struct proc *parent, bool set_oppid) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; PROC_LOCK(child->p_pptr); sigqueue_take(child->p_ksi); PROC_UNLOCK(child->p_pptr); LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); proc_clear_orphan(child); if ((child->p_flag & P_TRACED) != 0) { proc_add_orphan(child, child->p_pptr); } child->p_pptr = parent; if (set_oppid) child->p_oppid = parent->p_pid; } diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c index f60500b22ef4..883b664aef0d 100644 --- a/sys/kern/kern_fail.c +++ b/sys/kern/kern_fail.c @@ -1,1145 +1,1145 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009 Isilon Inc http://www.isilon.com/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /** * @file * * fail(9) Facility. * * @ingroup failpoint_private */ /** * @defgroup failpoint fail(9) Facility * * Failpoints allow for injecting fake errors into running code on the fly, * without modifying code or recompiling with flags. Failpoints are always * present, and are very efficient when disabled. Failpoints are described * in man fail(9). */ /** * @defgroup failpoint_private Private fail(9) Implementation functions * * Private implementations for the actual failpoint code. * * @ingroup failpoint */ /** * @addtogroup failpoint_private * @{ */ #include #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ILOG_DEFINE_FOR_FILE ILOG_DEFINE_FOR_FILE(L_ISI_FAIL_POINT, L_ILOG, fail_point); #endif static MALLOC_DEFINE(M_FAIL_POINT, "Fail Points", "fail points system"); #define fp_free(ptr) free(ptr, M_FAIL_POINT) #define fp_malloc(size, flags) malloc((size), M_FAIL_POINT, (flags)) #define fs_free(ptr) fp_free(ptr) #define fs_malloc() fp_malloc(sizeof(struct fail_point_setting), \ M_WAITOK | M_ZERO) /** * These define the wchans that are used for sleeping, pausing respectively. * They are chosen arbitrarily but need to be distinct to the failpoint and * the sleep/pause distinction. */ #define FP_SLEEP_CHANNEL(fp) (void*)(fp) #define FP_PAUSE_CHANNEL(fp) __DEVOLATILE(void*, &fp->fp_setting) /** * Don't allow more than this many entries in a fail point set by sysctl. * The 99.99...% case is to have 1 entry. I can't imagine having this many * entries, so it should not limit us. Saves on re-mallocs while holding * a non-sleepable lock. */ #define FP_MAX_ENTRY_COUNT 20 /* Used to drain sbufs to the sysctl output */ int fail_sysctl_drain_func(void *, const char *, int); /* Head of tailq of struct fail_point_entry */ TAILQ_HEAD(fail_point_entry_queue, fail_point_entry); /** * fp entries garbage list; outstanding entries are cleaned up in the * garbage collector */ STAILQ_HEAD(fail_point_setting_garbage, fail_point_setting); static struct fail_point_setting_garbage fp_setting_garbage = STAILQ_HEAD_INITIALIZER(fp_setting_garbage); static struct mtx mtx_garbage_list; MTX_SYSINIT(mtx_garbage_list, &mtx_garbage_list, "fail point garbage mtx", MTX_SPIN); static struct sx sx_fp_set; SX_SYSINIT(sx_fp_set, &sx_fp_set, "fail point set sx"); /** * Failpoint types. * Don't change these without changing fail_type_strings in fail.c. * @ingroup failpoint_private */ enum fail_point_t { FAIL_POINT_OFF, /**< don't fail */ FAIL_POINT_PANIC, /**< panic */ FAIL_POINT_RETURN, /**< return an errorcode */ FAIL_POINT_BREAK, /**< break into the debugger */ FAIL_POINT_PRINT, /**< print a message */ FAIL_POINT_SLEEP, /**< sleep for some msecs */ FAIL_POINT_PAUSE, /**< sleep until failpoint is set to off */ FAIL_POINT_YIELD, /**< yield the cpu */ FAIL_POINT_DELAY, /**< busy wait the cpu */ FAIL_POINT_NUMTYPES, FAIL_POINT_INVALID = -1 }; static struct { const char *name; int nmlen; } fail_type_strings[] = { #define FP_TYPE_NM_LEN(s) { s, sizeof(s) - 1 } [FAIL_POINT_OFF] = FP_TYPE_NM_LEN("off"), [FAIL_POINT_PANIC] = FP_TYPE_NM_LEN("panic"), [FAIL_POINT_RETURN] = FP_TYPE_NM_LEN("return"), [FAIL_POINT_BREAK] = FP_TYPE_NM_LEN("break"), [FAIL_POINT_PRINT] = FP_TYPE_NM_LEN("print"), [FAIL_POINT_SLEEP] = FP_TYPE_NM_LEN("sleep"), [FAIL_POINT_PAUSE] = FP_TYPE_NM_LEN("pause"), [FAIL_POINT_YIELD] = FP_TYPE_NM_LEN("yield"), [FAIL_POINT_DELAY] = FP_TYPE_NM_LEN("delay"), }; #define FE_COUNT_UNTRACKED (INT_MIN) /** * Internal structure tracking a single term of a complete failpoint. * @ingroup failpoint_private */ struct fail_point_entry { volatile bool fe_stale; enum fail_point_t fe_type; /**< type of entry */ int fe_arg; /**< argument to type (e.g. return value) */ int fe_prob; /**< likelihood of firing in millionths */ int32_t fe_count; /**< number of times to fire, -1 means infinite */ pid_t fe_pid; /**< only fail for this process */ struct fail_point *fe_parent; /**< backpointer to fp */ TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry ptr */ }; struct fail_point_setting { STAILQ_ENTRY(fail_point_setting) fs_garbage_link; struct fail_point_entry_queue fp_entry_queue; struct fail_point * fs_parent; struct mtx feq_mtx; /* Gives fail_point_pause something to do. */ }; /** * Defines stating the equivalent of probablilty one (100%) */ enum { PROB_MAX = 1000000, /* probability between zero and this number */ PROB_DIGITS = 6 /* number of zero's in above number */ }; /* Get a ref on an fp's fp_setting */ static inline struct fail_point_setting *fail_point_setting_get_ref( struct fail_point *fp); /* Release a ref on an fp_setting */ static inline void fail_point_setting_release_ref(struct fail_point *fp); /* Allocate and initialize a struct fail_point_setting */ static struct fail_point_setting *fail_point_setting_new(struct fail_point *); /* Free a struct fail_point_setting */ static void fail_point_setting_destroy(struct fail_point_setting *fp_setting); /* Allocate and initialize a struct fail_point_entry */ static struct fail_point_entry *fail_point_entry_new(struct fail_point_setting *); /* Free a struct fail_point_entry */ static void fail_point_entry_destroy(struct fail_point_entry *fp_entry); /* Append fp setting to garbage list */ static inline void fail_point_setting_garbage_append( struct fail_point_setting *fp_setting); /* Swap fp's setting with fp_setting_new */ static inline struct fail_point_setting * fail_point_swap_settings(struct fail_point *fp, struct fail_point_setting *fp_setting_new); /* Free up any zero-ref setting in the garbage queue */ static void fail_point_garbage_collect(void); /* If this fail point's setting are empty, then swap it out to NULL. */ static inline void fail_point_eval_swap_out(struct fail_point *fp, struct fail_point_setting *fp_setting); bool fail_point_is_off(struct fail_point *fp) { bool return_val; struct fail_point_setting *fp_setting; struct fail_point_entry *ent; return_val = true; fp_setting = fail_point_setting_get_ref(fp); if (fp_setting != NULL) { TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { if (!ent->fe_stale) { return_val = false; break; } } } fail_point_setting_release_ref(fp); return (return_val); } /* Allocate and initialize a struct fail_point_setting */ static struct fail_point_setting * fail_point_setting_new(struct fail_point *fp) { struct fail_point_setting *fs_new; fs_new = fs_malloc(); fs_new->fs_parent = fp; TAILQ_INIT(&fs_new->fp_entry_queue); mtx_init(&fs_new->feq_mtx, "fail point entries", NULL, MTX_SPIN); fail_point_setting_garbage_append(fs_new); return (fs_new); } /* Free a struct fail_point_setting */ static void fail_point_setting_destroy(struct fail_point_setting *fp_setting) { struct fail_point_entry *ent; while (!TAILQ_EMPTY(&fp_setting->fp_entry_queue)) { ent = TAILQ_FIRST(&fp_setting->fp_entry_queue); TAILQ_REMOVE(&fp_setting->fp_entry_queue, ent, fe_entries); fail_point_entry_destroy(ent); } fs_free(fp_setting); } /* Allocate and initialize a struct fail_point_entry */ static struct fail_point_entry * fail_point_entry_new(struct fail_point_setting *fp_setting) { struct fail_point_entry *fp_entry; fp_entry = fp_malloc(sizeof(struct fail_point_entry), M_WAITOK | M_ZERO); fp_entry->fe_parent = fp_setting->fs_parent; fp_entry->fe_prob = PROB_MAX; fp_entry->fe_pid = NO_PID; fp_entry->fe_count = FE_COUNT_UNTRACKED; TAILQ_INSERT_TAIL(&fp_setting->fp_entry_queue, fp_entry, fe_entries); return (fp_entry); } /* Free a struct fail_point_entry */ static void fail_point_entry_destroy(struct fail_point_entry *fp_entry) { fp_free(fp_entry); } /* Get a ref on an fp's fp_setting */ static inline struct fail_point_setting * fail_point_setting_get_ref(struct fail_point *fp) { struct fail_point_setting *fp_setting; /* Invariant: if we have a ref, our pointer to fp_setting is safe */ atomic_add_acq_32(&fp->fp_ref_cnt, 1); fp_setting = fp->fp_setting; return (fp_setting); } /* Release a ref on an fp_setting */ static inline void fail_point_setting_release_ref(struct fail_point *fp) { KASSERT(&fp->fp_ref_cnt > 0, ("Attempting to deref w/no refs")); atomic_subtract_rel_32(&fp->fp_ref_cnt, 1); } /* Append fp entries to fp garbage list */ static inline void fail_point_setting_garbage_append(struct fail_point_setting *fp_setting) { mtx_lock_spin(&mtx_garbage_list); STAILQ_INSERT_TAIL(&fp_setting_garbage, fp_setting, fs_garbage_link); mtx_unlock_spin(&mtx_garbage_list); } /* Swap fp's entries with fp_setting_new */ static struct fail_point_setting * fail_point_swap_settings(struct fail_point *fp, struct fail_point_setting *fp_setting_new) { struct fail_point_setting *fp_setting_old; fp_setting_old = fp->fp_setting; fp->fp_setting = fp_setting_new; return (fp_setting_old); } static inline void fail_point_eval_swap_out(struct fail_point *fp, struct fail_point_setting *fp_setting) { /* We may have already been swapped out and replaced; ignore. */ if (fp->fp_setting == fp_setting) fail_point_swap_settings(fp, NULL); } /* Free up any zero-ref entries in the garbage queue */ static void fail_point_garbage_collect(void) { struct fail_point_setting *fs_current, *fs_next; struct fail_point_setting_garbage fp_ents_free_list; /** * We will transfer the entries to free to fp_ents_free_list while holding * the spin mutex, then free it after we drop the lock. This avoids * triggering witness due to sleepable mutexes in the memory * allocator. */ STAILQ_INIT(&fp_ents_free_list); mtx_lock_spin(&mtx_garbage_list); STAILQ_FOREACH_SAFE(fs_current, &fp_setting_garbage, fs_garbage_link, fs_next) { if (fs_current->fs_parent->fp_setting != fs_current && fs_current->fs_parent->fp_ref_cnt == 0) { STAILQ_REMOVE(&fp_setting_garbage, fs_current, fail_point_setting, fs_garbage_link); STAILQ_INSERT_HEAD(&fp_ents_free_list, fs_current, fs_garbage_link); } } mtx_unlock_spin(&mtx_garbage_list); STAILQ_FOREACH_SAFE(fs_current, &fp_ents_free_list, fs_garbage_link, fs_next) fail_point_setting_destroy(fs_current); } /* Drain out all refs from this fail point */ static inline void fail_point_drain(struct fail_point *fp, int expected_ref) { struct fail_point_setting *entries; entries = fail_point_swap_settings(fp, NULL); /** * We have unpaused all threads; so we will wait no longer * than the time taken for the longest remaining sleep, or * the length of time of a long-running code block. */ while (fp->fp_ref_cnt > expected_ref) { wakeup(FP_PAUSE_CHANNEL(fp)); tsleep(&fp, PWAIT, "fail_point_drain", hz / 100); } if (fp->fp_callout) callout_drain(fp->fp_callout); fail_point_swap_settings(fp, entries); } static inline void fail_point_pause(struct fail_point *fp, enum fail_point_return_code *pret, struct mtx *mtx_sleep) { if (fp->fp_pre_sleep_fn) fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); msleep_spin(FP_PAUSE_CHANNEL(fp), mtx_sleep, "failpt", 0); if (fp->fp_post_sleep_fn) fp->fp_post_sleep_fn(fp->fp_post_sleep_arg); } static inline void fail_point_sleep(struct fail_point *fp, int msecs, enum fail_point_return_code *pret) { int timo; /* Convert from millisecs to ticks, rounding up */ timo = howmany((int64_t)msecs * hz, 1000L); if (timo > 0) { if (!(fp->fp_flags & FAIL_POINT_USE_TIMEOUT_PATH)) { if (fp->fp_pre_sleep_fn) fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); tsleep(FP_SLEEP_CHANNEL(fp), PWAIT, "failpt", timo); if (fp->fp_post_sleep_fn) fp->fp_post_sleep_fn(fp->fp_post_sleep_arg); } else { if (fp->fp_pre_sleep_fn) fp->fp_pre_sleep_fn(fp->fp_pre_sleep_arg); callout_reset(fp->fp_callout, timo, fp->fp_post_sleep_fn, fp->fp_post_sleep_arg); *pret = FAIL_POINT_RC_QUEUED; } } } static char *parse_fail_point(struct fail_point_setting *, char *); static char *parse_term(struct fail_point_setting *, char *); static char *parse_number(int *out_units, int *out_decimal, char *); static char *parse_type(struct fail_point_entry *, char *); /** * Initialize a fail_point. The name is formed in a printf-like fashion * from "fmt" and subsequent arguments. This function is generally used * for custom failpoints located at odd places in the sysctl tree, and is * not explicitly needed for standard in-line-declared failpoints. * * @ingroup failpoint */ void fail_point_init(struct fail_point *fp, const char *fmt, ...) { va_list ap; char *name; int n; fp->fp_setting = NULL; fp->fp_flags = 0; /* Figure out the size of the name. */ va_start(ap, fmt); n = vsnprintf(NULL, 0, fmt, ap); va_end(ap); /* Allocate the name and fill it in. */ name = fp_malloc(n + 1, M_WAITOK); if (name != NULL) { va_start(ap, fmt); vsnprintf(name, n + 1, fmt, ap); va_end(ap); } fp->fp_name = name; fp->fp_location = ""; fp->fp_flags |= FAIL_POINT_DYNAMIC_NAME; fp->fp_pre_sleep_fn = NULL; fp->fp_pre_sleep_arg = NULL; fp->fp_post_sleep_fn = NULL; fp->fp_post_sleep_arg = NULL; } void fail_point_alloc_callout(struct fail_point *fp) { /** * This assumes that calls to fail_point_use_timeout_path() * will not race. */ if (fp->fp_callout != NULL) return; fp->fp_callout = fp_malloc(sizeof(*fp->fp_callout), M_WAITOK); callout_init(fp->fp_callout, CALLOUT_MPSAFE); } /** * Free the resources held by a fail_point, and wake any paused threads. * Thou shalt not allow threads to hit this fail point after you enter this * function, nor shall you call this multiple times for a given fp. * @ingroup failpoint */ void fail_point_destroy(struct fail_point *fp) { fail_point_drain(fp, 0); if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) { fp_free(__DECONST(void *, fp->fp_name)); fp->fp_name = NULL; } fp->fp_flags = 0; if (fp->fp_callout) { fp_free(fp->fp_callout); fp->fp_callout = NULL; } sx_xlock(&sx_fp_set); fail_point_garbage_collect(); sx_xunlock(&sx_fp_set); } /** * This does the real work of evaluating a fail point. If the fail point tells * us to return a value, this function returns 1 and fills in 'return_value' * (return_value is allowed to be null). If the fail point tells us to panic, * we never return. Otherwise we just return 0 after doing some work, which * means "keep going". */ enum fail_point_return_code fail_point_eval_nontrivial(struct fail_point *fp, int *return_value) { bool execute = false; struct fail_point_entry *ent; struct fail_point_setting *fp_setting; enum fail_point_return_code ret; int cont; int count; int msecs; int usecs; ret = FAIL_POINT_RC_CONTINUE; cont = 0; /* don't continue by default */ fp_setting = fail_point_setting_get_ref(fp); if (fp_setting == NULL) goto abort; TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { if (ent->fe_stale) continue; if (ent->fe_prob < PROB_MAX && ent->fe_prob < random() % PROB_MAX) continue; if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid) continue; if (ent->fe_count != FE_COUNT_UNTRACKED) { count = ent->fe_count; while (count > 0) { if (atomic_cmpset_32(&ent->fe_count, count, count - 1)) { count--; execute = true; break; } count = ent->fe_count; } if (execute == false) /* We lost the race; consider the entry stale and bail now */ continue; if (count == 0) ent->fe_stale = true; } switch (ent->fe_type) { case FAIL_POINT_PANIC: panic("fail point %s panicking", fp->fp_name); /* NOTREACHED */ case FAIL_POINT_RETURN: if (return_value != NULL) *return_value = ent->fe_arg; ret = FAIL_POINT_RC_RETURN; break; case FAIL_POINT_BREAK: printf("fail point %s breaking to debugger\n", fp->fp_name); breakpoint(); break; case FAIL_POINT_PRINT: printf("fail point %s executing\n", fp->fp_name); cont = ent->fe_arg; break; case FAIL_POINT_SLEEP: msecs = ent->fe_arg; if (msecs) fail_point_sleep(fp, msecs, &ret); break; case FAIL_POINT_PAUSE: /** * Pausing is inherently strange with multiple * entries given our design. That is because some * entries could be unreachable, for instance in cases like: * pause->return. We can never reach the return entry. * The sysctl layer actually truncates all entries after * a pause for this reason. */ mtx_lock_spin(&fp_setting->feq_mtx); fail_point_pause(fp, &ret, &fp_setting->feq_mtx); mtx_unlock_spin(&fp_setting->feq_mtx); break; case FAIL_POINT_YIELD: kern_yield(PRI_UNCHANGED); break; case FAIL_POINT_DELAY: usecs = ent->fe_arg; DELAY(usecs); break; default: break; } if (cont == 0) break; } if (fail_point_is_off(fp)) fail_point_eval_swap_out(fp, fp_setting); abort: fail_point_setting_release_ref(fp); return (ret); } /** * Translate internal fail_point structure into human-readable text. */ static void fail_point_get(struct fail_point *fp, struct sbuf *sb, bool verbose) { struct fail_point_entry *ent; struct fail_point_setting *fp_setting; struct fail_point_entry *fp_entry_cpy; int cnt_sleeping; int idx; int printed_entry_count; cnt_sleeping = 0; idx = 0; printed_entry_count = 0; fp_entry_cpy = fp_malloc(sizeof(struct fail_point_entry) * (FP_MAX_ENTRY_COUNT + 1), M_WAITOK); fp_setting = fail_point_setting_get_ref(fp); if (fp_setting != NULL) { TAILQ_FOREACH(ent, &fp_setting->fp_entry_queue, fe_entries) { if (ent->fe_stale) continue; KASSERT(printed_entry_count < FP_MAX_ENTRY_COUNT, ("FP entry list larger than allowed")); fp_entry_cpy[printed_entry_count] = *ent; ++printed_entry_count; } } fail_point_setting_release_ref(fp); /* This is our equivalent of a NULL terminator */ fp_entry_cpy[printed_entry_count].fe_type = FAIL_POINT_INVALID; while (idx < printed_entry_count) { ent = &fp_entry_cpy[idx]; ++idx; if (ent->fe_prob < PROB_MAX) { int decimal = ent->fe_prob % (PROB_MAX / 100); int units = ent->fe_prob / (PROB_MAX / 100); sbuf_printf(sb, "%d", units); if (decimal) { int digits = PROB_DIGITS - 2; while (!(decimal % 10)) { digits--; decimal /= 10; } sbuf_printf(sb, ".%0*d", digits, decimal); } sbuf_printf(sb, "%%"); } if (ent->fe_count >= 0) sbuf_printf(sb, "%d*", ent->fe_count); sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name); if (ent->fe_arg) sbuf_printf(sb, "(%d)", ent->fe_arg); if (ent->fe_pid != NO_PID) sbuf_printf(sb, "[pid %d]", ent->fe_pid); if (TAILQ_NEXT(ent, fe_entries)) - sbuf_printf(sb, "->"); + sbuf_cat(sb, "->"); } if (!printed_entry_count) - sbuf_printf(sb, "off"); + sbuf_cat(sb, "off"); fp_free(fp_entry_cpy); if (verbose) { #ifdef STACK /* Print number of sleeping threads. queue=0 is the argument * used by msleep when sending our threads to sleep. */ - sbuf_printf(sb, "\nsleeping_thread_stacks = {\n"); + sbuf_cat(sb, "\nsleeping_thread_stacks = {\n"); sleepq_sbuf_print_stacks(sb, FP_SLEEP_CHANNEL(fp), 0, &cnt_sleeping); - sbuf_printf(sb, "},\n"); + sbuf_cat(sb, "},\n"); #endif sbuf_printf(sb, "sleeping_thread_count = %d,\n", cnt_sleeping); #ifdef STACK - sbuf_printf(sb, "paused_thread_stacks = {\n"); + sbuf_cat(sb, "paused_thread_stacks = {\n"); sleepq_sbuf_print_stacks(sb, FP_PAUSE_CHANNEL(fp), 0, &cnt_sleeping); - sbuf_printf(sb, "},\n"); + sbuf_cat(sb, "},\n"); #endif sbuf_printf(sb, "paused_thread_count = %d\n", cnt_sleeping); } } /** * Set an internal fail_point structure from a human-readable failpoint string * in a lock-safe manner. */ static int fail_point_set(struct fail_point *fp, char *buf) { struct fail_point_entry *ent, *ent_next; struct fail_point_setting *entries; bool should_wake_paused; bool should_truncate; int error; error = 0; should_wake_paused = false; should_truncate = false; /* Parse new entries. */ /** * ref protects our new malloc'd stuff from being garbage collected * before we link it. */ fail_point_setting_get_ref(fp); entries = fail_point_setting_new(fp); if (parse_fail_point(entries, buf) == NULL) { STAILQ_REMOVE(&fp_setting_garbage, entries, fail_point_setting, fs_garbage_link); fail_point_setting_destroy(entries); error = EINVAL; goto end; } /** * Transfer the entries we are going to keep to a new list. * Get rid of useless zero probability entries, and entries with hit * count 0. * If 'off' is present, and it has no hit count set, then all entries * after it are discarded since they are unreachable. */ TAILQ_FOREACH_SAFE(ent, &entries->fp_entry_queue, fe_entries, ent_next) { if (ent->fe_prob == 0 || ent->fe_count == 0) { printf("Discarding entry which cannot execute %s\n", fail_type_strings[ent->fe_type].name); TAILQ_REMOVE(&entries->fp_entry_queue, ent, fe_entries); fp_free(ent); continue; } else if (should_truncate) { printf("Discarding unreachable entry %s\n", fail_type_strings[ent->fe_type].name); TAILQ_REMOVE(&entries->fp_entry_queue, ent, fe_entries); fp_free(ent); continue; } if (ent->fe_type == FAIL_POINT_OFF) { should_wake_paused = true; if (ent->fe_count == FE_COUNT_UNTRACKED) { should_truncate = true; TAILQ_REMOVE(&entries->fp_entry_queue, ent, fe_entries); fp_free(ent); } } else if (ent->fe_type == FAIL_POINT_PAUSE) { should_truncate = true; } else if (ent->fe_type == FAIL_POINT_SLEEP && (fp->fp_flags & FAIL_POINT_NONSLEEPABLE)) { /** * If this fail point is annotated as being in a * non-sleepable ctx, convert sleep to delay and * convert the msec argument to usecs. */ printf("Sleep call request on fail point in " "non-sleepable context; using delay instead " "of sleep\n"); ent->fe_type = FAIL_POINT_DELAY; ent->fe_arg *= 1000; } } if (TAILQ_EMPTY(&entries->fp_entry_queue)) { entries = fail_point_swap_settings(fp, NULL); if (entries != NULL) wakeup(FP_PAUSE_CHANNEL(fp)); } else { if (should_wake_paused) wakeup(FP_PAUSE_CHANNEL(fp)); fail_point_swap_settings(fp, entries); } end: #ifdef IWARNING if (error) IWARNING("Failed to set %s %s to %s", fp->fp_name, fp->fp_location, buf); else INOTICE("Set %s %s to %s", fp->fp_name, fp->fp_location, buf); #endif /* IWARNING */ fail_point_setting_release_ref(fp); return (error); } #define MAX_FAIL_POINT_BUF 1023 /** * Handle kernel failpoint set/get. */ int fail_point_sysctl(SYSCTL_HANDLER_ARGS) { struct fail_point *fp; char *buf; struct sbuf sb, *sb_check; int error; buf = NULL; error = 0; fp = arg1; sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND); if (sb_check != &sb) return (ENOMEM); sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req); /* Setting */ /** * Lock protects any new entries from being garbage collected before we * can link them to the fail point. */ sx_xlock(&sx_fp_set); if (req->newptr) { if (req->newlen > MAX_FAIL_POINT_BUF) { error = EINVAL; goto out; } buf = fp_malloc(req->newlen + 1, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error) goto out; buf[req->newlen] = '\0'; error = fail_point_set(fp, buf); } fail_point_garbage_collect(); sx_xunlock(&sx_fp_set); /* Retrieving. */ fail_point_get(fp, &sb, false); out: sbuf_finish(&sb); sbuf_delete(&sb); if (buf) fp_free(buf); return (error); } int fail_point_sysctl_status(SYSCTL_HANDLER_ARGS) { struct fail_point *fp; struct sbuf sb, *sb_check; fp = arg1; sb_check = sbuf_new(&sb, NULL, 1024, SBUF_AUTOEXTEND); if (sb_check != &sb) return (ENOMEM); sbuf_set_drain(&sb, (sbuf_drain_func *)fail_sysctl_drain_func, req); /* Retrieving. */ fail_point_get(fp, &sb, true); sbuf_finish(&sb); sbuf_delete(&sb); /** * Lock protects any new entries from being garbage collected before we * can link them to the fail point. */ sx_xlock(&sx_fp_set); fail_point_garbage_collect(); sx_xunlock(&sx_fp_set); return (0); } int fail_sysctl_drain_func(void *sysctl_args, const char *buf, int len) { struct sysctl_req *sa; int error; sa = sysctl_args; error = SYSCTL_OUT(sa, buf, len); if (error == ENOMEM) return (-1); else return (len); } /** * Internal helper function to translate a human-readable failpoint string * into a internally-parsable fail_point structure. */ static char * parse_fail_point(struct fail_point_setting *ents, char *p) { /* :: * ( "->" )* */ uint8_t term_count; term_count = 1; p = parse_term(ents, p); if (p == NULL) return (NULL); while (*p != '\0') { term_count++; if (p[0] != '-' || p[1] != '>' || (p = parse_term(ents, p+2)) == NULL || term_count > FP_MAX_ENTRY_COUNT) return (NULL); } return (p); } /** * Internal helper function to parse an individual term from a failpoint. */ static char * parse_term(struct fail_point_setting *ents, char *p) { struct fail_point_entry *ent; ent = fail_point_entry_new(ents); /* * :: * ( ( "%") | ( "*" ) )* * * [ "(" ")" ] * [ "[pid " "]" ] */ /* ( ( "%") | ( "*" ) )* */ while (isdigit(*p) || *p == '.') { int units, decimal; p = parse_number(&units, &decimal, p); if (p == NULL) return (NULL); if (*p == '%') { if (units > 100) /* prevent overflow early */ units = 100; ent->fe_prob = units * (PROB_MAX / 100) + decimal; if (ent->fe_prob > PROB_MAX) ent->fe_prob = PROB_MAX; } else if (*p == '*') { if (!units || units < 0 || decimal) return (NULL); ent->fe_count = units; } else return (NULL); p++; } /* */ p = parse_type(ent, p); if (p == NULL) return (NULL); if (*p == '\0') return (p); /* [ "(" ")" ] */ if (*p != '(') return (p); p++; if (!isdigit(*p) && *p != '-') return (NULL); ent->fe_arg = strtol(p, &p, 0); if (*p++ != ')') return (NULL); /* [ "[pid " "]" ] */ #define PID_STRING "[pid " if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0) return (p); p += sizeof(PID_STRING) - 1; if (!isdigit(*p)) return (NULL); ent->fe_pid = strtol(p, &p, 0); if (*p++ != ']') return (NULL); return (p); } /** * Internal helper function to parse a numeric for a failpoint term. */ static char * parse_number(int *out_units, int *out_decimal, char *p) { char *old_p; /** * :: * [ "." ] | * "." */ /* whole part */ old_p = p; *out_units = strtol(p, &p, 10); if (p == old_p && *p != '.') return (NULL); /* fractional part */ *out_decimal = 0; if (*p == '.') { int digits = 0; p++; while (isdigit(*p)) { int digit = *p - '0'; if (digits < PROB_DIGITS - 2) *out_decimal = *out_decimal * 10 + digit; else if (digits == PROB_DIGITS - 2 && digit >= 5) (*out_decimal)++; digits++; p++; } if (!digits) /* need at least one digit after '.' */ return (NULL); while (digits++ < PROB_DIGITS - 2) /* add implicit zeros */ *out_decimal *= 10; } return (p); /* success */ } /** * Internal helper function to parse an individual type for a failpoint term. */ static char * parse_type(struct fail_point_entry *ent, char *beg) { enum fail_point_t type; int len; for (type = FAIL_POINT_OFF; type < FAIL_POINT_NUMTYPES; type++) { len = fail_type_strings[type].nmlen; if (strncmp(fail_type_strings[type].name, beg, len) == 0) { ent->fe_type = type; return (beg + len); } } return (NULL); } /* The fail point sysctl tree. */ SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "fail points"); /* Debugging/testing stuff for fail point */ static int sysctl_test_fail_point(SYSCTL_HANDLER_ARGS) { KFAIL_POINT_RETURN(DEBUG_FP, test_fail_point); return (0); } SYSCTL_OID(_debug_fail_point, OID_AUTO, test_trigger_fail_point, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_test_fail_point, "A", "Trigger test fail points"); diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c index 8e65fabeddc9..f6f781ade697 100644 --- a/sys/kern/kern_rctl.c +++ b/sys/kern/kern_rctl.c @@ -1,2246 +1,2246 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2010 The FreeBSD Foundation * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifdef RCTL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef RACCT #error "The RCTL option requires the RACCT option" #endif FEATURE(rctl, "Resource Limits"); #define HRF_DEFAULT 0 #define HRF_DONT_INHERIT 1 #define HRF_DONT_ACCUMULATE 2 #define RCTL_MAX_INBUFSIZE 4 * 1024 #define RCTL_MAX_OUTBUFSIZE 16 * 1024 * 1024 #define RCTL_LOG_BUFSIZE 128 #define RCTL_PCPU_SHIFT (10 * 1000000) static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; static int rctl_log_rate_limit = 10; static int rctl_devctl_rate_limit = 10; /* * Values below are initialized in rctl_init(). */ static int rctl_throttle_min = -1; static int rctl_throttle_max = -1; static int rctl_throttle_pct = -1; static int rctl_throttle_pct2 = -1; static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Resource Limits"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN, &rctl_maxbufsize, 0, "Maximum output buffer size"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW, &rctl_log_rate_limit, 0, "Maximum number of log messages per second"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN, &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second"); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_min_sysctl, "IU", "Shortest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_max_sysctl, "IU", "Longest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_pct_sysctl, "IU", "Throttling penalty for process consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, &rctl_throttle_pct2_sysctl, "IU", "Throttling penalty for container consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2); /* * 'rctl_rule_link' connects a rule with every racct it's related to. * For example, rule 'user:X:openfiles:deny=N/process' is linked * with uidinfo for user X, and to each process of that user. */ struct rctl_rule_link { LIST_ENTRY(rctl_rule_link) rrl_next; struct rctl_rule *rrl_rule; int rrl_exceeded; }; struct dict { const char *d_name; int d_value; }; static struct dict subjectnames[] = { { "process", RCTL_SUBJECT_TYPE_PROCESS }, { "user", RCTL_SUBJECT_TYPE_USER }, { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS }, { "jail", RCTL_SUBJECT_TYPE_JAIL }, { NULL, -1 }}; static struct dict resourcenames[] = { { "cputime", RACCT_CPU }, { "datasize", RACCT_DATA }, { "stacksize", RACCT_STACK }, { "coredumpsize", RACCT_CORE }, { "memoryuse", RACCT_RSS }, { "memorylocked", RACCT_MEMLOCK }, { "maxproc", RACCT_NPROC }, { "openfiles", RACCT_NOFILE }, { "vmemoryuse", RACCT_VMEM }, { "pseudoterminals", RACCT_NPTS }, { "swapuse", RACCT_SWAP }, { "nthr", RACCT_NTHR }, { "msgqqueued", RACCT_MSGQQUEUED }, { "msgqsize", RACCT_MSGQSIZE }, { "nmsgq", RACCT_NMSGQ }, { "nsem", RACCT_NSEM }, { "nsemop", RACCT_NSEMOP }, { "nshm", RACCT_NSHM }, { "shmsize", RACCT_SHMSIZE }, { "wallclock", RACCT_WALLCLOCK }, { "pcpu", RACCT_PCTCPU }, { "readbps", RACCT_READBPS }, { "writebps", RACCT_WRITEBPS }, { "readiops", RACCT_READIOPS }, { "writeiops", RACCT_WRITEIOPS }, { NULL, -1 }}; static struct dict actionnames[] = { { "sighup", RCTL_ACTION_SIGHUP }, { "sigint", RCTL_ACTION_SIGINT }, { "sigquit", RCTL_ACTION_SIGQUIT }, { "sigill", RCTL_ACTION_SIGILL }, { "sigtrap", RCTL_ACTION_SIGTRAP }, { "sigabrt", RCTL_ACTION_SIGABRT }, { "sigemt", RCTL_ACTION_SIGEMT }, { "sigfpe", RCTL_ACTION_SIGFPE }, { "sigkill", RCTL_ACTION_SIGKILL }, { "sigbus", RCTL_ACTION_SIGBUS }, { "sigsegv", RCTL_ACTION_SIGSEGV }, { "sigsys", RCTL_ACTION_SIGSYS }, { "sigpipe", RCTL_ACTION_SIGPIPE }, { "sigalrm", RCTL_ACTION_SIGALRM }, { "sigterm", RCTL_ACTION_SIGTERM }, { "sigurg", RCTL_ACTION_SIGURG }, { "sigstop", RCTL_ACTION_SIGSTOP }, { "sigtstp", RCTL_ACTION_SIGTSTP }, { "sigchld", RCTL_ACTION_SIGCHLD }, { "sigttin", RCTL_ACTION_SIGTTIN }, { "sigttou", RCTL_ACTION_SIGTTOU }, { "sigio", RCTL_ACTION_SIGIO }, { "sigxcpu", RCTL_ACTION_SIGXCPU }, { "sigxfsz", RCTL_ACTION_SIGXFSZ }, { "sigvtalrm", RCTL_ACTION_SIGVTALRM }, { "sigprof", RCTL_ACTION_SIGPROF }, { "sigwinch", RCTL_ACTION_SIGWINCH }, { "siginfo", RCTL_ACTION_SIGINFO }, { "sigusr1", RCTL_ACTION_SIGUSR1 }, { "sigusr2", RCTL_ACTION_SIGUSR2 }, { "sigthr", RCTL_ACTION_SIGTHR }, { "deny", RCTL_ACTION_DENY }, { "log", RCTL_ACTION_LOG }, { "devctl", RCTL_ACTION_DEVCTL }, { "throttle", RCTL_ACTION_THROTTLE }, { NULL, -1 }}; static void rctl_init(void); SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL); static uma_zone_t rctl_rule_zone; static uma_zone_t rctl_rule_link_zone; static int rctl_rule_fully_specified(const struct rctl_rule *rule); static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule); static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits"); static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_min; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 1 || val > rctl_throttle_max) return (EINVAL); RACCT_LOCK(); rctl_throttle_min = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_max; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < rctl_throttle_min) return (EINVAL); RACCT_LOCK(); rctl_throttle_max = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct2; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct2 = val; RACCT_UNLOCK(); return (0); } static const char * rctl_subject_type_name(int subject) { int i; for (i = 0; subjectnames[i].d_name != NULL; i++) { if (subjectnames[i].d_value == subject) return (subjectnames[i].d_name); } panic("rctl_subject_type_name: unknown subject type %d", subject); } static const char * rctl_action_name(int action) { int i; for (i = 0; actionnames[i].d_name != NULL; i++) { if (actionnames[i].d_value == action) return (actionnames[i].d_name); } panic("rctl_action_name: unknown action %d", action); } const char * rctl_resource_name(int resource) { int i; for (i = 0; resourcenames[i].d_name != NULL; i++) { if (resourcenames[i].d_value == resource) return (resourcenames[i].d_name); } panic("rctl_resource_name: unknown resource %d", resource); } static struct racct * rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule) { struct ucred *cred = p->p_ucred; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); switch (rule->rr_per) { case RCTL_SUBJECT_TYPE_PROCESS: return (p->p_racct); case RCTL_SUBJECT_TYPE_USER: return (cred->cr_ruidinfo->ui_racct); case RCTL_SUBJECT_TYPE_LOGINCLASS: return (cred->cr_loginclass->lc_racct); case RCTL_SUBJECT_TYPE_JAIL: return (cred->cr_prison->pr_prison_racct->prr_racct); default: panic("%s: unknown per %d", __func__, rule->rr_per); } } /* * Return the amount of resource that can be allocated by 'p' before * hitting 'rule'. */ static int64_t rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) { const struct racct *racct; int64_t available; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); racct = rctl_proc_rule_to_racct(p, rule); available = rule->rr_amount - racct->r_resources[rule->rr_resource]; return (available); } /* * Called every second for proc, uidinfo, loginclass, and jail containers. * If the limit isn't exceeded, it decreases the usage amount to zero. * Otherwise, it decreases it by the value of the limit. This way * resource consumption exceeding the limit "carries over" to the next * period. */ void rctl_throttle_decay(struct racct *racct, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t minavailable; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_THROTTLE) continue; if (rule->rr_amount < minavailable) minavailable = rule->rr_amount; } if (racct->r_resources[resource] < minavailable) { racct->r_resources[resource] = 0; } else { /* * Cap utilization counter at ten times the limit. Otherwise, * if we changed the rule lowering the allowed amount, it could * take unreasonably long time for the accumulated resource * usage to drop. */ if (racct->r_resources[resource] > minavailable * 10) racct->r_resources[resource] = minavailable * 10; racct->r_resources[resource] -= minavailable; } } /* * Special version of rctl_get_available() for the %CPU resource. * We slightly cheat here and return less than we normally would. */ int64_t rctl_pcpu_available(const struct proc *p) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, limit; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; limit = 0; LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != RACCT_PCTCPU) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) { minavailable = available; limit = rule->rr_amount; } } /* * Return slightly less than actual value of the available * %cpu resource. This makes %cpu throttling more aggressive * and lets us act sooner than the limits are already exceeded. */ if (limit != 0) { if (limit > 2 * RCTL_PCPU_SHIFT) minavailable -= RCTL_PCPU_SHIFT; else minavailable -= (limit / 2); } return (minavailable); } static uint64_t xadd(uint64_t a, uint64_t b) { uint64_t c; c = a + b; /* * Detect overflow. */ if (c < a || c < b) return (UINT64_MAX); return (c); } static uint64_t xmul(uint64_t a, uint64_t b) { if (b != 0 && a > UINT64_MAX / b) return (UINT64_MAX); return (a * b); } /* * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition * to what it keeps allocated now. Returns non-zero if the allocation should * be denied, 0 otherwise. */ int rctl_enforce(struct proc *p, int resource, uint64_t amount) { static struct timeval log_lasttime, devctl_lasttime; static int log_curtime = 0, devctl_curtime = 0; struct rctl_rule *rule; struct rctl_rule_link *link; struct sbuf sb; char *buf; int64_t available; uint64_t sleep_ms, sleep_ratio; int should_deny = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; available = rctl_available_resource(p, rule); if (available >= (int64_t)amount) { link->rrl_exceeded = 0; continue; } switch (rule->rr_action) { case RCTL_ACTION_DENY: should_deny = 1; continue; case RCTL_ACTION_LOG: /* * If rrl_exceeded != 0, it means we've already * logged a warning for this process. */ if (link->rrl_exceeded != 0) continue; /* * If the process state is not fully initialized yet, * we can't access most of the required fields, e.g. * p->p_comm. This happens when called from fork1(). * Ignore this rule for now; it will be processed just * after fork, when called from racct_proc_fork_done(). */ if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&log_lasttime, &log_curtime, rctl_log_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); rctl_rule_to_sbuf(&sb, rule); sbuf_finish(&sb); printf("rctl: rule \"%s\" matched by pid %d " "(%s), uid %d, jail %s\n", sbuf_data(&sb), p->p_pid, p->p_comm, p->p_ucred->cr_uid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_DEVCTL: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, rctl_devctl_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); - sbuf_printf(&sb, "rule="); + sbuf_cat(&sb, "rule="); rctl_rule_to_sbuf(&sb, rule); sbuf_printf(&sb, " pid=%d ruid=%d jail=%s", p->p_pid, p->p_ucred->cr_ruid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_finish(&sb); devctl_notify("RCTL", "rule", "matched", sbuf_data(&sb)); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_THROTTLE: if (p->p_state != PRS_NORMAL) continue; if (rule->rr_amount == 0) { racct_proc_throttle(p, rctl_throttle_max); continue; } /* * Make the process sleep for a fraction of second * proportional to the ratio of process' resource * utilization compared to the limit. The point is * to penalize resource hogs: processes that consume * more of the available resources sleep for longer. * * We're trying to defer division until the very end, * to minimize the rounding effects. The following * calculation could have been written in a clearer * way like this: * * sleep_ms = hz * p->p_racct->r_resources[resource] / * rule->rr_amount; * sleep_ms *= rctl_throttle_pct / 100; * if (sleep_ms < rctl_throttle_min) * sleep_ms = rctl_throttle_min; * */ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]); sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100; if (sleep_ms < rctl_throttle_min * rule->rr_amount) sleep_ms = rctl_throttle_min * rule->rr_amount; /* * Multiply that by the ratio of the resource * consumption for the container compared to the limit, * squared. In other words, a process in a container * that is two times over the limit will be throttled * four times as much for hitting the same rule. The * point is to penalize processes more if the container * itself (eg certain UID or jail) is above the limit. */ if (available < 0) sleep_ratio = -available / rule->rr_amount; else sleep_ratio = 0; sleep_ratio = xmul(sleep_ratio, sleep_ratio); sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100; sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio)); /* * Finally the division. */ sleep_ms /= rule->rr_amount; if (sleep_ms > rctl_throttle_max) sleep_ms = rctl_throttle_max; #if 0 printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n", __func__, p->p_pid, p->p_comm, p->p_racct->r_resources[resource], rule->rr_amount, (uintmax_t)sleep_ms, (uintmax_t)sleep_ratio, (intmax_t)available); #endif KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n", __func__, (uintmax_t)sleep_ms, rctl_throttle_min)); racct_proc_throttle(p, sleep_ms); continue; default: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; KASSERT(rule->rr_action > 0 && rule->rr_action <= RCTL_ACTION_SIGNAL_MAX, ("rctl_enforce: unknown action %d", rule->rr_action)); /* * We're using the fact that RCTL_ACTION_SIG* values * are equal to their counterparts from sys/signal.h. */ kern_psignal(p, rule->rr_action); link->rrl_exceeded = 1; continue; } } if (should_deny) { /* * Return fake error code; the caller should change it * into one proper for the situation - EFSIZ, ENOMEM etc. */ return (EDOOFUS); } return (0); } uint64_t rctl_get_limit(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; uint64_t amount = UINT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; if (rule->rr_amount < amount) amount = rule->rr_amount; } return (amount); } uint64_t rctl_get_available(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, allocated; minavailable = INT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) minavailable = available; } /* * XXX: Think about this _hard_. */ allocated = p->p_racct->r_resources[resource]; if (minavailable < INT64_MAX - allocated) minavailable += allocated; if (minavailable < 0) minavailable = 0; return (minavailable); } static int rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter) { ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_subject_type != filter->rr_subject_type) return (0); switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (filter->rr_subject.rs_proc != NULL && rule->rr_subject.rs_proc != filter->rr_subject.rs_proc) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (filter->rr_subject.rs_uip != NULL && rule->rr_subject.rs_uip != filter->rr_subject.rs_uip) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (filter->rr_subject.rs_loginclass != NULL && rule->rr_subject.rs_loginclass != filter->rr_subject.rs_loginclass) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (filter->rr_subject.rs_prison_racct != NULL && rule->rr_subject.rs_prison_racct != filter->rr_subject.rs_prison_racct) return (0); break; default: panic("rctl_rule_matches: unknown subject type %d", filter->rr_subject_type); } } if (filter->rr_resource != RACCT_UNDEFINED) { if (rule->rr_resource != filter->rr_resource) return (0); } if (filter->rr_action != RCTL_ACTION_UNDEFINED) { if (rule->rr_action != filter->rr_action) return (0); } if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) { if (rule->rr_amount != filter->rr_amount) return (0); } if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_per != filter->rr_per) return (0); } return (1); } static int str2value(const char *str, int *value, struct dict *table) { int i; if (value == NULL) return (EINVAL); for (i = 0; table[i].d_name != NULL; i++) { if (strcasecmp(table[i].d_name, str) == 0) { *value = table[i].d_value; return (0); } } return (EINVAL); } static int str2id(const char *str, id_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); return (0); } static int str2int64(const char *str, int64_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); if (*value < 0) return (ERANGE); return (0); } /* * Connect the rule to the racct, increasing refcount for the rule. */ static void rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); rctl_rule_acquire(rule); link = uma_zalloc(rctl_rule_link_zone, M_WAITOK); link->rrl_rule = rule; link->rrl_exceeded = 0; RACCT_LOCK(); LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); RACCT_UNLOCK(); } static int rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); RACCT_LOCK_ASSERT(); link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT); if (link == NULL) return (ENOMEM); rctl_rule_acquire(rule); link->rrl_rule = rule; link->rrl_exceeded = 0; LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); return (0); } /* * Remove limits for a rules matching the filter and release * the refcounts for the rules, possibly freeing them. Returns * the number of limit structures removed. */ static int rctl_racct_remove_rules(struct racct *racct, const struct rctl_rule *filter) { struct rctl_rule_link *link, *linktmp; int removed = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); removed++; } return (removed); } static void rctl_rule_acquire_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_hold(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uihold(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_hold(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_acquire_subject: unknown subject type %d", rule->rr_subject_type); } } static void rctl_rule_release_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_free(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uifree(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_free(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_release_subject: unknown subject type %d", rule->rr_subject_type); } } struct rctl_rule * rctl_rule_alloc(int flags) { struct rctl_rule *rule; ASSERT_RACCT_ENABLED(); rule = uma_zalloc(rctl_rule_zone, flags); if (rule == NULL) return (NULL); rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_resource = RACCT_UNDEFINED; rule->rr_action = RCTL_ACTION_UNDEFINED; rule->rr_amount = RCTL_AMOUNT_UNDEFINED; refcount_init(&rule->rr_refcount, 1); return (rule); } struct rctl_rule * rctl_rule_duplicate(const struct rctl_rule *rule, int flags) { struct rctl_rule *copy; ASSERT_RACCT_ENABLED(); copy = uma_zalloc(rctl_rule_zone, flags); if (copy == NULL) return (NULL); copy->rr_subject_type = rule->rr_subject_type; copy->rr_subject.rs_proc = rule->rr_subject.rs_proc; copy->rr_subject.rs_uip = rule->rr_subject.rs_uip; copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass; copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct; copy->rr_per = rule->rr_per; copy->rr_resource = rule->rr_resource; copy->rr_action = rule->rr_action; copy->rr_amount = rule->rr_amount; refcount_init(©->rr_refcount, 1); rctl_rule_acquire_subject(copy); return (copy); } void rctl_rule_acquire(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); refcount_acquire(&rule->rr_refcount); } static void rctl_rule_free(void *context, int pending) { struct rctl_rule *rule; rule = (struct rctl_rule *)context; ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0")); /* * We don't need locking here; rule is guaranteed to be inaccessible. */ rctl_rule_release_subject(rule); uma_zfree(rctl_rule_zone, rule); } void rctl_rule_release(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); if (refcount_release(&rule->rr_refcount)) { /* * rctl_rule_release() is often called when iterating * over all the uidinfo structures in the system, * holding uihashtbl_lock. Since rctl_rule_free() * might end up calling uifree(), this would lead * to lock recursion. Use taskqueue to avoid this. */ TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule); taskqueue_enqueue(taskqueue_thread, &rule->rr_task); } } static int rctl_rule_fully_specified(const struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: return (0); case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) return (0); break; default: panic("rctl_rule_fully_specified: unknown subject type %d", rule->rr_subject_type); } if (rule->rr_resource == RACCT_UNDEFINED) return (0); if (rule->rr_action == RCTL_ACTION_UNDEFINED) return (0); if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED) return (0); if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED) return (0); return (1); } static int rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep) { struct rctl_rule *rule; char *subjectstr, *subject_idstr, *resourcestr, *actionstr, *amountstr, *perstr; id_t id; int error = 0; ASSERT_RACCT_ENABLED(); rule = rctl_rule_alloc(M_WAITOK); subjectstr = strsep(&rulestr, ":"); subject_idstr = strsep(&rulestr, ":"); resourcestr = strsep(&rulestr, ":"); actionstr = strsep(&rulestr, "=/"); amountstr = strsep(&rulestr, "/"); perstr = rulestr; if (subjectstr == NULL || subjectstr[0] == '\0') rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(subjectstr, &rule->rr_subject_type, subjectnames); if (error != 0) goto out; } if (subject_idstr == NULL || subject_idstr[0] == '\0') { rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; } else { switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: error = EINVAL; goto out; case RCTL_SUBJECT_TYPE_PROCESS: error = str2id(subject_idstr, &id); if (error != 0) goto out; sx_assert(&allproc_lock, SA_LOCKED); rule->rr_subject.rs_proc = pfind(id); if (rule->rr_subject.rs_proc == NULL) { error = ESRCH; goto out; } PROC_UNLOCK(rule->rr_subject.rs_proc); break; case RCTL_SUBJECT_TYPE_USER: error = str2id(subject_idstr, &id); if (error != 0) goto out; rule->rr_subject.rs_uip = uifind(id); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: rule->rr_subject.rs_loginclass = loginclass_find(subject_idstr); if (rule->rr_subject.rs_loginclass == NULL) { error = ENAMETOOLONG; goto out; } break; case RCTL_SUBJECT_TYPE_JAIL: rule->rr_subject.rs_prison_racct = prison_racct_find(subject_idstr); if (rule->rr_subject.rs_prison_racct == NULL) { error = ENAMETOOLONG; goto out; } break; default: panic("rctl_string_to_rule: unknown subject type %d", rule->rr_subject_type); } } if (resourcestr == NULL || resourcestr[0] == '\0') rule->rr_resource = RACCT_UNDEFINED; else { error = str2value(resourcestr, &rule->rr_resource, resourcenames); if (error != 0) goto out; } if (actionstr == NULL || actionstr[0] == '\0') rule->rr_action = RCTL_ACTION_UNDEFINED; else { error = str2value(actionstr, &rule->rr_action, actionnames); if (error != 0) goto out; } if (amountstr == NULL || amountstr[0] == '\0') rule->rr_amount = RCTL_AMOUNT_UNDEFINED; else { error = str2int64(amountstr, &rule->rr_amount); if (error != 0) goto out; if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) { if (rule->rr_amount > INT64_MAX / 1000000) { error = ERANGE; goto out; } rule->rr_amount *= 1000000; } } if (perstr == NULL || perstr[0] == '\0') rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(perstr, &rule->rr_per, subjectnames); if (error != 0) goto out; } out: if (error == 0) *rulep = rule; else rctl_rule_release(rule); return (error); } /* * Link a rule with all the subjects it applies to. */ int rctl_rule_add(struct rctl_rule *rule) { struct proc *p; struct ucred *cred; struct uidinfo *uip; struct prison *pr; struct prison_racct *prr; struct loginclass *lc; struct rctl_rule *rule2; int match; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); /* * Some rules just don't make sense, like "deny" rule for an undeniable * resource. The exception are the RSS and %CPU resources - they are * not deniable in the racct sense, but the limit is enforced in * a different way. */ if (rule->rr_action == RCTL_ACTION_DENY && !RACCT_IS_DENIABLE(rule->rr_resource) && rule->rr_resource != RACCT_RSS && rule->rr_resource != RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && !RACCT_IS_DECAYING(rule->rr_resource)) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && rule->rr_resource == RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && RACCT_IS_SLOPPY(rule->rr_resource)) { return (EOPNOTSUPP); } /* * Make sure there are no duplicated rules. Also, for the "deny" * rules, remove ones differing only by "amount". */ if (rule->rr_action == RCTL_ACTION_DENY) { rule2 = rctl_rule_duplicate(rule, M_WAITOK); rule2->rr_amount = RCTL_AMOUNT_UNDEFINED; rctl_rule_remove(rule2); rctl_rule_release(rule2); } else rctl_rule_remove(rule); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = rule->rr_subject.rs_proc; KASSERT(p != NULL, ("rctl_rule_add: NULL proc")); rctl_racct_add_rule(p->p_racct, rule); /* * In case of per-process rule, we don't have anything more * to do. */ return (0); case RCTL_SUBJECT_TYPE_USER: uip = rule->rr_subject.rs_uip; KASSERT(uip != NULL, ("rctl_rule_add: NULL uip")); rctl_racct_add_rule(uip->ui_racct, rule); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = rule->rr_subject.rs_loginclass; KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass")); rctl_racct_add_rule(lc->lc_racct, rule); break; case RCTL_SUBJECT_TYPE_JAIL: prr = rule->rr_subject.rs_prison_racct; KASSERT(prr != NULL, ("rctl_rule_add: NULL pr")); rctl_racct_add_rule(prr->prr_racct, rule); break; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } /* * Now go through all the processes and add the new rule to the ones * it applies to. */ sx_assert(&allproc_lock, SA_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { cred = p->p_ucred; switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_USER: if (cred->cr_uidinfo == rule->rr_subject.rs_uip || cred->cr_ruidinfo == rule->rr_subject.rs_uip) break; continue; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (cred->cr_loginclass == rule->rr_subject.rs_loginclass) break; continue; case RCTL_SUBJECT_TYPE_JAIL: match = 0; for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) { if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) { match = 1; break; } } if (match) break; continue; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } rctl_racct_add_rule(p->p_racct, rule); } return (0); } static void rctl_rule_pre_callback(void) { RACCT_LOCK(); } static void rctl_rule_post_callback(void) { RACCT_UNLOCK(); } static void rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; int found = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); found += rctl_racct_remove_rules(racct, filter); *((int *)arg3) += found; } /* * Remove all rules that match the filter. */ int rctl_rule_remove(struct rctl_rule *filter) { struct proc *p; int found = 0; ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS && filter->rr_subject.rs_proc != NULL) { p = filter->rr_subject.rs_proc; RACCT_LOCK(); found = rctl_racct_remove_rules(p->p_racct, filter); RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } loginclass_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); ui_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); prison_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); sx_assert(&allproc_lock, SA_LOCKED); RACCT_LOCK(); FOREACH_PROC_IN_SYSTEM(p) { found += rctl_racct_remove_rules(p->p_racct, filter); } RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } /* * Appends a rule to the sbuf. */ static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule) { int64_t amount; ASSERT_RACCT_ENABLED(); sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type)); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) - sbuf_printf(sb, ":"); + sbuf_putc(sb, ':'); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_proc->p_pid); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) - sbuf_printf(sb, ":"); + sbuf_putc(sb, ':'); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_uip->ui_uid); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) - sbuf_printf(sb, ":"); + sbuf_putc(sb, ':'); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_loginclass->lc_name); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) - sbuf_printf(sb, ":"); + sbuf_putc(sb, ':'); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_prison_racct->prr_name); break; default: panic("rctl_rule_to_sbuf: unknown subject type %d", rule->rr_subject_type); } amount = rule->rr_amount; if (amount != RCTL_AMOUNT_UNDEFINED && RACCT_IS_IN_MILLIONS(rule->rr_resource)) amount /= 1000000; sbuf_printf(sb, "%s:%s=%jd", rctl_resource_name(rule->rr_resource), rctl_action_name(rule->rr_action), amount); if (rule->rr_per != rule->rr_subject_type) sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per)); } /* * Routine used by RCTL syscalls to read in input string. */ static int rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen) { char *str; int error; ASSERT_RACCT_ENABLED(); if (inbuflen <= 0) return (EINVAL); if (inbuflen > RCTL_MAX_INBUFSIZE) return (E2BIG); str = malloc(inbuflen + 1, M_RCTL, M_WAITOK); error = copyinstr(inbufp, str, inbuflen, NULL); if (error != 0) { free(str, M_RCTL); return (error); } *inputstr = str; return (0); } /* * Routine used by RCTL syscalls to write out output string. */ static int rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen) { int error; ASSERT_RACCT_ENABLED(); if (outputsbuf == NULL) return (0); sbuf_finish(outputsbuf); if (outbuflen < sbuf_len(outputsbuf) + 1) { sbuf_delete(outputsbuf); return (ERANGE); } error = copyout(sbuf_data(outputsbuf), outbufp, sbuf_len(outputsbuf) + 1); sbuf_delete(outputsbuf); return (error); } static struct sbuf * rctl_racct_to_sbuf(struct racct *racct, int sloppy) { struct sbuf *sb; int64_t amount; int i; ASSERT_RACCT_ENABLED(); sb = sbuf_new_auto(); for (i = 0; i <= RACCT_MAX; i++) { if (sloppy == 0 && RACCT_IS_SLOPPY(i)) continue; RACCT_LOCK(); amount = racct->r_resources[i]; RACCT_UNLOCK(); if (RACCT_IS_IN_MILLIONS(i)) amount /= 1000000; sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount); } sbuf_setpos(sb, sbuf_len(sb) - 1); return (sb); } int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { struct rctl_rule *filter; struct sbuf *outputsbuf = NULL; struct proc *p; struct uidinfo *uip; struct loginclass *lc; struct prison_racct *prr; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RACCT); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = filter->rr_subject.rs_proc; if (p == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0); break; case RCTL_SUBJECT_TYPE_USER: uip = filter->rr_subject.rs_uip; if (uip == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = filter->rr_subject.rs_loginclass; if (lc == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1); break; case RCTL_SUBJECT_TYPE_JAIL: prr = filter->rr_subject.rs_prison_racct; if (prr == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1); break; default: error = EINVAL; } out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); if (error != 0) return (error); error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen); return (error); } static void rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; struct rctl_rule_link *link; struct sbuf *sb = (struct sbuf *)arg3; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); - sbuf_printf(sb, ","); + sbuf_putc(sb, ','); } } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; struct proc *p; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RULES); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); FOREACH_PROC_IN_SYSTEM(p) { RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { /* * Non-process rules will be added to the buffer later. * Adding them here would result in duplicated output. */ if (link->rrl_rule->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) continue; if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); - sbuf_printf(sb, ","); + sbuf_putc(sb, ','); } RACCT_UNLOCK(); } loginclass_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); ui_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); prison_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_LIMITS); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EOPNOTSUPP); } if (filter->rr_subject.rs_proc == NULL) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); RACCT_LOCK(); LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links, rrl_next) { rctl_rule_to_sbuf(sb, link->rrl_rule); - sbuf_printf(sb, ","); + sbuf_putc(sb, ','); } RACCT_UNLOCK(); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; sbuf_delete(sb); goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { struct rctl_rule *rule; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_ADD_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &rule); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } /* * The 'per' part of a rule is optional. */ if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED && rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) rule->rr_per = rule->rr_subject_type; if (!rctl_rule_fully_specified(rule)) { error = EINVAL; goto out; } error = rctl_rule_add(rule); out: rctl_rule_release(rule); sx_sunlock(&allproc_lock); return (error); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { struct rctl_rule *filter; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_REMOVE_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } error = rctl_rule_remove(filter); rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (error); } /* * Update RCTL rule list after credential change. */ void rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred) { LIST_HEAD(, rctl_rule_link) newrules; struct rctl_rule_link *link, *newlink; struct uidinfo *newuip; struct loginclass *newlc; struct prison_racct *newprr; int rulecnt, i; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_NOTOWNED); newuip = newcred->cr_ruidinfo; newlc = newcred->cr_loginclass; newprr = newcred->cr_prison->pr_prison_racct; LIST_INIT(&newrules); again: /* * First, count the rules that apply to the process with new * credentials. */ rulecnt = 0; RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) rulecnt++; } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) rulecnt++; RACCT_UNLOCK(); /* * Create temporary list. We've dropped the rctl_lock in order * to use M_WAITOK. */ for (i = 0; i < rulecnt; i++) { newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK); newlink->rrl_rule = NULL; newlink->rrl_exceeded = 0; LIST_INSERT_HEAD(&newrules, newlink, rrl_next); } newlink = LIST_FIRST(&newrules); /* * Assign rules to the newly allocated list entries. */ RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } if (rulecnt == 0) { /* * Free the old rule list. */ while (!LIST_EMPTY(&p->p_racct->r_rule_links)) { link = LIST_FIRST(&p->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } /* * Replace lists and we're done. * * XXX: Is there any way to switch list heads instead * of iterating here? */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); LIST_INSERT_HEAD(&p->p_racct->r_rule_links, newlink, rrl_next); } RACCT_UNLOCK(); return; } goaround: RACCT_UNLOCK(); /* * Rule list changed while we were not holding the rctl_lock. * Free the new list and try again. */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); if (newlink->rrl_rule != NULL) rctl_rule_release(newlink->rrl_rule); uma_zfree(rctl_rule_link_zone, newlink); } goto again; } /* * Assign RCTL rules to the newly created process. */ int rctl_proc_fork(struct proc *parent, struct proc *child) { struct rctl_rule *rule; struct rctl_rule_link *link; int error; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent)); LIST_INIT(&child->p_racct->r_rule_links); /* * Go through limits applicable to the parent and assign them * to the child. Rules with 'process' subject have to be duplicated * in order to make their rr_subject point to the new process. */ LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT); if (rule == NULL) goto fail; KASSERT(rule->rr_subject.rs_proc == parent, ("rule->rr_subject.rs_proc != parent")); rule->rr_subject.rs_proc = child; error = rctl_racct_add_rule_locked(child->p_racct, rule); rctl_rule_release(rule); if (error != 0) goto fail; } else { error = rctl_racct_add_rule_locked(child->p_racct, link->rrl_rule); if (error != 0) goto fail; } } return (0); fail: while (!LIST_EMPTY(&child->p_racct->r_rule_links)) { link = LIST_FIRST(&child->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } return (EAGAIN); } /* * Release rules attached to the racct. */ void rctl_racct_release(struct racct *racct) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); while (!LIST_EMPTY(&racct->r_rule_links)) { link = LIST_FIRST(&racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } } static void rctl_init(void) { if (!racct_enable) return; rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); rctl_rule_link_zone = uma_zcreate("rctl_rule_link", sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Set default values, making sure not to overwrite the ones * fetched from tunables. Most of those could be set at the * declaration, except for the rctl_throttle_max - we cannot * set it there due to hz not being compile time constant. */ if (rctl_throttle_min < 1) rctl_throttle_min = 1; if (rctl_throttle_max < rctl_throttle_min) rctl_throttle_max = 2 * hz; if (rctl_throttle_pct < 0) rctl_throttle_pct = 100; if (rctl_throttle_pct2 < 0) rctl_throttle_pct2 = 100; } #else /* !RCTL */ #include #include int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { return (ENOSYS); } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { return (ENOSYS); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { return (ENOSYS); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { return (ENOSYS); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { return (ENOSYS); } #endif /* RCTL */ diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index c847783cd3da..8726c35e15a5 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -1,4599 +1,4599 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , signal__send, "struct thread *", "struct proc *", "int"); SDT_PROBE_DEFINE2(proc, , , signal__clear, "int", "ksiginfo_t *"); SDT_PROBE_DEFINE3(proc, , , signal__discard, "struct thread *", "struct proc *", "int"); static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); static void reschedule_signals(struct proc *p, sigset_t block, int flags); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static int sig_suspend_threads(struct thread *, struct proc *); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock); static void sigqueue_start(void); static void sigfastblock_setpend(struct thread *td, bool resched); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { .f_isfd = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); static int kern_lognosys = 0; SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0, "Log invalid syscalls"); static int kern_signosys = 1; SYSCTL_INT(_kern, OID_AUTO, signosys, CTLFLAG_RWTUN, &kern_signosys, 0, "Send SIGSYS on return from invalid syscall"); __read_frequently bool sigfastblock_fetch_always = false; SYSCTL_BOOL(_kern, OID_AUTO, sigfastblock_fetch_always, CTLFLAG_RWTUN, &sigfastblock_fetch_always, 0, "Fetch sigfastblock word on each syscall entry for proper " "blocking semantic"); static bool kern_sig_discard_ign = true; SYSCTL_BOOL(_kern, OID_AUTO, sig_discard_ign, CTLFLAG_RWTUN, &kern_sig_discard_ign, 0, "Discard ignored signals on delivery, otherwise queue them to " "the target queue"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) static int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); static int capmode_coredump; SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, &capmode_coredump, 0, "Allow processes in capability mode to dump core"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); static int coredump_devctl = 0; SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, 0, "Generate a devctl notification when processes coredump"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SIGPROP_KILL 0x01 /* terminates process by default */ #define SIGPROP_CORE 0x02 /* ditto and coredumps */ #define SIGPROP_STOP 0x04 /* suspend process */ #define SIGPROP_TTYSTOP 0x08 /* ditto, from tty */ #define SIGPROP_IGNORE 0x10 /* ignore by default */ #define SIGPROP_CONT 0x20 /* continue if suspended */ static const int sigproptbl[NSIG] = { [SIGHUP] = SIGPROP_KILL, [SIGINT] = SIGPROP_KILL, [SIGQUIT] = SIGPROP_KILL | SIGPROP_CORE, [SIGILL] = SIGPROP_KILL | SIGPROP_CORE, [SIGTRAP] = SIGPROP_KILL | SIGPROP_CORE, [SIGABRT] = SIGPROP_KILL | SIGPROP_CORE, [SIGEMT] = SIGPROP_KILL | SIGPROP_CORE, [SIGFPE] = SIGPROP_KILL | SIGPROP_CORE, [SIGKILL] = SIGPROP_KILL, [SIGBUS] = SIGPROP_KILL | SIGPROP_CORE, [SIGSEGV] = SIGPROP_KILL | SIGPROP_CORE, [SIGSYS] = SIGPROP_KILL | SIGPROP_CORE, [SIGPIPE] = SIGPROP_KILL, [SIGALRM] = SIGPROP_KILL, [SIGTERM] = SIGPROP_KILL, [SIGURG] = SIGPROP_IGNORE, [SIGSTOP] = SIGPROP_STOP, [SIGTSTP] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGCONT] = SIGPROP_IGNORE | SIGPROP_CONT, [SIGCHLD] = SIGPROP_IGNORE, [SIGTTIN] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGTTOU] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGIO] = SIGPROP_IGNORE, [SIGXCPU] = SIGPROP_KILL, [SIGXFSZ] = SIGPROP_KILL, [SIGVTALRM] = SIGPROP_KILL, [SIGPROF] = SIGPROP_KILL, [SIGWINCH] = SIGPROP_IGNORE, [SIGINFO] = SIGPROP_IGNORE, [SIGUSR1] = SIGPROP_KILL, [SIGUSR2] = SIGPROP_KILL, }; #define _SIG_FOREACH_ADVANCE(i, set) ({ \ int __found; \ for (;;) { \ if (__bits != 0) { \ int __sig = ffs(__bits); \ __bits &= ~(1u << (__sig - 1)); \ sig = __i * sizeof((set)->__bits[0]) * NBBY + __sig; \ __found = 1; \ break; \ } \ if (++__i == _SIG_WORDS) { \ __found = 0; \ break; \ } \ __bits = (set)->__bits[__i]; \ } \ __found != 0; \ }) #define SIG_FOREACH(i, set) \ for (int32_t __i = -1, __bits = 0; \ _SIG_FOREACH_ADVANCE(i, set); ) \ static sigset_t fastblock_mask; static void ast_sig(struct thread *td, int tda) { struct proc *p; int old_boundary, sig; bool resched_sigs; p = td->td_proc; #ifdef DIAGNOSTIC if (p->p_numthreads == 1 && (tda & (TDAI(TDA_SIG) | TDAI(TDA_AST))) == 0) { PROC_LOCK(p); thread_lock(td); /* * Note that TDA_SIG should be re-read from * td_ast, since signal might have been delivered * after we cleared td_flags above. This is one of * the reason for looping check for AST condition. * See comment in userret() about P_PPWAIT. */ if ((p->p_flag & P_PPWAIT) == 0 && (td->td_pflags & TDP_SIGFASTBLOCK) == 0) { if (SIGPENDING(td) && ((tda | td->td_ast) & (TDAI(TDA_SIG) | TDAI(TDA_AST))) == 0) { thread_unlock(td); /* fix dumps */ panic( "failed2 to set signal flags for ast p %p " "td %p tda %#x td_ast %#x fl %#x", p, td, tda, td->td_ast, td->td_flags); } } thread_unlock(td); PROC_UNLOCK(p); } #endif /* * Check for signals. Unlocked reads of p_pendingcnt or * p_siglist might cause process-directed signal to be handled * later. */ if ((tda & TDAI(TDA_SIG)) != 0 || p->p_pendingcnt > 0 || !SIGISEMPTY(p->p_siglist)) { sigfastblock_fetch(td); PROC_LOCK(p); old_boundary = ~TDB_BOUNDARY | (td->td_dbgflags & TDB_BOUNDARY); td->td_dbgflags |= TDB_BOUNDARY; mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); td->td_dbgflags &= old_boundary; PROC_UNLOCK(p); resched_sigs = true; } else { resched_sigs = false; } /* * Handle deferred update of the fast sigblock value, after * the postsig() loop was performed. */ sigfastblock_setpend(td, resched_sigs); } static void ast_sigsuspend(struct thread *td, int tda __unused) { MPASS((td->td_pflags & TDP_OLDMASK) != 0); td->td_pflags &= ~TDP_OLDMASK; kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0); } static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); SIGFILLSET(fastblock_mask); SIG_CANTMASK(fastblock_mask); ast_register(TDA_SIG, ASTR_UNCOND, 0, ast_sig); ast_register(TDA_SIGSUSPEND, ASTR_ASTF_REQUIRED | ASTR_TDP, TDP_OLDMASK, ast_sigsuspend); } ksiginfo_t * ksiginfo_alloc(int mwait) { MPASS(mwait == M_WAITOK || mwait == M_NOWAIT); if (ksiginfo_zone == NULL) return (NULL); return (uma_zalloc(ksiginfo_zone, mwait | M_ZERO)); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline bool ksiginfo_tryfree(ksiginfo_t *ksi) { if ((ksi->ksi_flags & KSI_EXT) == 0) { uma_zfree(ksiginfo_zone, ksi); return (true); } return (false); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); SIGEMPTYSET(list->sq_ptrace); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ static int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_ptrace, signo)) { count++; SIGDELSET(sq->sq_ptrace, signo); si->ksi_flags |= KSI_PTRACE; } if (SIGISMEMBER(sq->sq_kill, signo)) { count++; if (count == 1) SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) && !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } static int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); /* * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path * for these signals. */ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(M_NOWAIT)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if (ret != 0) { if ((si->ksi_flags & KSI_PTRACE) != 0) { SIGADDSET(sq->sq_ptrace, signo); ret = 0; goto out_set_bit; } else if ((si->ksi_flags & KSI_TRAP) != 0 || (si->ksi_flags & KSI_SIGQ) == 0) { SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } return (ret); } out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); SIGEMPTYSET(sq->sq_ptrace); } static void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set) { sigset_t tmp; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_ptrace; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_ptrace, tmp); SIGSETNAND(src->sq_ptrace, tmp); tmp = src->sq_signals; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); } #if 0 static void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } #endif static void sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_ptrace, *set); SIGSETNAND(sq->sq_signals, *set); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ static void sigqueue_delete_set_proc(struct proc *p, const sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } static void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to thread td, the current * thread, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (SIGPENDING(td)) ast_sched(td, TDA_SIG); } /* * Returns 1 (true) if altstack is configured for the thread, and the * passed stack bottom address falls into the altstack range. Handles * the 43 compat special case where the alt stack size is zero. */ int sigonstack(size_t sp) { struct thread *td; td = curthread; if ((td->td_pflags & TDP_ALTSTACK) == 0) return (0); #if defined(COMPAT_43) if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && td->td_sigstk.ss_size == 0) return ((td->td_sigstk.ss_flags & SS_ONSTACK) != 0); #endif return (sp >= (size_t)td->td_sigstk.ss_sp && sp < td->td_sigstk.ss_size + (size_t)td->td_sigstk.ss_sp); } static __inline int sigprop(int sig) { if (sig > 0 && sig < nitems(sigproptbl)) return (sigproptbl[sig]); return (0); } static bool sigact_flag_test(const struct sigaction *act, int flag) { /* * SA_SIGINFO is reset when signal disposition is set to * ignore or default. Other flags are kept according to user * settings. */ return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO || ((__sighandler_t *)act->sa_sigaction != SIG_IGN && (__sighandler_t *)act->sa_sigaction != SIG_DFL))); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags) { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); if (act != NULL && act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK | SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER | SA_NOCLDWAIT | SA_SIGINFO)) != 0) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { memset(oact, 0, sizeof(*oact)); oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (sigact_flag_test(act, SA_SIGINFO)) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!sigact_flag_test(act, SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (sigact_flag_test(act, SA_ONSTACK)) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (sigact_flag_test(act, SA_RESETHAND)) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (sigact_flag_test(act, SA_NODEFER)) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SIGPROP_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sys_sigaction(struct thread *td, struct sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(struct thread *td, struct osigaction_args *uap) { struct osigaction sa; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(struct thread *td, struct osigreturn_args *uap) { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(struct proc *p) { int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset specified signal to the default disposition. */ static void sigdflt(struct sigacts *ps, int sig) { mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { struct sigacts *ps; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig_drop_caught(p); /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td = curthread; MPASS(td->td_proc == p); td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset, int flags) { sigset_t new_block, oset1; struct proc *p; int error; p = td->td_proc; if ((flags & SIGPROCMASK_PROC_LOCKED) != 0) PROC_LOCK_ASSERT(p, MA_OWNED); else PROC_LOCK(p); mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; SIGSETOR(td->td_sigmask, *set); new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); goto out; case SIG_SETMASK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; if (flags & SIGPROCMASK_OLD) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); signotify(td); break; default: error = EINVAL; goto out; } /* * The new_block set contains signals that were not previously * blocked, but are blocked now. * * In case we block any signal that was not previously blocked * for td, and process has the signal pending, try to schedule * signal delivery to some thread that does not block the * signal, possibly waking it up. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, flags); } out: if (!(flags & SIGPROCMASK_PROC_LOCKED)) PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap) { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(struct thread *td, struct osigprocmask_args *uap) { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sys_sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { /* * sigwait() function shall not return EINTR, but * the syscall does. Non-ancient libc provides the * wrapper which hides EINTR. Otherwise, EINTR return * is used by libthr to handle required cancellation * point in the sigwait(). */ if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT) return (ERESTART); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } static void proc_td_siginfo_capture(struct thread *td, siginfo_t *si) { struct thread *thr; FOREACH_THREAD_IN_PROC(td->td_proc, thr) { if (thr == td) thr->td_si = *si; else thr->td_si.si_signo = 0; } } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t saved_mask, new_block; struct proc *p; int error, sig, timevalid = 0; sbintime_t sbt, precision, tsbt; struct timespec ts; bool traced; p = td->td_proc; error = 0; traced = false; /* Ensure the sigfastblock value is up to date. */ sigfastblock_fetch(td); if (timeout != NULL) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; ts = *timeout; if (ts.tv_sec < INT32_MAX / 2) { tsbt = tstosbt(ts); precision = tsbt; precision >>= tc_precexp; if (TIMESEL(&sbt, tsbt)) sbt += tc_tick_sbt; sbt += tsbt; } else precision = sbt = 0; } } else precision = sbt = 0; ksiginfo_init(ksi); /* Some signals can not be waited for. */ SIG_CANTMASK(waitset); ps = p->p_sigacts; PROC_LOCK(p); saved_mask = td->td_sigmask; SIGSETNAND(td->td_sigmask, waitset); if ((p->p_sysent->sv_flags & SV_SIG_DISCIGN) != 0 || !kern_sig_discard_ign) { thread_lock(td); td->td_flags |= TDF_SIGWAIT; thread_unlock(td); } for (;;) { mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); KASSERT(sig >= 0, ("sig %d", sig)); if (sig != 0 && SIGISMEMBER(waitset, sig)) { if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 || sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) { error = 0; break; } } if (error != 0) break; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout != NULL && !timevalid) { error = EINVAL; break; } if (traced) { error = EINTR; break; } error = msleep_sbt(&p->p_sigacts, &p->p_mtx, PPAUSE | PCATCH, "sigwait", sbt, precision, C_ABSOLUTE); /* The syscalls can not be restarted. */ if (error == ERESTART) error = EINTR; /* * If PTRACE_SCE or PTRACE_SCX were set after * userspace entered the syscall, return spurious * EINTR after wait was done. Only do this as last * resort after rechecking for possible queued signals * and expired timeouts. */ if (error == 0 && (p->p_ptevents & PTRACE_SYSCALL) != 0) traced = true; } thread_lock(td); td->td_flags &= ~TDF_SIGWAIT; thread_unlock(td); new_block = saved_mask; SIGSETNAND(new_block, td->td_sigmask); td->td_sigmask = saved_mask; /* * Fewer signals can be delivered to us, reschedule signal * notification. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, 0); if (error == 0) { SDT_PROBE2(proc, , , signal__clear, sig, ksi); if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code); } #endif if (sig == SIGKILL) { proc_td_siginfo_capture(td, &ksi->ksi_info); sigexit(td, sig); } } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sys_sigpending(struct thread *td, struct sigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(struct thread *td, struct osigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(struct thread *td, struct osigvec_args *uap) { struct sigvec vec; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(struct thread *td, struct osigblock_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(struct thread *td, struct osigsetmask_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap) { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; int has_sig, sig; /* Ensure the sigfastblock value is up to date. */ sigfastblock_fetch(td); /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask, SIGPROCMASK_PROC_LOCKED); td->td_pflags |= TDP_OLDMASK; ast_sched(td, TDA_SIGSUSPEND); /* * Process signals now. Otherwise, we can get spurious wakeup * due to signal entered process queue, but delivered to other * thread. But sigsuspend should return only on signal * delivery. */ (p->p_sysent->sv_set_syscall_retval)(td, EINTR); for (has_sig = 0; !has_sig;) { while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; thread_suspend_check(0); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); has_sig += postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); /* * If PTRACE_SCE or PTRACE_SCX were set after * userspace entered the syscall, return spurious * EINTR. */ if ((p->p_ptevents & PTRACE_SYSCALL) != 0) has_sig += 1; } PROC_UNLOCK(p); td->td_errno = EINTR; td->td_pflags |= TDP_NERRNO; return (EJUSTRETURN); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(struct thread *td, struct osigsuspend_args *uap) { sigset_t mask; OSIG2SIG(uap->mask, mask); return (kern_sigsuspend(td, mask)); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(struct thread *td, struct osigstack_args *uap) { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap) { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } struct killpg1_ctx { struct thread *td; ksiginfo_t *ksi; int sig; bool sent; bool found; int ret; }; static void killpg1_sendsig_locked(struct proc *p, struct killpg1_ctx *arg) { int err; err = p_cansignal(arg->td, p, arg->sig); if (err == 0 && arg->sig != 0) pksignal(p, arg->sig, arg->ksi); if (err != ESRCH) arg->found = true; if (err == 0) arg->sent = true; else if (arg->ret == 0 && err != ESRCH && err != EPERM) arg->ret = err; } static void killpg1_sendsig(struct proc *p, bool notself, struct killpg1_ctx *arg) { if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 || (notself && p == arg->td->td_proc) || p->p_state == PRS_NEW) return; PROC_LOCK(p); killpg1_sendsig_locked(p, arg); PROC_UNLOCK(p); } static void kill_processes_prison_cb(struct proc *p, void *arg) { struct killpg1_ctx *ctx = arg; if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 || (p == ctx->td->td_proc) || p->p_state == PRS_NEW) return; killpg1_sendsig_locked(p, ctx); } /* * Common code for kill process group/broadcast kill. * td is the calling thread, as usual. */ static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi) { struct proc *p; struct pgrp *pgrp; struct killpg1_ctx arg; arg.td = td; arg.ksi = ksi; arg.sig = sig; arg.sent = false; arg.found = false; arg.ret = 0; if (all) { /* * broadcast */ prison_proc_iterate(td->td_ucred->cr_prison, kill_processes_prison_cb, &arg); } else { again: sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); if (!sx_try_xlock(&pgrp->pg_killsx)) { PGRP_UNLOCK(pgrp); sx_xlock(&pgrp->pg_killsx); sx_xunlock(&pgrp->pg_killsx); goto again; } LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { killpg1_sendsig(p, false, &arg); } PGRP_UNLOCK(pgrp); sx_xunlock(&pgrp->pg_killsx); } MPASS(arg.ret != 0 || arg.found || !arg.sent); if (arg.ret == 0 && !arg.sent) arg.ret = arg.found ? EPERM : ESRCH; return (arg.ret); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int sys_kill(struct thread *td, struct kill_args *uap) { return (kern_kill(td, uap->pid, uap->signum)); } int kern_kill(struct thread *td, pid_t pid, int signum) { ksiginfo_t ksi; struct proc *p; int error; /* * A process in capability mode can send signals only to himself. * The main rationale behind this is that abort(3) is implemented as * kill(getpid(), SIGABRT). */ if (IN_CAPABILITY_MODE(td) && pid != td->td_proc->p_pid) return (ECAPMODE); AUDIT_ARG_SIGNUM(signum); AUDIT_ARG_PID(pid); if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (pid > 0) { /* kill single process */ if ((p = pfind_any(pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, signum); if (error == 0 && signum) pksignal(p, signum, &ksi); PROC_UNLOCK(p); return (error); } switch (pid) { case -1: /* broadcast signal */ return (killpg1(td, signum, 0, 1, &ksi)); case 0: /* signal own process group */ return (killpg1(td, signum, 0, 0, &ksi)); default: /* negative explicit process group */ return (killpg1(td, signum, -pid, 0, &ksi)); } /* NOTREACHED */ } int sys_pdkill(struct thread *td, struct pdkill_args *uap) { struct proc *p; int error; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_FD(uap->fd); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p); if (error) return (error); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) kern_psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(struct thread *td, struct okillpg_args *uap) { ksiginfo_t ksi; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; return (killpg1(td, uap->signum, uap->pgid, 0, &ksi)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sys_sigqueue(struct thread *td, struct sigqueue_args *uap) { union sigval sv; sv.sival_ptr = uap->value; return (kern_sigqueue(td, uap->pid, uap->signum, &sv)); } int kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (pid <= 0) return (EINVAL); if ((p = pfind_any(pid)) == NULL) return (ESRCH); error = p_cansignal(td, p, signum); if (error == 0 && signum != 0) { ksiginfo_init(&ksi); ksi.ksi_flags = KSI_SIGQ; ksi.ksi_signo = signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value = *value; error = pksignal(p, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi) { struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && (checkctty == 0 || p->p_flag & P_CONTROLT)) pksignal(p, sig, ksi); PROC_UNLOCK(p); } } } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. Should be called after * mach-specific routine, because sysent->sv_sendsig() needs correct * ps_siginfo and signal mask. */ static void postsig_done(int sig, struct thread *td, struct sigacts *ps) { sigset_t mask; mtx_assert(&ps->ps_mtx, MA_OWNED); td->td_ru.ru_nsignals++; mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(mask, sig); kern_sigprocmask(td, SIG_BLOCK, &mask, NULL, SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED); if (SIGISMEMBER(ps->ps_sigreset, sig)) sigdflt(ps, sig); } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; sigset_t sigmask; int sig; p = td->td_proc; sig = ksi->ksi_signo; KASSERT(_SIG_VALID(sig), ("invalid signal")); sigfastblock_fetch(td); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sigmask = td->td_sigmask; if (td->td_sigblock_val != 0) SIGSETOR(sigmask, fastblock_mask); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(sigmask, sig)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, ksi->ksi_code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; td->td_pflags &= ~TDP_SIGFASTBLOCK; td->td_sigblock_val = 0; } mtx_unlock(&ps->ps_mtx); p->p_sig = sig; /* XXX to verify code */ tdsendsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, bool fast_sigblock) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(!fast_sigblock || p == curproc); /* * Check if current thread can handle the signal without * switching context to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig) && (!fast_sigblock || curthread->td_sigblock_val == 0)) return (curthread); /* Find a non-stopped thread that does not mask the signal. */ signal_td = NULL; FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig) && (!fast_sigblock || td != curthread || td->td_sigblock_val == 0) && (td->td_flags & TDF_BOUNDARY) == 0) { signal_td = td; break; } } /* Select random (first) thread if no better match was found. */ if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void kern_psignal(struct proc *p, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(p, NULL, sig, &ksi); } int pksignal(struct proc *p, int sig, ksiginfo_t *ksi) { return (tdsendsignal(p, NULL, sig, ksi)); } /* Utility function for finding a thread to send signal event to. */ int sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **ttd) { struct thread *td; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = tdfind(sigev->sigev_notify_thread_id, p->p_pid); if (td == NULL) return (ESRCH); *ttd = td; } else { *ttd = NULL; PROC_LOCK(p); } return (0); } void tdsignal(struct thread *td, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(td->td_proc, td, sig, &ksi); } void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi) { (void) tdsendsignal(td->td_proc, td, sig, ksi); } static int sig_sleepq_abort(struct thread *td, int intrval) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (intrval == 0 && (td->td_flags & TDF_SIGWAIT) == 0) { thread_unlock(td); return (0); } return (sleepq_abort(td, intrval)); } int tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; int wakeup_swapper; MPASS(td == NULL || p == td->td_proc); PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) panic("%s(): invalid signal %d", __func__, sig); KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__)); /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { td = sigtd(p, sig, false); sigqueue = &p->p_sigqueue; } else sigqueue = &td->td_sigqueue; SDT_PROBE3(proc, , , signal__send, td, p, sig); /* * If the signal is being ignored, then we forget about it * immediately, except when the target process executes * sigwait(). (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { if (kern_sig_discard_ign && (p->p_sysent->sv_flags & SV_SIG_DISCIGN) == 0) { SDT_PROBE3(proc, , , signal__discard, td, p, sig); mtx_unlock(&ps->ps_mtx); if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0) ksiginfo_tryfree(ksi); return (ret); } else { action = SIG_CATCH; intrval = 0; } } else { if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; } mtx_unlock(&ps->ps_mtx); if (prop & SIGPROP_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SIGPROP_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SIGPROP_TTYSTOP) != 0 && (p->p_pgrp->pg_flags & PGRP_ORPHANED) != 0 && action == SIG_DFL) { if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); wakeup_swapper = 0; /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediately, such as * waking up threads so that they can cross the user boundary. * We try to do the per-process part here. */ if (P_SHOULDSTOP(p)) { KASSERT(!(p->p_flag & P_WEXIT), ("signal to stopped but exiting process")); if (sig == SIGKILL) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SIGPROP_CONT) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; PROC_SLOCK(p); if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xsig = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out_cont; } if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ PROC_SUNLOCK(p); goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out_cont; } if (prop & SIGPROP_STOP) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ PROC_SLOCK(p); thread_lock(td); if (TD_CAN_ABORT(td)) wakeup_swapper = sig_sleepq_abort(td, intrval); else thread_unlock(td); PROC_SUNLOCK(p); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { tdsigwakeup(td, sig, action, intrval); goto out; } MPASS(action == SIG_DFL); if (prop & SIGPROP_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); wakeup_swapper = sig_suspend_threads(td, p); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xsig); } else PROC_SUNLOCK(p); goto out; } } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: tdsigwakeup(td, sig, action, intrval); PROC_SLOCK(p); thread_unsuspend(p); PROC_SUNLOCK(p); out_cont: itimer_proc_continue(p); kqtimer_proc_continue(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); if (wakeup_swapper) kick_proc0(); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; int prop, wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); PROC_SLOCK(p); thread_lock(td); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. Be careful to avoid bumping the * priority of the idle thread, since we still allow to signal * kernel processes. */ if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) goto out; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SIGPROP_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); return; } /* * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) goto out; /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); wakeup_swapper = sig_sleepq_abort(td, intrval); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); return; } /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif out: PROC_SUNLOCK(p); thread_unlock(td); } static void ptrace_coredumpreq(struct thread *td, struct proc *p, struct thr_coredump_req *tcq) { void *rl_cookie; if (p->p_sysent->sv_coredump == NULL) { tcq->tc_error = ENOSYS; return; } rl_cookie = vn_rangelock_wlock(tcq->tc_vp, 0, OFF_MAX); tcq->tc_error = p->p_sysent->sv_coredump(td, tcq->tc_vp, tcq->tc_limit, tcq->tc_flags); vn_rangelock_unlock(tcq->tc_vp, rl_cookie); } static void ptrace_syscallreq(struct thread *td, struct proc *p, struct thr_syscall_req *tsr) { struct sysentvec *sv; struct sysent *se; register_t rv_saved[2]; int error, nerror; int sc; bool audited, sy_thr_static; sv = p->p_sysent; if (sv->sv_table == NULL || sv->sv_size < tsr->ts_sa.code) { tsr->ts_ret.sr_error = ENOSYS; return; } sc = tsr->ts_sa.code; if (sc == SYS_syscall || sc == SYS___syscall) { sc = tsr->ts_sa.args[0]; memmove(&tsr->ts_sa.args[0], &tsr->ts_sa.args[1], sizeof(register_t) * (tsr->ts_nargs - 1)); } tsr->ts_sa.callp = se = &sv->sv_table[sc]; VM_CNT_INC(v_syscall); td->td_pticks = 0; if (__predict_false(td->td_cowgen != atomic_load_int( &td->td_proc->p_cowgen))) thread_cow_update(td); #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (se->sy_flags & SYF_CAPENABLED) == 0) { tsr->ts_ret.sr_error = ECAPMODE; return; } #endif sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0; audited = AUDIT_SYSCALL_ENTER(sc, td) != 0; if (!sy_thr_static) { error = syscall_thread_enter(td, &se); sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0; if (error != 0) { tsr->ts_ret.sr_error = error; return; } } rv_saved[0] = td->td_retval[0]; rv_saved[1] = td->td_retval[1]; nerror = td->td_errno; td->td_retval[0] = 0; td->td_retval[1] = 0; #ifdef KDTRACE_HOOKS if (se->sy_entry != 0) (*systrace_probe_func)(&tsr->ts_sa, SYSTRACE_ENTRY, 0); #endif tsr->ts_ret.sr_error = se->sy_call(td, tsr->ts_sa.args); #ifdef KDTRACE_HOOKS if (se->sy_return != 0) (*systrace_probe_func)(&tsr->ts_sa, SYSTRACE_RETURN, tsr->ts_ret.sr_error != 0 ? -1 : td->td_retval[0]); #endif tsr->ts_ret.sr_retval[0] = td->td_retval[0]; tsr->ts_ret.sr_retval[1] = td->td_retval[1]; td->td_retval[0] = rv_saved[0]; td->td_retval[1] = rv_saved[1]; td->td_errno = nerror; if (audited) AUDIT_SYSCALL_EXIT(error, td); if (!sy_thr_static) syscall_thread_exit(td, se); } static void ptrace_remotereq(struct thread *td, int flag) { struct proc *p; MPASS(td == curthread); p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if ((td->td_dbgflags & flag) == 0) return; KASSERT((p->p_flag & P_STOPPED_TRACE) != 0, ("not stopped")); KASSERT(td->td_remotereq != NULL, ("td_remotereq is NULL")); PROC_UNLOCK(p); switch (flag) { case TDB_COREDUMPREQ: ptrace_coredumpreq(td, p, td->td_remotereq); break; case TDB_SCREMOTEREQ: ptrace_syscallreq(td, p, td->td_remotereq); break; default: __unreachable(); } PROC_LOCK(p); MPASS((td->td_dbgflags & flag) != 0); td->td_dbgflags &= ~flag; td->td_remotereq = NULL; wakeup(p); } static int sig_suspend_threads(struct thread *td, struct proc *p) { struct thread *td2; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); ast_sched_locked(td2, TDA_SUSPEND); if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_SBDRY) { /* * Once a thread is asleep with * TDF_SBDRY and without TDF_SERESTART * or TDF_SEINTR set, it should never * become suspended due to this check. */ KASSERT(!TD_IS_SUSPENDED(td2), ("thread with deferred stops suspended")); if (TD_SBDRY_INTR(td2)) { wakeup_swapper |= sleepq_abort(td2, TD_SBDRY_ERRNO(td2)); continue; } } else if (!TD_IS_SUSPENDED(td2)) thread_suspend_one(td2); } else if (!TD_IS_SUSPENDED(td2)) { #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } return (wakeup_swapper); } /* * Stop the process for an event deemed interesting to the debugger. If si is * non-NULL, this is a signal exchange; the new signal requested by the * debugger will be returned for handling. If si is NULL, this is some other * type of interesting event. The debugger may request a signal be delivered in * that case as well, however it will be deferred until it can be handled. */ int ptracestop(struct thread *td, int sig, ksiginfo_t *si) { struct proc *p = td->td_proc; struct thread *td2; ksiginfo_t ksi; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); td->td_xsig = sig; if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) { td->td_dbgflags |= TDB_XSIG; CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d", td->td_tid, p->p_pid, td->td_dbgflags, sig); PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) { if (P_KILLED(p)) { /* * Ensure that, if we've been PT_KILLed, the * exit status reflects that. Another thread * may also be in ptracestop(), having just * received the SIGKILL, but this thread was * unsuspended first. */ td->td_dbgflags &= ~TDB_XSIG; td->td_xsig = SIGKILL; p->p_ptevents = 0; break; } if (p->p_flag & P_SINGLE_EXIT && !(td->td_dbgflags & TDB_EXIT)) { /* * Ignore ptrace stops except for thread exit * events when the process exits. */ td->td_dbgflags &= ~TDB_XSIG; PROC_SUNLOCK(p); return (0); } /* * Make wait(2) work. Ensure that right after the * attach, the thread which was decided to become the * leader of attach gets reported to the waiter. * Otherwise, just avoid overwriting another thread's * assignment to p_xthread. If another thread has * already set p_xthread, the current thread will get * a chance to report itself upon the next iteration. */ if ((td->td_dbgflags & TDB_FSTP) != 0 || ((p->p_flag2 & P2_PTRACE_FSTP) == 0 && p->p_xthread == NULL)) { p->p_xsig = sig; p->p_xthread = td; /* * If we are on sleepqueue already, * let sleepqueue code decide if it * needs to go sleep after attach. */ if (td->td_wchan == NULL) td->td_dbgflags &= ~TDB_FSTP; p->p_flag2 &= ~P2_PTRACE_FSTP; p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE; sig_suspend_threads(td, p); } if ((td->td_dbgflags & TDB_STOPATFORK) != 0) { td->td_dbgflags &= ~TDB_STOPATFORK; } stopme: td->td_dbgflags |= TDB_SSWITCH; thread_suspend_switch(td, p); td->td_dbgflags &= ~TDB_SSWITCH; if ((td->td_dbgflags & (TDB_COREDUMPREQ | TDB_SCREMOTEREQ)) != 0) { MPASS((td->td_dbgflags & (TDB_COREDUMPREQ | TDB_SCREMOTEREQ)) != (TDB_COREDUMPREQ | TDB_SCREMOTEREQ)); PROC_SUNLOCK(p); ptrace_remotereq(td, td->td_dbgflags & (TDB_COREDUMPREQ | TDB_SCREMOTEREQ)); PROC_SLOCK(p); goto stopme; } if (p->p_xthread == td) p->p_xthread = NULL; if (!(p->p_flag & P_TRACED)) break; if (td->td_dbgflags & TDB_SUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); } if (si != NULL && sig == td->td_xsig) { /* Parent wants us to take the original signal unchanged. */ si->ksi_flags |= KSI_HEAD; if (sigqueue_add(&td->td_sigqueue, sig, si) != 0) si->ksi_signo = 0; } else if (td->td_xsig != 0) { /* * If parent wants us to take a new signal, then it will leave * it in td->td_xsig; otherwise we just look for signals again. */ ksiginfo_init(&ksi); ksi.ksi_signo = td->td_xsig; ksi.ksi_flags |= KSI_PTRACE; td2 = sigtd(p, td->td_xsig, false); tdsendsignal(p, td2, td->td_xsig, &ksi); if (td != td2) return (0); } return (td->td_xsig); } static void reschedule_signals(struct proc *p, sigset_t block, int flags) { struct sigacts *ps; struct thread *td; int sig; bool fastblk, pslocked; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; pslocked = (flags & SIGPROCMASK_PS_LOCKED) != 0; mtx_assert(&ps->ps_mtx, pslocked ? MA_OWNED : MA_NOTOWNED); if (SIGISEMPTY(p->p_siglist)) return; SIGSETAND(block, p->p_siglist); fastblk = (flags & SIGPROCMASK_FASTBLK) != 0; SIG_FOREACH(sig, &block) { td = sigtd(p, sig, fastblk); /* * If sigtd() selected us despite sigfastblock is * blocking, do not activate AST or wake us, to avoid * loop in AST handler. */ if (fastblk && td == curthread) continue; signotify(td); if (!pslocked) mtx_lock(&ps->ps_mtx); if (p->p_flag & P_TRACED || (SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig))) { tdsigwakeup(td, sig, SIG_CATCH, (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART)); } if (!pslocked) mtx_unlock(&ps->ps_mtx); } } void tdsigcleanup(struct thread *td) { struct proc *p; sigset_t unblocked; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_flush(&td->td_sigqueue); if (p->p_numthreads == 1) return; /* * Since we cannot handle signals, notify signal post code * about this by filling the sigmask. * * Also, if needed, wake up thread(s) that do not block the * same signals as the exiting thread, since the thread might * have been selected for delivery and woken up. */ SIGFILLSET(unblocked); SIGSETNAND(unblocked, td->td_sigmask); SIGFILLSET(td->td_sigmask); reschedule_signals(p, unblocked, 0); } static int sigdeferstop_curr_flags(int cflags) { MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 || (cflags & TDF_SBDRY) != 0); return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)); } /* * Defer the delivery of SIGSTOP for the current thread, according to * the requested mode. Returns previous flags, which must be restored * by sigallowstop(). * * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and * cleared by the current thread, which allow the lock-less read-only * accesses below. */ int sigdeferstop_impl(int mode) { struct thread *td; int cflags, nflags; td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); switch (mode) { case SIGDEFERSTOP_NOP: nflags = cflags; break; case SIGDEFERSTOP_OFF: nflags = 0; break; case SIGDEFERSTOP_SILENT: nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART); break; case SIGDEFERSTOP_EINTR: nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART; break; case SIGDEFERSTOP_ERESTART: nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR; break; default: panic("sigdeferstop: invalid mode %x", mode); break; } if (cflags == nflags) return (SIGDEFERSTOP_VAL_NCHG); thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | nflags; thread_unlock(td); return (cflags); } /* * Restores the STOP handling mode, typically permitting the delivery * of SIGSTOP for the current thread. This does not immediately * suspend if a stop was posted. Instead, the thread will suspend * either via ast() or a subsequent interruptible sleep. */ void sigallowstop_impl(int prev) { struct thread *td; int cflags; KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop")); KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("sigallowstop: incorrect previous mode %x", prev)); td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); if (cflags != prev) { thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | prev; thread_unlock(td); } } enum sigstatus { SIGSTATUS_HANDLE, SIGSTATUS_HANDLED, SIGSTATUS_IGNORE, SIGSTATUS_SBDRY_STOP, }; /* * The thread has signal "sig" pending. Figure out what to do with it: * * _HANDLE -> the caller should handle the signal * _HANDLED -> handled internally, reload pending signal set * _IGNORE -> ignored, remove from the set of pending signals and try the * next pending signal * _SBDRY_STOP -> the signal should stop the thread but this is not * permitted in the current context */ static enum sigstatus sigprocess(struct thread *td, int sig) { struct proc *p; struct sigacts *ps; struct sigqueue *queue; ksiginfo_t ksi; int prop; KASSERT(_SIG_VALID(sig), ("%s: invalid signal %d", __func__, sig)); p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); /* * We should allow pending but ignored signals below * if there is sigwait() active, or P_TRACED was * on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (p->p_flag & P_TRACED) == 0 && (td->td_flags & TDF_SIGWAIT) == 0) { return (SIGSTATUS_IGNORE); } /* * If the process is going to single-thread mode to prepare * for exit, there is no sense in delivering any signal * to usermode. Another important consequence is that * msleep(..., PCATCH, ...) now is only interruptible by a * suspend request. */ if ((p->p_flag2 & P2_WEXIT) != 0) return (SIGSTATUS_IGNORE); if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) { /* * If traced, always stop. * Remove old signal from queue before the stop. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ queue = &td->td_sigqueue; ksiginfo_init(&ksi); if (sigqueue_get(queue, sig, &ksi) == 0) { queue = &p->p_sigqueue; sigqueue_get(queue, sig, &ksi); } td->td_si = ksi.ksi_info; mtx_unlock(&ps->ps_mtx); sig = ptracestop(td, sig, &ksi); mtx_lock(&ps->ps_mtx); td->td_si.si_signo = 0; /* * Keep looking if the debugger discarded or * replaced the signal. */ if (sig == 0) return (SIGSTATUS_HANDLED); /* * If the signal became masked, re-queue it. */ if (SIGISMEMBER(td->td_sigmask, sig)) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(&p->p_sigqueue, sig, &ksi); return (SIGSTATUS_HANDLED); } /* * If the traced bit got turned off, requeue the signal and * reload the set of pending signals. This ensures that p_sig* * and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) { if ((ksi.ksi_flags & KSI_PTRACE) == 0) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(queue, sig, &ksi); } return (SIGSTATUS_HANDLED); } } /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif return (SIGSTATUS_IGNORE); } /* * If there is a pending stop signal to process with * default action, stop here, then clear the signal. * Traced or exiting processes should ignore stops. * Additionally, a member of an orphaned process group * should ignore tty stops. */ prop = sigprop(sig); if (prop & SIGPROP_STOP) { mtx_unlock(&ps->ps_mtx); if ((p->p_flag & (P_TRACED | P_WEXIT | P_SINGLE_EXIT)) != 0 || ((p->p_pgrp-> pg_flags & PGRP_ORPHANED) != 0 && (prop & SIGPROP_TTYSTOP) != 0)) { mtx_lock(&ps->ps_mtx); return (SIGSTATUS_IGNORE); } if (TD_SBDRY_INTR(td)) { KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); mtx_lock(&ps->ps_mtx); return (SIGSTATUS_SBDRY_STOP); } WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p); thread_suspend_switch(td, p); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); return (SIGSTATUS_HANDLED); } else if ((prop & SIGPROP_IGNORE) != 0 && (td->td_flags & TDF_SIGWAIT) == 0) { /* * Default action is to ignore; drop it if * not in kern_sigtimedwait(). */ return (SIGSTATUS_IGNORE); } else { return (SIGSTATUS_HANDLE); } case (intptr_t)SIG_IGN: if ((td->td_flags & TDF_SIGWAIT) == 0) return (SIGSTATUS_IGNORE); else return (SIGSTATUS_HANDLE); default: /* * This signal has an action, let postsig() process it. */ return (SIGSTATUS_HANDLE); } } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling * issignal by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(struct thread *td) { struct proc *p; sigset_t sigpending; int sig; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { sigpending = td->td_sigqueue.sq_signals; SIGSETOR(sigpending, p->p_sigqueue.sq_signals); SIGSETNAND(sigpending, td->td_sigmask); if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); /* * Do fast sigblock if requested by usermode. Since * we do know that there was a signal pending at this * point, set the FAST_SIGBLOCK_PEND as indicator for * usermode to perform a dummy call to * FAST_SIGBLOCK_UNBLOCK, which causes immediate * delivery of postponed pending signal. */ if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) { if (td->td_sigblock_val != 0) SIGSETNAND(sigpending, fastblock_mask); if (SIGISEMPTY(sigpending)) { td->td_pflags |= TDP_SIGFASTPENDING; return (0); } } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED && (p->p_flag2 & P2_PTRACE_FSTP) != 0 && SIGISMEMBER(sigpending, SIGSTOP)) { /* * If debugger just attached, always consume * SIGSTOP from ptrace(PT_ATTACH) first, to * execute the debugger attach ritual in * order. */ td->td_dbgflags |= TDB_FSTP; SIGEMPTYSET(sigpending); SIGADDSET(sigpending, SIGSTOP); } SIG_FOREACH(sig, &sigpending) { switch (sigprocess(td, sig)) { case SIGSTATUS_HANDLE: return (sig); case SIGSTATUS_HANDLED: goto next; case SIGSTATUS_IGNORE: sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); break; case SIGSTATUS_SBDRY_STOP: return (-1); } } next:; } } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ int postsig(int sig) { struct thread *td; struct proc *p; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; KASSERT(sig != 0, ("postsig")); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 && sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0) return (0); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code); #endif if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); proc_td_siginfo_capture(td, &ksi.ksi_info); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN, ("postsig action %p", action)); KASSERT(!SIGISMEMBER(td->td_sigmask, sig), ("postsig action: blocked sig %d", sig)); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; if (p->p_sig == sig) { p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } return (1); } int sig_ast_checksusp(struct thread *td) { struct proc *p __diagused; int ret; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if (!td_ast_pending(td, TDA_SUSPEND)) return (0); ret = thread_suspend_check(1); MPASS(ret == 0 || ret == EINTR || ret == ERESTART); return (ret); } int sig_ast_needsigchk(struct thread *td) { struct proc *p; struct sigacts *ps; int ret, sig; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if (!td_ast_pending(td, TDA_SIG)) return (0); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig = cursig(td); if (sig == -1) { mtx_unlock(&ps->ps_mtx); KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); KASSERT(TD_SBDRY_INTR(td), ("lost TDF_SERESTART of TDF_SEINTR")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); ret = TD_SBDRY_ERRNO(td); } else if (sig != 0) { ret = SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART; mtx_unlock(&ps->ps_mtx); } else { mtx_unlock(&ps->ps_mtx); ret = 0; } /* * Do not go into sleep if this thread was the ptrace(2) * attach leader. cursig() consumed SIGSTOP from PT_ATTACH, * but we usually act on the signal by interrupting sleep, and * should do that here as well. */ if ((td->td_dbgflags & TDB_FSTP) != 0) { if (ret == 0) ret = EINTR; td->td_dbgflags &= ~TDB_FSTP; } return (ret); } int sig_intr(void) { struct thread *td; struct proc *p; int ret; td = curthread; if (!td_ast_pending(td, TDA_SIG) && !td_ast_pending(td, TDA_SUSPEND)) return (0); p = td->td_proc; PROC_LOCK(p); ret = sig_ast_checksusp(td); if (ret == 0) ret = sig_ast_needsigchk(td); PROC_UNLOCK(p); return (ret); } bool curproc_sigkilled(void) { struct thread *td; struct proc *p; struct sigacts *ps; bool res; td = curthread; if (!td_ast_pending(td, TDA_SIG)) return (false); p = td->td_proc; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); res = SIGISMEMBER(td->td_sigqueue.sq_signals, SIGKILL) || SIGISMEMBER(p->p_sigqueue.sq_signals, SIGKILL); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (res); } void proc_wkilled(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_WKILLED) == 0) { p->p_flag |= P_WKILLED; /* * Notify swapper that there is a process to swap in. * The notification is racy, at worst it would take 10 * seconds for the swapper process to notice. */ if ((p->p_flag & (P_INMEM | P_SWAPPINGIN)) == 0) wakeup(&proc0); } } /* * Kill the current process for stated reason. */ void killproc(struct proc *p, const char *why) { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), jid %d, uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, p->p_ucred->cr_uid, why); proc_wkilled(p); kern_psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(struct thread *td, int sig) { struct proc *p = td->td_proc; const char *coreinfo; int rv; PROC_LOCK_ASSERT(p, MA_OWNED); proc_set_p2_wexit(p); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SIGPROP_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ rv = coredump(td); switch (rv) { case 0: sig |= WCOREFLAG; coreinfo = " (core dumped)"; break; case EFAULT: coreinfo = " (no core dump - bad address)"; break; case EINVAL: coreinfo = " (no core dump - invalid argument)"; break; case EFBIG: coreinfo = " (no core dump - too large)"; break; default: coreinfo = " (no core dump - other error)"; break; } if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), jid %d, uid %d: exited on " "signal %d%s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, td->td_ucred->cr_uid, sig &~ WCOREFLAG, coreinfo); } else PROC_UNLOCK(p); exit1(td, 0, sig); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } pksignal(p->p_pptr, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int sig) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, sig); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xsig); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason, status; if (WCOREDUMP(p->p_xsig)) { reason = CLD_DUMPED; status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { reason = CLD_KILLED; status = WTERMSIG(p->p_xsig); } else { reason = CLD_EXITED; status = p->p_xexit; } /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } #define MAX_NUM_CORE_FILES 100000 #ifndef NUM_CORE_FILES #define NUM_CORE_FILES 5 #endif CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES); static int num_cores = NUM_CORE_FILES; static int sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) { int error; int new_val; new_val = num_cores; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val > MAX_NUM_CORE_FILES) new_val = MAX_NUM_CORE_FILES; if (new_val < 0) new_val = 0; num_cores = new_val; return (0); } SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_debug_num_cores_check, "I", "Maximum number of generated process corefiles while using index format"); #define GZIP_SUFFIX ".gz" #define ZSTD_SUFFIX ".zst" int compress_user_cores = 0; static int sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) { int error, val; val = compress_user_cores; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && !compressor_avail(val)) return (EINVAL); compress_user_cores = val; return (error); } SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_compress_user_cores, "I", "Enable compression of user corefiles (" __XSTRING(COMPRESS_GZIP) " = gzip, " __XSTRING(COMPRESS_ZSTD) " = zstd)"); int compress_user_cores_level = 6; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, &compress_user_cores_level, 0, "Corefile compression level"); /* * Protect the access to corefilename[] by allproc_lock. */ #define corefilename_lock allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); static int sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) { int error; sx_xlock(&corefilename_lock); error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), req); sx_xunlock(&corefilename_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", "Process corefile name format string"); static void vnode_close_locked(struct thread *td, struct vnode *vp) { VOP_UNLOCK(vp); vn_close(vp, FWRITE, td->td_ucred, td); } /* * If the core format has a %I in it, then we need to check * for existing corefiles before defining a name. * To do this we iterate over 0..ncores to find a * non-existing core file name to use. If all core files are * already used we choose the oldest one. */ static int corefile_open_last(struct thread *td, char *name, int indexpos, int indexlen, int ncores, struct vnode **vpp) { struct vnode *oldvp, *nextvp, *vp; struct vattr vattr; struct nameidata nd; int error, i, flags, oflags, cmode; char ch; struct timespec lasttime; nextvp = oldvp = NULL; cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); for (i = 0; i < ncores; i++) { flags = O_CREAT | FWRITE | O_NOFOLLOW; ch = name[indexpos + indexlen]; (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen, i); name[indexpos + indexlen] = ch; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error != 0) break; vp = nd.ni_vp; NDFREE_PNBUF(&nd); if ((flags & O_CREAT) == O_CREAT) { nextvp = vp; break; } error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) { vnode_close_locked(td, vp); break; } if (oldvp == NULL || lasttime.tv_sec > vattr.va_mtime.tv_sec || (lasttime.tv_sec == vattr.va_mtime.tv_sec && lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) { if (oldvp != NULL) vn_close(oldvp, FWRITE, td->td_ucred, td); oldvp = vp; VOP_UNLOCK(oldvp); lasttime = vattr.va_mtime; } else { vnode_close_locked(td, vp); } } if (oldvp != NULL) { if (nextvp == NULL) { if ((td->td_proc->p_flag & P_SUGID) != 0) { error = EFAULT; vn_close(oldvp, FWRITE, td->td_ucred, td); } else { nextvp = oldvp; error = vn_lock(nextvp, LK_EXCLUSIVE); if (error != 0) { vn_close(nextvp, FWRITE, td->td_ucred, td); nextvp = NULL; } } } else { vn_close(oldvp, FWRITE, td->td_ucred, td); } } if (error != 0) { if (nextvp != NULL) vnode_close_locked(td, oldvp); } else { *vpp = nextvp; } return (error); } /* * corefile_open(comm, uid, pid, td, compress, vpp, namep) * Expand the name described in corefilename, using name, uid, and pid * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static int corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, int compress, int signum, struct vnode **vpp, char **namep) { struct sbuf sb; struct nameidata nd; const char *format; char *hostname, *name; int cmode, error, flags, i, indexpos, indexlen, oflags, ncores; hostname = NULL; format = corefilename; name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexlen = 0; indexpos = -1; ncores = num_cores; (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); sx_slock(&corefilename_lock); for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': sbuf_putc(&sb, '%'); break; case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, M_TEMP, M_WAITOK); } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); - sbuf_printf(&sb, "%s", hostname); + sbuf_cat(&sb, hostname); break; case 'I': /* autoincrementing index */ if (indexpos != -1) { sbuf_printf(&sb, "%%I"); break; } indexpos = sbuf_len(&sb); sbuf_printf(&sb, "%u", ncores - 1); indexlen = sbuf_len(&sb) - indexpos; break; case 'N': /* process name */ sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); break; case 'S': /* signal number */ sbuf_printf(&sb, "%i", signum); break; case 'U': /* user id */ sbuf_printf(&sb, "%u", uid); break; default: log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); break; } break; default: sbuf_putc(&sb, format[i]); break; } } sx_sunlock(&corefilename_lock); free(hostname, M_TEMP); if (compress == COMPRESS_GZIP) - sbuf_printf(&sb, GZIP_SUFFIX); + sbuf_cat(&sb, GZIP_SUFFIX); else if (compress == COMPRESS_ZSTD) - sbuf_printf(&sb, ZSTD_SUFFIX); + sbuf_cat(&sb, ZSTD_SUFFIX); if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); free(name, M_TEMP); return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); if (indexpos != -1) { error = corefile_open_last(td, name, indexpos, indexlen, ncores, vpp); if (error != 0) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " "on initial open test, error = %d\n", pid, comm, uid, name, error); } } else { cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); flags = O_CREAT | FWRITE | O_NOFOLLOW; if ((td->td_proc->p_flag & P_SUGID) != 0) flags |= O_EXCL; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error == 0) { *vpp = nd.ni_vp; NDFREE_PNBUF(&nd); } } if (error != 0) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } *namep = name; return (0); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vnode *vp; struct flock lf; struct vattr vattr; size_t fullpathsize; int error, error1, locked; char *name; /* name of corefile */ void *rl_cookie; off_t limit; char *fullpath, *freepath = NULL; struct sbuf *sb; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || (p->p_flag2 & P2_NOTRACE) != 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(td, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); return (EFBIG); } PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress_user_cores, p->p_sig, &vp, &name); if (error != 0) return (error); /* * Don't dump to non-regular files or files with links. * Do not dump into system files. Effective user must own the corefile. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 || vattr.va_uid != cred->cr_uid) { VOP_UNLOCK(vp); error = EFAULT; goto out; } VOP_UNLOCK(vp); /* Postpone other writers, including core dumps of other processes. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); if (p->p_sysent->sv_coredump != NULL) { error = p->p_sysent->sv_coredump(td, vp, limit, 0); } else { error = ENOSYS; } if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } vn_rangelock_unlock(vp, rl_cookie); /* * Notify the userland helper that a process triggered a core dump. * This allows the helper to run an automated debugging session. */ if (error != 0 || coredump_devctl == 0) goto out; sb = sbuf_new_auto(); if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0) goto out2; - sbuf_printf(sb, "comm=\""); + sbuf_cat(sb, "comm=\""); devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); - sbuf_printf(sb, "\" core=\""); + sbuf_cat(sb, "\" core=\""); /* * We can't lookup core file vp directly. When we're replacing a core, and * other random times, we flush the name cache, so it will fail. Instead, * if the path of the core is relative, add the current dir in front if it. */ if (name[0] != '/') { fullpathsize = MAXPATHLEN; freepath = malloc(fullpathsize, M_TEMP, M_WAITOK); if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) { free(freepath, M_TEMP); goto out2; } devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_putc(sb, '/'); } devctl_safe_quote_sb(sb, name); - sbuf_printf(sb, "\""); + sbuf_putc(sb, '"'); if (sbuf_finish(sb) == 0) devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); out2: sbuf_delete(sb); out: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(struct thread *td, struct nosys_args *args) { struct proc *p; p = td->td_proc; if (SV_PROC_FLAG(p, SV_SIGSYS) != 0 && kern_signosys) { PROC_LOCK(p); tdsignal(td, SIGSYS); PROC_UNLOCK(p); } if (kern_lognosys == 1 || kern_lognosys == 3) { uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } if (kern_lognosys == 2 || kern_lognosys == 3 || (p->p_pid == 1 && (kern_lognosys & 3) == 0)) { printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(struct sigio **sigiop, int sig, int checkctty) { ksiginfo_t ksi; struct sigio *sigio; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) kern_psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) kern_psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); refcount_init(&ps->ps_refcnt, 1); mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { if (refcount_release(&ps->ps_refcnt) == 0) return; mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } struct sigacts * sigacts_hold(struct sigacts *ps) { refcount_acquire(&ps->ps_refcnt); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { return (ps->ps_refcnt > 1); } void sig_drop_caught(struct proc *p) { int sig; struct sigacts *ps; ps = p->p_sigacts; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&ps->ps_mtx, MA_OWNED); SIG_FOREACH(sig, &ps->ps_sigcatch) { sigdflt(ps, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0) sigqueue_delete_proc(p, sig); } } static void sigfastblock_failed(struct thread *td, bool sendsig, bool write) { ksiginfo_t ksi; /* * Prevent further fetches and SIGSEGVs, allowing thread to * issue syscalls despite corruption. */ sigfastblock_clear(td); if (!sendsig) return; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGSEGV; ksi.ksi_code = write ? SEGV_ACCERR : SEGV_MAPERR; ksi.ksi_addr = td->td_sigblock_ptr; trapsignal(td, &ksi); } static bool sigfastblock_fetch_sig(struct thread *td, bool sendsig, uint32_t *valp) { uint32_t res; if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) return (true); if (fueword32((void *)td->td_sigblock_ptr, &res) == -1) { sigfastblock_failed(td, sendsig, false); return (false); } *valp = res; td->td_sigblock_val = res & ~SIGFASTBLOCK_FLAGS; return (true); } static void sigfastblock_resched(struct thread *td, bool resched) { struct proc *p; if (resched) { p = td->td_proc; PROC_LOCK(p); reschedule_signals(p, td->td_sigmask, 0); PROC_UNLOCK(p); } ast_sched(td, TDA_SIG); } int sys_sigfastblock(struct thread *td, struct sigfastblock_args *uap) { struct proc *p; int error, res; uint32_t oldval; error = 0; p = td->td_proc; switch (uap->cmd) { case SIGFASTBLOCK_SETPTR: if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) { error = EBUSY; break; } if (((uintptr_t)(uap->ptr) & (sizeof(uint32_t) - 1)) != 0) { error = EINVAL; break; } td->td_pflags |= TDP_SIGFASTBLOCK; td->td_sigblock_ptr = uap->ptr; break; case SIGFASTBLOCK_UNBLOCK: if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) { error = EINVAL; break; } for (;;) { res = casueword32(td->td_sigblock_ptr, SIGFASTBLOCK_PEND, &oldval, 0); if (res == -1) { error = EFAULT; sigfastblock_failed(td, false, true); break; } if (res == 0) break; MPASS(res == 1); if (oldval != SIGFASTBLOCK_PEND) { error = EBUSY; break; } error = thread_check_susp(td, false); if (error != 0) break; } if (error != 0) break; /* * td_sigblock_val is cleared there, but not on a * syscall exit. The end effect is that a single * interruptible sleep, while user sigblock word is * set, might return EINTR or ERESTART to usermode * without delivering signal. All further sleeps, * until userspace clears the word and does * sigfastblock(UNBLOCK), observe current word and no * longer get interrupted. It is slight * non-conformance, with alternative to have read the * sigblock word on each syscall entry. */ td->td_sigblock_val = 0; /* * Rely on normal ast mechanism to deliver pending * signals to current thread. But notify others about * fake unblock. */ sigfastblock_resched(td, error == 0 && p->p_numthreads != 1); break; case SIGFASTBLOCK_UNSETPTR: if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) { error = EINVAL; break; } if (!sigfastblock_fetch_sig(td, false, &oldval)) { error = EFAULT; break; } if (oldval != 0 && oldval != SIGFASTBLOCK_PEND) { error = EBUSY; break; } sigfastblock_clear(td); break; default: error = EINVAL; break; } return (error); } void sigfastblock_clear(struct thread *td) { bool resched; if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) return; td->td_sigblock_val = 0; resched = (td->td_pflags & TDP_SIGFASTPENDING) != 0 || SIGPENDING(td); td->td_pflags &= ~(TDP_SIGFASTBLOCK | TDP_SIGFASTPENDING); sigfastblock_resched(td, resched); } void sigfastblock_fetch(struct thread *td) { uint32_t val; (void)sigfastblock_fetch_sig(td, true, &val); } static void sigfastblock_setpend1(struct thread *td) { int res; uint32_t oldval; if ((td->td_pflags & TDP_SIGFASTPENDING) == 0) return; res = fueword32((void *)td->td_sigblock_ptr, &oldval); if (res == -1) { sigfastblock_failed(td, true, false); return; } for (;;) { res = casueword32(td->td_sigblock_ptr, oldval, &oldval, oldval | SIGFASTBLOCK_PEND); if (res == -1) { sigfastblock_failed(td, true, true); return; } if (res == 0) { td->td_sigblock_val = oldval & ~SIGFASTBLOCK_FLAGS; td->td_pflags &= ~TDP_SIGFASTPENDING; break; } MPASS(res == 1); if (thread_check_susp(td, false) != 0) break; } } static void sigfastblock_setpend(struct thread *td, bool resched) { struct proc *p; sigfastblock_setpend1(td); if (resched) { p = td->td_proc; PROC_LOCK(p); reschedule_signals(p, fastblock_mask, SIGPROCMASK_FASTBLK); PROC_UNLOCK(p); } } diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index a4bfe8e21aed..8baa78951501 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -1,3139 +1,3139 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ #include #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include "opt_sysctl.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef DDB #include #include #endif #include #include #include #include static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic"); static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids"); static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer"); RB_GENERATE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid); /* * The sysctllock protects the MIB tree. It also protects sysctl * contexts used with dynamic sysctls. The sysctl_register_oid() and * sysctl_unregister_oid() routines require the sysctllock to already * be held, so the sysctl_wlock() and sysctl_wunlock() routines are * provided for the few places in the kernel which need to use that * API rather than using the dynamic API. Use of the dynamic API is * strongly encouraged for most code. * * The sysctlmemlock is used to limit the amount of user memory wired for * sysctl requests. This is implemented by serializing any userland * sysctl requests larger than a single page via an exclusive lock. * * The sysctlstringlock is used to protect concurrent access to writable * string nodes in sysctl_handle_string(). */ static struct rmlock sysctllock; static struct sx __exclusive_cache_line sysctlmemlock; static struct sx sysctlstringlock; #define SYSCTL_WLOCK() rm_wlock(&sysctllock) #define SYSCTL_WUNLOCK() rm_wunlock(&sysctllock) #define SYSCTL_RLOCK(tracker) rm_rlock(&sysctllock, (tracker)) #define SYSCTL_RUNLOCK(tracker) rm_runlock(&sysctllock, (tracker)) #define SYSCTL_WLOCKED() rm_wowned(&sysctllock) #define SYSCTL_ASSERT_LOCKED() rm_assert(&sysctllock, RA_LOCKED) #define SYSCTL_ASSERT_WLOCKED() rm_assert(&sysctllock, RA_WLOCKED) #define SYSCTL_ASSERT_RLOCKED() rm_assert(&sysctllock, RA_RLOCKED) #define SYSCTL_INIT() rm_init_flags(&sysctllock, "sysctl lock", \ RM_SLEEPABLE) #define SYSCTL_SLEEP(ch, wmesg, timo) \ rm_sleep(ch, &sysctllock, 0, wmesg, timo) static int sysctl_root(SYSCTL_HANDLER_ARGS); /* Root list */ struct sysctl_oid_list sysctl__children = RB_INITIALIZER(&sysctl__children); static char* sysctl_escape_name(const char*); static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse); static int sysctl_old_kernel(struct sysctl_req *, const void *, size_t); static int sysctl_new_kernel(struct sysctl_req *, void *, size_t); static int name2oid(const char *, int *, int *, struct sysctl_oid **); static struct sysctl_oid * sysctl_find_oidname(const char *name, struct sysctl_oid_list *list) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SYSCTL_FOREACH(oidp, list) { if (strcmp(oidp->oid_name, name) == 0) { return (oidp); } } return (NULL); } static struct sysctl_oid * sysctl_find_oidnamelen(const char *name, size_t len, struct sysctl_oid_list *list) { struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SYSCTL_FOREACH(oidp, list) { if (strncmp(oidp->oid_name, name, len) == 0 && oidp->oid_name[len] == '\0') return (oidp); } return (NULL); } /* * Initialization of the MIB tree. * * Order by number in each list. */ void sysctl_wlock(void) { SYSCTL_WLOCK(); } void sysctl_wunlock(void) { SYSCTL_WUNLOCK(); } static int sysctl_root_handler_locked(struct sysctl_oid *oid, void *arg1, intmax_t arg2, struct sysctl_req *req, struct rm_priotracker *tracker) { int error; if (oid->oid_kind & CTLFLAG_DYN) atomic_add_int(&oid->oid_running, 1); if (tracker != NULL) SYSCTL_RUNLOCK(tracker); else SYSCTL_WUNLOCK(); /* * Treat set CTLFLAG_NEEDGIANT and unset CTLFLAG_MPSAFE flags the same, * untill we're ready to remove all traces of Giant from sysctl(9). */ if ((oid->oid_kind & CTLFLAG_NEEDGIANT) || (!(oid->oid_kind & CTLFLAG_MPSAFE))) mtx_lock(&Giant); error = oid->oid_handler(oid, arg1, arg2, req); if ((oid->oid_kind & CTLFLAG_NEEDGIANT) || (!(oid->oid_kind & CTLFLAG_MPSAFE))) mtx_unlock(&Giant); KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error); if (tracker != NULL) SYSCTL_RLOCK(tracker); else SYSCTL_WLOCK(); if (oid->oid_kind & CTLFLAG_DYN) { if (atomic_fetchadd_int(&oid->oid_running, -1) == 1 && (oid->oid_kind & CTLFLAG_DYING) != 0) wakeup(&oid->oid_running); } return (error); } static void sysctl_load_tunable_by_oid_locked(struct sysctl_oid *oidp) { struct sysctl_req req; struct sysctl_oid *curr; char *penv = NULL; char path[96]; ssize_t rem = sizeof(path); ssize_t len; uint8_t data[512] __aligned(sizeof(uint64_t)); int size; int error; path[--rem] = 0; for (curr = oidp; curr != NULL; curr = SYSCTL_PARENT(curr)) { len = strlen(curr->oid_name); rem -= len; if (curr != oidp) rem -= 1; if (rem < 0) { printf("OID path exceeds %d bytes\n", (int)sizeof(path)); return; } memcpy(path + rem, curr->oid_name, len); if (curr != oidp) path[rem + len] = '.'; } memset(&req, 0, sizeof(req)); req.td = curthread; req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_INT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_UINT: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_LONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_ULONG: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(long), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int8_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int16_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int32_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_S64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(int64_t), GETENV_SIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U8: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint8_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U16: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint16_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U32: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint32_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_U64: if (getenv_array(path + rem, data, sizeof(data), &size, sizeof(uint64_t), GETENV_UNSIGNED) == 0) return; req.newlen = size; req.newptr = data; break; case CTLTYPE_STRING: penv = kern_getenv(path + rem); if (penv == NULL) return; req.newlen = strlen(penv); req.newptr = penv; break; default: return; } error = sysctl_root_handler_locked(oidp, oidp->oid_arg1, oidp->oid_arg2, &req, NULL); if (error != 0) printf("Setting sysctl %s failed: %d\n", path + rem, error); if (penv != NULL) freeenv(penv); } /* * Locate the path to a given oid. Returns the length of the resulting path, * or -1 if the oid was not found. nodes must have room for CTL_MAXNAME * elements. */ static int sysctl_search_oid(struct sysctl_oid **nodes, struct sysctl_oid *needle) { int indx; SYSCTL_ASSERT_LOCKED(); indx = 0; /* * Do a depth-first search of the oid tree, looking for 'needle'. Start * with the first child of the root. */ nodes[indx] = RB_MIN(sysctl_oid_list, &sysctl__children); for (;;) { if (nodes[indx] == needle) return (indx + 1); if (nodes[indx] == NULL) { /* Node has no more siblings, so back up to parent. */ if (indx-- == 0) { /* Retreat to root, so give up. */ break; } } else if ((nodes[indx]->oid_kind & CTLTYPE) == CTLTYPE_NODE) { /* Node has children. */ if (++indx == CTL_MAXNAME) { /* Max search depth reached, so give up. */ break; } /* Start with the first child. */ nodes[indx] = RB_MIN(sysctl_oid_list, &nodes[indx - 1]->oid_children); continue; } /* Consider next sibling. */ nodes[indx] = RB_NEXT(sysctl_oid_list, NULL, nodes[indx]); } return (-1); } static void sysctl_warn_reuse(const char *func, struct sysctl_oid *leaf) { struct sysctl_oid *nodes[CTL_MAXNAME]; char buf[128]; struct sbuf sb; int rc, i; (void)sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_printf(&sb, "%s: can't re-use a leaf (", __func__); rc = sysctl_search_oid(nodes, leaf); if (rc > 0) { for (i = 0; i < rc; i++) sbuf_printf(&sb, "%s%.*s", nodes[i]->oid_name, i != (rc - 1), "."); } else { - sbuf_printf(&sb, "%s", leaf->oid_name); + sbuf_cat(&sb, leaf->oid_name); } - sbuf_printf(&sb, ")!\n"); + sbuf_cat(&sb, ")!\n"); (void)sbuf_finish(&sb); } #ifdef SYSCTL_DEBUG static int sysctl_reuse_test(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; SYSCTL_RLOCK(&tracker); sysctl_warn_reuse(__func__, oidp); SYSCTL_RUNLOCK(&tracker); return (0); } SYSCTL_PROC(_sysctl, OID_AUTO, reuse_test, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_reuse_test, "-", ""); #endif void sysctl_register_oid(struct sysctl_oid *oidp) { struct sysctl_oid_list *parent = oidp->oid_parent; struct sysctl_oid *p, key; int oid_number; int timeout = 2; /* * First check if another oid with the same name already * exists in the parent's list. */ SYSCTL_ASSERT_WLOCKED(); p = sysctl_find_oidname(oidp->oid_name, parent); if (p != NULL) { if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) { p->oid_refcnt++; return; } else { sysctl_warn_reuse(__func__, p); return; } } /* get current OID number */ oid_number = oidp->oid_number; #if (OID_AUTO >= 0) #error "OID_AUTO is expected to be a negative value" #endif /* * Any negative OID number qualifies as OID_AUTO. Valid OID * numbers should always be positive. * * NOTE: DO NOT change the starting value here, change it in * , and make sure it is at least 256 to * accommodate e.g. net.inet.raw as a static sysctl node. */ if (oid_number < 0) { static int newoid; /* * By decrementing the next OID number we spend less * time inserting the OIDs into a sorted list. */ if (--newoid < CTL_AUTO_START) newoid = 0x7fffffff; oid_number = newoid; } /* * Insert the OID into the parent's list sorted by OID number. */ key.oid_number = oid_number; p = RB_NFIND(sysctl_oid_list, parent, &key); while (p != NULL && oid_number == p->oid_number) { /* get the next valid OID number */ if (oid_number < CTL_AUTO_START || oid_number == 0x7fffffff) { /* wraparound - restart */ oid_number = CTL_AUTO_START; /* don't loop forever */ if (!timeout--) panic("sysctl: Out of OID numbers\n"); key.oid_number = oid_number; p = RB_NFIND(sysctl_oid_list, parent, &key); continue; } p = RB_NEXT(sysctl_oid_list, NULL, p); oid_number++; } /* check for non-auto OID number collision */ if (oidp->oid_number >= 0 && oidp->oid_number < CTL_AUTO_START && oid_number >= CTL_AUTO_START) { printf("sysctl: OID number(%d) is already in use for '%s'\n", oidp->oid_number, oidp->oid_name); } /* update the OID number, if any */ oidp->oid_number = oid_number; RB_INSERT(sysctl_oid_list, parent, oidp); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { #ifdef VIMAGE /* * Can fetch value multiple times for VNET loader tunables. * Only fetch once for non-VNET loader tunables. */ if ((oidp->oid_kind & CTLFLAG_VNET) == 0) #endif oidp->oid_kind |= CTLFLAG_NOFETCH; /* try to fetch value from kernel environment */ sysctl_load_tunable_by_oid_locked(oidp); } } void sysctl_register_disabled_oid(struct sysctl_oid *oidp) { /* * Mark the leaf as dormant if it's not to be immediately enabled. * We do not disable nodes as they can be shared between modules * and it is always safe to access a node. */ KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("internal flag is set in oid_kind")); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) oidp->oid_kind |= CTLFLAG_DORMANT; sysctl_register_oid(oidp); } void sysctl_enable_oid(struct sysctl_oid *oidp) { SYSCTL_ASSERT_WLOCKED(); if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) == 0, ("sysctl node is marked as dormant")); return; } KASSERT((oidp->oid_kind & CTLFLAG_DORMANT) != 0, ("enabling already enabled sysctl oid")); oidp->oid_kind &= ~CTLFLAG_DORMANT; } void sysctl_unregister_oid(struct sysctl_oid *oidp) { int error; SYSCTL_ASSERT_WLOCKED(); if (oidp->oid_number == OID_AUTO) { error = EINVAL; } else { error = ENOENT; if (RB_REMOVE(sysctl_oid_list, oidp->oid_parent, oidp)) error = 0; } /* * This can happen when a module fails to register and is * being unloaded afterwards. It should not be a panic() * for normal use. */ if (error) { printf("%s: failed(%d) to unregister sysctl(%s)\n", __func__, error, oidp->oid_name); } } /* Initialize a new context to keep track of dynamically added sysctls. */ int sysctl_ctx_init(struct sysctl_ctx_list *c) { if (c == NULL) { return (EINVAL); } /* * No locking here, the caller is responsible for not adding * new nodes to a context until after this function has * returned. */ TAILQ_INIT(c); return (0); } /* Free the context, and destroy all dynamic oids registered in this context */ int sysctl_ctx_free(struct sysctl_ctx_list *clist) { struct sysctl_ctx_entry *e, *e1; int error; error = 0; /* * First perform a "dry run" to check if it's ok to remove oids. * XXX FIXME * XXX This algorithm is a hack. But I don't know any * XXX better solution for now... */ SYSCTL_WLOCK(); TAILQ_FOREACH(e, clist, link) { error = sysctl_remove_oid_locked(e->entry, 0, 0); if (error) break; } /* * Restore deregistered entries, either from the end, * or from the place where error occurred. * e contains the entry that was not unregistered */ if (error) e1 = TAILQ_PREV(e, sysctl_ctx_list, link); else e1 = TAILQ_LAST(clist, sysctl_ctx_list); while (e1 != NULL) { sysctl_register_oid(e1->entry); e1 = TAILQ_PREV(e1, sysctl_ctx_list, link); } if (error) { SYSCTL_WUNLOCK(); return(EBUSY); } /* Now really delete the entries */ e = TAILQ_FIRST(clist); while (e != NULL) { e1 = TAILQ_NEXT(e, link); error = sysctl_remove_oid_locked(e->entry, 1, 0); if (error) panic("sysctl_remove_oid: corrupt tree, entry: %s", e->entry->oid_name); free(e, M_SYSCTLOID); e = e1; } SYSCTL_WUNLOCK(); return (error); } /* Add an entry to the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK); e->entry = oidp; TAILQ_INSERT_HEAD(clist, e, link); return (e); } /* Find an entry in the context */ struct sysctl_ctx_entry * sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; SYSCTL_ASSERT_WLOCKED(); if (clist == NULL || oidp == NULL) return(NULL); TAILQ_FOREACH(e, clist, link) { if (e->entry == oidp) return(e); } return (e); } /* * Delete an entry from the context. * NOTE: this function doesn't free oidp! You have to remove it * with sysctl_remove_oid(). */ int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp) { struct sysctl_ctx_entry *e; if (clist == NULL || oidp == NULL) return (EINVAL); SYSCTL_WLOCK(); e = sysctl_ctx_entry_find(clist, oidp); if (e != NULL) { TAILQ_REMOVE(clist, e, link); SYSCTL_WUNLOCK(); free(e, M_SYSCTLOID); return (0); } else { SYSCTL_WUNLOCK(); return (ENOENT); } } /* * Remove dynamically created sysctl trees. * oidp - top of the tree to be removed * del - if 0 - just deregister, otherwise free up entries as well * recurse - if != 0 traverse the subtree to be deleted */ int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse) { int error; SYSCTL_WLOCK(); error = sysctl_remove_oid_locked(oidp, del, recurse); SYSCTL_WUNLOCK(); return (error); } int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, int recurse) { struct sysctl_oid *p; int error; error = ENOENT; SYSCTL_WLOCK(); p = sysctl_find_oidname(name, &parent->oid_children); if (p) error = sysctl_remove_oid_locked(p, del, recurse); SYSCTL_WUNLOCK(); return (error); } /* * Duplicate the provided string, escaping any illegal characters. The result * must be freed when no longer in use. * * The list of illegal characters is ".". */ static char* sysctl_escape_name(const char* orig) { int i, s = 0, d = 0, nillegals = 0; char *new; /* First count the number of illegal characters */ for (i = 0; orig[i] != '\0'; i++) { if (orig[i] == '.') nillegals++; } /* Allocate storage for new string */ new = malloc(i + 2 * nillegals + 1, M_SYSCTLOID, M_WAITOK); /* Copy the name, escaping characters as we go */ while (orig[s] != '\0') { if (orig[s] == '.') { /* %25 is the hexadecimal representation of '.' */ new[d++] = '%'; new[d++] = '2'; new[d++] = '5'; s++; } else { new[d++] = orig[s++]; } } /* Finally, nul-terminate */ new[d] = '\0'; return (new); } static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse) { struct sysctl_oid *p, *tmp; int error; SYSCTL_ASSERT_WLOCKED(); if (oidp == NULL) return(EINVAL); if ((oidp->oid_kind & CTLFLAG_DYN) == 0) { printf("Warning: can't remove non-dynamic nodes (%s)!\n", oidp->oid_name); return (EINVAL); } /* * WARNING: normal method to do this should be through * sysctl_ctx_free(). Use recursing as the last resort * method to purge your sysctl tree of leftovers... * However, if some other code still references these nodes, * it will panic. */ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oidp->oid_refcnt == 1) { for(p = RB_MIN(sysctl_oid_list, &oidp->oid_children); p != NULL; p = tmp) { if (!recurse) { printf("Warning: failed attempt to " "remove oid %s with child %s\n", oidp->oid_name, p->oid_name); return (ENOTEMPTY); } tmp = RB_NEXT(sysctl_oid_list, &oidp->oid_children, p); error = sysctl_remove_oid_locked(p, del, recurse); if (error) return (error); } } } if (oidp->oid_refcnt > 1 ) { oidp->oid_refcnt--; } else { if (oidp->oid_refcnt == 0) { printf("Warning: bad oid_refcnt=%u (%s)!\n", oidp->oid_refcnt, oidp->oid_name); return (EINVAL); } sysctl_unregister_oid(oidp); if (del) { /* * Wait for all threads running the handler to drain. * This preserves the previous behavior when the * sysctl lock was held across a handler invocation, * and is necessary for module unload correctness. */ while (oidp->oid_running > 0) { oidp->oid_kind |= CTLFLAG_DYING; SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0); } if (oidp->oid_descr) free(__DECONST(char *, oidp->oid_descr), M_SYSCTLOID); if (oidp->oid_label) free(__DECONST(char *, oidp->oid_label), M_SYSCTLOID); free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID); free(oidp, M_SYSCTLOID); } } return (0); } /* * Create new sysctls at run time. * clist may point to a valid context initialized with sysctl_ctx_init(). */ struct sysctl_oid * sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int number, const char *name, int kind, void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr, const char *label) { struct sysctl_oid *oidp; char *escaped; /* You have to hook up somewhere.. */ if (parent == NULL) return(NULL); escaped = sysctl_escape_name(name); /* Check if the node already exists, otherwise create it */ SYSCTL_WLOCK(); oidp = sysctl_find_oidname(escaped, parent); if (oidp != NULL) { free(escaped, M_SYSCTLOID); if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { oidp->oid_refcnt++; /* Update the context */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); SYSCTL_WUNLOCK(); return (oidp); } else { sysctl_warn_reuse(__func__, oidp); SYSCTL_WUNLOCK(); return (NULL); } } oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO); oidp->oid_parent = parent; RB_INIT(&oidp->oid_children); oidp->oid_number = number; oidp->oid_refcnt = 1; oidp->oid_name = escaped; oidp->oid_handler = handler; oidp->oid_kind = CTLFLAG_DYN | kind; oidp->oid_arg1 = arg1; oidp->oid_arg2 = arg2; oidp->oid_fmt = fmt; if (descr != NULL) oidp->oid_descr = strdup(descr, M_SYSCTLOID); if (label != NULL) oidp->oid_label = strdup(label, M_SYSCTLOID); /* Update the context, if used */ if (clist != NULL) sysctl_ctx_entry_add(clist, oidp); /* Register this oid */ sysctl_register_oid(oidp); SYSCTL_WUNLOCK(); return (oidp); } /* * Rename an existing oid. */ void sysctl_rename_oid(struct sysctl_oid *oidp, const char *name) { char *newname; char *oldname; newname = strdup(name, M_SYSCTLOID); SYSCTL_WLOCK(); oldname = __DECONST(char *, oidp->oid_name); oidp->oid_name = newname; SYSCTL_WUNLOCK(); free(oldname, M_SYSCTLOID); } /* * Reparent an existing oid. */ int sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent) { struct sysctl_oid *oidp; SYSCTL_WLOCK(); if (oid->oid_parent == parent) { SYSCTL_WUNLOCK(); return (0); } oidp = sysctl_find_oidname(oid->oid_name, parent); if (oidp != NULL) { SYSCTL_WUNLOCK(); return (EEXIST); } sysctl_unregister_oid(oid); oid->oid_parent = parent; oid->oid_number = OID_AUTO; sysctl_register_oid(oid); SYSCTL_WUNLOCK(); return (0); } /* * Register the kernel's oids on startup. */ SET_DECLARE(sysctl_set, struct sysctl_oid); static void sysctl_register_all(void *arg) { struct sysctl_oid **oidp; sx_init(&sysctlmemlock, "sysctl mem"); sx_init(&sysctlstringlock, "sysctl string handler"); SYSCTL_INIT(); SYSCTL_WLOCK(); SET_FOREACH(oidp, sysctl_set) sysctl_register_oid(*oidp); SYSCTL_WUNLOCK(); } SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_FIRST, sysctl_register_all, NULL); #ifdef VIMAGE static void sysctl_setenv_vnet(void *arg __unused, const char *name) { struct sysctl_oid *oidp; int oid[CTL_MAXNAME]; int error, nlen; SYSCTL_WLOCK(); error = name2oid(name, oid, &nlen, &oidp); if (error) goto out; if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && (oidp->oid_kind & CTLFLAG_VNET) != 0 && (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { /* Update value from kernel environment */ sysctl_load_tunable_by_oid_locked(oidp); } out: SYSCTL_WUNLOCK(); } static void sysctl_unsetenv_vnet(void *arg __unused, const char *name) { struct sysctl_oid *oidp; int oid[CTL_MAXNAME]; int error, nlen; SYSCTL_WLOCK(); /* * The setenv / unsetenv event handlers are invoked by kern_setenv() / * kern_unsetenv() without exclusive locks. It is rare but still possible * that the invoke order of event handlers is different from that of * kern_setenv() and kern_unsetenv(). * Re-check environment variable string to make sure it is unset. */ if (testenv(name)) goto out; error = name2oid(name, oid, &nlen, &oidp); if (error) goto out; if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE && (oidp->oid_kind & CTLFLAG_VNET) != 0 && (oidp->oid_kind & CTLFLAG_TUN) != 0 && (oidp->oid_kind & CTLFLAG_NOFETCH) == 0) { size_t size; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_INT: case CTLTYPE_UINT: size = sizeof(int); break; case CTLTYPE_LONG: case CTLTYPE_ULONG: size = sizeof(long); break; case CTLTYPE_S8: case CTLTYPE_U8: size = sizeof(int8_t); break; case CTLTYPE_S16: case CTLTYPE_U16: size = sizeof(int16_t); break; case CTLTYPE_S32: case CTLTYPE_U32: size = sizeof(int32_t); break; case CTLTYPE_S64: case CTLTYPE_U64: size = sizeof(int64_t); break; case CTLTYPE_STRING: MPASS(oidp->oid_arg2 > 0); size = oidp->oid_arg2; break; default: goto out; } vnet_restore_init(oidp->oid_arg1, size); } out: SYSCTL_WUNLOCK(); } /* * Register the kernel's setenv / unsetenv events. */ EVENTHANDLER_DEFINE(setenv, sysctl_setenv_vnet, NULL, EVENTHANDLER_PRI_ANY); EVENTHANDLER_DEFINE(unsetenv, sysctl_unsetenv_vnet, NULL, EVENTHANDLER_PRI_ANY); #endif /* * "Staff-functions" * * These functions implement a presently undocumented interface * used by the sysctl program to walk the tree, and get the type * so it can print the value. * This interface is under work and consideration, and should probably * be killed with a big axe by the first person who can find the time. * (be aware though, that the proper interface isn't as obvious as it * may seem, there are various conflicting requirements. * * {CTL_SYSCTL, CTL_SYSCTL_DEBUG} printf the entire MIB-tree. * {CTL_SYSCTL, CTL_SYSCTL_NAME, ...} return the name of the "..." * OID. * {CTL_SYSCTL, CTL_SYSCTL_NEXT, ...} return the next OID, honoring * CTLFLAG_SKIP. * {CTL_SYSCTL, CTL_SYSCTL_NAME2OID} return the OID of the name in * "new" * {CTL_SYSCTL, CTL_SYSCTL_OIDFMT, ...} return the kind & format info * for the "..." OID. * {CTL_SYSCTL, CTL_SYSCTL_OIDDESCR, ...} return the description of the * "..." OID. * {CTL_SYSCTL, CTL_SYSCTL_OIDLABEL, ...} return the aggregation label of * the "..." OID. * {CTL_SYSCTL, CTL_SYSCTL_NEXTNOSKIP, ...} return the next OID, ignoring * CTLFLAG_SKIP. */ #ifdef SYSCTL_DEBUG static void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { int k; struct sysctl_oid *oidp; SYSCTL_ASSERT_LOCKED(); SYSCTL_FOREACH(oidp, l) { for (k=0; koid_number, oidp->oid_name); printf("%c%c", oidp->oid_kind & CTLFLAG_RD ? 'R':' ', oidp->oid_kind & CTLFLAG_WR ? 'W':' '); if (oidp->oid_handler) printf(" *Handler"); switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_NODE: printf(" Node\n"); if (!oidp->oid_handler) { sysctl_sysctl_debug_dump_node( SYSCTL_CHILDREN(oidp), i + 2); } break; case CTLTYPE_INT: printf(" Int\n"); break; case CTLTYPE_UINT: printf(" u_int\n"); break; case CTLTYPE_LONG: printf(" Long\n"); break; case CTLTYPE_ULONG: printf(" u_long\n"); break; case CTLTYPE_STRING: printf(" String\n"); break; case CTLTYPE_S8: printf(" int8_t\n"); break; case CTLTYPE_S16: printf(" int16_t\n"); break; case CTLTYPE_S32: printf(" int32_t\n"); break; case CTLTYPE_S64: printf(" int64_t\n"); break; case CTLTYPE_U8: printf(" uint8_t\n"); break; case CTLTYPE_U16: printf(" uint16_t\n"); break; case CTLTYPE_U32: printf(" uint32_t\n"); break; case CTLTYPE_U64: printf(" uint64_t\n"); break; case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; default: printf("\n"); } } } static int sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; int error; error = priv_check(req->td, PRIV_SYSCTL_DEBUG); if (error) return (error); SYSCTL_RLOCK(&tracker); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); SYSCTL_RUNLOCK(&tracker); return (ENOENT); } SYSCTL_PROC(_sysctl, CTL_SYSCTL_DEBUG, debug, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_sysctl_debug, "-", ""); #endif static int sysctl_sysctl_name(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int error; struct sysctl_oid *oid, key; struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; struct rm_priotracker tracker; char buf[10]; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); while (namelen) { if (!lsp) { snprintf(buf,sizeof(buf),"%d",*name); if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, buf, strlen(buf)); if (error) goto out; namelen--; name++; continue; } lsp2 = NULL; key.oid_number = *name; oid = RB_FIND(sysctl_oid_list, lsp, &key); if (oid) { if (req->oldidx) error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, oid->oid_name, strlen(oid->oid_name)); if (error) goto out; namelen--; name++; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE && !oid->oid_handler) lsp2 = SYSCTL_CHILDREN(oid); } lsp = lsp2; } error = SYSCTL_OUT(req, "", 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } /* * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NAME, name, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_name, ""); enum sysctl_iter_action { ITER_SIBLINGS, /* Not matched, continue iterating siblings */ ITER_CHILDREN, /* Node has children we need to iterate over them */ ITER_FOUND, /* Matching node was found */ }; /* * Tries to find the next node for @name and @namelen. * * Returns next action to take. */ static enum sysctl_iter_action sysctl_sysctl_next_node(struct sysctl_oid *oidp, int *name, unsigned int namelen, bool honor_skip) { if ((oidp->oid_kind & CTLFLAG_DORMANT) != 0) return (ITER_SIBLINGS); if (honor_skip && (oidp->oid_kind & CTLFLAG_SKIP) != 0) return (ITER_SIBLINGS); if (namelen == 0) { /* * We have reached a node with a full name match and are * looking for the next oid in its children. * * For CTL_SYSCTL_NEXTNOSKIP we are done. * * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it * has a handler) and move on to the children. */ if (!honor_skip) return (ITER_FOUND); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (ITER_FOUND); /* If node does not have an iterator, treat it as leaf */ if (oidp->oid_handler) return (ITER_FOUND); /* Report oid as a node to iterate */ return (ITER_CHILDREN); } /* * No match yet. Continue seeking the given name. * * We are iterating in order by oid_number, so skip oids lower * than the one we are looking for. * * When the current oid_number is higher than the one we seek, * that means we have reached the next oid in the sequence and * should return it. * * If the oid_number matches the name at this level then we * have to find a node to continue searching at the next level. */ if (oidp->oid_number < *name) return (ITER_SIBLINGS); if (oidp->oid_number > *name) { /* * We have reached the next oid. * * For CTL_SYSCTL_NEXTNOSKIP we are done. * * For CTL_SYSCTL_NEXT we skip CTLTYPE_NODE (unless it * has a handler) and move on to the children. */ if (!honor_skip) return (ITER_FOUND); if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (ITER_FOUND); /* If node does not have an iterator, treat it as leaf */ if (oidp->oid_handler) return (ITER_FOUND); return (ITER_CHILDREN); } /* match at a current level */ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) return (ITER_SIBLINGS); if (oidp->oid_handler) return (ITER_SIBLINGS); return (ITER_CHILDREN); } /* * Recursively walk the sysctl subtree at lsp until we find the given name. * Returns true and fills in next oid data in @next and @len if oid is found. */ static bool sysctl_sysctl_next_action(struct sysctl_oid_list *lsp, int *name, u_int namelen, int *next, int *len, int level, bool honor_skip) { struct sysctl_oid_list *next_lsp; struct sysctl_oid *oidp = NULL, key; bool success = false; enum sysctl_iter_action action; SYSCTL_ASSERT_LOCKED(); /* * Start the search at the requested oid. But if not found, then scan * through all children. */ if (namelen > 0) { key.oid_number = *name; oidp = RB_FIND(sysctl_oid_list, lsp, &key); } if (!oidp) oidp = RB_MIN(sysctl_oid_list, lsp); for(; oidp != NULL; oidp = RB_NEXT(sysctl_oid_list, lsp, oidp)) { action = sysctl_sysctl_next_node(oidp, name, namelen, honor_skip); if (action == ITER_SIBLINGS) continue; if (action == ITER_FOUND) { success = true; break; } KASSERT((action== ITER_CHILDREN), ("ret(%d)!=ITER_CHILDREN", action)); next_lsp = SYSCTL_CHILDREN(oidp); if (namelen == 0) { success = sysctl_sysctl_next_action(next_lsp, NULL, 0, next + 1, len, level + 1, honor_skip); } else { success = sysctl_sysctl_next_action(next_lsp, name + 1, namelen - 1, next + 1, len, level + 1, honor_skip); if (!success) { /* * We maintain the invariant that current node oid * is >= the oid provided in @name. * As there are no usable children at this node, * current node oid is strictly > than the requested * oid. * Hence, reduce namelen to 0 to allow for picking first * nodes/leafs in the next node in list. */ namelen = 0; } } if (success) break; } if (success) { *next = oidp->oid_number; if (level > *len) *len = level; } return (success); } static int sysctl_sysctl_next(SYSCTL_HANDLER_ARGS) { int *name = (int *) arg1; u_int namelen = arg2; int len, error; bool success; struct sysctl_oid_list *lsp = &sysctl__children; struct rm_priotracker tracker; int next[CTL_MAXNAME]; len = 0; SYSCTL_RLOCK(&tracker); success = sysctl_sysctl_next_action(lsp, name, namelen, next, &len, 1, oidp->oid_number == CTL_SYSCTL_NEXT); SYSCTL_RUNLOCK(&tracker); if (!success) return (ENOENT); error = SYSCTL_OUT(req, next, len * sizeof (int)); return (error); } /* * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in * capability mode. */ static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXT, next, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static SYSCTL_NODE(_sysctl, CTL_SYSCTL_NEXTNOSKIP, nextnoskip, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_next, ""); static int name2oid(const char *name, int *oid, int *len, struct sysctl_oid **oidpp) { struct sysctl_oid *oidp; struct sysctl_oid_list *lsp = &sysctl__children; const char *n; SYSCTL_ASSERT_LOCKED(); for (*len = 0; *len < CTL_MAXNAME;) { n = strchrnul(name, '.'); oidp = sysctl_find_oidnamelen(name, n - name, lsp); if (oidp == NULL) return (ENOENT); *oid++ = oidp->oid_number; (*len)++; name = n; if (*name == '.') name++; if (*name == '\0') { if (oidpp) *oidpp = oidp; return (0); } if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE) break; if (oidp->oid_handler) break; lsp = SYSCTL_CHILDREN(oidp); } return (ENOENT); } static int sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS) { char *p; int error, oid[CTL_MAXNAME], len = 0; struct sysctl_oid *op = NULL; struct rm_priotracker tracker; char buf[32]; if (!req->newlen) return (ENOENT); if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); p = buf; if (req->newlen >= sizeof(buf)) p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); error = SYSCTL_IN(req, p, req->newlen); if (error) { if (p != buf) free(p, M_SYSCTL); return (error); } p [req->newlen] = '\0'; SYSCTL_RLOCK(&tracker); error = name2oid(p, oid, &len, &op); SYSCTL_RUNLOCK(&tracker); if (p != buf) free(p, M_SYSCTL); if (error) return (error); error = SYSCTL_OUT(req, oid, len * sizeof *oid); return (error); } /* * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in * capability mode. */ SYSCTL_PROC(_sysctl, CTL_SYSCTL_NAME2OID, name2oid, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", ""); static int sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_fmt == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); if (error) goto out; error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDFMT, oidfmt, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidfmt, ""); static int sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_descr == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDDESCR, oiddescr, CTLFLAG_RD | CTLFLAG_MPSAFE|CTLFLAG_CAPRD, sysctl_sysctl_oiddescr, ""); static int sysctl_sysctl_oidlabel(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, NULL, req); if (error) goto out; if (oid->oid_label == NULL) { error = ENOENT; goto out; } error = SYSCTL_OUT(req, oid->oid_label, strlen(oid->oid_label) + 1); out: SYSCTL_RUNLOCK(&tracker); return (error); } static SYSCTL_NODE(_sysctl, CTL_SYSCTL_OIDLABEL, oidlabel, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, sysctl_sysctl_oidlabel, ""); /* * Default "handler" functions. */ /* * Handle a bool. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_bool(SYSCTL_HANDLER_ARGS) { uint8_t temp; int error; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) temp = *(bool *)arg1 ? 1 : 0; else temp = arg2 ? 1 : 0; error = SYSCTL_OUT(req, &temp, sizeof(temp)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else { error = SYSCTL_IN(req, &temp, sizeof(temp)); if (!error) *(bool *)arg1 = temp ? 1 : 0; } return (error); } /* * Handle an int8_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_8(SYSCTL_HANDLER_ARGS) { int8_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int8_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int16_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_16(SYSCTL_HANDLER_ARGS) { int16_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int16_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int32_t, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_32(SYSCTL_HANDLER_ARGS) { int32_t tmpout; int error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int32_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(tmpout)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(tmpout)); return (error); } /* * Handle an int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_int(SYSCTL_HANDLER_ARGS) { int tmpout, error = 0; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(int *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(int)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(int)); return (error); } /* * Based on sysctl_handle_int() convert milliseconds into ticks. * Note: this is used by TCP. */ int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) { int error, s, tt; tt = *(int *)arg1; s = (int)((int64_t)tt * 1000 / hz); error = sysctl_handle_int(oidp, &s, 0, req); if (error || !req->newptr) return (error); tt = (int)((int64_t)s * hz / 1000); if (tt < 1) return (EINVAL); *(int *)arg1 = tt; return (0); } /* * Handle a long, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_long(SYSCTL_HANDLER_ARGS) { int error = 0; long tmplong; #ifdef SCTL_MASK32 int tmpint; #endif /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmplong = *(long *)arg1; else tmplong = arg2; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { tmpint = tmplong; error = SYSCTL_OUT(req, &tmpint, sizeof(int)); } else #endif error = SYSCTL_OUT(req, &tmplong, sizeof(long)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; #ifdef SCTL_MASK32 else if (req->flags & SCTL_MASK32) { error = SYSCTL_IN(req, &tmpint, sizeof(int)); *(long *)arg1 = (long)tmpint; } #endif else error = SYSCTL_IN(req, arg1, sizeof(long)); return (error); } /* * Handle a 64 bit int, signed or unsigned. * Two cases: * a variable: point arg1 at it. * a constant: pass it in arg2. */ int sysctl_handle_64(SYSCTL_HANDLER_ARGS) { int error = 0; uint64_t tmpout; /* * Attempt to get a coherent snapshot by making a copy of the data. */ if (arg1) tmpout = *(uint64_t *)arg1; else tmpout = arg2; error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t)); if (error || !req->newptr) return (error); if (!arg1) error = EPERM; else error = SYSCTL_IN(req, arg1, sizeof(uint64_t)); return (error); } /* * Handle our generic '\0' terminated 'C' string. * Two cases: * a variable string: point arg1 at it, arg2 is max length. * a constant string: point arg1 at it, arg2 is zero. */ int sysctl_handle_string(SYSCTL_HANDLER_ARGS) { char *tmparg; size_t outlen; int error = 0, ro_string = 0; /* * If the sysctl isn't writable and isn't a preallocated tunable that * can be modified by kenv(2), microoptimise and treat it as a * read-only string. * A zero-length buffer indicates a fixed size read-only * string. In ddb, don't worry about trying to make a malloced * snapshot. */ if ((oidp->oid_kind & (CTLFLAG_WR | CTLFLAG_TUN)) == 0 || arg2 == 0 || kdb_active) { arg2 = strlen((char *)arg1) + 1; ro_string = 1; } if (req->oldptr != NULL) { if (ro_string) { tmparg = arg1; outlen = strlen(tmparg) + 1; } else { tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK); sx_slock(&sysctlstringlock); memcpy(tmparg, arg1, arg2); sx_sunlock(&sysctlstringlock); outlen = strlen(tmparg) + 1; } error = SYSCTL_OUT(req, tmparg, outlen); if (!ro_string) free(tmparg, M_SYSCTLTMP); } else { if (!ro_string) sx_slock(&sysctlstringlock); outlen = strlen((char *)arg1) + 1; if (!ro_string) sx_sunlock(&sysctlstringlock); error = SYSCTL_OUT(req, NULL, outlen); } if (error || !req->newptr) return (error); if (req->newlen - req->newidx >= arg2 || req->newlen - req->newidx < 0) { error = EINVAL; } else if (req->newlen - req->newidx == 0) { sx_xlock(&sysctlstringlock); ((char *)arg1)[0] = '\0'; sx_xunlock(&sysctlstringlock); } else if (req->newfunc == sysctl_new_kernel) { arg2 = req->newlen - req->newidx; sx_xlock(&sysctlstringlock); error = SYSCTL_IN(req, arg1, arg2); if (error == 0) { ((char *)arg1)[arg2] = '\0'; req->newidx += arg2; } sx_xunlock(&sysctlstringlock); } else { arg2 = req->newlen - req->newidx; tmparg = malloc(arg2, M_SYSCTLTMP, M_WAITOK); error = SYSCTL_IN(req, tmparg, arg2); if (error) { free(tmparg, M_SYSCTLTMP); return (error); } sx_xlock(&sysctlstringlock); memcpy(arg1, tmparg, arg2); ((char *)arg1)[arg2] = '\0'; sx_xunlock(&sysctlstringlock); free(tmparg, M_SYSCTLTMP); req->newidx += arg2; } return (error); } /* * Handle any kind of opaque data. * arg1 points to it, arg2 is the size. */ int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS) { int error, tries; u_int generation; struct sysctl_req req2; /* * Attempt to get a coherent snapshot, by using the thread * pre-emption counter updated from within mi_switch() to * determine if we were pre-empted during a bcopy() or * copyout(). Make 3 attempts at doing this before giving up. * If we encounter an error, stop immediately. */ tries = 0; req2 = *req; retry: generation = curthread->td_generation; error = SYSCTL_OUT(req, arg1, arg2); if (error) return (error); tries++; if (generation != curthread->td_generation && tries < 3) { *req = req2; goto retry; } error = SYSCTL_IN(req, arg1, arg2); return (error); } /* * Based on sysctl_handle_64() convert microseconds to a sbintime. */ int sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS) { int error; int64_t usec; usec = sbttous(*(sbintime_t *)arg1); error = sysctl_handle_64(oidp, &usec, 0, req); if (error || !req->newptr) return (error); *(sbintime_t *)arg1 = ustosbt(usec); return (0); } /* * Based on sysctl_handle_64() convert milliseconds to a sbintime. */ int sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS) { int error; int64_t msec; msec = sbttoms(*(sbintime_t *)arg1); error = sysctl_handle_64(oidp, &msec, 0, req); if (error || !req->newptr) return (error); *(sbintime_t *)arg1 = mstosbt(msec); return (0); } /* * Convert seconds to a struct timeval. Intended for use with * intervals and thus does not permit negative seconds. */ int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS) { struct timeval *tv; int error, secs; tv = arg1; secs = tv->tv_sec; error = sysctl_handle_int(oidp, &secs, 0, req); if (error || req->newptr == NULL) return (error); if (secs < 0) return (EINVAL); tv->tv_sec = secs; return (0); } /* * Transfer functions to/from kernel space. * XXX: rather untested at this point */ static int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; if (req->oldptr) { i = l; if (req->oldlen <= req->oldidx) i = 0; else if (i > req->oldlen - req->oldidx) i = req->oldlen - req->oldidx; if (i > 0) bcopy(p, (char *)req->oldptr + req->oldidx, i); } req->oldidx += l; if (req->oldptr && i != l) return (ENOMEM); return (0); } static int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); bcopy((const char *)req->newptr + req->newidx, p, l); req->newidx += l; return (0); } int kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int error = 0; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr= old; } if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_kernel; req.newfunc = sysctl_new_kernel; req.lock = REQ_UNWIRED; error = sysctl_root(0, name, namelen, &req); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } int kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags) { int oid[CTL_MAXNAME]; size_t oidlen, plen; int error; oid[0] = CTL_SYSCTL; oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, strlen(name), &plen, flags); if (error) return (error); error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp, new, newlen, retval, flags); return (error); } /* * Transfer function to/from user space. */ static int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { size_t i, len, origidx; int error; origidx = req->oldidx; req->oldidx += l; if (req->oldptr == NULL) return (0); /* * If we have not wired the user supplied buffer and we are currently * holding locks, drop a witness warning, as it's possible that * write operations to the user page can sleep. */ if (req->lock != REQ_WIRED) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_old_user()"); i = l; len = req->validlen; if (len <= origidx) i = 0; else { if (i > len - origidx) i = len - origidx; if (req->lock == REQ_WIRED) { error = copyout_nofault(p, (char *)req->oldptr + origidx, i); } else error = copyout(p, (char *)req->oldptr + origidx, i); if (error != 0) return (error); } if (i < l) return (ENOMEM); return (0); } static int sysctl_new_user(struct sysctl_req *req, void *p, size_t l) { int error; if (!req->newptr) return (0); if (req->newlen - req->newidx < l) return (EINVAL); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "sysctl_new_user()"); error = copyin((const char *)req->newptr + req->newidx, p, l); req->newidx += l; return (error); } /* * Wire the user space destination buffer. If set to a value greater than * zero, the len parameter limits the maximum amount of wired memory. */ int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len) { int ret; size_t wiredlen; wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen; ret = 0; if (req->lock != REQ_WIRED && req->oldptr && req->oldfunc == sysctl_old_user) { if (wiredlen != 0) { ret = vslock(req->oldptr, wiredlen); if (ret != 0) { if (ret != ENOMEM) return (ret); wiredlen = 0; } } req->lock = REQ_WIRED; req->validlen = wiredlen; } return (0); } int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req) { struct sysctl_oid_list *lsp; struct sysctl_oid *oid; struct sysctl_oid key; int indx; SYSCTL_ASSERT_LOCKED(); lsp = &sysctl__children; indx = 0; while (indx < CTL_MAXNAME) { key.oid_number = name[indx]; oid = RB_FIND(sysctl_oid_list, lsp, &key); if (oid == NULL) return (ENOENT); indx++; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oid->oid_handler != NULL || indx == namelen) { *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } lsp = SYSCTL_CHILDREN(oid); } else if (indx == namelen) { if ((oid->oid_kind & CTLFLAG_DORMANT) != 0) return (ENOENT); *noid = oid; if (nindx != NULL) *nindx = indx; KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0, ("%s found DYING node %p", __func__, oid)); return (0); } else { return (ENOTDIR); } } return (ENOENT); } /* * Traverse our tree, and find the right node, execute whatever it points * to, and return the resulting error code. */ static int sysctl_root(SYSCTL_HANDLER_ARGS) { struct sysctl_oid *oid; struct rm_priotracker tracker; int error, indx, lvl; SYSCTL_RLOCK(&tracker); error = sysctl_find_oid(arg1, arg2, &oid, &indx, req); if (error) goto out; if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { /* * You can't call a sysctl when it's a node, but has * no handler. Inform the user that it's a node. * The indx may or may not be the same as namelen. */ if (oid->oid_handler == NULL) { error = EISDIR; goto out; } } /* Is this sysctl writable? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_WR)) { error = EPERM; goto out; } KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL")); #ifdef CAPABILITY_MODE /* * If the process is in capability mode, then don't permit reading or * writing unless specifically granted for the node. */ if (IN_CAPABILITY_MODE(req->td)) { if ((req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD)) || (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))) { error = EPERM; goto out; } } #endif /* Is this sysctl sensitive to securelevels? */ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) { lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE; error = securelevel_gt(req->td->td_ucred, lvl); if (error) goto out; } /* Is this sysctl writable by only privileged users? */ if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) { int priv; if (oid->oid_kind & CTLFLAG_PRISON) priv = PRIV_SYSCTL_WRITEJAIL; #ifdef VIMAGE else if ((oid->oid_kind & CTLFLAG_VNET) && prison_owns_vnet(req->td->td_ucred)) priv = PRIV_SYSCTL_WRITEJAIL; #endif else priv = PRIV_SYSCTL_WRITE; error = priv_check(req->td, priv); if (error) goto out; } if (!oid->oid_handler) { error = EINVAL; goto out; } if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { arg1 = (int *)arg1 + indx; arg2 -= indx; } else { arg1 = oid->oid_arg1; arg2 = oid->oid_arg2; } #ifdef MAC error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2, req); if (error != 0) goto out; #endif #ifdef VIMAGE if ((oid->oid_kind & CTLFLAG_VNET) && arg1 != NULL) arg1 = (void *)(curvnet->vnet_data_base + (uintptr_t)arg1); #endif error = sysctl_root_handler_locked(oid, arg1, arg2, req, &tracker); out: SYSCTL_RUNLOCK(&tracker); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __sysctl_args { int *name; u_int namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctl(struct thread *td, struct __sysctl_args *uap) { int error, i, name[CTL_MAXNAME]; size_t j; if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); error = copyin(uap->name, &name, uap->namelen * sizeof(int)); if (error) return (error); error = userland_sysctl(td, name, uap->namelen, uap->old, uap->oldlenp, 0, uap->new, uap->newlen, &j, 0); if (error && error != ENOMEM) return (error); if (uap->oldlenp) { i = copyout(&j, uap->oldlenp, sizeof(j)); if (i) return (i); } return (error); } int kern___sysctlbyname(struct thread *td, const char *oname, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags, bool inkernel) { int oid[CTL_MAXNAME]; char namebuf[16]; char *name; size_t oidlen; int error; if (namelen > MAXPATHLEN || namelen == 0) return (EINVAL); name = namebuf; if (namelen > sizeof(namebuf)) name = malloc(namelen, M_SYSCTL, M_WAITOK); error = copyin(oname, name, namelen); if (error != 0) goto out; oid[0] = CTL_SYSCTL; oid[1] = CTL_SYSCTL_NAME2OID; oidlen = sizeof(oid); error = kernel_sysctl(td, oid, 2, oid, &oidlen, (void *)name, namelen, retval, flags); if (error != 0) goto out; error = userland_sysctl(td, oid, *retval / sizeof(int), old, oldlenp, inkernel, new, newlen, retval, flags); out: if (namelen > sizeof(namebuf)) free(name, M_SYSCTL); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __sysctlbyname_args { const char *name; size_t namelen; void *old; size_t *oldlenp; void *new; size_t newlen; }; #endif int sys___sysctlbyname(struct thread *td, struct __sysctlbyname_args *uap) { size_t rv; int error; error = kern___sysctlbyname(td, uap->name, uap->namelen, uap->old, uap->oldlenp, uap->new, uap->newlen, &rv, 0, 0); if (error != 0) return (error); if (uap->oldlenp != NULL) error = copyout(&rv, uap->oldlenp, sizeof(rv)); return (error); } /* * This is used from various compatibility syscalls too. That's why name * must be in kernel space. */ int userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, const void *new, size_t newlen, size_t *retval, int flags) { int error = 0, memlocked; struct sysctl_req req; bzero(&req, sizeof req); req.td = td; req.flags = flags; if (oldlenp) { if (inkernel) { req.oldlen = *oldlenp; } else { error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); if (error) return (error); } } req.validlen = req.oldlen; req.oldptr = old; if (new != NULL) { req.newlen = newlen; req.newptr = new; } req.oldfunc = sysctl_old_user; req.newfunc = sysctl_new_user; req.lock = REQ_UNWIRED; #ifdef KTRACE if (KTRPOINT(curthread, KTR_SYSCTL)) ktrsysctl(name, namelen); #endif memlocked = 0; if (req.oldptr && req.oldlen > 4 * PAGE_SIZE) { memlocked = 1; sx_xlock(&sysctlmemlock); } CURVNET_SET(TD_TO_VNET(td)); for (;;) { req.oldidx = 0; req.newidx = 0; error = sysctl_root(0, name, namelen, &req); if (error != EAGAIN) break; kern_yield(PRI_USER); } CURVNET_RESTORE(); if (req.lock == REQ_WIRED && req.validlen > 0) vsunlock(req.oldptr, req.validlen); if (memlocked) sx_xunlock(&sysctlmemlock); if (error && error != ENOMEM) return (error); if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Drain into a sysctl struct. The user buffer should be wired if a page * fault would cause issue. */ static int sbuf_sysctl_drain(void *arg, const char *data, int len) { struct sysctl_req *req = arg; int error; error = SYSCTL_OUT(req, data, len); KASSERT(error >= 0, ("Got unexpected negative value %d", error)); return (error == 0 ? len : -error); } struct sbuf * sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length, struct sysctl_req *req) { /* Supply a default buffer size if none given. */ if (buf == NULL && length == 0) length = 64; s = sbuf_new(s, buf, length, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(s, sbuf_sysctl_drain, req); return (s); } #ifdef DDB /* The current OID the debugger is working with */ static struct sysctl_oid *g_ddb_oid; /* The current flags specified by the user */ static int g_ddb_sysctl_flags; /* Check to see if the last sysctl printed */ static int g_ddb_sysctl_printed; static const int ctl_sign[CTLTYPE+1] = { [CTLTYPE_INT] = 1, [CTLTYPE_LONG] = 1, [CTLTYPE_S8] = 1, [CTLTYPE_S16] = 1, [CTLTYPE_S32] = 1, [CTLTYPE_S64] = 1, }; static const int ctl_size[CTLTYPE+1] = { [CTLTYPE_INT] = sizeof(int), [CTLTYPE_UINT] = sizeof(u_int), [CTLTYPE_LONG] = sizeof(long), [CTLTYPE_ULONG] = sizeof(u_long), [CTLTYPE_S8] = sizeof(int8_t), [CTLTYPE_S16] = sizeof(int16_t), [CTLTYPE_S32] = sizeof(int32_t), [CTLTYPE_S64] = sizeof(int64_t), [CTLTYPE_U8] = sizeof(uint8_t), [CTLTYPE_U16] = sizeof(uint16_t), [CTLTYPE_U32] = sizeof(uint32_t), [CTLTYPE_U64] = sizeof(uint64_t), }; #define DB_SYSCTL_NAME_ONLY 0x001 /* Compare with -N */ #define DB_SYSCTL_VALUE_ONLY 0x002 /* Compare with -n */ #define DB_SYSCTL_OPAQUE 0x004 /* Compare with -o */ #define DB_SYSCTL_HEX 0x008 /* Compare with -x */ #define DB_SYSCTL_SAFE_ONLY 0x100 /* Only simple types */ static const char db_sysctl_modifs[] = { 'N', 'n', 'o', 'x', }; static const int db_sysctl_modif_values[] = { DB_SYSCTL_NAME_ONLY, DB_SYSCTL_VALUE_ONLY, DB_SYSCTL_OPAQUE, DB_SYSCTL_HEX, }; /* Handlers considered safe to print while recursing */ static int (* const db_safe_handlers[])(SYSCTL_HANDLER_ARGS) = { sysctl_handle_bool, sysctl_handle_8, sysctl_handle_16, sysctl_handle_32, sysctl_handle_64, sysctl_handle_int, sysctl_handle_long, sysctl_handle_string, sysctl_handle_opaque, }; /* * Use in place of sysctl_old_kernel to print sysctl values. * * Compare to the output handling in show_var from sbin/sysctl/sysctl.c */ static int sysctl_old_ddb(struct sysctl_req *req, const void *ptr, size_t len) { const u_char *val, *p; const char *sep1; size_t intlen, slen; uintmax_t umv; intmax_t mv; int sign, ctltype, hexlen, xflag, error; /* Suppress false-positive GCC uninitialized variable warnings */ mv = 0; umv = 0; slen = len; val = p = ptr; if (ptr == NULL) { error = 0; goto out; } /* We are going to print */ g_ddb_sysctl_printed = 1; xflag = g_ddb_sysctl_flags & DB_SYSCTL_HEX; ctltype = (g_ddb_oid->oid_kind & CTLTYPE); sign = ctl_sign[ctltype]; intlen = ctl_size[ctltype]; switch (ctltype) { case CTLTYPE_NODE: case CTLTYPE_STRING: db_printf("%.*s", (int) len, (const char *) p); error = 0; goto out; case CTLTYPE_INT: case CTLTYPE_UINT: case CTLTYPE_LONG: case CTLTYPE_ULONG: case CTLTYPE_S8: case CTLTYPE_S16: case CTLTYPE_S32: case CTLTYPE_S64: case CTLTYPE_U8: case CTLTYPE_U16: case CTLTYPE_U32: case CTLTYPE_U64: hexlen = 2 + (intlen * CHAR_BIT + 3) / 4; sep1 = ""; while (len >= intlen) { switch (ctltype) { case CTLTYPE_INT: case CTLTYPE_UINT: umv = *(const u_int *)p; mv = *(const int *)p; break; case CTLTYPE_LONG: case CTLTYPE_ULONG: umv = *(const u_long *)p; mv = *(const long *)p; break; case CTLTYPE_S8: case CTLTYPE_U8: umv = *(const uint8_t *)p; mv = *(const int8_t *)p; break; case CTLTYPE_S16: case CTLTYPE_U16: umv = *(const uint16_t *)p; mv = *(const int16_t *)p; break; case CTLTYPE_S32: case CTLTYPE_U32: umv = *(const uint32_t *)p; mv = *(const int32_t *)p; break; case CTLTYPE_S64: case CTLTYPE_U64: umv = *(const uint64_t *)p; mv = *(const int64_t *)p; break; } db_printf("%s", sep1); if (xflag) db_printf("%#0*jx", hexlen, umv); else if (!sign) db_printf("%ju", umv); else if (g_ddb_oid->oid_fmt[1] == 'K') { /* Kelvins are currently unsupported. */ error = EOPNOTSUPP; goto out; } else db_printf("%jd", mv); sep1 = " "; len -= intlen; p += intlen; } error = 0; goto out; case CTLTYPE_OPAQUE: /* TODO: Support struct functions. */ /* FALLTHROUGH */ default: db_printf("Format:%s Length:%zu Dump:0x", g_ddb_oid->oid_fmt, len); while (len-- && (xflag || p < val + 16)) db_printf("%02x", *p++); if (!xflag && len > 16) db_printf("..."); error = 0; goto out; } out: req->oldidx += slen; return (error); } /* * Avoid setting new sysctl values from the debugger */ static int sysctl_new_ddb(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) return (0); /* Changing sysctls from the debugger is currently unsupported */ return (EPERM); } /* * Run a sysctl handler with the DDB oldfunc and newfunc attached. * Instead of copying any output to a buffer we'll dump it right to * the console. */ static int db_sysctl(struct sysctl_oid *oidp, int *name, u_int namelen, void *old, size_t *oldlenp, size_t *retval, int flags) { struct sysctl_req req; int error; /* Setup the request */ bzero(&req, sizeof req); req.td = kdb_thread; req.oldfunc = sysctl_old_ddb; req.newfunc = sysctl_new_ddb; req.lock = REQ_UNWIRED; if (oldlenp) { req.oldlen = *oldlenp; } req.validlen = req.oldlen; if (old) { req.oldptr = old; } /* Setup our globals for sysctl_old_ddb */ g_ddb_oid = oidp; g_ddb_sysctl_flags = flags; g_ddb_sysctl_printed = 0; error = sysctl_root(0, name, namelen, &req); /* Reset globals */ g_ddb_oid = NULL; g_ddb_sysctl_flags = 0; if (retval) { if (req.oldptr && req.oldidx > req.validlen) *retval = req.validlen; else *retval = req.oldidx; } return (error); } /* * Show a sysctl's name */ static void db_show_oid_name(int *oid, size_t nlen) { struct sysctl_oid *oidp; int qoid[CTL_MAXNAME + 2]; int error; qoid[0] = CTL_SYSCTL; qoid[1] = CTL_SYSCTL_NAME; memcpy(qoid + 2, oid, nlen * sizeof(int)); error = sysctl_find_oid(qoid, nlen + 2, &oidp, NULL, NULL); if (error) db_error("sysctl name oid"); error = db_sysctl(oidp, qoid, nlen + 2, NULL, NULL, NULL, 0); if (error) db_error("sysctl name"); } /* * Check to see if an OID is safe to print from ddb. */ static bool db_oid_safe(const struct sysctl_oid *oidp) { for (unsigned int i = 0; i < nitems(db_safe_handlers); ++i) { if (oidp->oid_handler == db_safe_handlers[i]) return (true); } return (false); } /* * Show a sysctl at a specific OID * Compare to the input handling in show_var from sbin/sysctl/sysctl.c */ static int db_show_oid(struct sysctl_oid *oidp, int *oid, size_t nlen, int flags) { int error, xflag, oflag, Nflag, nflag; size_t len; xflag = flags & DB_SYSCTL_HEX; oflag = flags & DB_SYSCTL_OPAQUE; nflag = flags & DB_SYSCTL_VALUE_ONLY; Nflag = flags & DB_SYSCTL_NAME_ONLY; if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_OPAQUE && (!xflag && !oflag)) return (0); if (Nflag) { db_show_oid_name(oid, nlen); error = 0; goto out; } if (!nflag) { db_show_oid_name(oid, nlen); db_printf(": "); } if ((flags & DB_SYSCTL_SAFE_ONLY) && !db_oid_safe(oidp)) { db_printf("Skipping, unsafe to print while recursing."); error = 0; goto out; } /* Try once, and ask about the size */ len = 0; error = db_sysctl(oidp, oid, nlen, NULL, NULL, &len, flags); if (error) goto out; if (!g_ddb_sysctl_printed) /* Lie about the size */ error = db_sysctl(oidp, oid, nlen, (void *) 1, &len, NULL, flags); out: db_printf("\n"); return (error); } /* * Show all sysctls under a specific OID * Compare to sysctl_all from sbin/sysctl/sysctl.c */ static int db_show_sysctl_all(int *oid, size_t len, int flags) { struct sysctl_oid *oidp; int qoid[CTL_MAXNAME + 2], next[CTL_MAXNAME]; size_t nlen; qoid[0] = CTL_SYSCTL; qoid[1] = CTL_SYSCTL_NEXT; if (len) { nlen = len; memcpy(&qoid[2], oid, nlen * sizeof(int)); } else { nlen = 1; qoid[2] = CTL_KERN; } for (;;) { int error; size_t nextsize = sizeof(next); error = kernel_sysctl(kdb_thread, qoid, nlen + 2, next, &nextsize, NULL, 0, &nlen, 0); if (error != 0) { if (error == ENOENT) return (0); else db_error("sysctl(next)"); } nlen /= sizeof(int); if (nlen < (unsigned int)len) return (0); if (memcmp(&oid[0], &next[0], len * sizeof(int)) != 0) return (0); /* Find the OID in question */ error = sysctl_find_oid(next, nlen, &oidp, NULL, NULL); if (error) return (error); (void)db_show_oid(oidp, next, nlen, flags | DB_SYSCTL_SAFE_ONLY); if (db_pager_quit) return (0); memcpy(&qoid[2 + len], &next[len], (nlen - len) * sizeof(int)); } } /* * Show a sysctl by its user facing string */ static int db_sysctlbyname(const char *name, int flags) { struct sysctl_oid *oidp; int oid[CTL_MAXNAME]; int error, nlen; error = name2oid(name, oid, &nlen, &oidp); if (error) { return (error); } if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) { db_show_sysctl_all(oid, nlen, flags); } else { error = db_show_oid(oidp, oid, nlen, flags); } return (error); } static void db_sysctl_cmd_usage(void) { db_printf( " sysctl [/Nnox] \n" " \n" " The name of the sysctl to show. \n" " \n" " Show a sysctl by hooking into SYSCTL_IN and SYSCTL_OUT. \n" " This will work for most sysctls, but should not be used \n" " with sysctls that are known to malloc. \n" " \n" " While recursing any \"unsafe\" sysctls will be skipped. \n" " Call sysctl directly on the sysctl to try printing the \n" " skipped sysctl. This is unsafe and may make the ddb \n" " session unusable. \n" " \n" " Arguments: \n" " /N Display only the name of the sysctl. \n" " /n Display only the value of the sysctl. \n" " /o Display opaque values. \n" " /x Display the sysctl in hex. \n" " \n" "For example: \n" "sysctl vm.v_free_min \n" "vn.v_free_min: 12669 \n" ); } /* * Show a specific sysctl similar to sysctl (8). */ DB_COMMAND_FLAGS(sysctl, db_sysctl_cmd, CS_OWN) { char name[TOK_STRING_SIZE]; int error, i, t, flags; /* Parse the modifiers */ t = db_read_token(); if (t == tSLASH || t == tMINUS) { t = db_read_token(); if (t != tIDENT) { db_printf("Bad modifier\n"); error = EINVAL; goto out; } db_strcpy(modif, db_tok_string); } else { db_unread_token(t); modif[0] = '\0'; } flags = 0; for (i = 0; i < nitems(db_sysctl_modifs); i++) { if (strchr(modif, db_sysctl_modifs[i])) { flags |= db_sysctl_modif_values[i]; } } /* Parse the sysctl names */ t = db_read_token(); if (t != tIDENT) { db_printf("Need sysctl name\n"); error = EINVAL; goto out; } /* Copy the name into a temporary buffer */ db_strcpy(name, db_tok_string); /* Ensure there is no trailing cruft */ t = db_read_token(); if (t != tEOL) { db_printf("Unexpected sysctl argument\n"); error = EINVAL; goto out; } error = db_sysctlbyname(name, flags); if (error == ENOENT) { db_printf("unknown oid: '%s'\n", db_tok_string); goto out; } else if (error) { db_printf("%s: error: %d\n", db_tok_string, error); goto out; } out: /* Ensure we eat all of our text */ db_flush_lex(); if (error == EINVAL) { db_sysctl_cmd_usage(); } } #endif /* DDB */ diff --git a/sys/kern/kern_tslog.c b/sys/kern/kern_tslog.c index a22370b85b02..7b0847d5d187 100644 --- a/sys/kern/kern_tslog.c +++ b/sys/kern/kern_tslog.c @@ -1,223 +1,223 @@ /*- * Copyright (c) 2017 Colin Percival * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #ifndef TSLOGSIZE #define TSLOGSIZE 262144 #endif static volatile long nrecs = 0; static struct timestamp { void * td; int type; const char * f; const char * s; uint64_t tsc; } timestamps[TSLOGSIZE]; void tslog(void * td, int type, const char * f, const char * s) { uint64_t tsc = get_cyclecount(); long pos; /* A NULL thread is thread0 before curthread is set. */ if (td == NULL) td = &thread0; /* Grab a slot. */ pos = atomic_fetchadd_long(&nrecs, 1); /* Store record. */ if (pos < nitems(timestamps)) { timestamps[pos].td = td; timestamps[pos].type = type; timestamps[pos].f = f; timestamps[pos].s = s; timestamps[pos].tsc = tsc; } } static int sysctl_debug_tslog(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; size_t i, limit; caddr_t loader_tslog; void * loader_tslog_buf; size_t loader_tslog_len; /* * This code can race against the code in tslog() which stores * records: Theoretically we could end up reading a record after * its slots have been reserved but before it has been written. * Since this code takes orders of magnitude longer to run than * tslog() takes to write a record, it is highly unlikely that * anyone will ever experience this race. */ sb = sbuf_new_for_sysctl(NULL, NULL, 1024, req); /* Get data from the boot loader, if it provided any. */ loader_tslog = preload_search_by_type("TSLOG data"); if (loader_tslog != NULL) { loader_tslog_buf = preload_fetch_addr(loader_tslog); loader_tslog_len = preload_fetch_size(loader_tslog); sbuf_bcat(sb, loader_tslog_buf, loader_tslog_len); } /* Add data logged within the kernel. */ limit = MIN(nrecs, nitems(timestamps)); for (i = 0; i < limit; i++) { sbuf_printf(sb, "%p", timestamps[i].td); sbuf_printf(sb, " %llu", (unsigned long long)timestamps[i].tsc); switch (timestamps[i].type) { case TS_ENTER: - sbuf_printf(sb, " ENTER"); + sbuf_cat(sb, " ENTER"); break; case TS_EXIT: - sbuf_printf(sb, " EXIT"); + sbuf_cat(sb, " EXIT"); break; case TS_THREAD: - sbuf_printf(sb, " THREAD"); + sbuf_cat(sb, " THREAD"); break; case TS_EVENT: - sbuf_printf(sb, " EVENT"); + sbuf_cat(sb, " EVENT"); break; } sbuf_printf(sb, " %s", timestamps[i].f ? timestamps[i].f : "(null)"); if (timestamps[i].s) sbuf_printf(sb, " %s\n", timestamps[i].s); else - sbuf_printf(sb, "\n"); + sbuf_putc(sb, '\n'); } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_PROC(_debug, OID_AUTO, tslog, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_SKIP, 0, 0, sysctl_debug_tslog, "", "Dump recorded event timestamps"); MALLOC_DEFINE(M_TSLOGUSER, "tsloguser", "Strings used by userland tslog"); static struct procdata { pid_t ppid; uint64_t tsc_forked; uint64_t tsc_exited; char * execname; char * namei; int reused; } procs[PID_MAX + 1]; void tslog_user(pid_t pid, pid_t ppid, const char * execname, const char * namei) { uint64_t tsc = get_cyclecount(); /* If we wrapped, do nothing. */ if (procs[pid].reused) return; /* If we have a ppid, we're recording a fork. */ if (ppid != (pid_t)(-1)) { /* If we have a ppid already, we wrapped. */ if (procs[pid].ppid) { procs[pid].reused = 1; return; } /* Fill in some fields. */ procs[pid].ppid = ppid; procs[pid].tsc_forked = tsc; return; } /* If we have an execname, record it. */ if (execname != NULL) { if (procs[pid].execname != NULL) free(procs[pid].execname, M_TSLOGUSER); procs[pid].execname = strdup(execname, M_TSLOGUSER); return; } /* Record the first namei for the process. */ if (namei != NULL) { if (procs[pid].namei == NULL) procs[pid].namei = strdup(namei, M_TSLOGUSER); return; } /* Otherwise we're recording an exit. */ procs[pid].tsc_exited = tsc; } static int sysctl_debug_tslog_user(SYSCTL_HANDLER_ARGS) { int error; struct sbuf *sb; pid_t pid; sb = sbuf_new_for_sysctl(NULL, NULL, 1024, req); /* Export the data we logged. */ for (pid = 0; pid <= PID_MAX; pid++) { sbuf_printf(sb, "%zu", (size_t)pid); sbuf_printf(sb, " %zu", (size_t)procs[pid].ppid); sbuf_printf(sb, " %llu", (unsigned long long)procs[pid].tsc_forked); sbuf_printf(sb, " %llu", (unsigned long long)procs[pid].tsc_exited); sbuf_printf(sb, " \"%s\"", procs[pid].execname ? procs[pid].execname : ""); sbuf_printf(sb, " \"%s\"", procs[pid].namei ? procs[pid].namei : ""); - sbuf_printf(sb, "\n"); + sbuf_putc(sb, '\n'); } error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_PROC(_debug, OID_AUTO, tslog_user, CTLTYPE_STRING|CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_SKIP, 0, 0, sysctl_debug_tslog_user, "", "Dump recorded userland event timestamps"); diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index 20515f4e430b..ebd7139fa612 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -1,3358 +1,3358 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002-2007, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file implements the ULE scheduler. ULE supports independent CPU * run queues and fine grain locking. It has superior interactive * performance under load even on uni-processor systems. * * etymology: * ULE is the last three letters in schedule. It owes its name to a * generic user created for a scheduling system by Paul Mikesell at * Isilon Systems and a general lack of creativity on the part of the author. */ #include #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #ifdef KDTRACE_HOOKS #include int __read_mostly dtrace_vtime_active; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #endif #include #include #define KTR_ULE 0 #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) #define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU))) #define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load")) /* * Thread scheduler specific section. All fields are protected * by the thread lock. */ struct td_sched { struct runq *ts_runq; /* Run-queue we're queued on. */ short ts_flags; /* TSF_* flags. */ int ts_cpu; /* CPU that we have affinity for. */ int ts_rltick; /* Real last tick, for affinity. */ int ts_slice; /* Ticks of slice remaining. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ int ts_ltick; /* Last tick that we were running on */ int ts_ftick; /* First tick that we were running on */ int ts_ticks; /* Tick count */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif }; /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #define THREAD_CAN_SCHED(td, cpu) \ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= sizeof(struct thread0_storage), "increase struct thread0_storage.t0st_sched size"); /* * Priority ranges used for interactive and non-interactive timeshare * threads. The timeshare priorities are split up into four ranges. * The first range handles interactive threads. The last three ranges * (NHALF, x, and NHALF) handle non-interactive threads with the outer * ranges supporting nice values. */ #define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2) #define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE) #define PRI_MIN_INTERACT PRI_MIN_TIMESHARE #define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1) #define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE) #define PRI_MAX_BATCH PRI_MAX_TIMESHARE /* * Cpu percentage computation macros and defines. * * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. * SCHED_TICK_MAX: Maximum number of ticks before scaling back. * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. */ #define SCHED_TICK_SECS 10 #define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) #define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) #define SCHED_TICK_SHIFT 10 #define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) #define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) /* * These macros determine priorities for non-interactive threads. They are * assigned a priority based on their recent cpu utilization as expressed * by the ratio of ticks to the tick total. NHALF priorities at the start * and end of the MIN to MAX timeshare range are only reachable with negative * or positive nice respectively. * * PRI_RANGE: Priority range for utilization dependent priorities. * PRI_NRESV: Number of nice values. * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. * PRI_NICE: Determines the part of the priority inherited from nice. */ #define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) #define SCHED_PRI_MIN (PRI_MIN_BATCH + SCHED_PRI_NHALF) #define SCHED_PRI_MAX (PRI_MAX_BATCH - SCHED_PRI_NHALF) #define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) #define SCHED_PRI_TICKS(ts) \ (SCHED_TICK_HZ((ts)) / \ (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) #define SCHED_PRI_NICE(nice) (nice) /* * These determine the interactivity of a process. Interactivity differs from * cpu utilization in that it expresses the voluntary time slept vs time ran * while cpu utilization includes all time not running. This more accurately * models the intent of the thread. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) #define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters determine the slice behavior for batch work. */ #define SCHED_SLICE_DEFAULT_DIVISOR 10 /* ~94 ms, 12 stathz ticks. */ #define SCHED_SLICE_MIN_DIVISOR 6 /* DEFAULT/MIN = ~16 ms. */ /* Flags kept in td_flags. */ #define TDF_PICKCPU TDF_SCHED0 /* Thread should pick new CPU. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* * tickincr: Converts a stathz tick into a hz domain scaled by * the shift factor. Without the shift the error rate * due to rounding would be unacceptably high. * realstathz: stathz is sometimes 0 and run off of hz. * sched_slice: Runtime of each thread before rescheduling. * preempt_thresh: Priority threshold for preemption and remote IPIs. */ static u_int __read_mostly sched_interact = SCHED_INTERACT_THRESH; static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT; static int __read_mostly realstathz = 127; /* reset during boot. */ static int __read_mostly sched_slice = 10; /* reset during boot. */ static int __read_mostly sched_slice_min = 1; /* reset during boot. */ #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int __read_mostly preempt_thresh = PRI_MAX_IDLE; #else static int __read_mostly preempt_thresh = PRI_MIN_KERN; #endif #else static int __read_mostly preempt_thresh = 0; #endif static int __read_mostly static_boost = PRI_MIN_BATCH; static int __read_mostly sched_idlespins = 10000; static int __read_mostly sched_idlespinthresh = -1; /* * tdq - per processor runqs and statistics. A mutex synchronizes access to * most fields. Some fields are loaded or modified without the mutex. * * Locking protocols: * (c) constant after initialization * (f) flag, set with the tdq lock held, cleared on local CPU * (l) all accesses are CPU-local * (ls) stores are performed by the local CPU, loads may be lockless * (t) all accesses are protected by the tdq mutex * (ts) stores are serialized by the tdq mutex, loads may be lockless */ struct tdq { /* * Ordered to improve efficiency of cpu_search() and switch(). * tdq_lock is padded to avoid false sharing with tdq_load and * tdq_cpu_idle. */ struct mtx_padalign tdq_lock; /* run queue lock. */ struct cpu_group *tdq_cg; /* (c) Pointer to cpu topology. */ struct thread *tdq_curthread; /* (t) Current executing thread. */ int tdq_load; /* (ts) Aggregate load. */ int tdq_sysload; /* (ts) For loadavg, !ITHD load. */ int tdq_cpu_idle; /* (ls) cpu_idle() is active. */ int tdq_transferable; /* (ts) Transferable thread count. */ short tdq_switchcnt; /* (l) Switches this tick. */ short tdq_oldswitchcnt; /* (l) Switches last tick. */ u_char tdq_lowpri; /* (ts) Lowest priority thread. */ u_char tdq_owepreempt; /* (f) Remote preemption pending. */ u_char tdq_idx; /* (t) Current insert index. */ u_char tdq_ridx; /* (t) Current removal index. */ int tdq_id; /* (c) cpuid. */ struct runq tdq_realtime; /* (t) real-time run queue. */ struct runq tdq_timeshare; /* (t) timeshare run queue. */ struct runq tdq_idle; /* (t) Queue of IDLE threads. */ char tdq_name[TDQ_NAME_LEN]; #ifdef KTR char tdq_loadname[TDQ_LOADNAME_LEN]; #endif }; /* Idle thread states and config. */ #define TDQ_RUNNING 1 #define TDQ_IDLE 2 /* Lockless accessors. */ #define TDQ_LOAD(tdq) atomic_load_int(&(tdq)->tdq_load) #define TDQ_TRANSFERABLE(tdq) atomic_load_int(&(tdq)->tdq_transferable) #define TDQ_SWITCHCNT(tdq) (atomic_load_short(&(tdq)->tdq_switchcnt) + \ atomic_load_short(&(tdq)->tdq_oldswitchcnt)) #define TDQ_SWITCHCNT_INC(tdq) (atomic_store_short(&(tdq)->tdq_switchcnt, \ atomic_load_short(&(tdq)->tdq_switchcnt) + 1)) #ifdef SMP struct cpu_group __read_mostly *cpu_top; /* CPU topology */ #define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) #define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) /* * Run-time tunables. */ static int rebalance = 1; static int balance_interval = 128; /* Default set in sched_initticks(). */ static int __read_mostly affinity; static int __read_mostly steal_idle = 1; static int __read_mostly steal_thresh = 2; static int __read_mostly always_steal = 0; static int __read_mostly trysteal_limit = 2; /* * One thread queue per processor. */ static struct tdq __read_mostly *balance_tdq; static int balance_ticks; DPCPU_DEFINE_STATIC(struct tdq, tdq); DPCPU_DEFINE_STATIC(uint32_t, randomval); #define TDQ_SELF() ((struct tdq *)PCPU_GET(sched)) #define TDQ_CPU(x) (DPCPU_ID_PTR((x), tdq)) #define TDQ_ID(x) ((x)->tdq_id) #else /* !SMP */ static struct tdq tdq_cpu; #define TDQ_ID(x) (0) #define TDQ_SELF() (&tdq_cpu) #define TDQ_CPU(x) (&tdq_cpu) #endif #define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_TRYLOCK(t) mtx_trylock_spin(TDQ_LOCKPTR((t))) #define TDQ_TRYLOCK_FLAGS(t, f) mtx_trylock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock)) static void sched_setpreempt(int); static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); static int sched_interact_score(struct thread *); static void sched_interact_update(struct thread *); static void sched_interact_fork(struct thread *); static void sched_pctcpu_update(struct td_sched *, int); /* Operations on per processor queues */ static struct thread *tdq_choose(struct tdq *); static void tdq_setup(struct tdq *, int i); static void tdq_load_add(struct tdq *, struct thread *); static void tdq_load_rem(struct tdq *, struct thread *); static __inline void tdq_runq_add(struct tdq *, struct thread *, int); static __inline void tdq_runq_rem(struct tdq *, struct thread *); static inline int sched_shouldpreempt(int, int, int); static void tdq_print(int cpu); static void runq_print(struct runq *rq); static int tdq_add(struct tdq *, struct thread *, int); #ifdef SMP static int tdq_move(struct tdq *, struct tdq *); static int tdq_idled(struct tdq *); static void tdq_notify(struct tdq *, int lowpri); static struct thread *tdq_steal(struct tdq *, int); static struct thread *runq_steal(struct runq *, int); static int sched_pickcpu(struct thread *, int); static void sched_balance(void); static bool sched_balance_pair(struct tdq *, struct tdq *); static inline struct tdq *sched_setcpu(struct thread *, int, int); static inline void thread_unblock_switch(struct thread *, struct mtx *); static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS); static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent); #endif static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); SDT_PROVIDER_DEFINE(sched); SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", "struct proc *", "uint8_t"); SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", "struct proc *", "void *"); SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", "struct proc *", "void *", "int"); SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", "struct proc *", "uint8_t", "struct thread *"); SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", "struct proc *"); SDT_PROBE_DEFINE(sched, , , on__cpu); SDT_PROBE_DEFINE(sched, , , remain__cpu); SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", "struct proc *"); /* * Print the threads waiting on a run-queue. */ static void runq_print(struct runq *rq) { struct rqhead *rqh; struct thread *td; int pri; int j; int i; for (i = 0; i < RQB_LEN; i++) { printf("\t\trunq bits %d 0x%zx\n", i, rq->rq_status.rqb_bits[i]); for (j = 0; j < RQB_BPW; j++) if (rq->rq_status.rqb_bits[i] & (1ul << j)) { pri = j + (i << RQB_L2BPW); rqh = &rq->rq_queues[pri]; TAILQ_FOREACH(td, rqh, td_runq) { printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", td, td->td_name, td->td_priority, td->td_rqindex, pri); } } } } /* * Print the status of a per-cpu thread queue. Should be a ddb show cmd. */ static void __unused tdq_print(int cpu) { struct tdq *tdq; tdq = TDQ_CPU(cpu); printf("tdq %d:\n", TDQ_ID(tdq)); printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); printf("\tLock name: %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt); printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); printf("\tload transferable: %d\n", tdq->tdq_transferable); printf("\tlowest priority: %d\n", tdq->tdq_lowpri); printf("\trealtime runq:\n"); runq_print(&tdq->tdq_realtime); printf("\ttimeshare runq:\n"); runq_print(&tdq->tdq_timeshare); printf("\tidle runq:\n"); runq_print(&tdq->tdq_idle); } static inline int sched_shouldpreempt(int pri, int cpri, int remote) { /* * If the new priority is not better than the current priority there is * nothing to do. */ if (pri >= cpri) return (0); /* * Always preempt idle. */ if (cpri >= PRI_MIN_IDLE) return (1); /* * If preemption is disabled don't preempt others. */ if (preempt_thresh == 0) return (0); /* * Preempt if we exceed the threshold. */ if (pri <= preempt_thresh) return (1); /* * If we're interactive or better and there is non-interactive * or worse running preempt only remote processors. */ if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT) return (1); return (0); } /* * Add a thread to the actual run-queue. Keeps transferable counts up to * date with what is actually on the run-queue. Selects the correct * queue position for timeshare threads. */ static __inline void tdq_runq_add(struct tdq *tdq, struct thread *td, int flags) { struct td_sched *ts; u_char pri; TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); pri = td->td_priority; ts = td_get_sched(td); TD_SET_RUNQ(td); if (THREAD_CAN_MIGRATE(td)) { tdq->tdq_transferable++; ts->ts_flags |= TSF_XFERABLE; } if (pri < PRI_MIN_BATCH) { ts->ts_runq = &tdq->tdq_realtime; } else if (pri <= PRI_MAX_BATCH) { ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH, ("Invalid priority %d on timeshare runq", pri)); /* * This queue contains only priorities between MIN and MAX * batch. Use the whole queue to represent these values. */ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE; pri = (pri + tdq->tdq_idx) % RQ_NQS; /* * This effectively shortens the queue by one so we * can have a one slot difference between idx and * ridx while we wait for threads to drain. */ if (tdq->tdq_ridx != tdq->tdq_idx && pri == tdq->tdq_ridx) pri = (unsigned char)(pri - 1) % RQ_NQS; } else pri = tdq->tdq_ridx; runq_add_pri(ts->ts_runq, td, pri, flags); return; } else ts->ts_runq = &tdq->tdq_idle; runq_add(ts->ts_runq, td, flags); } /* * Remove a thread from a run-queue. This typically happens when a thread * is selected to run. Running threads are not on the queue and the * transferable count does not reflect them. */ static __inline void tdq_runq_rem(struct tdq *tdq, struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); KASSERT(ts->ts_runq != NULL, ("tdq_runq_remove: thread %p null ts_runq", td)); if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; ts->ts_flags &= ~TSF_XFERABLE; } if (ts->ts_runq == &tdq->tdq_timeshare) { if (tdq->tdq_idx != tdq->tdq_ridx) runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx); else runq_remove_idx(ts->ts_runq, td, NULL); } else runq_remove(ts->ts_runq, td); } /* * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load * for this thread to the referenced thread queue. */ static void tdq_load_add(struct tdq *tdq, struct thread *td) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); tdq->tdq_load++; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload++; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Remove the load from a thread that is transitioning to a sleep state or * exiting. */ static void tdq_load_rem(struct tdq *tdq, struct thread *td) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload--; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Bound timeshare latency by decreasing slice size as load increases. We * consider the maximum latency as the sum of the threads waiting to run * aside from curthread and target no more than sched_slice latency but * no less than sched_slice_min runtime. */ static inline int tdq_slice(struct tdq *tdq) { int load; /* * It is safe to use sys_load here because this is called from * contexts where timeshare threads are running and so there * cannot be higher priority load in the system. */ load = tdq->tdq_sysload - 1; if (load >= SCHED_SLICE_MIN_DIVISOR) return (sched_slice_min); if (load <= 1) return (sched_slice); return (sched_slice / load); } /* * Set lowpri to its exact value by searching the run-queue and * evaluating curthread. curthread may be passed as an optimization. */ static void tdq_setlowpri(struct tdq *tdq, struct thread *ctd) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if (ctd == NULL) ctd = tdq->tdq_curthread; td = tdq_choose(tdq); if (td == NULL || td->td_priority > ctd->td_priority) tdq->tdq_lowpri = ctd->td_priority; else tdq->tdq_lowpri = td->td_priority; } #ifdef SMP /* * We need some randomness. Implement a classic Linear Congruential * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits * of the random state (in the low bits of our answer) to keep * the maximum randomness. */ static uint32_t sched_random(void) { uint32_t *rndptr; rndptr = DPCPU_PTR(randomval); *rndptr = *rndptr * 69069 + 5; return (*rndptr >> 16); } struct cpu_search { cpuset_t *cs_mask; /* The mask of allowed CPUs to choose from. */ int cs_prefer; /* Prefer this CPU and groups including it. */ int cs_running; /* The thread is now running at cs_prefer. */ int cs_pri; /* Min priority for low. */ int cs_load; /* Max load for low, min load for high. */ int cs_trans; /* Min transferable load for high. */ }; struct cpu_search_res { int csr_cpu; /* The best CPU found. */ int csr_load; /* The load of cs_cpu. */ }; /* * Search the tree of cpu_groups for the lowest or highest loaded CPU. * These routines actually compare the load on all paths through the tree * and find the least loaded cpu on the least loaded path, which may differ * from the least loaded cpu in the system. This balances work among caches * and buses. */ static int cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s, struct cpu_search_res *r) { struct cpu_search_res lr; struct tdq *tdq; int c, bload, l, load, p, total; total = 0; bload = INT_MAX; r->csr_cpu = -1; /* Loop through children CPU groups if there are any. */ if (cg->cg_children > 0) { for (c = cg->cg_children - 1; c >= 0; c--) { load = cpu_search_lowest(&cg->cg_child[c], s, &lr); total += load; /* * When balancing do not prefer SMT groups with load >1. * It allows round-robin between SMT groups with equal * load within parent group for more fair scheduling. */ if (__predict_false(s->cs_running) && (cg->cg_child[c].cg_flags & CG_FLAG_THREAD) && load >= 128 && (load & 128) != 0) load += 128; if (lr.csr_cpu >= 0 && (load < bload || (load == bload && lr.csr_load < r->csr_load))) { bload = load; r->csr_cpu = lr.csr_cpu; r->csr_load = lr.csr_load; } } return (total); } /* Loop through children CPUs otherwise. */ for (c = cg->cg_last; c >= cg->cg_first; c--) { if (!CPU_ISSET(c, &cg->cg_mask)) continue; tdq = TDQ_CPU(c); l = TDQ_LOAD(tdq); if (c == s->cs_prefer) { if (__predict_false(s->cs_running)) l--; p = 128; } else p = 0; load = l * 256; total += load - p; /* * Check this CPU is acceptable. * If the threads is already on the CPU, don't look on the TDQ * priority, since it can be the priority of the thread itself. */ if (l > s->cs_load || (atomic_load_char(&tdq->tdq_lowpri) <= s->cs_pri && (!s->cs_running || c != s->cs_prefer)) || !CPU_ISSET(c, s->cs_mask)) continue; /* * When balancing do not prefer CPUs with load > 1. * It allows round-robin between CPUs with equal load * within the CPU group for more fair scheduling. */ if (__predict_false(s->cs_running) && l > 0) p = 0; load -= sched_random() % 128; if (bload > load - p) { bload = load - p; r->csr_cpu = c; r->csr_load = load; } } return (total); } static int cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s, struct cpu_search_res *r) { struct cpu_search_res lr; struct tdq *tdq; int c, bload, l, load, total; total = 0; bload = INT_MIN; r->csr_cpu = -1; /* Loop through children CPU groups if there are any. */ if (cg->cg_children > 0) { for (c = cg->cg_children - 1; c >= 0; c--) { load = cpu_search_highest(&cg->cg_child[c], s, &lr); total += load; if (lr.csr_cpu >= 0 && (load > bload || (load == bload && lr.csr_load > r->csr_load))) { bload = load; r->csr_cpu = lr.csr_cpu; r->csr_load = lr.csr_load; } } return (total); } /* Loop through children CPUs otherwise. */ for (c = cg->cg_last; c >= cg->cg_first; c--) { if (!CPU_ISSET(c, &cg->cg_mask)) continue; tdq = TDQ_CPU(c); l = TDQ_LOAD(tdq); load = l * 256; total += load; /* * Check this CPU is acceptable. */ if (l < s->cs_load || TDQ_TRANSFERABLE(tdq) < s->cs_trans || !CPU_ISSET(c, s->cs_mask)) continue; load -= sched_random() % 256; if (load > bload) { bload = load; r->csr_cpu = c; } } r->csr_load = bload; return (total); } /* * Find the cpu with the least load via the least loaded path that has a * lowpri greater than pri pri. A pri of -1 indicates any priority is * acceptable. */ static inline int sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload, int prefer, int running) { struct cpu_search s; struct cpu_search_res r; s.cs_prefer = prefer; s.cs_running = running; s.cs_mask = mask; s.cs_pri = pri; s.cs_load = maxload; cpu_search_lowest(cg, &s, &r); return (r.csr_cpu); } /* * Find the cpu with the highest load via the highest loaded path. */ static inline int sched_highest(const struct cpu_group *cg, cpuset_t *mask, int minload, int mintrans) { struct cpu_search s; struct cpu_search_res r; s.cs_mask = mask; s.cs_load = minload; s.cs_trans = mintrans; cpu_search_highest(cg, &s, &r); return (r.csr_cpu); } static void sched_balance_group(struct cpu_group *cg) { struct tdq *tdq; struct thread *td; cpuset_t hmask, lmask; int high, low, anylow; CPU_FILL(&hmask); for (;;) { high = sched_highest(cg, &hmask, 1, 0); /* Stop if there is no more CPU with transferrable threads. */ if (high == -1) break; CPU_CLR(high, &hmask); CPU_COPY(&hmask, &lmask); /* Stop if there is no more CPU left for low. */ if (CPU_EMPTY(&lmask)) break; tdq = TDQ_CPU(high); if (TDQ_LOAD(tdq) == 1) { /* * There is only one running thread. We can't move * it from here, so tell it to pick new CPU by itself. */ TDQ_LOCK(tdq); td = tdq->tdq_curthread; if (td->td_lock == TDQ_LOCKPTR(tdq) && (td->td_flags & TDF_IDLETD) == 0 && THREAD_CAN_MIGRATE(td)) { td->td_flags |= TDF_PICKCPU; ast_sched_locked(td, TDA_SCHED); if (high != curcpu) ipi_cpu(high, IPI_AST); } TDQ_UNLOCK(tdq); break; } anylow = 1; nextlow: if (TDQ_TRANSFERABLE(tdq) == 0) continue; low = sched_lowest(cg, &lmask, -1, TDQ_LOAD(tdq) - 1, high, 1); /* Stop if we looked well and found no less loaded CPU. */ if (anylow && low == -1) break; /* Go to next high if we found no less loaded CPU. */ if (low == -1) continue; /* Transfer thread from high to low. */ if (sched_balance_pair(tdq, TDQ_CPU(low))) { /* CPU that got thread can no longer be a donor. */ CPU_CLR(low, &hmask); } else { /* * If failed, then there is no threads on high * that can run on this low. Drop low from low * mask and look for different one. */ CPU_CLR(low, &lmask); anylow = 0; goto nextlow; } } } static void sched_balance(void) { struct tdq *tdq; balance_ticks = max(balance_interval / 2, 1) + (sched_random() % balance_interval); tdq = TDQ_SELF(); TDQ_UNLOCK(tdq); sched_balance_group(cpu_top); TDQ_LOCK(tdq); } /* * Lock two thread queues using their address to maintain lock order. */ static void tdq_lock_pair(struct tdq *one, struct tdq *two) { if (one < two) { TDQ_LOCK(one); TDQ_LOCK_FLAGS(two, MTX_DUPOK); } else { TDQ_LOCK(two); TDQ_LOCK_FLAGS(one, MTX_DUPOK); } } /* * Unlock two thread queues. Order is not important here. */ static void tdq_unlock_pair(struct tdq *one, struct tdq *two) { TDQ_UNLOCK(one); TDQ_UNLOCK(two); } /* * Transfer load between two imbalanced thread queues. Returns true if a thread * was moved between the queues, and false otherwise. */ static bool sched_balance_pair(struct tdq *high, struct tdq *low) { int cpu, lowpri; bool ret; ret = false; tdq_lock_pair(high, low); /* * Transfer a thread from high to low. */ if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load) { lowpri = tdq_move(high, low); if (lowpri != -1) { /* * In case the target isn't the current CPU notify it of * the new load, possibly sending an IPI to force it to * reschedule. Otherwise maybe schedule a preemption. */ cpu = TDQ_ID(low); if (cpu != PCPU_GET(cpuid)) tdq_notify(low, lowpri); else sched_setpreempt(low->tdq_lowpri); ret = true; } } tdq_unlock_pair(high, low); return (ret); } /* * Move a thread from one thread queue to another. Returns -1 if the source * queue was empty, else returns the maximum priority of all threads in * the destination queue prior to the addition of the new thread. In the latter * case, this priority can be used to determine whether an IPI needs to be * delivered. */ static int tdq_move(struct tdq *from, struct tdq *to) { struct thread *td; int cpu; TDQ_LOCK_ASSERT(from, MA_OWNED); TDQ_LOCK_ASSERT(to, MA_OWNED); cpu = TDQ_ID(to); td = tdq_steal(from, cpu); if (td == NULL) return (-1); /* * Although the run queue is locked the thread may be * blocked. We can not set the lock until it is unblocked. */ thread_lock_block_wait(td); sched_rem(td); THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(from)); td->td_lock = TDQ_LOCKPTR(to); td_get_sched(td)->ts_cpu = cpu; return (tdq_add(to, td, SRQ_YIELDING)); } /* * This tdq has idled. Try to steal a thread from another cpu and switch * to it. */ static int tdq_idled(struct tdq *tdq) { struct cpu_group *cg, *parent; struct tdq *steal; cpuset_t mask; int cpu, switchcnt, goup; if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL) return (1); CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); restart: switchcnt = TDQ_SWITCHCNT(tdq); for (cg = tdq->tdq_cg, goup = 0; ; ) { cpu = sched_highest(cg, &mask, steal_thresh, 1); /* * We were assigned a thread but not preempted. Returning * 0 here will cause our caller to switch to it. */ if (TDQ_LOAD(tdq)) return (0); /* * We found no CPU to steal from in this group. Escalate to * the parent and repeat. But if parent has only two children * groups we can avoid searching this group again by searching * the other one specifically and then escalating two levels. */ if (cpu == -1) { if (goup) { cg = cg->cg_parent; goup = 0; } parent = cg->cg_parent; if (parent == NULL) return (1); if (parent->cg_children == 2) { if (cg == &parent->cg_child[0]) cg = &parent->cg_child[1]; else cg = &parent->cg_child[0]; goup = 1; } else cg = parent; continue; } steal = TDQ_CPU(cpu); /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. * * Testing this ahead of tdq_lock_pair() only catches * this situation about 20% of the time on an 8 core * 16 thread Ryzen 7, but it still helps performance. */ if (TDQ_LOAD(steal) < steal_thresh || TDQ_TRANSFERABLE(steal) == 0) goto restart; /* * Try to lock both queues. If we are assigned a thread while * waited for the lock, switch to it now instead of stealing. * If we can't get the lock, then somebody likely got there * first so continue searching. */ TDQ_LOCK(tdq); if (tdq->tdq_load > 0) { mi_switch(SW_VOL | SWT_IDLE); return (0); } if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) { TDQ_UNLOCK(tdq); CPU_CLR(cpu, &mask); continue; } /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread, or * we were preempted and the CPU loading info may be out * of date. The latter is rare. In either case restart * the search. */ if (TDQ_LOAD(steal) < steal_thresh || TDQ_TRANSFERABLE(steal) == 0 || switchcnt != TDQ_SWITCHCNT(tdq)) { tdq_unlock_pair(tdq, steal); goto restart; } /* * Steal the thread and switch to it. */ if (tdq_move(steal, tdq) != -1) break; /* * We failed to acquire a thread even though it looked * like one was available. This could be due to affinity * restrictions or for other reasons. Loop again after * removing this CPU from the set. The restart logic * above does not restore this CPU to the set due to the * likelyhood of failing here again. */ CPU_CLR(cpu, &mask); tdq_unlock_pair(tdq, steal); } TDQ_UNLOCK(steal); mi_switch(SW_VOL | SWT_IDLE); return (0); } /* * Notify a remote cpu of new work. Sends an IPI if criteria are met. * * "lowpri" is the minimum scheduling priority among all threads on * the queue prior to the addition of the new thread. */ static void tdq_notify(struct tdq *tdq, int lowpri) { int cpu; TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT(tdq->tdq_lowpri <= lowpri, ("tdq_notify: lowpri %d > tdq_lowpri %d", lowpri, tdq->tdq_lowpri)); if (tdq->tdq_owepreempt) return; /* * Check to see if the newly added thread should preempt the one * currently running. */ if (!sched_shouldpreempt(tdq->tdq_lowpri, lowpri, 1)) return; /* * Make sure that our caller's earlier update to tdq_load is * globally visible before we read tdq_cpu_idle. Idle thread * accesses both of them without locks, and the order is important. */ atomic_thread_fence_seq_cst(); /* * Try to figure out if we can signal the idle thread instead of sending * an IPI. This check is racy; at worst, we will deliever an IPI * unnecessarily. */ cpu = TDQ_ID(tdq); if (TD_IS_IDLETHREAD(tdq->tdq_curthread) && (atomic_load_int(&tdq->tdq_cpu_idle) == 0 || cpu_idle_wakeup(cpu))) return; /* * The run queues have been updated, so any switch on the remote CPU * will satisfy the preemption request. */ tdq->tdq_owepreempt = 1; ipi_cpu(cpu, IPI_PREEMPT); } /* * Steals load from a timeshare queue. Honors the rotating queue head * index. */ static struct thread * runq_steal_from(struct runq *rq, int cpu, u_char start) { struct rqbits *rqb; struct rqhead *rqh; struct thread *td, *first; int bit; int i; rqb = &rq->rq_status; bit = start & (RQB_BPW -1); first = NULL; again: for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { if (rqb->rqb_bits[i] == 0) continue; if (bit == 0) bit = RQB_FFS(rqb->rqb_bits[i]); for (; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[i] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) { if (first) { if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); } else first = td; } } } if (start != 0) { start = 0; goto again; } if (first && THREAD_CAN_MIGRATE(first) && THREAD_CAN_SCHED(first, cpu)) return (first); return (NULL); } /* * Steals load from a standard linear queue. */ static struct thread * runq_steal(struct runq *rq, int cpu) { struct rqhead *rqh; struct rqbits *rqb; struct thread *td; int word; int bit; rqb = &rq->rq_status; for (word = 0; word < RQB_LEN; word++) { if (rqb->rqb_bits[word] == 0) continue; for (bit = 0; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); } } return (NULL); } /* * Attempt to steal a thread in priority order from a thread queue. */ static struct thread * tdq_steal(struct tdq *tdq, int cpu) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) return (td); if ((td = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx)) != NULL) return (td); return (runq_steal(&tdq->tdq_idle, cpu)); } /* * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the * current lock and returns with the assigned queue locked. */ static inline struct tdq * sched_setcpu(struct thread *td, int cpu, int flags) { struct tdq *tdq; struct mtx *mtx; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_CPU(cpu); td_get_sched(td)->ts_cpu = cpu; /* * If the lock matches just return the queue. */ if (td->td_lock == TDQ_LOCKPTR(tdq)) { KASSERT((flags & SRQ_HOLD) == 0, ("sched_setcpu: Invalid lock for SRQ_HOLD")); return (tdq); } /* * The hard case, migration, we need to block the thread first to * prevent order reversals with other cpus locks. */ spinlock_enter(); mtx = thread_lock_block(td); if ((flags & SRQ_HOLD) == 0) mtx_unlock_spin(mtx); TDQ_LOCK(tdq); thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); spinlock_exit(); return (tdq); } SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding"); SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load"); SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu"); SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration"); static int sched_pickcpu(struct thread *td, int flags) { struct cpu_group *cg, *ccg; struct td_sched *ts; struct tdq *tdq; cpuset_t *mask; int cpu, pri, r, self, intr; self = PCPU_GET(cpuid); ts = td_get_sched(td); KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on " "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name)); if (smp_started == 0) return (self); /* * Don't migrate a running thread from sched_switch(). */ if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) return (ts->ts_cpu); /* * Prefer to run interrupt threads on the processors that generate * the interrupt. */ if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && curthread->td_intr_nesting_level) { tdq = TDQ_SELF(); if (tdq->tdq_lowpri >= PRI_MIN_IDLE) { SCHED_STAT_INC(pickcpu_idle_affinity); return (self); } ts->ts_cpu = self; intr = 1; cg = tdq->tdq_cg; goto llc; } else { intr = 0; tdq = TDQ_CPU(ts->ts_cpu); cg = tdq->tdq_cg; } /* * If the thread can run on the last cpu and the affinity has not * expired and it is idle, run it there. */ if (THREAD_CAN_SCHED(td, ts->ts_cpu) && atomic_load_char(&tdq->tdq_lowpri) >= PRI_MIN_IDLE && SCHED_AFFINITY(ts, CG_SHARE_L2)) { if (cg->cg_flags & CG_FLAG_THREAD) { /* Check all SMT threads for being idle. */ for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) { pri = atomic_load_char(&TDQ_CPU(cpu)->tdq_lowpri); if (CPU_ISSET(cpu, &cg->cg_mask) && pri < PRI_MIN_IDLE) break; } if (cpu > cg->cg_last) { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } else { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } llc: /* * Search for the last level cache CPU group in the tree. * Skip SMT, identical groups and caches with expired affinity. * Interrupt threads affinity is explicit and never expires. */ for (ccg = NULL; cg != NULL; cg = cg->cg_parent) { if (cg->cg_flags & CG_FLAG_THREAD) continue; if (cg->cg_children == 1 || cg->cg_count == 1) continue; if (cg->cg_level == CG_SHARE_NONE || (!intr && !SCHED_AFFINITY(ts, cg->cg_level))) continue; ccg = cg; } /* Found LLC shared by all CPUs, so do a global search. */ if (ccg == cpu_top) ccg = NULL; cpu = -1; mask = &td->td_cpuset->cs_mask; pri = td->td_priority; r = TD_IS_RUNNING(td); /* * Try hard to keep interrupts within found LLC. Search the LLC for * the least loaded CPU we can run now. For NUMA systems it should * be within target domain, and it also reduces scheduling overhead. */ if (ccg != NULL && intr) { cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_intrbind); } else /* Search the LLC for the least loaded idle CPU we can run now. */ if (ccg != NULL) { cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE), INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_affinity); } /* Search globally for the least loaded CPU we can run now. */ if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } /* Search globally for the least loaded CPU. */ if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu.")); KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu)); /* * Compare the lowest loaded cpu to current cpu. */ tdq = TDQ_CPU(cpu); if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri && atomic_load_char(&tdq->tdq_lowpri) < PRI_MIN_IDLE && TDQ_LOAD(TDQ_SELF()) <= TDQ_LOAD(tdq) + 1) { SCHED_STAT_INC(pickcpu_local); cpu = self; } if (cpu != ts->ts_cpu) SCHED_STAT_INC(pickcpu_migration); return (cpu); } #endif /* * Pick the highest priority task we have and return it. */ static struct thread * tdq_choose(struct tdq *tdq) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = runq_choose(&tdq->tdq_realtime); if (td != NULL) return (td); td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_BATCH, ("tdq_choose: Invalid priority on timeshare queue %d", td->td_priority)); return (td); } td = runq_choose(&tdq->tdq_idle); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_IDLE, ("tdq_choose: Invalid priority on idle queue %d", td->td_priority)); return (td); } return (NULL); } /* * Initialize a thread queue. */ static void tdq_setup(struct tdq *tdq, int id) { if (bootverbose) printf("ULE: setup cpu %d\n", id); runq_init(&tdq->tdq_realtime); runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); tdq->tdq_id = id; snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), "sched lock %d", (int)TDQ_ID(tdq)); mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN); #ifdef KTR snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname), "CPU %d load", (int)TDQ_ID(tdq)); #endif } #ifdef SMP static void sched_setup_smp(void) { struct tdq *tdq; int i; cpu_top = smp_topo(); CPU_FOREACH(i) { tdq = DPCPU_ID_PTR(i, tdq); tdq_setup(tdq, i); tdq->tdq_cg = smp_topo_find(cpu_top, i); if (tdq->tdq_cg == NULL) panic("Can't find cpu group for %d\n", i); DPCPU_ID_SET(i, randomval, i * 69069 + 5); } PCPU_SET(sched, DPCPU_PTR(tdq)); balance_tdq = TDQ_SELF(); } #endif /* * Setup the thread queues and initialize the topology based on MD * information. */ static void sched_setup(void *dummy) { struct tdq *tdq; #ifdef SMP sched_setup_smp(); #else tdq_setup(TDQ_SELF(), 0); #endif tdq = TDQ_SELF(); /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(tdq); tdq_load_add(tdq, &thread0); tdq->tdq_curthread = &thread0; tdq->tdq_lowpri = thread0.td_priority; TDQ_UNLOCK(tdq); } /* * This routine determines time constants after stathz and hz are setup. */ /* ARGSUSED */ static void sched_initticks(void *dummy) { int incr; realstathz = stathz ? stathz : hz; sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR; sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); /* * tickincr is shifted out by 10 to avoid rounding errors due to * hz not being evenly divisible by stathz on all platforms. */ incr = (hz << SCHED_TICK_SHIFT) / realstathz; /* * This does not work for values of stathz that are more than * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. */ if (incr == 0) incr = 1; tickincr = incr; #ifdef SMP /* * Set the default balance interval now that we know * what realstathz is. */ balance_interval = realstathz; balance_ticks = balance_interval; affinity = SCHED_AFFINITY_DEFAULT; #endif if (sched_idlespinthresh < 0) sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz; } /* * This is the core of the interactivity algorithm. Determines a score based * on past behavior. It is the ratio of sleep time to run time scaled to * a [0, 100] integer. This is the voluntary sleep time of a process, which * differs from the cpu usage because it does not account for time spent * waiting on a run-queue. Would be prettier if we had floating point. * * When a thread's sleep time is greater than its run time the * calculation is: * * scaling factor * interactivity score = --------------------- * sleep time / run time * * * When a thread's run time is greater than its sleep time the * calculation is: * * scaling factor * interactivity score = 2 * scaling factor - --------------------- * run time / sleep time */ static int sched_interact_score(struct thread *td) { struct td_sched *ts; int div; ts = td_get_sched(td); /* * The score is only needed if this is likely to be an interactive * task. Don't go through the expense of computing it if there's * no chance. */ if (sched_interact <= SCHED_INTERACT_HALF && ts->ts_runtime >= ts->ts_slptime) return (SCHED_INTERACT_HALF); if (ts->ts_runtime > ts->ts_slptime) { div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); } if (ts->ts_slptime > ts->ts_runtime) { div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); return (ts->ts_runtime / div); } /* runtime == slptime */ if (ts->ts_runtime) return (SCHED_INTERACT_HALF); /* * This can happen if slptime and runtime are 0. */ return (0); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct thread *td) { u_int pri, score; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; /* * If the score is interactive we place the thread in the realtime * queue with a priority that is less than kernel and interrupt * priorities. These threads are not subject to nice restrictions. * * Scores greater than this are placed on the normal timeshare queue * where the priority is partially decided by the most recent cpu * utilization and the rest is decided by nice value. * * The nice value of the process has a linear effect on the calculated * score. Negative nice values make it easier for a thread to be * considered interactive. */ score = imax(0, sched_interact_score(td) + td->td_proc->p_nice); if (score < sched_interact) { pri = PRI_MIN_INTERACT; pri += (PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) * score / sched_interact; KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT, ("sched_priority: invalid interactive priority %u score %u", pri, score)); } else { pri = SCHED_PRI_MIN; if (td_get_sched(td)->ts_ticks) pri += min(SCHED_PRI_TICKS(td_get_sched(td)), SCHED_PRI_RANGE - 1); pri += SCHED_PRI_NICE(td->td_proc->p_nice); KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH, ("sched_priority: invalid priority %u: nice %d, " "ticks %d ftick %d ltick %d tick pri %d", pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks, td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick, SCHED_PRI_TICKS(td_get_sched(td)))); } sched_user_prio(td, pri); return; } /* * This routine enforces a maximum limit on the amount of scheduling history * kept. It is called after either the slptime or runtime is adjusted. This * function is ugly due to integer math. */ static void sched_interact_update(struct thread *td) { struct td_sched *ts; u_int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum < SCHED_SLP_RUN_MAX) return; /* * This only happens from two places: * 1) We have added an unusual amount of run time from fork_exit. * 2) We have added an unusual amount of sleep time from sched_sleep(). */ if (sum > SCHED_SLP_RUN_MAX * 2) { if (ts->ts_runtime > ts->ts_slptime) { ts->ts_runtime = SCHED_SLP_RUN_MAX; ts->ts_slptime = 1; } else { ts->ts_slptime = SCHED_SLP_RUN_MAX; ts->ts_runtime = 1; } return; } /* * If we have exceeded by more than 1/5th then the algorithm below * will not bring us back into range. Dividing by two here forces * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] */ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { ts->ts_runtime /= 2; ts->ts_slptime /= 2; return; } ts->ts_runtime = (ts->ts_runtime / 5) * 4; ts->ts_slptime = (ts->ts_slptime / 5) * 4; } /* * Scale back the interactivity history when a child thread is created. The * history is inherited from the parent but the thread may behave totally * differently. For example, a shell spawning a compiler process. We want * to learn that the compiler is behaving badly very quickly. */ static void sched_interact_fork(struct thread *td) { struct td_sched *ts; int ratio; int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum > SCHED_SLP_RUN_FORK) { ratio = sum / SCHED_SLP_RUN_FORK; ts->ts_runtime /= ratio; ts->ts_slptime /= ratio; } } /* * Called from proc0_init() to setup the scheduler fields. */ void schedinit(void) { struct td_sched *ts0; /* * Set up the scheduler specific parts of thread0. */ ts0 = td_get_sched(&thread0); ts0->ts_ltick = ticks; ts0->ts_ftick = ticks; ts0->ts_slice = 0; ts0->ts_cpu = curcpu; /* set valid CPU number */ } /* * schedinit_ap() is needed prior to calling sched_throw(NULL) to ensure that * the pcpu requirements are met for any calls in the period between curthread * initialization and sched_throw(). One can safely add threads to the queue * before sched_throw(), for instance, as long as the thread lock is setup * correctly. * * TDQ_SELF() relies on the below sched pcpu setting; it may be used only * after schedinit_ap(). */ void schedinit_ap(void) { #ifdef SMP PCPU_SET(sched, DPCPU_PTR(tdq)); #endif PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(TDQ_SELF()); } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most sched_slice stathz ticks. */ int sched_rr_interval(void) { /* Convert sched_slice from stathz to hz. */ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); } /* * Update the percent cpu tracking information when it is requested or * the total history exceeds the maximum. We keep a sliding history of * tick counts that slowly decays. This is less precise than the 4BSD * mechanism since it happens with less regular and frequent events. */ static void sched_pctcpu_update(struct td_sched *ts, int run) { int t = ticks; /* * The signed difference may be negative if the thread hasn't run for * over half of the ticks rollover period. */ if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) { ts->ts_ticks = 0; ts->ts_ftick = t - SCHED_TICK_TARG; } else if (t - ts->ts_ftick >= SCHED_TICK_MAX) { ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) * (ts->ts_ltick - (t - SCHED_TICK_TARG)); ts->ts_ftick = t - SCHED_TICK_TARG; } if (run) ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT; ts->ts_ltick = t; } /* * Adjust the priority of a thread. Move it to the appropriate run-queue * if necessary. This is the back-end for several priority related * functions. */ static void sched_thread_priority(struct thread *td, u_char prio) { struct tdq *tdq; int oldpri; KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); if (td != curthread && prio < td->td_priority) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), "lend prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, curthread); } THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new * queue. This could be optimized to not re-add in some * cases. */ if (TD_ON_RUNQ(td) && prio < td->td_priority) { sched_rem(td); td->td_priority = prio; sched_add(td, SRQ_BORROWING | SRQ_HOLDTD); return; } /* * If the thread is currently running we may have to adjust the lowpri * information so other cpus are aware of our current priority. */ if (TD_IS_RUNNING(td)) { tdq = TDQ_CPU(td_get_sched(td)->ts_cpu); oldpri = td->td_priority; td->td_priority = prio; if (prio < tdq->tdq_lowpri) tdq->tdq_lowpri = prio; else if (tdq->tdq_lowpri == oldpri) tdq_setlowpri(tdq, td); return; } td->td_priority = prio; } /* * Update a thread's priority when it is lent another thread's * priority. */ void sched_lend_prio(struct thread *td, u_char prio) { td->td_flags |= TDF_BORROWING; sched_thread_priority(td, prio); } /* * Restore a thread's priority when priority propagation is * over. The prio argument is the minimum priority the thread * needs to have to satisfy other possible priority lending * requests. If the thread's regular priority is less * important than prio, the thread will keep a priority boost * of prio. */ void sched_unlend_prio(struct thread *td, u_char prio) { u_char base_pri; if (td->td_base_pri >= PRI_MIN_TIMESHARE && td->td_base_pri <= PRI_MAX_TIMESHARE) base_pri = td->td_user_pri; else base_pri = td->td_base_pri; if (prio >= base_pri) { td->td_flags &= ~TDF_BORROWING; sched_thread_priority(td, base_pri); } else sched_lend_prio(td, prio); } /* * Standard entry for setting the priority to an absolute value. */ void sched_prio(struct thread *td, u_char prio) { u_char oldprio; /* First, update the base priority. */ td->td_base_pri = prio; /* * If the thread is borrowing another thread's priority, don't * ever lower the priority. */ if (td->td_flags & TDF_BORROWING && td->td_priority < prio) return; /* Change the real priority. */ oldprio = td->td_priority; sched_thread_priority(td, prio); /* * If the thread is on a turnstile, then let the turnstile update * its state. */ if (TD_ON_LOCK(td) && oldprio != prio) turnstile_adjust(td, oldprio); } /* * Set the base interrupt thread priority. */ void sched_ithread_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(td->td_pri_class == PRI_ITHD); td->td_base_ithread_pri = prio; sched_prio(td, prio); } /* * Set the base user priority, does not effect current running priority. */ void sched_user_prio(struct thread *td, u_char prio) { td->td_base_user_pri = prio; if (td->td_lend_user_pri <= prio) return; td->td_user_pri = prio; } void sched_lend_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lend_user_pri = prio; td->td_user_pri = min(prio, td->td_base_user_pri); if (td->td_priority > td->td_user_pri) sched_prio(td, td->td_user_pri); else if (td->td_priority != td->td_user_pri) ast_sched_locked(td, TDA_SCHED); } /* * Like the above but first check if there is anything to do. */ void sched_lend_user_prio_cond(struct thread *td, u_char prio) { if (td->td_lend_user_pri != prio) goto lend; if (td->td_user_pri != min(prio, td->td_base_user_pri)) goto lend; if (td->td_priority != td->td_user_pri) goto lend; return; lend: thread_lock(td); sched_lend_user_prio(td, prio); thread_unlock(td); } #ifdef SMP /* * This tdq is about to idle. Try to steal a thread from another CPU before * choosing the idle thread. */ static void tdq_trysteal(struct tdq *tdq) { struct cpu_group *cg, *parent; struct tdq *steal; cpuset_t mask; int cpu, i, goup; if (smp_started == 0 || steal_idle == 0 || trysteal_limit == 0 || tdq->tdq_cg == NULL) return; CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); /* We don't want to be preempted while we're iterating. */ spinlock_enter(); TDQ_UNLOCK(tdq); for (i = 1, cg = tdq->tdq_cg, goup = 0; ; ) { cpu = sched_highest(cg, &mask, steal_thresh, 1); /* * If a thread was added while interrupts were disabled don't * steal one here. */ if (TDQ_LOAD(tdq) > 0) { TDQ_LOCK(tdq); break; } /* * We found no CPU to steal from in this group. Escalate to * the parent and repeat. But if parent has only two children * groups we can avoid searching this group again by searching * the other one specifically and then escalating two levels. */ if (cpu == -1) { if (goup) { cg = cg->cg_parent; goup = 0; } if (++i > trysteal_limit) { TDQ_LOCK(tdq); break; } parent = cg->cg_parent; if (parent == NULL) { TDQ_LOCK(tdq); break; } if (parent->cg_children == 2) { if (cg == &parent->cg_child[0]) cg = &parent->cg_child[1]; else cg = &parent->cg_child[0]; goup = 1; } else cg = parent; continue; } steal = TDQ_CPU(cpu); /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. * At this point unconditionally exit the loop to bound * the time spent in the critcal section. */ if (TDQ_LOAD(steal) < steal_thresh || TDQ_TRANSFERABLE(steal) == 0) continue; /* * Try to lock both queues. If we are assigned a thread while * waited for the lock, switch to it now instead of stealing. * If we can't get the lock, then somebody likely got there * first. */ TDQ_LOCK(tdq); if (tdq->tdq_load > 0) break; if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) break; /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. */ if (TDQ_LOAD(steal) < steal_thresh || TDQ_TRANSFERABLE(steal) == 0) { TDQ_UNLOCK(steal); break; } /* * If we fail to acquire one due to affinity restrictions, * bail out and let the idle thread to a more complete search * outside of a critical section. */ if (tdq_move(steal, tdq) == -1) { TDQ_UNLOCK(steal); break; } TDQ_UNLOCK(steal); break; } spinlock_exit(); } #endif /* * Handle migration from sched_switch(). This happens only for * cpu binding. */ static struct mtx * sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) { struct tdq *tdn; #ifdef SMP int lowpri; #endif KASSERT(THREAD_CAN_MIGRATE(td) || (td_get_sched(td)->ts_flags & TSF_BOUND) != 0, ("Thread %p shouldn't migrate", td)); KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: " "thread %s queued on absent CPU %d.", td->td_name, td_get_sched(td)->ts_cpu)); tdn = TDQ_CPU(td_get_sched(td)->ts_cpu); #ifdef SMP tdq_load_rem(tdq, td); /* * Do the lock dance required to avoid LOR. We have an * extra spinlock nesting from sched_switch() which will * prevent preemption while we're holding neither run-queue lock. */ TDQ_UNLOCK(tdq); TDQ_LOCK(tdn); lowpri = tdq_add(tdn, td, flags); tdq_notify(tdn, lowpri); TDQ_UNLOCK(tdn); TDQ_LOCK(tdq); #endif return (TDQ_LOCKPTR(tdn)); } /* * thread_lock_unblock() that does not assume td_lock is blocked. */ static inline void thread_unblock_switch(struct thread *td, struct mtx *mtx) { atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, (uintptr_t)mtx); } /* * Switch threads. This function has to handle threads coming in while * blocked for some reason, running, or idle. It also must deal with * migrating a thread from one queue to another as running threads may * be assigned elsewhere via binding. */ void sched_switch(struct thread *td, int flags) { struct thread *newtd; struct tdq *tdq; struct td_sched *ts; struct mtx *mtx; int srqflag; int cpuid, preempted; #ifdef SMP int pickcpu; #endif THREAD_LOCK_ASSERT(td, MA_OWNED); cpuid = PCPU_GET(cpuid); tdq = TDQ_SELF(); ts = td_get_sched(td); sched_pctcpu_update(ts, 1); #ifdef SMP pickcpu = (td->td_flags & TDF_PICKCPU) != 0; if (pickcpu) ts->ts_rltick = ticks - affinity * MAX_CACHE_LEVELS; else ts->ts_rltick = ticks; #endif td->td_lastcpu = td->td_oncpu; preempted = (td->td_flags & TDF_SLICEEND) == 0 && (flags & SW_PREEMPT) != 0; td->td_flags &= ~(TDF_PICKCPU | TDF_SLICEEND); ast_unsched_locked(td, TDA_SCHED); td->td_owepreempt = 0; atomic_store_char(&tdq->tdq_owepreempt, 0); if (!TD_IS_IDLETHREAD(td)) TDQ_SWITCHCNT_INC(tdq); /* * Always block the thread lock so we can drop the tdq lock early. */ mtx = thread_lock_block(td); spinlock_enter(); if (TD_IS_IDLETHREAD(td)) { MPASS(mtx == TDQ_LOCKPTR(tdq)); TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(mtx == TDQ_LOCKPTR(tdq)); srqflag = preempted ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; #ifdef SMP if (THREAD_CAN_MIGRATE(td) && (!THREAD_CAN_SCHED(td, ts->ts_cpu) || pickcpu)) ts->ts_cpu = sched_pickcpu(td, 0); #endif if (ts->ts_cpu == cpuid) tdq_runq_add(tdq, td, srqflag); else mtx = sched_switch_migrate(tdq, td, srqflag); } else { /* This thread must be going to sleep. */ if (mtx != TDQ_LOCKPTR(tdq)) { mtx_unlock_spin(mtx); TDQ_LOCK(tdq); } tdq_load_rem(tdq, td); #ifdef SMP if (tdq->tdq_load == 0) tdq_trysteal(tdq); #endif } #if (KTR_COMPILE & KTR_SCHED) != 0 if (TD_IS_IDLETHREAD(td)) KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", "prio:%d", td->td_priority); else KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, "lockname:\"%s\"", td->td_lockname); #endif /* * We enter here with the thread blocked and assigned to the * appropriate cpu run-queue or sleep-queue and with the current * thread-queue locked. */ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); MPASS(td == tdq->tdq_curthread); newtd = choosethread(); sched_pctcpu_update(td_get_sched(newtd), 0); TDQ_UNLOCK(tdq); /* * Call the MD code to switch contexts if necessary. */ if (td != newtd) { #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); #ifdef KDTRACE_HOOKS /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif td->td_oncpu = NOCPU; cpu_switch(td, newtd, mtx); cpuid = td->td_oncpu = PCPU_GET(cpuid); SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); #endif } else { thread_unblock_switch(td, mtx); SDT_PROBE0(sched, , , remain__cpu); } KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count %d", curthread->td_md.md_spinlock_count)); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); } /* * Adjust thread priorities as a result of a nice request. */ void sched_nice(struct proc *p, int nice) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); sched_priority(td); sched_prio(td, td->td_base_user_pri); thread_unlock(td); } } /* * Record the sleep time for the interactivity scorer. */ void sched_sleep(struct thread *td, int prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptick = ticks; if (TD_IS_SUSPENDED(td) || prio >= PSOCK) td->td_flags |= TDF_CANSWAP; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; if (static_boost == 1 && prio) sched_prio(td, prio); else if (static_boost && td->td_priority > static_boost) sched_prio(td, static_boost); } /* * Schedule a thread to resume execution and record how long it voluntarily * slept. We also update the pctcpu, interactivity, and priority. * * Requires the thread lock on entry, drops on exit. */ void sched_wakeup(struct thread *td, int srqflags) { struct td_sched *ts; int slptick; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); td->td_flags &= ~TDF_CANSWAP; /* * If we slept for more than a tick update our interactivity and * priority. */ slptick = td->td_slptick; td->td_slptick = 0; if (slptick && slptick != ticks) { ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT; sched_interact_update(td); sched_pctcpu_update(ts, 0); } /* * When resuming an idle ithread, restore its base ithread * priority. */ if (PRI_BASE(td->td_pri_class) == PRI_ITHD && td->td_priority != td->td_base_ithread_pri) sched_prio(td, td->td_base_ithread_pri); /* * Reset the slice value since we slept and advanced the round-robin. */ ts->ts_slice = 0; sched_add(td, SRQ_BORING | srqflags); } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct thread *td, struct thread *child) { THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(td_get_sched(td), 1); sched_fork_thread(td, child); /* * Penalize the parent and child for forking. */ sched_interact_fork(child); sched_priority(child); td_get_sched(td)->ts_runtime += tickincr; sched_interact_update(td); sched_priority(td); } /* * Fork a new thread, may be within the same process. */ void sched_fork_thread(struct thread *td, struct thread *child) { struct td_sched *ts; struct td_sched *ts2; struct tdq *tdq; tdq = TDQ_SELF(); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Initialize child. */ ts = td_get_sched(td); ts2 = td_get_sched(child); child->td_oncpu = NOCPU; child->td_lastcpu = NOCPU; child->td_lock = TDQ_LOCKPTR(tdq); child->td_cpuset = cpuset_ref(td->td_cpuset); child->td_domain.dr_policy = td->td_cpuset->cs_domain; ts2->ts_cpu = ts->ts_cpu; ts2->ts_flags = 0; /* * Grab our parents cpu estimation information. */ ts2->ts_ticks = ts->ts_ticks; ts2->ts_ltick = ts->ts_ltick; ts2->ts_ftick = ts->ts_ftick; /* * Do not inherit any borrowed priority from the parent. */ child->td_priority = child->td_base_pri; /* * And update interactivity score. */ ts2->ts_slptime = ts->ts_slptime; ts2->ts_runtime = ts->ts_runtime; /* Attempt to quickly learn interactivity. */ ts2->ts_slice = tdq_slice(tdq) - sched_slice_min; #ifdef KTR bzero(ts2->ts_name, sizeof(ts2->ts_name)); #endif } /* * Adjust the priority class of a thread. */ void sched_class(struct thread *td, int class) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_pri_class == class) return; td->td_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct thread *child) { struct thread *td; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit", "prio:%d", child->td_priority); PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td, child); } /* * Penalize another thread for the time spent on this one. This helps to * worsen the priority and interactivity of processes which schedule batch * jobs such as make. This has little effect on the make process itself but * causes new processes spawned by it to receive worse scores immediately. */ void sched_exit_thread(struct thread *td, struct thread *child) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit", "prio:%d", child->td_priority); /* * Give the child's runtime to the parent without returning the * sleep time as a penalty to the parent. This causes shells that * launch expensive things to mark their children as expensive. */ thread_lock(td); td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime; sched_interact_update(td); sched_priority(td); thread_unlock(td); } void sched_preempt(struct thread *td) { struct tdq *tdq; int flags; SDT_PROBE2(sched, , , surrender, td, td->td_proc); thread_lock(td); tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); if (td->td_priority > tdq->tdq_lowpri) { if (td->td_critnest == 1) { flags = SW_INVOL | SW_PREEMPT; flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE : SWT_REMOTEPREEMPT; mi_switch(flags); /* Switch dropped thread lock. */ return; } td->td_owepreempt = 1; } else { tdq->tdq_owepreempt = 0; } thread_unlock(td); } /* * Fix priorities on return to user-space. Priorities may be elevated due * to static priorities in msleep() or similar. */ void sched_userret_slowpath(struct thread *td) { thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; tdq_setlowpri(TDQ_SELF(), td); thread_unlock(td); } SCHED_STAT_DEFINE(ithread_demotions, "Interrupt thread priority demotions"); SCHED_STAT_DEFINE(ithread_preemptions, "Interrupt thread preemptions due to time-sharing"); /* * Return time slice for a given thread. For ithreads this is * sched_slice. For other threads it is tdq_slice(tdq). */ static inline int td_slice(struct thread *td, struct tdq *tdq) { if (PRI_BASE(td->td_pri_class) == PRI_ITHD) return (sched_slice); return (tdq_slice(tdq)); } /* * Handle a stathz tick. This is really only relevant for timeshare * and interrupt threads. */ void sched_clock(struct thread *td, int cnt) { struct tdq *tdq; struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_SELF(); #ifdef SMP /* * We run the long term load balancer infrequently on the first cpu. */ if (balance_tdq == tdq && smp_started != 0 && rebalance != 0 && balance_ticks != 0) { balance_ticks -= cnt; if (balance_ticks <= 0) sched_balance(); } #endif /* * Save the old switch count so we have a record of the last ticks * activity. Initialize the new switch count based on our load. * If there is some activity seed it to reflect that. */ tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt; tdq->tdq_switchcnt = tdq->tdq_load; /* * Advance the insert index once for each tick to ensure that all * threads get a chance to run. */ if (tdq->tdq_idx == tdq->tdq_ridx) { tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) tdq->tdq_ridx = tdq->tdq_idx; } ts = td_get_sched(td); sched_pctcpu_update(ts, 1); if ((td->td_pri_class & PRI_FIFO_BIT) || TD_IS_IDLETHREAD(td)) return; if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) { /* * We used a tick; charge it to the thread so * that we can compute our interactivity. */ td_get_sched(td)->ts_runtime += tickincr * cnt; sched_interact_update(td); sched_priority(td); } /* * Force a context switch if the current thread has used up a full * time slice (default is 100ms). */ ts->ts_slice += cnt; if (ts->ts_slice >= td_slice(td, tdq)) { ts->ts_slice = 0; /* * If an ithread uses a full quantum, demote its * priority and preempt it. */ if (PRI_BASE(td->td_pri_class) == PRI_ITHD) { SCHED_STAT_INC(ithread_preemptions); td->td_owepreempt = 1; if (td->td_base_pri + RQ_PPQ < PRI_MAX_ITHD) { SCHED_STAT_INC(ithread_demotions); sched_prio(td, td->td_base_pri + RQ_PPQ); } } else { ast_sched_locked(td, TDA_SCHED); td->td_flags |= TDF_SLICEEND; } } } u_int sched_estcpu(struct thread *td __unused) { return (0); } /* * Return whether the current CPU has runnable tasks. Used for in-kernel * cooperative idle threads. */ int sched_runnable(void) { struct tdq *tdq; int load; load = 1; tdq = TDQ_SELF(); if ((curthread->td_flags & TDF_IDLETD) != 0) { if (TDQ_LOAD(tdq) > 0) goto out; } else if (TDQ_LOAD(tdq) - 1 > 0) goto out; load = 0; out: return (load); } /* * Choose the highest priority thread to run. The thread is removed from * the run-queue while running however the load remains. */ struct thread * sched_choose(void) { struct thread *td; struct tdq *tdq; tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = tdq_choose(tdq); if (td != NULL) { tdq_runq_rem(tdq, td); tdq->tdq_lowpri = td->td_priority; } else { tdq->tdq_lowpri = PRI_MAX_IDLE; td = PCPU_GET(idlethread); } tdq->tdq_curthread = td; return (td); } /* * Set owepreempt if the currently running thread has lower priority than "pri". * Preemption never happens directly in ULE, we always request it once we exit a * critical section. */ static void sched_setpreempt(int pri) { struct thread *ctd; int cpri; ctd = curthread; THREAD_LOCK_ASSERT(ctd, MA_OWNED); cpri = ctd->td_priority; if (pri < cpri) ast_sched_locked(ctd, TDA_SCHED); if (KERNEL_PANICKED() || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) return; if (!sched_shouldpreempt(pri, cpri, 0)) return; ctd->td_owepreempt = 1; } /* * Add a thread to a thread queue. Select the appropriate runq and add the * thread to it. This is the internal function called when the tdq is * predetermined. */ static int tdq_add(struct tdq *tdq, struct thread *td, int flags) { int lowpri; TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); lowpri = tdq->tdq_lowpri; if (td->td_priority < lowpri) tdq->tdq_lowpri = td->td_priority; tdq_runq_add(tdq, td, flags); tdq_load_add(tdq, td); return (lowpri); } /* * Select the target thread queue and add a thread to it. Request * preemption or IPI a remote processor if required. * * Requires the thread lock on entry, drops on exit. */ void sched_add(struct thread *td, int flags) { struct tdq *tdq; #ifdef SMP int cpu, lowpri; #endif KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Recalculate the priority before we select the target cpu or * run-queue. */ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_priority(td); #ifdef SMP /* * Pick the destination cpu and if it isn't ours transfer to the * target cpu. */ cpu = sched_pickcpu(td, flags); tdq = sched_setcpu(td, cpu, flags); lowpri = tdq_add(tdq, td, flags); if (cpu != PCPU_GET(cpuid)) tdq_notify(tdq, lowpri); else if (!(flags & SRQ_YIELDING)) sched_setpreempt(td->td_priority); #else tdq = TDQ_SELF(); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ if (td->td_lock != TDQ_LOCKPTR(tdq)) { TDQ_LOCK(tdq); if ((flags & SRQ_HOLD) != 0) td->td_lock = TDQ_LOCKPTR(tdq); else thread_lock_set(td, TDQ_LOCKPTR(tdq)); } (void)tdq_add(tdq, td, flags); if (!(flags & SRQ_YIELDING)) sched_setpreempt(td->td_priority); #endif if (!(flags & SRQ_HOLDTD)) thread_unlock(td); } /* * Remove a thread from a run-queue without running it. This is used * when we're stealing a thread from a remote queue. Otherwise all threads * exit by calling sched_exit_thread() and sched_throw() themselves. */ void sched_rem(struct thread *td) { struct tdq *tdq; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem", "prio:%d", td->td_priority); SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); tdq = TDQ_CPU(td_get_sched(td)->ts_cpu); TDQ_LOCK_ASSERT(tdq, MA_OWNED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); tdq_runq_rem(tdq, td); tdq_load_rem(tdq, td); TD_SET_CAN_RUN(td); if (td->td_priority == tdq->tdq_lowpri) tdq_setlowpri(tdq, NULL); } /* * Fetch cpu utilization information. Updates on demand. */ fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct td_sched *ts; pctcpu = 0; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(ts, TD_IS_RUNNING(td)); if (ts->ts_ticks) { int rtick; /* How many rtick per second ? */ rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; } return (pctcpu); } /* * Enforce affinity settings for a thread. Called after adjustments to * cpumask. */ void sched_affinity(struct thread *td) { #ifdef SMP struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); if (THREAD_CAN_SCHED(td, ts->ts_cpu)) return; if (TD_ON_RUNQ(td)) { sched_rem(td); sched_add(td, SRQ_BORING | SRQ_HOLDTD); return; } if (!TD_IS_RUNNING(td)) return; /* * Force a switch before returning to userspace. If the * target thread is not running locally send an ipi to force * the issue. */ ast_sched_locked(td, TDA_SCHED); if (td != curthread) ipi_cpu(ts->ts_cpu, IPI_PREEMPT); #endif } /* * Bind a thread to a target cpu. */ void sched_bind(struct thread *td, int cpu) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); KASSERT(td == curthread, ("sched_bind: can only bind curthread")); ts = td_get_sched(td); if (ts->ts_flags & TSF_BOUND) sched_unbind(td); KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td)); ts->ts_flags |= TSF_BOUND; sched_pin(); if (PCPU_GET(cpuid) == cpu) return; ts->ts_cpu = cpu; /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL | SWT_BIND); thread_lock(td); } /* * Release a bound thread. */ void sched_unbind(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); ts = td_get_sched(td); if ((ts->ts_flags & TSF_BOUND) == 0) return; ts->ts_flags &= ~TSF_BOUND; sched_unpin(); } int sched_is_bound(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); return (td_get_sched(td)->ts_flags & TSF_BOUND); } /* * Basic yield call. */ void sched_relinquish(struct thread *td) { thread_lock(td); mi_switch(SW_VOL | SWT_RELINQUISH); } /* * Return the total system load. */ int sched_load(void) { #ifdef SMP int total; int i; total = 0; CPU_FOREACH(i) total += atomic_load_int(&TDQ_CPU(i)->tdq_sysload); return (total); #else return (atomic_load_int(&TDQ_SELF()->tdq_sysload)); #endif } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } #ifdef SMP #define TDQ_IDLESPIN(tdq) \ ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0) #else #define TDQ_IDLESPIN(tdq) 1 #endif /* * The actual idle process. */ void sched_idletd(void *dummy) { struct thread *td; struct tdq *tdq; int oldswitchcnt, switchcnt; int i; mtx_assert(&Giant, MA_NOTOWNED); td = curthread; tdq = TDQ_SELF(); THREAD_NO_SLEEPING(); oldswitchcnt = -1; for (;;) { if (TDQ_LOAD(tdq)) { thread_lock(td); mi_switch(SW_VOL | SWT_IDLE); } switchcnt = TDQ_SWITCHCNT(tdq); #ifdef SMP if (always_steal || switchcnt != oldswitchcnt) { oldswitchcnt = switchcnt; if (tdq_idled(tdq) == 0) continue; } switchcnt = TDQ_SWITCHCNT(tdq); #else oldswitchcnt = switchcnt; #endif /* * If we're switching very frequently, spin while checking * for load rather than entering a low power state that * may require an IPI. However, don't do any busy * loops while on SMT machines as this simply steals * cycles from cores doing useful work. */ if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) { for (i = 0; i < sched_idlespins; i++) { if (TDQ_LOAD(tdq)) break; cpu_spinwait(); } } /* If there was context switch during spin, restart it. */ switchcnt = TDQ_SWITCHCNT(tdq); if (TDQ_LOAD(tdq) != 0 || switchcnt != oldswitchcnt) continue; /* Run main MD idle handler. */ atomic_store_int(&tdq->tdq_cpu_idle, 1); /* * Make sure that the tdq_cpu_idle update is globally visible * before cpu_idle() reads tdq_load. The order is important * to avoid races with tdq_notify(). */ atomic_thread_fence_seq_cst(); /* * Checking for again after the fence picks up assigned * threads often enough to make it worthwhile to do so in * order to avoid calling cpu_idle(). */ if (TDQ_LOAD(tdq) != 0) { atomic_store_int(&tdq->tdq_cpu_idle, 0); continue; } cpu_idle(switchcnt * 4 > sched_idlespinthresh); atomic_store_int(&tdq->tdq_cpu_idle, 0); /* * Account thread-less hardware interrupts and * other wakeup reasons equal to context switches. */ switchcnt = TDQ_SWITCHCNT(tdq); if (switchcnt != oldswitchcnt) continue; TDQ_SWITCHCNT_INC(tdq); oldswitchcnt++; } } /* * sched_throw_grab() chooses a thread from the queue to switch to * next. It returns with the tdq lock dropped in a spinlock section to * keep interrupts disabled until the CPU is running in a proper threaded * context. */ static struct thread * sched_throw_grab(struct tdq *tdq) { struct thread *newtd; newtd = choosethread(); spinlock_enter(); TDQ_UNLOCK(tdq); KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count %d", curthread->td_md.md_spinlock_count)); return (newtd); } /* * A CPU is entering for the first time. */ void sched_ap_entry(void) { struct thread *newtd; struct tdq *tdq; tdq = TDQ_SELF(); /* This should have been setup in schedinit_ap(). */ THREAD_LOCKPTR_ASSERT(curthread, TDQ_LOCKPTR(tdq)); TDQ_LOCK(tdq); /* Correct spinlock nesting. */ spinlock_exit(); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); newtd = sched_throw_grab(tdq); /* doesn't return */ cpu_throw(NULL, newtd); } /* * A thread is exiting. */ void sched_throw(struct thread *td) { struct thread *newtd; struct tdq *tdq; tdq = TDQ_SELF(); MPASS(td != NULL); THREAD_LOCK_ASSERT(td, MA_OWNED); THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq)); tdq_load_rem(tdq, td); td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; thread_lock_block(td); newtd = sched_throw_grab(tdq); /* doesn't return */ cpu_switch(td, newtd, TDQ_LOCKPTR(tdq)); } /* * This is called from fork_exit(). Just acquire the correct locks and * let fork do the rest of the work. */ void sched_fork_exit(struct thread *td) { struct tdq *tdq; int cpuid; /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with the scheduler lock held. */ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count %d", curthread->td_md.md_spinlock_count)); cpuid = PCPU_GET(cpuid); tdq = TDQ_SELF(); TDQ_LOCK(tdq); spinlock_exit(); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); SDT_PROBE0(sched, , , on__cpu); } /* * Create on first use to catch odd startup conditions. */ char * sched_tdname(struct thread *td) { #ifdef KTR struct td_sched *ts; ts = td_get_sched(td); if (ts->ts_name[0] == '\0') snprintf(ts->ts_name, sizeof(ts->ts_name), "%s tid %d", td->td_name, td->td_tid); return (ts->ts_name); #else return (td->td_name); #endif } #ifdef KTR void sched_clear_tdname(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); ts->ts_name[0] = '\0'; } #endif #ifdef SMP /* * Build the CPU topology dump string. Is recursively called to collect * the topology tree. */ static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent) { char cpusetbuf[CPUSETBUFSIZ]; int i, first; sbuf_printf(sb, "%*s\n", indent, "", 1 + indent / 2, cg->cg_level); sbuf_printf(sb, "%*s ", indent, "", cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask)); first = TRUE; for (i = cg->cg_first; i <= cg->cg_last; i++) { if (CPU_ISSET(i, &cg->cg_mask)) { if (!first) - sbuf_printf(sb, ", "); + sbuf_cat(sb, ", "); else first = FALSE; sbuf_printf(sb, "%d", i); } } - sbuf_printf(sb, "\n"); + sbuf_cat(sb, "\n"); if (cg->cg_flags != 0) { sbuf_printf(sb, "%*s ", indent, ""); if ((cg->cg_flags & CG_FLAG_HTT) != 0) - sbuf_printf(sb, "HTT group"); + sbuf_cat(sb, "HTT group"); if ((cg->cg_flags & CG_FLAG_THREAD) != 0) - sbuf_printf(sb, "THREAD group"); + sbuf_cat(sb, "THREAD group"); if ((cg->cg_flags & CG_FLAG_SMT) != 0) - sbuf_printf(sb, "SMT group"); + sbuf_cat(sb, "SMT group"); if ((cg->cg_flags & CG_FLAG_NODE) != 0) - sbuf_printf(sb, "NUMA node"); - sbuf_printf(sb, "\n"); + sbuf_cat(sb, "NUMA node"); + sbuf_cat(sb, "\n"); } if (cg->cg_children > 0) { sbuf_printf(sb, "%*s \n", indent, ""); for (i = 0; i < cg->cg_children; i++) sysctl_kern_sched_topology_spec_internal(sb, &cg->cg_child[i], indent+2); sbuf_printf(sb, "%*s \n", indent, ""); } sbuf_printf(sb, "%*s\n", indent, ""); return (0); } /* * Sysctl handler for retrieving topology dump. It's a wrapper for * the recursive sysctl_kern_smp_topology_spec_internal(). */ static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS) { struct sbuf *topo; int err; KASSERT(cpu_top != NULL, ("cpu_top isn't initialized")); topo = sbuf_new_for_sysctl(NULL, NULL, 512, req); if (topo == NULL) return (ENOMEM); - sbuf_printf(topo, "\n"); + sbuf_cat(topo, "\n"); err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1); - sbuf_printf(topo, "\n"); + sbuf_cat(topo, "\n"); if (err == 0) { err = sbuf_finish(topo); } sbuf_delete(topo); return (err); } #endif static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val, period; period = 1000000 / realstathz; new_val = period * sched_slice; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val <= 0) return (EINVAL); sched_slice = imax(1, (new_val + period / 2) / period); sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); return (0); } SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, "Scheduler name"); SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds"); SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks"); SYSCTL_UINT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0, "Maximal (lowest) priority for preemption"); SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0, "Assign static kernel priorities to sleeping threads"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0, "Number of times idle thread will spin waiting for new work"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh, 0, "Threshold before we will permit idle thread spinning"); #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, &balance_interval, 0, "Average period in stathz ticks to run the long-term balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, "Attempts to steal work from other cores before idling"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, "Minimum load on remote CPU before we'll steal"); SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit, 0, "Topological distance limit for stealing threads in sched_switch()"); SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0, "Always run the stealer from the idle thread"); SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A", "XML dump of detected CPU topology"); #endif /* ps compat. All cpu percentages from ULE are weighted. */ static int ccpu = 0; SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "Decay factor used for updating %CPU in 4BSD scheduler"); diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c index 9b04518010e3..ac9d73ce3c6c 100644 --- a/sys/kern/subr_blist.c +++ b/sys/kern/subr_blist.c @@ -1,1177 +1,1177 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting * * This module implements a general bitmap allocator/deallocator. The * allocator eats around 2 bits per 'block'. The module does not * try to interpret the meaning of a 'block' other than to return * SWAPBLK_NONE on an allocation failure. * * A radix tree controls access to pieces of the bitmap, and includes * auxiliary information at each interior node about the availabilty of * contiguous free blocks in the subtree rooted at that node. A radix * constant defines the size of the bitmaps contained in a leaf node * and the number of descendents of each of the meta (interior) nodes. * Each subtree is associated with a range of blocks. The root of any * subtree stores a hint field that defines an upper bound on the size * of the largest allocation that can begin in the associated block * range. A hint is an upper bound on a potential allocation, but not * necessarily a tight upper bound. * * The bitmap field in each node directs the search for available blocks. * For a leaf node, a bit is set if the corresponding block is free. For a * meta node, a bit is set if the corresponding subtree contains a free * block somewhere within it. The search at a meta node considers only * children of that node that represent a range that includes a free block. * * The hinting greatly increases code efficiency for allocations while * the general radix structure optimizes both allocations and frees. The * radix tree should be able to operate well no matter how much * fragmentation there is and no matter how large a bitmap is used. * * The blist code wires all necessary memory at creation time. Neither * allocations nor frees require interaction with the memory subsystem. * The non-blocking nature of allocations and frees is required by swap * code (vm/swap_pager.c). * * LAYOUT: The radix tree is laid out recursively using a linear array. * Each meta node is immediately followed (laid out sequentially in * memory) by BLIST_RADIX lower-level nodes. This is a recursive * structure but one that can be easily scanned through a very simple * 'skip' calculation. The memory allocation is only large enough to * cover the number of blocks requested at creation time. Nodes that * represent blocks beyond that limit, nodes that would never be read * or written, are not allocated, so that the last of the * BLIST_RADIX lower-level nodes of a some nodes may not be allocated. * * NOTE: the allocator cannot currently allocate more than * BLIST_RADIX blocks per call. It will panic with 'allocation too * large' if you try. This is an area that could use improvement. The * radix is large enough that this restriction does not effect the swap * system, though. Currently only the allocation code is affected by * this algorithmic unfeature. The freeing code can handle arbitrary * ranges. * * This code can be compiled stand-alone for debugging. */ #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #else #ifndef BLIST_NO_DEBUG #define BLIST_DEBUG #endif #include #include #include #include #include #include #include #include #include #include #include #define bitcount64(x) __bitcount64((uint64_t)(x)) #define malloc(a,b,c) calloc(a, 1) #define free(a,b) free(a) #define ummin(a,b) ((a) < (b) ? (a) : (b)) #define imin(a,b) ((a) < (b) ? (a) : (b)) #define KASSERT(a,b) assert(a) #include #endif /* * static support functions */ static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount); static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, int maxcount, u_daddr_t radix); static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix); static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, daddr_t count); static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix); #ifndef _KERNEL static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab); #endif #ifdef _KERNEL static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); #endif #define BLIST_MASK (BLIST_RADIX - 1) /* * For a subtree that can represent the state of up to 'radix' blocks, the * number of leaf nodes of the subtree is L=radix/BLIST_RADIX. If 'm' * is short for BLIST_RADIX, then for a tree of height h with L=m**h * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' * in the 'meta' functions that process subtrees. Since integer division * discards remainders, we can express this computation as * skip = (m * m**h) / (m - 1) * skip = (m * (radix / m)) / (m - 1) * skip = radix / (m - 1) * so that simple integer division by a constant can safely be used for the * calculation. */ static inline daddr_t radix_to_skip(daddr_t radix) { return (radix / BLIST_MASK); } /* * Provide a mask with count bits set, starting as position n. */ static inline u_daddr_t bitrange(int n, int count) { return (((u_daddr_t)-1 << n) & ((u_daddr_t)-1 >> (BLIST_RADIX - (n + count)))); } static inline int bitpos(u_daddr_t mask) { _Static_assert(sizeof(long long) >= sizeof(mask), "mask too big for ffsll()"); return (ffsll(mask) - 1); } /* * blist_create() - create a blist capable of handling up to the specified * number of blocks * * blocks - must be greater than 0 * flags - malloc flags * * The smallest blist consists of a single leaf node capable of * managing BLIST_RADIX blocks. */ blist_t blist_create(daddr_t blocks, int flags) { blist_t bl; u_daddr_t nodes, radix; KASSERT(blocks > 0, ("invalid block count")); /* * Calculate the radix and node count used for scanning. */ nodes = 1; for (radix = 1; (blocks - 1) / BLIST_RADIX / radix > 0; radix *= BLIST_RADIX) nodes += 1 + (blocks - 1) / BLIST_RADIX / radix; /* * Include a sentinel node to ensure that cross-leaf scans stay within * the bounds of the allocation. */ if (blocks % BLIST_RADIX == 0) nodes++; bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags | M_ZERO); if (bl == NULL) return (NULL); bl->bl_blocks = blocks; bl->bl_radix = radix; #if defined(BLIST_DEBUG) printf( "BLIST representing %lld blocks (%lld MB of swap)" ", requiring %lldK of ram\n", (long long)bl->bl_blocks, (long long)bl->bl_blocks * 4 / 1024, (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 ); printf("BLIST raw radix tree contains %lld records\n", (long long)nodes); #endif return (bl); } void blist_destroy(blist_t bl) { free(bl, M_SWAP); } /* * blist_alloc() - reserve space in the block bitmap. Return the base * of a contiguous region or SWAPBLK_NONE if space could * not be allocated. */ daddr_t blist_alloc(blist_t bl, int *count, int maxcount) { daddr_t blk, cursor; KASSERT(*count <= maxcount, ("invalid parameters %d > %d", *count, maxcount)); KASSERT(*count <= BLIST_MAX_ALLOC, ("minimum allocation too large: %d", *count)); /* * This loop iterates at most twice. An allocation failure in the * first iteration leads to a second iteration only if the cursor was * non-zero. When the cursor is zero, an allocation failure will * stop further iterations. */ for (cursor = bl->bl_cursor;; cursor = 0) { blk = blst_meta_alloc(bl->bl_root, cursor, count, maxcount, bl->bl_radix); if (blk != SWAPBLK_NONE) { bl->bl_avail -= *count; bl->bl_cursor = blk + *count; if (bl->bl_cursor == bl->bl_blocks) bl->bl_cursor = 0; return (blk); } if (cursor == 0) return (SWAPBLK_NONE); } } /* * blist_avail() - return the number of free blocks. */ daddr_t blist_avail(blist_t bl) { return (bl->bl_avail); } /* * blist_free() - free up space in the block bitmap. Return the base * of a contiguous region. */ void blist_free(blist_t bl, daddr_t blkno, daddr_t count) { KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, ("freeing invalid range: blkno %jx, count %d, blocks %jd", (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); bl->bl_avail += count; } /* * blist_fill() - mark a region in the block bitmap as off-limits * to the allocator (i.e. allocate it), ignoring any * existing allocations. Return the number of blocks * actually filled that were free before the call. */ daddr_t blist_fill(blist_t bl, daddr_t blkno, daddr_t count) { daddr_t filled; KASSERT(blkno >= 0 && blkno + count <= bl->bl_blocks, ("filling invalid range: blkno %jx, count %d, blocks %jd", (uintmax_t)blkno, (int)count, (uintmax_t)bl->bl_blocks)); filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix); bl->bl_avail -= filled; return (filled); } /* * blist_resize() - resize an existing radix tree to handle the * specified number of blocks. This will reallocate * the tree and transfer the previous bitmap to the new * one. When extending the tree you can specify whether * the new blocks are to left allocated or freed. */ void blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) { blist_t newbl = blist_create(count, flags); blist_t save = *pbl; *pbl = newbl; if (count > save->bl_blocks) count = save->bl_blocks; blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); /* * If resizing upwards, should we free the new space or not? */ if (freenew && count < newbl->bl_blocks) { blist_free(newbl, count, newbl->bl_blocks - count); } blist_destroy(save); } #ifdef BLIST_DEBUG /* * blist_print() - dump radix tree */ void blist_print(blist_t bl) { printf("BLIST avail = %jd, cursor = %08jx {\n", (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor); if (bl->bl_root->bm_bitmap != 0) blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); printf("}\n"); } #endif static const u_daddr_t fib[] = { 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578, }; /* * Use 'gap' to describe a maximal range of unallocated blocks/bits. */ struct gap_stats { daddr_t start; /* current gap start, or SWAPBLK_NONE */ daddr_t num; /* number of gaps observed */ daddr_t max; /* largest gap size */ daddr_t avg; /* average gap size */ daddr_t err; /* sum - num * avg */ daddr_t histo[nitems(fib)]; /* # gaps in each size range */ int max_bucket; /* last histo elt with nonzero val */ }; /* * gap_stats_counting() - is the state 'counting 1 bits'? * or 'skipping 0 bits'? */ static inline bool gap_stats_counting(const struct gap_stats *stats) { return (stats->start != SWAPBLK_NONE); } /* * init_gap_stats() - initialize stats on gap sizes */ static inline void init_gap_stats(struct gap_stats *stats) { bzero(stats, sizeof(*stats)); stats->start = SWAPBLK_NONE; } /* * update_gap_stats() - update stats on gap sizes */ static void update_gap_stats(struct gap_stats *stats, daddr_t posn) { daddr_t size; int hi, lo, mid; if (!gap_stats_counting(stats)) { stats->start = posn; return; } size = posn - stats->start; stats->start = SWAPBLK_NONE; if (size > stats->max) stats->max = size; /* * Find the fibonacci range that contains size, * expecting to find it in an early range. */ lo = 0; hi = 1; while (hi < nitems(fib) && fib[hi] <= size) { lo = hi; hi *= 2; } if (hi >= nitems(fib)) hi = nitems(fib); while (lo + 1 != hi) { mid = (lo + hi) >> 1; if (fib[mid] <= size) lo = mid; else hi = mid; } stats->histo[lo]++; if (lo > stats->max_bucket) stats->max_bucket = lo; stats->err += size - stats->avg; stats->num++; stats->avg += stats->err / stats->num; stats->err %= stats->num; } /* * dump_gap_stats() - print stats on gap sizes */ static inline void dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) { int i; sbuf_printf(s, "number of maximal free ranges: %jd\n", (intmax_t)stats->num); sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); sbuf_printf(s, "average maximal free range size: %jd\n", (intmax_t)stats->avg); - sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); - sbuf_printf(s, " count | size range\n"); - sbuf_printf(s, " ----- | ----------\n"); + sbuf_cat(s, "number of maximal free ranges of different sizes:\n"); + sbuf_cat(s, " count | size range\n"); + sbuf_cat(s, " ----- | ----------\n"); for (i = 0; i < stats->max_bucket; i++) { if (stats->histo[i] != 0) { sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); if (fib[i] != fib[i + 1] - 1) sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], (intmax_t)fib[i + 1] - 1); else sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); } } sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); if (stats->histo[i] > 1) sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], (intmax_t)stats->max); else sbuf_printf(s, "%jd\n", (intmax_t)stats->max); } /* * blist_stats() - dump radix tree stats */ void blist_stats(blist_t bl, struct sbuf *s) { struct gap_stats gstats; struct gap_stats *stats = &gstats; daddr_t i, nodes, radix; u_daddr_t diff, mask; int digit; init_gap_stats(stats); nodes = 0; radix = bl->bl_radix; for (i = 0; i < bl->bl_blocks; ) { /* * Check for skippable subtrees starting at i. */ while (radix != 1) { if (bl->bl_root[nodes].bm_bitmap == 0) { if (gap_stats_counting(stats)) update_gap_stats(stats, i); break; } /* * Skip subtree root. */ nodes++; radix /= BLIST_RADIX; } if (radix == 1) { /* * Scan leaf. */ mask = bl->bl_root[nodes].bm_bitmap; diff = mask ^ (mask << 1); if (gap_stats_counting(stats)) diff ^= 1; while (diff != 0) { digit = bitpos(diff); update_gap_stats(stats, i + digit); diff ^= bitrange(digit, 1); } } nodes += radix_to_skip(radix * BLIST_RADIX); i += radix * BLIST_RADIX; /* * Find max size subtree starting at i. */ for (radix = 1; ((i / BLIST_RADIX / radix) & BLIST_MASK) == 0; radix *= BLIST_RADIX) ; } update_gap_stats(stats, i); dump_gap_stats(stats, s); } /************************************************************************ * ALLOCATION SUPPORT FUNCTIONS * ************************************************************************ * * These support functions do all the actual work. They may seem * rather longish, but that's because I've commented them up. The * actual code is straight forward. * */ /* * BLST_NEXT_LEAF_ALLOC() - allocate the blocks starting with the next leaf. * * 'scan' is a leaf node, and its first block is at address 'start'. The * next leaf node could be adjacent, or several nodes away if the least * common ancestor of 'scan' and its neighbor is several levels up. Use * addresses to determine how many meta-nodes lie between the leaves. If * sequence of leaves starting with the next one has enough initial bits * set, clear them and clear the bits in the meta nodes on the path up to * the least common ancestor to mark any subtrees made completely empty. */ static int blst_next_leaf_alloc(blmeta_t *scan, daddr_t start, int count, int maxcount) { u_daddr_t radix; daddr_t blk; int avail, digit; start += BLIST_RADIX; for (blk = start; blk - start < maxcount; blk += BLIST_RADIX) { /* Skip meta-nodes, as long as they promise more free blocks. */ radix = BLIST_RADIX; while (((++scan)->bm_bitmap & 1) == 1 && ((blk / radix) & BLIST_MASK) == 0) radix *= BLIST_RADIX; if (~scan->bm_bitmap != 0) { /* * Either there is no next leaf with any free blocks, * or we've reached the next leaf and found that some * of its blocks are not free. In the first case, * bitpos() returns zero here. */ avail = blk - start + bitpos(~scan->bm_bitmap); if (avail < count || avail == 0) { /* * There isn't a next leaf with enough free * blocks at its beginning to bother * allocating. */ return (avail); } maxcount = imin(avail, maxcount); if (maxcount % BLIST_RADIX == 0) { /* * There was no next leaf. Back scan up to * last leaf. */ do { radix /= BLIST_RADIX; --scan; } while (radix != 1); blk -= BLIST_RADIX; } } } /* * 'scan' is the last leaf that provides blocks. Clear from 1 to * BLIST_RADIX bits to represent the allocation of those last blocks. */ if (maxcount % BLIST_RADIX != 0) scan->bm_bitmap &= ~bitrange(0, maxcount % BLIST_RADIX); else scan->bm_bitmap = 0; for (;;) { /* Back up over meta-nodes, clearing bits if necessary. */ blk -= BLIST_RADIX; for (radix = BLIST_RADIX; (digit = ((blk / radix) & BLIST_MASK)) == 0; radix *= BLIST_RADIX) { if ((scan--)->bm_bitmap == 0) scan->bm_bitmap ^= 1; } if ((scan--)->bm_bitmap == 0) scan[-digit * radix_to_skip(radix)].bm_bitmap ^= (u_daddr_t)1 << digit; if (blk == start) break; /* Clear all the bits of this leaf. */ scan->bm_bitmap = 0; } return (maxcount); } /* * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap). * * This function is the core of the allocator. Its execution time is * proportional to log(count), plus height of the tree if the allocation * crosses a leaf boundary. */ static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int *count, int maxcount) { u_daddr_t mask; int bighint, count1, hi, lo, num_shifts; count1 = *count - 1; num_shifts = fls(count1); mask = ~scan->bm_bitmap; while ((mask & (mask + 1)) != 0 && num_shifts > 0) { /* * If bit i is 0 in mask, then bits in [i, i + (count1 >> * num_shifts)] are 1 in scan->bm_bitmap. Reduce num_shifts to * 0, while preserving this invariant. The updates to mask * leave fewer bits 0, but each bit that remains 0 represents a * longer string of consecutive 1-bits in scan->bm_bitmap. If * more updates to mask cannot set more bits, because mask is * partitioned with all 1 bits following all 0 bits, the loop * terminates immediately. */ num_shifts--; mask |= mask >> ((count1 >> num_shifts) + 1) / 2; } bighint = count1 >> num_shifts; if (~mask == 0) { /* * Update bighint. There is no allocation bigger than * count1 >> num_shifts starting in this leaf. */ scan->bm_bighint = bighint; return (SWAPBLK_NONE); } /* Discard any candidates that appear before blk. */ if ((blk & BLIST_MASK) != 0) { if ((~mask & bitrange(0, blk & BLIST_MASK)) != 0) { /* Grow bighint in case all discarded bits are set. */ bighint += blk & BLIST_MASK; mask |= bitrange(0, blk & BLIST_MASK); if (~mask == 0) { scan->bm_bighint = bighint; return (SWAPBLK_NONE); } } blk -= blk & BLIST_MASK; } /* * The least significant set bit in mask marks the start of the first * available range of sufficient size. Find its position. */ lo = bitpos(~mask); /* * Find how much space is available starting at that position. */ if ((mask & (mask + 1)) != 0) { /* Count the 1 bits starting at position lo. */ hi = bitpos(mask & (mask + 1)) + count1; if (maxcount < hi - lo) hi = lo + maxcount; *count = hi - lo; mask = ~bitrange(lo, *count); } else if (maxcount <= BLIST_RADIX - lo) { /* All the blocks we can use are available here. */ hi = lo + maxcount; *count = maxcount; mask = ~bitrange(lo, *count); if (hi == BLIST_RADIX) scan->bm_bighint = bighint; } else { /* Check next leaf for some of the blocks we want or need. */ count1 = *count - (BLIST_RADIX - lo); maxcount -= BLIST_RADIX - lo; hi = blst_next_leaf_alloc(scan, blk, count1, maxcount); if (hi < count1) /* * The next leaf cannot supply enough blocks to reach * the minimum required allocation. The hint cannot be * updated, because the same allocation request could * be satisfied later, by this leaf, if the state of * the next leaf changes, and without any changes to * this leaf. */ return (SWAPBLK_NONE); *count = BLIST_RADIX - lo + hi; scan->bm_bighint = bighint; } /* Clear the allocated bits from this leaf. */ scan->bm_bitmap &= mask; return (blk + lo); } /* * blist_meta_alloc() - allocate at a meta in the radix tree. * * Attempt to allocate at a meta node. If we can't, we update * bighint and return a failure. Updating bighint optimize future * calls that hit this node. We have to check for our collapse cases * and we have a few optimizations strewn in as well. */ static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, int *count, int maxcount, u_daddr_t radix) { daddr_t blk, i, r, skip; u_daddr_t mask; bool scan_from_start; int digit; if (radix == 1) return (blst_leaf_alloc(scan, cursor, count, maxcount)); blk = cursor & -(radix * BLIST_RADIX); scan_from_start = (cursor == blk); skip = radix_to_skip(radix); mask = scan->bm_bitmap; /* Discard any candidates that appear before cursor. */ digit = (cursor / radix) & BLIST_MASK; mask &= (u_daddr_t)-1 << digit; if (mask == 0) return (SWAPBLK_NONE); /* * If the first try is for a block that includes the cursor, pre-undo * the digit * radix offset in the first call; otherwise, ignore the * cursor entirely. */ if (((mask >> digit) & 1) == 1) cursor -= digit * radix; else cursor = blk; /* * Examine the nonempty subtree associated with each bit set in mask. */ do { digit = bitpos(mask); i = 1 + digit * skip; if (*count <= scan[i].bm_bighint) { /* * The allocation might fit beginning in the i'th subtree. */ r = blst_meta_alloc(&scan[i], cursor + digit * radix, count, maxcount, radix / BLIST_RADIX); if (r != SWAPBLK_NONE) { if (scan[i].bm_bitmap == 0) scan->bm_bitmap ^= bitrange(digit, 1); return (r); } } cursor = blk; } while ((mask ^= bitrange(digit, 1)) != 0); /* * We couldn't allocate count in this subtree. If the whole tree was * scanned, and the last tree node is allocated, update bighint. */ if (scan_from_start && !(digit == BLIST_RADIX - 1 && scan[i].bm_bighint == BLIST_MAX_ALLOC)) scan->bm_bighint = *count - 1; return (SWAPBLK_NONE); } /* * BLST_LEAF_FREE() - free allocated block from leaf bitmap * */ static void blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) { u_daddr_t mask; /* * free some data in this bitmap * mask=0000111111111110000 * \_________/\__/ * count n */ mask = bitrange(blk & BLIST_MASK, count); KASSERT((scan->bm_bitmap & mask) == 0, ("freeing free block: %jx, size %d, mask %jx", (uintmax_t)blk, count, (uintmax_t)scan->bm_bitmap & mask)); scan->bm_bitmap |= mask; } /* * BLST_META_FREE() - free allocated blocks from radix tree meta info * * This support routine frees a range of blocks from the bitmap. * The range must be entirely enclosed by this radix node. If a * meta node, we break the range down recursively to free blocks * in subnodes (which means that this code can free an arbitrary * range whereas the allocation code cannot allocate an arbitrary * range). */ static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) { daddr_t blk, endBlk, i, skip; int digit, endDigit; /* * We could probably do a better job here. We are required to make * bighint at least as large as the biggest allocable block of data. * If we just shoehorn it, a little extra overhead will be incurred * on the next allocation (but only that one typically). */ scan->bm_bighint = BLIST_MAX_ALLOC; if (radix == 1) return (blst_leaf_free(scan, freeBlk, count)); endBlk = freeBlk + count; blk = (freeBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX); /* * blk is first block past the end of the range of this meta node, * or 0 in case of overflow. */ if (blk != 0) endBlk = ummin(endBlk, blk); skip = radix_to_skip(radix); blk = freeBlk & -radix; digit = (blk / radix) & BLIST_MASK; endDigit = 1 + (((endBlk - 1) / radix) & BLIST_MASK); scan->bm_bitmap |= bitrange(digit, endDigit - digit); for (i = 1 + digit * skip; blk < endBlk; i += skip) { blk += radix; count = ummin(blk, endBlk) - freeBlk; blst_meta_free(&scan[i], freeBlk, count, radix / BLIST_RADIX); freeBlk = blk; } } /* * BLST_COPY() - copy one radix tree to another * * Locates free space in the source tree and frees it in the destination * tree. The space may not already be free in the destination. */ static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, daddr_t count) { daddr_t endBlk, i, skip; /* * Leaf node */ if (radix == 1) { u_daddr_t v = scan->bm_bitmap; if (v == (u_daddr_t)-1) { blist_free(dest, blk, count); } else if (v != 0) { int i; for (i = 0; i < count; ++i) { if (v & ((u_daddr_t)1 << i)) blist_free(dest, blk + i, 1); } } return; } /* * Meta node */ if (scan->bm_bitmap == 0) { /* * Source all allocated, leave dest allocated */ return; } endBlk = blk + count; skip = radix_to_skip(radix); for (i = 1; blk < endBlk; i += skip) { blk += radix; count = radix; if (blk >= endBlk) count -= blk - endBlk; blst_copy(&scan[i], blk - radix, radix / BLIST_RADIX, dest, count); } } /* * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap * * This routine allocates all blocks in the specified range * regardless of any existing allocations in that range. Returns * the number of blocks allocated by the call. */ static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) { daddr_t nblks; u_daddr_t mask; mask = bitrange(blk & BLIST_MASK, count); /* Count the number of blocks that we are allocating. */ nblks = bitcount64(scan->bm_bitmap & mask); scan->bm_bitmap &= ~mask; return (nblks); } /* * BLIST_META_FILL() - allocate specific blocks at a meta node * * This routine allocates the specified range of blocks, * regardless of any existing allocations in the range. The * range must be within the extent of this node. Returns the * number of blocks allocated by the call. */ static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) { daddr_t blk, endBlk, i, nblks, skip; int digit; if (radix == 1) return (blst_leaf_fill(scan, allocBlk, count)); endBlk = allocBlk + count; blk = (allocBlk + radix * BLIST_RADIX) & -(radix * BLIST_RADIX); /* * blk is first block past the end of the range of this meta node, * or 0 in case of overflow. */ if (blk != 0) endBlk = ummin(endBlk, blk); skip = radix_to_skip(radix); blk = allocBlk & -radix; nblks = 0; while (blk < endBlk) { digit = (blk / radix) & BLIST_MASK; i = 1 + digit * skip; blk += radix; count = ummin(blk, endBlk) - allocBlk; nblks += blst_meta_fill(&scan[i], allocBlk, count, radix / BLIST_RADIX); if (scan[i].bm_bitmap == 0) scan->bm_bitmap &= ~((u_daddr_t)1 << digit); allocBlk = blk; } return (nblks); } #ifdef BLIST_DEBUG static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) { daddr_t skip; u_daddr_t mask; int digit; if (radix == 1) { printf( "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n", tab, tab, "", (long long)blk, (long long)BLIST_RADIX, (int)(1 + (BLIST_RADIX - 1) / 4), (long long)scan->bm_bitmap, (long long)scan->bm_bighint ); return; } printf( "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n", tab, tab, "", (long long)blk, (long long)radix * BLIST_RADIX, (long long)radix * BLIST_RADIX, (int)(1 + (BLIST_RADIX - 1) / 4), (long long)scan->bm_bitmap, (long long)scan->bm_bighint ); skip = radix_to_skip(radix); tab += 4; mask = scan->bm_bitmap; /* Examine the nonempty subtree associated with each bit set in mask */ do { digit = bitpos(mask); blst_radix_print(&scan[1 + digit * skip], blk + digit * radix, radix / BLIST_RADIX, tab); } while ((mask ^= bitrange(digit, 1)) != 0); tab -= 4; printf( "%*.*s}\n", tab, tab, "" ); } #endif #ifdef BLIST_DEBUG int main(int ac, char **av) { daddr_t size = BLIST_RADIX * BLIST_RADIX; int i; blist_t bl; struct sbuf *s; for (i = 1; i < ac; ++i) { const char *ptr = av[i]; if (*ptr != '-') { size = strtoll(ptr, NULL, 0); continue; } ptr += 2; fprintf(stderr, "Bad option: %s\n", ptr - 2); exit(1); } bl = blist_create(size, M_WAITOK); if (bl == NULL) { fprintf(stderr, "blist_create failed\n"); exit(1); } blist_free(bl, 0, size); for (;;) { char buf[1024]; long long da = 0; int count = 0, maxcount = 0; printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), (long long)size, (long long)bl->bl_radix * BLIST_RADIX); fflush(stdout); if (fgets(buf, sizeof(buf), stdin) == NULL) break; switch(buf[0]) { case 'r': if (sscanf(buf + 1, "%d", &count) == 1) { blist_resize(&bl, count, 1, M_WAITOK); } else { printf("?\n"); } case 'p': blist_print(bl); break; case 's': s = sbuf_new_auto(); blist_stats(bl, s); sbuf_finish(s); printf("%s", sbuf_data(s)); sbuf_delete(s); break; case 'a': if (sscanf(buf + 1, "%d%d", &count, &maxcount) == 2) { daddr_t blk = blist_alloc(bl, &count, maxcount); printf(" R=%08llx, c=%08d\n", (long long)blk, count); } else { printf("?\n"); } break; case 'f': if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { blist_free(bl, da, count); } else { printf("?\n"); } break; case 'l': if (sscanf(buf + 1, "%llx %d", &da, &count) == 2) { printf(" n=%jd\n", (intmax_t)blist_fill(bl, da, count)); } else { printf("?\n"); } break; case '?': case 'h': puts( "p -print\n" "s -stats\n" "a %d %d -allocate\n" "f %x %d -free\n" "l %x %d -fill\n" "r %d -resize\n" "h/? -help\n" "q -quit" ); break; case 'q': break; default: printf("?\n"); break; } if (buf[0] == 'q') break; } return (0); } #endif diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c index 8caab20cf709..648394abd026 100644 --- a/sys/kern/subr_bus.c +++ b/sys/kern/subr_bus.c @@ -1,5837 +1,5837 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1997,1998,2003 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_bus.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); SYSCTL_ROOT_NODE(OID_AUTO, dev, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, NULL); static bool disable_failed_devs = false; SYSCTL_BOOL(_hw_bus, OID_AUTO, disable_failed_devices, CTLFLAG_RWTUN, &disable_failed_devs, 0, "Do not retry attaching devices that return an error from DEVICE_ATTACH the first time"); /* * Used to attach drivers to devclasses. */ typedef struct driverlink *driverlink_t; struct driverlink { kobj_class_t driver; TAILQ_ENTRY(driverlink) link; /* list of drivers in devclass */ int pass; int flags; #define DL_DEFERRED_PROBE 1 /* Probe deferred on this */ TAILQ_ENTRY(driverlink) passlink; }; /* * Forward declarations */ typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t; typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t; typedef TAILQ_HEAD(device_list, _device) device_list_t; struct devclass { TAILQ_ENTRY(devclass) link; devclass_t parent; /* parent in devclass hierarchy */ driver_list_t drivers; /* bus devclasses store drivers for bus */ char *name; device_t *devices; /* array of devices indexed by unit */ int maxunit; /* size of devices array */ int flags; #define DC_HAS_CHILDREN 1 struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; }; /** * @brief Implementation of _device. * * The structure is named "_device" instead of "device" to avoid type confusion * caused by other subsystems defining a (struct device). */ struct _device { /* * A device is a kernel object. The first field must be the * current ops table for the object. */ KOBJ_FIELDS; /* * Device hierarchy. */ TAILQ_ENTRY(_device) link; /**< list of devices in parent */ TAILQ_ENTRY(_device) devlink; /**< global device list membership */ device_t parent; /**< parent of this device */ device_list_t children; /**< list of child devices */ /* * Details of this device. */ driver_t *driver; /**< current driver */ devclass_t devclass; /**< current device class */ int unit; /**< current unit number */ char* nameunit; /**< name+unit e.g. foodev0 */ char* desc; /**< driver specific description */ u_int busy; /**< count of calls to device_busy() */ device_state_t state; /**< current device state */ uint32_t devflags; /**< api level flags for device_get_flags() */ u_int flags; /**< internal device flags */ u_int order; /**< order from device_add_child_ordered() */ void *ivars; /**< instance variables */ void *softc; /**< current driver's variables */ struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables */ struct sysctl_oid *sysctl_tree; /**< state for sysctl variables */ }; static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures"); static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc"); EVENTHANDLER_LIST_DEFINE(device_attach); EVENTHANDLER_LIST_DEFINE(device_detach); EVENTHANDLER_LIST_DEFINE(device_nomatch); EVENTHANDLER_LIST_DEFINE(dev_lookup); static void devctl2_init(void); static bool device_frozen; #define DRIVERNAME(d) ((d)? d->name : "no driver") #define DEVCLANAME(d) ((d)? d->name : "no devclass") #ifdef BUS_DEBUG static int bus_debug = 1; SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RWTUN, &bus_debug, 0, "Bus debug level"); #define PDEBUG(a) if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");} #define DEVICENAME(d) ((d)? device_get_name(d): "no device") /** * Produce the indenting, indent*2 spaces plus a '.' ahead of that to * prevent syslog from deleting initial spaces */ #define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJparent ? dc->parent->name : ""; break; default: return (EINVAL); } return (SYSCTL_OUT_STR(req, value)); } static void devclass_sysctl_init(devclass_t dc) { if (dc->sysctl_tree != NULL) return; sysctl_ctx_init(&dc->sysctl_ctx); dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A", "parent class"); } enum { DEVICE_SYSCTL_DESC, DEVICE_SYSCTL_DRIVER, DEVICE_SYSCTL_LOCATION, DEVICE_SYSCTL_PNPINFO, DEVICE_SYSCTL_PARENT, }; static int device_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct sbuf sb; device_t dev = (device_t)arg1; int error; sbuf_new_for_sysctl(&sb, NULL, 1024, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); bus_topo_lock(); switch (arg2) { case DEVICE_SYSCTL_DESC: sbuf_cat(&sb, dev->desc ? dev->desc : ""); break; case DEVICE_SYSCTL_DRIVER: sbuf_cat(&sb, dev->driver ? dev->driver->name : ""); break; case DEVICE_SYSCTL_LOCATION: bus_child_location(dev, &sb); break; case DEVICE_SYSCTL_PNPINFO: bus_child_pnpinfo(dev, &sb); break; case DEVICE_SYSCTL_PARENT: sbuf_cat(&sb, dev->parent ? dev->parent->nameunit : ""); break; default: error = EINVAL; goto out; } error = sbuf_finish(&sb); out: bus_topo_unlock(); sbuf_delete(&sb); return (error); } static void device_sysctl_init(device_t dev) { devclass_t dc = dev->devclass; int domain; if (dev->sysctl_tree != NULL) return; devclass_sysctl_init(dc); sysctl_ctx_init(&dev->sysctl_ctx); dev->sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&dev->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO, dev->nameunit + strlen(dc->name), CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "", "device_index"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%desc", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A", "device description"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%driver", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A", "device driver name"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%location", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A", "device location relative to parent"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%pnpinfo", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A", "device identification"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A", "parent device"); if (bus_get_domain(dev, &domain) == 0) SYSCTL_ADD_INT(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%domain", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, domain, "NUMA domain"); } static void device_sysctl_update(device_t dev) { devclass_t dc = dev->devclass; if (dev->sysctl_tree == NULL) return; sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name)); } static void device_sysctl_fini(device_t dev) { if (dev->sysctl_tree == NULL) return; sysctl_ctx_free(&dev->sysctl_ctx); dev->sysctl_tree = NULL; } static struct device_list bus_data_devices; static int bus_data_generation = 1; static kobj_method_t null_methods[] = { KOBJMETHOD_END }; DEFINE_CLASS(null, null_methods, 0); void bus_topo_assert(void) { GIANT_REQUIRED; } struct mtx * bus_topo_mtx(void) { return (&Giant); } void bus_topo_lock(void) { mtx_lock(bus_topo_mtx()); } void bus_topo_unlock(void) { mtx_unlock(bus_topo_mtx()); } /* * Bus pass implementation */ static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes); int bus_current_pass = BUS_PASS_ROOT; /** * @internal * @brief Register the pass level of a new driver attachment * * Register a new driver attachment's pass level. If no driver * attachment with the same pass level has been added, then @p new * will be added to the global passes list. * * @param new the new driver attachment */ static void driver_register_pass(struct driverlink *new) { struct driverlink *dl; /* We only consider pass numbers during boot. */ if (bus_current_pass == BUS_PASS_DEFAULT) return; /* * Walk the passes list. If we already know about this pass * then there is nothing to do. If we don't, then insert this * driver link into the list. */ TAILQ_FOREACH(dl, &passes, passlink) { if (dl->pass < new->pass) continue; if (dl->pass == new->pass) return; TAILQ_INSERT_BEFORE(dl, new, passlink); return; } TAILQ_INSERT_TAIL(&passes, new, passlink); } /** * @brief Raise the current bus pass * * Raise the current bus pass level to @p pass. Call the BUS_NEW_PASS() * method on the root bus to kick off a new device tree scan for each * new pass level that has at least one driver. */ void bus_set_pass(int pass) { struct driverlink *dl; if (bus_current_pass > pass) panic("Attempt to lower bus pass level"); TAILQ_FOREACH(dl, &passes, passlink) { /* Skip pass values below the current pass level. */ if (dl->pass <= bus_current_pass) continue; /* * Bail once we hit a driver with a pass level that is * too high. */ if (dl->pass > pass) break; /* * Raise the pass level to the next level and rescan * the tree. */ bus_current_pass = dl->pass; BUS_NEW_PASS(root_bus); } /* * If there isn't a driver registered for the requested pass, * then bus_current_pass might still be less than 'pass'. Set * it to 'pass' in that case. */ if (bus_current_pass < pass) bus_current_pass = pass; KASSERT(bus_current_pass == pass, ("Failed to update bus pass level")); } /* * Devclass implementation */ static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses); /** * @internal * @brief Find or create a device class * * If a device class with the name @p classname exists, return it, * otherwise if @p create is non-zero create and return a new device * class. * * If @p parentname is non-NULL, the parent of the devclass is set to * the devclass of that name. * * @param classname the devclass name to find or create * @param parentname the parent devclass name or @c NULL * @param create non-zero to create a devclass */ static devclass_t devclass_find_internal(const char *classname, const char *parentname, int create) { devclass_t dc; PDEBUG(("looking for %s", classname)); if (!classname) return (NULL); TAILQ_FOREACH(dc, &devclasses, link) { if (!strcmp(dc->name, classname)) break; } if (create && !dc) { PDEBUG(("creating %s", classname)); dc = malloc(sizeof(struct devclass) + strlen(classname) + 1, M_BUS, M_NOWAIT | M_ZERO); if (!dc) return (NULL); dc->parent = NULL; dc->name = (char*) (dc + 1); strcpy(dc->name, classname); TAILQ_INIT(&dc->drivers); TAILQ_INSERT_TAIL(&devclasses, dc, link); bus_data_generation_update(); } /* * If a parent class is specified, then set that as our parent so * that this devclass will support drivers for the parent class as * well. If the parent class has the same name don't do this though * as it creates a cycle that can trigger an infinite loop in * device_probe_child() if a device exists for which there is no * suitable driver. */ if (parentname && dc && !dc->parent && strcmp(classname, parentname) != 0) { dc->parent = devclass_find_internal(parentname, NULL, TRUE); dc->parent->flags |= DC_HAS_CHILDREN; } return (dc); } /** * @brief Create a device class * * If a device class with the name @p classname exists, return it, * otherwise create and return a new device class. * * @param classname the devclass name to find or create */ devclass_t devclass_create(const char *classname) { return (devclass_find_internal(classname, NULL, TRUE)); } /** * @brief Find a device class * * If a device class with the name @p classname exists, return it, * otherwise return @c NULL. * * @param classname the devclass name to find */ devclass_t devclass_find(const char *classname) { return (devclass_find_internal(classname, NULL, FALSE)); } /** * @brief Register that a device driver has been added to a devclass * * Register that a device driver has been added to a devclass. This * is called by devclass_add_driver to accomplish the recursive * notification of all the children classes of dc, as well as dc. * Each layer will have BUS_DRIVER_ADDED() called for all instances of * the devclass. * * We do a full search here of the devclass list at each iteration * level to save storing children-lists in the devclass structure. If * we ever move beyond a few dozen devices doing this, we may need to * reevaluate... * * @param dc the devclass to edit * @param driver the driver that was just added */ static void devclass_driver_added(devclass_t dc, driver_t *driver) { devclass_t parent; int i; /* * Call BUS_DRIVER_ADDED for any existing buses in this class. */ for (i = 0; i < dc->maxunit; i++) if (dc->devices[i] && device_is_attached(dc->devices[i])) BUS_DRIVER_ADDED(dc->devices[i], driver); /* * Walk through the children classes. Since we only keep a * single parent pointer around, we walk the entire list of * devclasses looking for children. We set the * DC_HAS_CHILDREN flag when a child devclass is created on * the parent, so we only walk the list for those devclasses * that have children. */ if (!(dc->flags & DC_HAS_CHILDREN)) return; parent = dc; TAILQ_FOREACH(dc, &devclasses, link) { if (dc->parent == parent) devclass_driver_added(dc, driver); } } static void device_handle_nomatch(device_t dev) { BUS_PROBE_NOMATCH(dev->parent, dev); EVENTHANDLER_DIRECT_INVOKE(device_nomatch, dev); dev->flags |= DF_DONENOMATCH; } /** * @brief Add a device driver to a device class * * Add a device driver to a devclass. This is normally called * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of * all devices in the devclass will be called to allow them to attempt * to re-probe any unmatched children. * * @param dc the devclass to edit * @param driver the driver to register */ int devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp) { driverlink_t dl; devclass_t child_dc; const char *parentname; PDEBUG(("%s", DRIVERNAME(driver))); /* Don't allow invalid pass values. */ if (pass <= BUS_PASS_ROOT) return (EINVAL); dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO); if (!dl) return (ENOMEM); /* * Compile the driver's methods. Also increase the reference count * so that the class doesn't get freed when the last instance * goes. This means we can safely use static methods and avoids a * double-free in devclass_delete_driver. */ kobj_class_compile((kobj_class_t) driver); /* * If the driver has any base classes, make the * devclass inherit from the devclass of the driver's * first base class. This will allow the system to * search for drivers in both devclasses for children * of a device using this driver. */ if (driver->baseclasses) parentname = driver->baseclasses[0]->name; else parentname = NULL; child_dc = devclass_find_internal(driver->name, parentname, TRUE); if (dcp != NULL) *dcp = child_dc; dl->driver = driver; TAILQ_INSERT_TAIL(&dc->drivers, dl, link); driver->refs++; /* XXX: kobj_mtx */ dl->pass = pass; driver_register_pass(dl); if (device_frozen) { dl->flags |= DL_DEFERRED_PROBE; } else { devclass_driver_added(dc, driver); } bus_data_generation_update(); return (0); } /** * @brief Register that a device driver has been deleted from a devclass * * Register that a device driver has been removed from a devclass. * This is called by devclass_delete_driver to accomplish the * recursive notification of all the children classes of busclass, as * well as busclass. Each layer will attempt to detach the driver * from any devices that are children of the bus's devclass. The function * will return an error if a device fails to detach. * * We do a full search here of the devclass list at each iteration * level to save storing children-lists in the devclass structure. If * we ever move beyond a few dozen devices doing this, we may need to * reevaluate... * * @param busclass the devclass of the parent bus * @param dc the devclass of the driver being deleted * @param driver the driver being deleted */ static int devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver) { devclass_t parent; device_t dev; int error, i; /* * Disassociate from any devices. We iterate through all the * devices in the devclass of the driver and detach any which are * using the driver and which have a parent in the devclass which * we are deleting from. * * Note that since a driver can be in multiple devclasses, we * should not detach devices which are not children of devices in * the affected devclass. * * If we're frozen, we don't generate NOMATCH events. Mark to * generate later. */ for (i = 0; i < dc->maxunit; i++) { if (dc->devices[i]) { dev = dc->devices[i]; if (dev->driver == driver && dev->parent && dev->parent->devclass == busclass) { if ((error = device_detach(dev)) != 0) return (error); if (device_frozen) { dev->flags &= ~DF_DONENOMATCH; dev->flags |= DF_NEEDNOMATCH; } else { device_handle_nomatch(dev); } } } } /* * Walk through the children classes. Since we only keep a * single parent pointer around, we walk the entire list of * devclasses looking for children. We set the * DC_HAS_CHILDREN flag when a child devclass is created on * the parent, so we only walk the list for those devclasses * that have children. */ if (!(busclass->flags & DC_HAS_CHILDREN)) return (0); parent = busclass; TAILQ_FOREACH(busclass, &devclasses, link) { if (busclass->parent == parent) { error = devclass_driver_deleted(busclass, dc, driver); if (error) return (error); } } return (0); } /** * @brief Delete a device driver from a device class * * Delete a device driver from a devclass. This is normally called * automatically by DRIVER_MODULE(). * * If the driver is currently attached to any devices, * devclass_delete_driver() will first attempt to detach from each * device. If one of the detach calls fails, the driver will not be * deleted. * * @param dc the devclass to edit * @param driver the driver to unregister */ int devclass_delete_driver(devclass_t busclass, driver_t *driver) { devclass_t dc = devclass_find(driver->name); driverlink_t dl; int error; PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass))); if (!dc) return (0); /* * Find the link structure in the bus' list of drivers. */ TAILQ_FOREACH(dl, &busclass->drivers, link) { if (dl->driver == driver) break; } if (!dl) { PDEBUG(("%s not found in %s list", driver->name, busclass->name)); return (ENOENT); } error = devclass_driver_deleted(busclass, dc, driver); if (error != 0) return (error); TAILQ_REMOVE(&busclass->drivers, dl, link); free(dl, M_BUS); /* XXX: kobj_mtx */ driver->refs--; if (driver->refs == 0) kobj_class_free((kobj_class_t) driver); bus_data_generation_update(); return (0); } /** * @brief Quiesces a set of device drivers from a device class * * Quiesce a device driver from a devclass. This is normally called * automatically by DRIVER_MODULE(). * * If the driver is currently attached to any devices, * devclass_quiesece_driver() will first attempt to quiesce each * device. * * @param dc the devclass to edit * @param driver the driver to unregister */ static int devclass_quiesce_driver(devclass_t busclass, driver_t *driver) { devclass_t dc = devclass_find(driver->name); driverlink_t dl; device_t dev; int i; int error; PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass))); if (!dc) return (0); /* * Find the link structure in the bus' list of drivers. */ TAILQ_FOREACH(dl, &busclass->drivers, link) { if (dl->driver == driver) break; } if (!dl) { PDEBUG(("%s not found in %s list", driver->name, busclass->name)); return (ENOENT); } /* * Quiesce all devices. We iterate through all the devices in * the devclass of the driver and quiesce any which are using * the driver and which have a parent in the devclass which we * are quiescing. * * Note that since a driver can be in multiple devclasses, we * should not quiesce devices which are not children of * devices in the affected devclass. */ for (i = 0; i < dc->maxunit; i++) { if (dc->devices[i]) { dev = dc->devices[i]; if (dev->driver == driver && dev->parent && dev->parent->devclass == busclass) { if ((error = device_quiesce(dev)) != 0) return (error); } } } return (0); } /** * @internal */ static driverlink_t devclass_find_driver_internal(devclass_t dc, const char *classname) { driverlink_t dl; PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc))); TAILQ_FOREACH(dl, &dc->drivers, link) { if (!strcmp(dl->driver->name, classname)) return (dl); } PDEBUG(("not found")); return (NULL); } /** * @brief Return the name of the devclass */ const char * devclass_get_name(devclass_t dc) { return (dc->name); } /** * @brief Find a device given a unit number * * @param dc the devclass to search * @param unit the unit number to search for * * @returns the device with the given unit number or @c * NULL if there is no such device */ device_t devclass_get_device(devclass_t dc, int unit) { if (dc == NULL || unit < 0 || unit >= dc->maxunit) return (NULL); return (dc->devices[unit]); } /** * @brief Find the softc field of a device given a unit number * * @param dc the devclass to search * @param unit the unit number to search for * * @returns the softc field of the device with the given * unit number or @c NULL if there is no such * device */ void * devclass_get_softc(devclass_t dc, int unit) { device_t dev; dev = devclass_get_device(dc, unit); if (!dev) return (NULL); return (device_get_softc(dev)); } /** * @brief Get a list of devices in the devclass * * An array containing a list of all the devices in the given devclass * is allocated and returned in @p *devlistp. The number of devices * in the array is returned in @p *devcountp. The caller should free * the array using @c free(p, M_TEMP), even if @p *devcountp is 0. * * @param dc the devclass to examine * @param devlistp points at location for array pointer return * value * @param devcountp points at location for array size return value * * @retval 0 success * @retval ENOMEM the array allocation failed */ int devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp) { int count, i; device_t *list; count = devclass_get_count(dc); list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO); if (!list) return (ENOMEM); count = 0; for (i = 0; i < dc->maxunit; i++) { if (dc->devices[i]) { list[count] = dc->devices[i]; count++; } } *devlistp = list; *devcountp = count; return (0); } /** * @brief Get a list of drivers in the devclass * * An array containing a list of pointers to all the drivers in the * given devclass is allocated and returned in @p *listp. The number * of drivers in the array is returned in @p *countp. The caller should * free the array using @c free(p, M_TEMP). * * @param dc the devclass to examine * @param listp gives location for array pointer return value * @param countp gives location for number of array elements * return value * * @retval 0 success * @retval ENOMEM the array allocation failed */ int devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp) { driverlink_t dl; driver_t **list; int count; count = 0; TAILQ_FOREACH(dl, &dc->drivers, link) count++; list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT); if (list == NULL) return (ENOMEM); count = 0; TAILQ_FOREACH(dl, &dc->drivers, link) { list[count] = dl->driver; count++; } *listp = list; *countp = count; return (0); } /** * @brief Get the number of devices in a devclass * * @param dc the devclass to examine */ int devclass_get_count(devclass_t dc) { int count, i; count = 0; for (i = 0; i < dc->maxunit; i++) if (dc->devices[i]) count++; return (count); } /** * @brief Get the maximum unit number used in a devclass * * Note that this is one greater than the highest currently-allocated * unit. If a null devclass_t is passed in, -1 is returned to indicate * that not even the devclass has been allocated yet. * * @param dc the devclass to examine */ int devclass_get_maxunit(devclass_t dc) { if (dc == NULL) return (-1); return (dc->maxunit); } /** * @brief Find a free unit number in a devclass * * This function searches for the first unused unit number greater * that or equal to @p unit. * * @param dc the devclass to examine * @param unit the first unit number to check */ int devclass_find_free_unit(devclass_t dc, int unit) { if (dc == NULL) return (unit); while (unit < dc->maxunit && dc->devices[unit] != NULL) unit++; return (unit); } /** * @brief Set the parent of a devclass * * The parent class is normally initialised automatically by * DRIVER_MODULE(). * * @param dc the devclass to edit * @param pdc the new parent devclass */ void devclass_set_parent(devclass_t dc, devclass_t pdc) { dc->parent = pdc; } /** * @brief Get the parent of a devclass * * @param dc the devclass to examine */ devclass_t devclass_get_parent(devclass_t dc) { return (dc->parent); } struct sysctl_ctx_list * devclass_get_sysctl_ctx(devclass_t dc) { return (&dc->sysctl_ctx); } struct sysctl_oid * devclass_get_sysctl_tree(devclass_t dc) { return (dc->sysctl_tree); } /** * @internal * @brief Allocate a unit number * * On entry, @p *unitp is the desired unit number (or @c -1 if any * will do). The allocated unit number is returned in @p *unitp. * @param dc the devclass to allocate from * @param unitp points at the location for the allocated unit * number * * @retval 0 success * @retval EEXIST the requested unit number is already allocated * @retval ENOMEM memory allocation failure */ static int devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp) { const char *s; int unit = *unitp; PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc))); /* Ask the parent bus if it wants to wire this device. */ if (unit == -1) BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name, &unit); /* If we were given a wired unit number, check for existing device */ /* XXX imp XXX */ if (unit != -1) { if (unit >= 0 && unit < dc->maxunit && dc->devices[unit] != NULL) { if (bootverbose) printf("%s: %s%d already exists; skipping it\n", dc->name, dc->name, *unitp); return (EEXIST); } } else { /* Unwired device, find the next available slot for it */ unit = 0; for (unit = 0;; unit++) { /* If this device slot is already in use, skip it. */ if (unit < dc->maxunit && dc->devices[unit] != NULL) continue; /* If there is an "at" hint for a unit then skip it. */ if (resource_string_value(dc->name, unit, "at", &s) == 0) continue; break; } } /* * We've selected a unit beyond the length of the table, so let's * extend the table to make room for all units up to and including * this one. */ if (unit >= dc->maxunit) { device_t *newlist, *oldlist; int newsize; oldlist = dc->devices; newsize = roundup((unit + 1), MAX(1, MINALLOCSIZE / sizeof(device_t))); newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT); if (!newlist) return (ENOMEM); if (oldlist != NULL) bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit); bzero(newlist + dc->maxunit, sizeof(device_t) * (newsize - dc->maxunit)); dc->devices = newlist; dc->maxunit = newsize; if (oldlist != NULL) free(oldlist, M_BUS); } PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc))); *unitp = unit; return (0); } /** * @internal * @brief Add a device to a devclass * * A unit number is allocated for the device (using the device's * preferred unit number if any) and the device is registered in the * devclass. This allows the device to be looked up by its unit * number, e.g. by decoding a dev_t minor number. * * @param dc the devclass to add to * @param dev the device to add * * @retval 0 success * @retval EEXIST the requested unit number is already allocated * @retval ENOMEM memory allocation failure */ static int devclass_add_device(devclass_t dc, device_t dev) { int buflen, error; PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX); if (buflen < 0) return (ENOMEM); dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO); if (!dev->nameunit) return (ENOMEM); if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) { free(dev->nameunit, M_BUS); dev->nameunit = NULL; return (error); } dc->devices[dev->unit] = dev; dev->devclass = dc; snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit); return (0); } /** * @internal * @brief Delete a device from a devclass * * The device is removed from the devclass's device list and its unit * number is freed. * @param dc the devclass to delete from * @param dev the device to delete * * @retval 0 success */ static int devclass_delete_device(devclass_t dc, device_t dev) { if (!dc || !dev) return (0); PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); if (dev->devclass != dc || dc->devices[dev->unit] != dev) panic("devclass_delete_device: inconsistent device class"); dc->devices[dev->unit] = NULL; if (dev->flags & DF_WILDCARD) dev->unit = -1; dev->devclass = NULL; free(dev->nameunit, M_BUS); dev->nameunit = NULL; return (0); } /** * @internal * @brief Make a new device and add it as a child of @p parent * * @param parent the parent of the new device * @param name the devclass name of the new device or @c NULL * to leave the devclass unspecified * @parem unit the unit number of the new device of @c -1 to * leave the unit number unspecified * * @returns the new device */ static device_t make_device(device_t parent, const char *name, int unit) { device_t dev; devclass_t dc; PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit)); if (name) { dc = devclass_find_internal(name, NULL, TRUE); if (!dc) { printf("make_device: can't find device class %s\n", name); return (NULL); } } else { dc = NULL; } dev = malloc(sizeof(*dev), M_BUS, M_NOWAIT|M_ZERO); if (!dev) return (NULL); dev->parent = parent; TAILQ_INIT(&dev->children); kobj_init((kobj_t) dev, &null_class); dev->driver = NULL; dev->devclass = NULL; dev->unit = unit; dev->nameunit = NULL; dev->desc = NULL; dev->busy = 0; dev->devflags = 0; dev->flags = DF_ENABLED; dev->order = 0; if (unit == -1) dev->flags |= DF_WILDCARD; if (name) { dev->flags |= DF_FIXEDCLASS; if (devclass_add_device(dc, dev)) { kobj_delete((kobj_t) dev, M_BUS); return (NULL); } } if (parent != NULL && device_has_quiet_children(parent)) dev->flags |= DF_QUIET | DF_QUIET_CHILDREN; dev->ivars = NULL; dev->softc = NULL; dev->state = DS_NOTPRESENT; TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink); bus_data_generation_update(); return (dev); } /** * @internal * @brief Print a description of a device. */ static int device_print_child(device_t dev, device_t child) { int retval = 0; if (device_is_alive(child)) retval += BUS_PRINT_CHILD(dev, child); else retval += device_printf(child, " not found\n"); return (retval); } /** * @brief Create a new device * * This creates a new device and adds it as a child of an existing * parent device. The new device will be added after the last existing * child with order zero. * * @param dev the device which will be the parent of the * new child device * @param name devclass name for new device or @c NULL if not * specified * @param unit unit number for new device or @c -1 if not * specified * * @returns the new device */ device_t device_add_child(device_t dev, const char *name, int unit) { return (device_add_child_ordered(dev, 0, name, unit)); } /** * @brief Create a new device * * This creates a new device and adds it as a child of an existing * parent device. The new device will be added after the last existing * child with the same order. * * @param dev the device which will be the parent of the * new child device * @param order a value which is used to partially sort the * children of @p dev - devices created using * lower values of @p order appear first in @p * dev's list of children * @param name devclass name for new device or @c NULL if not * specified * @param unit unit number for new device or @c -1 if not * specified * * @returns the new device */ device_t device_add_child_ordered(device_t dev, u_int order, const char *name, int unit) { device_t child; device_t place; PDEBUG(("%s at %s with order %u as unit %d", name, DEVICENAME(dev), order, unit)); KASSERT(name != NULL || unit == -1, ("child device with wildcard name and specific unit number")); child = make_device(dev, name, unit); if (child == NULL) return (child); child->order = order; TAILQ_FOREACH(place, &dev->children, link) { if (place->order > order) break; } if (place) { /* * The device 'place' is the first device whose order is * greater than the new child. */ TAILQ_INSERT_BEFORE(place, child, link); } else { /* * The new child's order is greater or equal to the order of * any existing device. Add the child to the tail of the list. */ TAILQ_INSERT_TAIL(&dev->children, child, link); } bus_data_generation_update(); return (child); } /** * @brief Delete a device * * This function deletes a device along with all of its children. If * the device currently has a driver attached to it, the device is * detached first using device_detach(). * * @param dev the parent device * @param child the device to delete * * @retval 0 success * @retval non-zero a unit error code describing the error */ int device_delete_child(device_t dev, device_t child) { int error; device_t grandchild; PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev))); /* detach parent before deleting children, if any */ if ((error = device_detach(child)) != 0) return (error); /* remove children second */ while ((grandchild = TAILQ_FIRST(&child->children)) != NULL) { error = device_delete_child(child, grandchild); if (error) return (error); } if (child->devclass) devclass_delete_device(child->devclass, child); if (child->parent) BUS_CHILD_DELETED(dev, child); TAILQ_REMOVE(&dev->children, child, link); TAILQ_REMOVE(&bus_data_devices, child, devlink); kobj_delete((kobj_t) child, M_BUS); bus_data_generation_update(); return (0); } /** * @brief Delete all children devices of the given device, if any. * * This function deletes all children devices of the given device, if * any, using the device_delete_child() function for each device it * finds. If a child device cannot be deleted, this function will * return an error code. * * @param dev the parent device * * @retval 0 success * @retval non-zero a device would not detach */ int device_delete_children(device_t dev) { device_t child; int error; PDEBUG(("Deleting all children of %s", DEVICENAME(dev))); error = 0; while ((child = TAILQ_FIRST(&dev->children)) != NULL) { error = device_delete_child(dev, child); if (error) { PDEBUG(("Failed deleting %s", DEVICENAME(child))); break; } } return (error); } /** * @brief Find a device given a unit number * * This is similar to devclass_get_devices() but only searches for * devices which have @p dev as a parent. * * @param dev the parent device to search * @param unit the unit number to search for. If the unit is -1, * return the first child of @p dev which has name * @p classname (that is, the one with the lowest unit.) * * @returns the device with the given unit number or @c * NULL if there is no such device */ device_t device_find_child(device_t dev, const char *classname, int unit) { devclass_t dc; device_t child; dc = devclass_find(classname); if (!dc) return (NULL); if (unit != -1) { child = devclass_get_device(dc, unit); if (child && child->parent == dev) return (child); } else { for (unit = 0; unit < devclass_get_maxunit(dc); unit++) { child = devclass_get_device(dc, unit); if (child && child->parent == dev) return (child); } } return (NULL); } /** * @internal */ static driverlink_t first_matching_driver(devclass_t dc, device_t dev) { if (dev->devclass) return (devclass_find_driver_internal(dc, dev->devclass->name)); return (TAILQ_FIRST(&dc->drivers)); } /** * @internal */ static driverlink_t next_matching_driver(devclass_t dc, device_t dev, driverlink_t last) { if (dev->devclass) { driverlink_t dl; for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link)) if (!strcmp(dev->devclass->name, dl->driver->name)) return (dl); return (NULL); } return (TAILQ_NEXT(last, link)); } /** * @internal */ int device_probe_child(device_t dev, device_t child) { devclass_t dc; driverlink_t best = NULL; driverlink_t dl; int result, pri = 0; /* We should preserve the devclass (or lack of) set by the bus. */ int hasclass = (child->devclass != NULL); bus_topo_assert(); dc = dev->devclass; if (!dc) panic("device_probe_child: parent device has no devclass"); /* * If the state is already probed, then return. */ if (child->state == DS_ALIVE) return (0); for (; dc; dc = dc->parent) { for (dl = first_matching_driver(dc, child); dl; dl = next_matching_driver(dc, child, dl)) { /* If this driver's pass is too high, then ignore it. */ if (dl->pass > bus_current_pass) continue; PDEBUG(("Trying %s", DRIVERNAME(dl->driver))); result = device_set_driver(child, dl->driver); if (result == ENOMEM) return (result); else if (result != 0) continue; if (!hasclass) { if (device_set_devclass(child, dl->driver->name) != 0) { char const * devname = device_get_name(child); if (devname == NULL) devname = "(unknown)"; printf("driver bug: Unable to set " "devclass (class: %s " "devname: %s)\n", dl->driver->name, devname); (void)device_set_driver(child, NULL); continue; } } /* Fetch any flags for the device before probing. */ resource_int_value(dl->driver->name, child->unit, "flags", &child->devflags); result = DEVICE_PROBE(child); /* * If the driver returns SUCCESS, there can be * no higher match for this device. */ if (result == 0) { best = dl; pri = 0; break; } /* Reset flags and devclass before the next probe. */ child->devflags = 0; if (!hasclass) (void)device_set_devclass(child, NULL); /* * Reset DF_QUIET in case this driver doesn't * end up as the best driver. */ device_verbose(child); /* * Probes that return BUS_PROBE_NOWILDCARD or lower * only match on devices whose driver was explicitly * specified. */ if (result <= BUS_PROBE_NOWILDCARD && !(child->flags & DF_FIXEDCLASS)) { result = ENXIO; } /* * The driver returned an error so it * certainly doesn't match. */ if (result > 0) { (void)device_set_driver(child, NULL); continue; } /* * A priority lower than SUCCESS, remember the * best matching driver. Initialise the value * of pri for the first match. */ if (best == NULL || result > pri) { best = dl; pri = result; continue; } } /* * If we have an unambiguous match in this devclass, * don't look in the parent. */ if (best && pri == 0) break; } if (best == NULL) return (ENXIO); /* * If we found a driver, change state and initialise the devclass. */ if (pri < 0) { /* Set the winning driver, devclass, and flags. */ result = device_set_driver(child, best->driver); if (result != 0) return (result); if (!child->devclass) { result = device_set_devclass(child, best->driver->name); if (result != 0) { (void)device_set_driver(child, NULL); return (result); } } resource_int_value(best->driver->name, child->unit, "flags", &child->devflags); /* * A bit bogus. Call the probe method again to make sure * that we have the right description. */ result = DEVICE_PROBE(child); if (result > 0) { if (!hasclass) (void)device_set_devclass(child, NULL); (void)device_set_driver(child, NULL); return (result); } } child->state = DS_ALIVE; bus_data_generation_update(); return (0); } /** * @brief Return the parent of a device */ device_t device_get_parent(device_t dev) { return (dev->parent); } /** * @brief Get a list of children of a device * * An array containing a list of all the children of the given device * is allocated and returned in @p *devlistp. The number of devices * in the array is returned in @p *devcountp. The caller should free * the array using @c free(p, M_TEMP). * * @param dev the device to examine * @param devlistp points at location for array pointer return * value * @param devcountp points at location for array size return value * * @retval 0 success * @retval ENOMEM the array allocation failed */ int device_get_children(device_t dev, device_t **devlistp, int *devcountp) { int count; device_t child; device_t *list; count = 0; TAILQ_FOREACH(child, &dev->children, link) { count++; } if (count == 0) { *devlistp = NULL; *devcountp = 0; return (0); } list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO); if (!list) return (ENOMEM); count = 0; TAILQ_FOREACH(child, &dev->children, link) { list[count] = child; count++; } *devlistp = list; *devcountp = count; return (0); } /** * @brief Return the current driver for the device or @c NULL if there * is no driver currently attached */ driver_t * device_get_driver(device_t dev) { return (dev->driver); } /** * @brief Return the current devclass for the device or @c NULL if * there is none. */ devclass_t device_get_devclass(device_t dev) { return (dev->devclass); } /** * @brief Return the name of the device's devclass or @c NULL if there * is none. */ const char * device_get_name(device_t dev) { if (dev != NULL && dev->devclass) return (devclass_get_name(dev->devclass)); return (NULL); } /** * @brief Return a string containing the device's devclass name * followed by an ascii representation of the device's unit number * (e.g. @c "foo2"). */ const char * device_get_nameunit(device_t dev) { return (dev->nameunit); } /** * @brief Return the device's unit number. */ int device_get_unit(device_t dev) { return (dev->unit); } /** * @brief Return the device's description string */ const char * device_get_desc(device_t dev) { return (dev->desc); } /** * @brief Return the device's flags */ uint32_t device_get_flags(device_t dev) { return (dev->devflags); } struct sysctl_ctx_list * device_get_sysctl_ctx(device_t dev) { return (&dev->sysctl_ctx); } struct sysctl_oid * device_get_sysctl_tree(device_t dev) { return (dev->sysctl_tree); } /** * @brief Print the name of the device followed by a colon and a space * * @returns the number of characters printed */ int device_print_prettyname(device_t dev) { const char *name = device_get_name(dev); if (name == NULL) return (printf("unknown: ")); return (printf("%s%d: ", name, device_get_unit(dev))); } /** * @brief Print the name of the device followed by a colon, a space * and the result of calling vprintf() with the value of @p fmt and * the following arguments. * * @returns the number of characters printed */ int device_printf(device_t dev, const char * fmt, ...) { char buf[128]; struct sbuf sb; const char *name; va_list ap; size_t retval; retval = 0; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_printf_drain, &retval); name = device_get_name(dev); if (name == NULL) sbuf_cat(&sb, "unknown: "); else sbuf_printf(&sb, "%s%d: ", name, device_get_unit(dev)); va_start(ap, fmt); sbuf_vprintf(&sb, fmt, ap); va_end(ap); sbuf_finish(&sb); sbuf_delete(&sb); return (retval); } /** * @brief Print the name of the device followed by a colon, a space * and the result of calling log() with the value of @p fmt and * the following arguments. * * @returns the number of characters printed */ int device_log(device_t dev, int pri, const char * fmt, ...) { char buf[128]; struct sbuf sb; const char *name; va_list ap; size_t retval; retval = 0; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); name = device_get_name(dev); if (name == NULL) sbuf_cat(&sb, "unknown: "); else sbuf_printf(&sb, "%s%d: ", name, device_get_unit(dev)); va_start(ap, fmt); sbuf_vprintf(&sb, fmt, ap); va_end(ap); sbuf_finish(&sb); log(pri, "%.*s", (int) sbuf_len(&sb), sbuf_data(&sb)); retval = sbuf_len(&sb); sbuf_delete(&sb); return (retval); } /** * @internal */ static void device_set_desc_internal(device_t dev, const char* desc, int copy) { if (dev->desc && (dev->flags & DF_DESCMALLOCED)) { free(dev->desc, M_BUS); dev->flags &= ~DF_DESCMALLOCED; dev->desc = NULL; } if (copy && desc) { dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT); if (dev->desc) { strcpy(dev->desc, desc); dev->flags |= DF_DESCMALLOCED; } } else { /* Avoid a -Wcast-qual warning */ dev->desc = (char *)(uintptr_t) desc; } bus_data_generation_update(); } /** * @brief Set the device's description * * The value of @c desc should be a string constant that will not * change (at least until the description is changed in a subsequent * call to device_set_desc() or device_set_desc_copy()). */ void device_set_desc(device_t dev, const char* desc) { device_set_desc_internal(dev, desc, FALSE); } /** * @brief Set the device's description * * The string pointed to by @c desc is copied. Use this function if * the device description is generated, (e.g. with sprintf()). */ void device_set_desc_copy(device_t dev, const char* desc) { device_set_desc_internal(dev, desc, TRUE); } /** * @brief Set the device's flags */ void device_set_flags(device_t dev, uint32_t flags) { dev->devflags = flags; } /** * @brief Return the device's softc field * * The softc is allocated and zeroed when a driver is attached, based * on the size field of the driver. */ void * device_get_softc(device_t dev) { return (dev->softc); } /** * @brief Set the device's softc field * * Most drivers do not need to use this since the softc is allocated * automatically when the driver is attached. */ void device_set_softc(device_t dev, void *softc) { if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) free(dev->softc, M_BUS_SC); dev->softc = softc; if (dev->softc) dev->flags |= DF_EXTERNALSOFTC; else dev->flags &= ~DF_EXTERNALSOFTC; } /** * @brief Free claimed softc * * Most drivers do not need to use this since the softc is freed * automatically when the driver is detached. */ void device_free_softc(void *softc) { free(softc, M_BUS_SC); } /** * @brief Claim softc * * This function can be used to let the driver free the automatically * allocated softc using "device_free_softc()". This function is * useful when the driver is refcounting the softc and the softc * cannot be freed when the "device_detach" method is called. */ void device_claim_softc(device_t dev) { if (dev->softc) dev->flags |= DF_EXTERNALSOFTC; else dev->flags &= ~DF_EXTERNALSOFTC; } /** * @brief Get the device's ivars field * * The ivars field is used by the parent device to store per-device * state (e.g. the physical location of the device or a list of * resources). */ void * device_get_ivars(device_t dev) { KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)")); return (dev->ivars); } /** * @brief Set the device's ivars field */ void device_set_ivars(device_t dev, void * ivars) { KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)")); dev->ivars = ivars; } /** * @brief Return the device's state */ device_state_t device_get_state(device_t dev) { return (dev->state); } /** * @brief Set the DF_ENABLED flag for the device */ void device_enable(device_t dev) { dev->flags |= DF_ENABLED; } /** * @brief Clear the DF_ENABLED flag for the device */ void device_disable(device_t dev) { dev->flags &= ~DF_ENABLED; } /** * @brief Increment the busy counter for the device */ void device_busy(device_t dev) { /* * Mark the device as busy, recursively up the tree if this busy count * goes 0->1. */ if (refcount_acquire(&dev->busy) == 0 && dev->parent != NULL) device_busy(dev->parent); } /** * @brief Decrement the busy counter for the device */ void device_unbusy(device_t dev) { /* * Mark the device as unbsy, recursively if this is the last busy count. */ if (refcount_release(&dev->busy) && dev->parent != NULL) device_unbusy(dev->parent); } /** * @brief Set the DF_QUIET flag for the device */ void device_quiet(device_t dev) { dev->flags |= DF_QUIET; } /** * @brief Set the DF_QUIET_CHILDREN flag for the device */ void device_quiet_children(device_t dev) { dev->flags |= DF_QUIET_CHILDREN; } /** * @brief Clear the DF_QUIET flag for the device */ void device_verbose(device_t dev) { dev->flags &= ~DF_QUIET; } ssize_t device_get_property(device_t dev, const char *prop, void *val, size_t sz, device_property_type_t type) { device_t bus = device_get_parent(dev); switch (type) { case DEVICE_PROP_ANY: case DEVICE_PROP_BUFFER: case DEVICE_PROP_HANDLE: /* Size checks done in implementation. */ break; case DEVICE_PROP_UINT32: if (sz % 4 != 0) return (-1); break; case DEVICE_PROP_UINT64: if (sz % 8 != 0) return (-1); break; default: return (-1); } return (BUS_GET_PROPERTY(bus, dev, prop, val, sz, type)); } bool device_has_property(device_t dev, const char *prop) { return (device_get_property(dev, prop, NULL, 0, DEVICE_PROP_ANY) >= 0); } /** * @brief Return non-zero if the DF_QUIET_CHIDLREN flag is set on the device */ int device_has_quiet_children(device_t dev) { return ((dev->flags & DF_QUIET_CHILDREN) != 0); } /** * @brief Return non-zero if the DF_QUIET flag is set on the device */ int device_is_quiet(device_t dev) { return ((dev->flags & DF_QUIET) != 0); } /** * @brief Return non-zero if the DF_ENABLED flag is set on the device */ int device_is_enabled(device_t dev) { return ((dev->flags & DF_ENABLED) != 0); } /** * @brief Return non-zero if the device was successfully probed */ int device_is_alive(device_t dev) { return (dev->state >= DS_ALIVE); } /** * @brief Return non-zero if the device currently has a driver * attached to it */ int device_is_attached(device_t dev) { return (dev->state >= DS_ATTACHED); } /** * @brief Return non-zero if the device is currently suspended. */ int device_is_suspended(device_t dev) { return ((dev->flags & DF_SUSPENDED) != 0); } /** * @brief Set the devclass of a device * @see devclass_add_device(). */ int device_set_devclass(device_t dev, const char *classname) { devclass_t dc; int error; if (!classname) { if (dev->devclass) devclass_delete_device(dev->devclass, dev); return (0); } if (dev->devclass) { printf("device_set_devclass: device class already set\n"); return (EINVAL); } dc = devclass_find_internal(classname, NULL, TRUE); if (!dc) return (ENOMEM); error = devclass_add_device(dc, dev); bus_data_generation_update(); return (error); } /** * @brief Set the devclass of a device and mark the devclass fixed. * @see device_set_devclass() */ int device_set_devclass_fixed(device_t dev, const char *classname) { int error; if (classname == NULL) return (EINVAL); error = device_set_devclass(dev, classname); if (error) return (error); dev->flags |= DF_FIXEDCLASS; return (0); } /** * @brief Query the device to determine if it's of a fixed devclass * @see device_set_devclass_fixed() */ bool device_is_devclass_fixed(device_t dev) { return ((dev->flags & DF_FIXEDCLASS) != 0); } /** * @brief Set the driver of a device * * @retval 0 success * @retval EBUSY the device already has a driver attached * @retval ENOMEM a memory allocation failure occurred */ int device_set_driver(device_t dev, driver_t *driver) { int domain; struct domainset *policy; if (dev->state >= DS_ATTACHED) return (EBUSY); if (dev->driver == driver) return (0); if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) { free(dev->softc, M_BUS_SC); dev->softc = NULL; } device_set_desc(dev, NULL); kobj_delete((kobj_t) dev, NULL); dev->driver = driver; if (driver) { kobj_init((kobj_t) dev, (kobj_class_t) driver); if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) { if (bus_get_domain(dev, &domain) == 0) policy = DOMAINSET_PREF(domain); else policy = DOMAINSET_RR(); dev->softc = malloc_domainset(driver->size, M_BUS_SC, policy, M_NOWAIT | M_ZERO); if (!dev->softc) { kobj_delete((kobj_t) dev, NULL); kobj_init((kobj_t) dev, &null_class); dev->driver = NULL; return (ENOMEM); } } } else { kobj_init((kobj_t) dev, &null_class); } bus_data_generation_update(); return (0); } /** * @brief Probe a device, and return this status. * * This function is the core of the device autoconfiguration * system. Its purpose is to select a suitable driver for a device and * then call that driver to initialise the hardware appropriately. The * driver is selected by calling the DEVICE_PROBE() method of a set of * candidate drivers and then choosing the driver which returned the * best value. This driver is then attached to the device using * device_attach(). * * The set of suitable drivers is taken from the list of drivers in * the parent device's devclass. If the device was originally created * with a specific class name (see device_add_child()), only drivers * with that name are probed, otherwise all drivers in the devclass * are probed. If no drivers return successful probe values in the * parent devclass, the search continues in the parent of that * devclass (see devclass_get_parent()) if any. * * @param dev the device to initialise * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code * @retval -1 Device already attached */ int device_probe(device_t dev) { int error; bus_topo_assert(); if (dev->state >= DS_ALIVE) return (-1); if (!(dev->flags & DF_ENABLED)) { if (bootverbose && device_get_name(dev) != NULL) { device_print_prettyname(dev); printf("not probed (disabled)\n"); } return (-1); } if ((error = device_probe_child(dev->parent, dev)) != 0) { if (bus_current_pass == BUS_PASS_DEFAULT && !(dev->flags & DF_DONENOMATCH)) { device_handle_nomatch(dev); } return (error); } return (0); } /** * @brief Probe a device and attach a driver if possible * * calls device_probe() and attaches if that was successful. */ int device_probe_and_attach(device_t dev) { int error; bus_topo_assert(); error = device_probe(dev); if (error == -1) return (0); else if (error != 0) return (error); CURVNET_SET_QUIET(vnet0); error = device_attach(dev); CURVNET_RESTORE(); return error; } /** * @brief Attach a device driver to a device * * This function is a wrapper around the DEVICE_ATTACH() driver * method. In addition to calling DEVICE_ATTACH(), it initialises the * device's sysctl tree, optionally prints a description of the device * and queues a notification event for user-based device management * services. * * Normally this function is only called internally from * device_probe_and_attach(). * * @param dev the device to initialise * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code */ int device_attach(device_t dev) { uint64_t attachtime; uint16_t attachentropy; int error; if (resource_disabled(dev->driver->name, dev->unit)) { device_disable(dev); if (bootverbose) device_printf(dev, "disabled via hints entry\n"); return (ENXIO); } device_sysctl_init(dev); if (!device_is_quiet(dev)) device_print_child(dev->parent, dev); attachtime = get_cyclecount(); dev->state = DS_ATTACHING; if ((error = DEVICE_ATTACH(dev)) != 0) { printf("device_attach: %s%d attach returned %d\n", dev->driver->name, dev->unit, error); if (disable_failed_devs) { /* * When the user has asked to disable failed devices, we * directly disable the device, but leave it in the * attaching state. It will not try to probe/attach the * device further. This leaves the device numbering * intact for other similar devices in the system. It * can be removed from this state with devctl. */ device_disable(dev); } else { /* * Otherwise, when attach fails, tear down the state * around that so we can retry when, for example, new * drivers are loaded. */ if (!(dev->flags & DF_FIXEDCLASS)) devclass_delete_device(dev->devclass, dev); (void)device_set_driver(dev, NULL); device_sysctl_fini(dev); KASSERT(dev->busy == 0, ("attach failed but busy")); dev->state = DS_NOTPRESENT; } return (error); } dev->flags |= DF_ATTACHED_ONCE; /* * We only need the low bits of this time, but ranges from tens to thousands * have been seen, so keep 2 bytes' worth. */ attachentropy = (uint16_t)(get_cyclecount() - attachtime); random_harvest_direct(&attachentropy, sizeof(attachentropy), RANDOM_ATTACH); device_sysctl_update(dev); dev->state = DS_ATTACHED; dev->flags &= ~DF_DONENOMATCH; EVENTHANDLER_DIRECT_INVOKE(device_attach, dev); return (0); } /** * @brief Detach a driver from a device * * This function is a wrapper around the DEVICE_DETACH() driver * method. If the call to DEVICE_DETACH() succeeds, it calls * BUS_CHILD_DETACHED() for the parent of @p dev, queues a * notification event for user-based device management services and * cleans up the device's sysctl tree. * * @param dev the device to un-initialise * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code */ int device_detach(device_t dev) { int error; bus_topo_assert(); PDEBUG(("%s", DEVICENAME(dev))); if (dev->busy > 0) return (EBUSY); if (dev->state == DS_ATTACHING) { device_printf(dev, "device in attaching state! Deferring detach.\n"); return (EBUSY); } if (dev->state != DS_ATTACHED) return (0); EVENTHANDLER_DIRECT_INVOKE(device_detach, dev, EVHDEV_DETACH_BEGIN); if ((error = DEVICE_DETACH(dev)) != 0) { EVENTHANDLER_DIRECT_INVOKE(device_detach, dev, EVHDEV_DETACH_FAILED); return (error); } else { EVENTHANDLER_DIRECT_INVOKE(device_detach, dev, EVHDEV_DETACH_COMPLETE); } if (!device_is_quiet(dev)) device_printf(dev, "detached\n"); if (dev->parent) BUS_CHILD_DETACHED(dev->parent, dev); if (!(dev->flags & DF_FIXEDCLASS)) devclass_delete_device(dev->devclass, dev); device_verbose(dev); dev->state = DS_NOTPRESENT; (void)device_set_driver(dev, NULL); device_sysctl_fini(dev); return (0); } /** * @brief Tells a driver to quiesce itself. * * This function is a wrapper around the DEVICE_QUIESCE() driver * method. If the call to DEVICE_QUIESCE() succeeds. * * @param dev the device to quiesce * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code */ int device_quiesce(device_t dev) { PDEBUG(("%s", DEVICENAME(dev))); if (dev->busy > 0) return (EBUSY); if (dev->state != DS_ATTACHED) return (0); return (DEVICE_QUIESCE(dev)); } /** * @brief Notify a device of system shutdown * * This function calls the DEVICE_SHUTDOWN() driver method if the * device currently has an attached driver. * * @returns the value returned by DEVICE_SHUTDOWN() */ int device_shutdown(device_t dev) { if (dev->state < DS_ATTACHED) return (0); return (DEVICE_SHUTDOWN(dev)); } /** * @brief Set the unit number of a device * * This function can be used to override the unit number used for a * device (e.g. to wire a device to a pre-configured unit number). */ int device_set_unit(device_t dev, int unit) { devclass_t dc; int err; if (unit == dev->unit) return (0); dc = device_get_devclass(dev); if (unit < dc->maxunit && dc->devices[unit]) return (EBUSY); err = devclass_delete_device(dc, dev); if (err) return (err); dev->unit = unit; err = devclass_add_device(dc, dev); if (err) return (err); bus_data_generation_update(); return (0); } /*======================================*/ /* * Some useful method implementations to make life easier for bus drivers. */ void resource_init_map_request_impl(struct resource_map_request *args, size_t sz) { bzero(args, sz); args->size = sz; args->memattr = VM_MEMATTR_DEVICE; } /** * @brief Initialise a resource list. * * @param rl the resource list to initialise */ void resource_list_init(struct resource_list *rl) { STAILQ_INIT(rl); } /** * @brief Reclaim memory used by a resource list. * * This function frees the memory for all resource entries on the list * (if any). * * @param rl the resource list to free */ void resource_list_free(struct resource_list *rl) { struct resource_list_entry *rle; while ((rle = STAILQ_FIRST(rl)) != NULL) { if (rle->res) panic("resource_list_free: resource entry is busy"); STAILQ_REMOVE_HEAD(rl, link); free(rle, M_BUS); } } /** * @brief Add a resource entry. * * This function adds a resource entry using the given @p type, @p * start, @p end and @p count values. A rid value is chosen by * searching sequentially for the first unused rid starting at zero. * * @param rl the resource list to edit * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param start the start address of the resource * @param end the end address of the resource * @param count XXX end-start+1 */ int resource_list_add_next(struct resource_list *rl, int type, rman_res_t start, rman_res_t end, rman_res_t count) { int rid; rid = 0; while (resource_list_find(rl, type, rid) != NULL) rid++; resource_list_add(rl, type, rid, start, end, count); return (rid); } /** * @brief Add or modify a resource entry. * * If an existing entry exists with the same type and rid, it will be * modified using the given values of @p start, @p end and @p * count. If no entry exists, a new one will be created using the * given values. The resource list entry that matches is then returned. * * @param rl the resource list to edit * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * @param start the start address of the resource * @param end the end address of the resource * @param count XXX end-start+1 */ struct resource_list_entry * resource_list_add(struct resource_list *rl, int type, int rid, rman_res_t start, rman_res_t end, rman_res_t count) { struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (!rle) { rle = malloc(sizeof(struct resource_list_entry), M_BUS, M_NOWAIT); if (!rle) panic("resource_list_add: can't record entry"); STAILQ_INSERT_TAIL(rl, rle, link); rle->type = type; rle->rid = rid; rle->res = NULL; rle->flags = 0; } if (rle->res) panic("resource_list_add: resource entry is busy"); rle->start = start; rle->end = end; rle->count = count; return (rle); } /** * @brief Determine if a resource entry is busy. * * Returns true if a resource entry is busy meaning that it has an * associated resource that is not an unallocated "reserved" resource. * * @param rl the resource list to search * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * * @returns Non-zero if the entry is busy, zero otherwise. */ int resource_list_busy(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (rle == NULL || rle->res == NULL) return (0); if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) { KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE), ("reserved resource is active")); return (0); } return (1); } /** * @brief Determine if a resource entry is reserved. * * Returns true if a resource entry is reserved meaning that it has an * associated "reserved" resource. The resource can either be * allocated or unallocated. * * @param rl the resource list to search * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * * @returns Non-zero if the entry is reserved, zero otherwise. */ int resource_list_reserved(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (rle != NULL && rle->flags & RLE_RESERVED) return (1); return (0); } /** * @brief Find a resource entry by type and rid. * * @param rl the resource list to search * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * * @returns the resource entry pointer or NULL if there is no such * entry. */ struct resource_list_entry * resource_list_find(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle; STAILQ_FOREACH(rle, rl, link) { if (rle->type == type && rle->rid == rid) return (rle); } return (NULL); } /** * @brief Delete a resource entry. * * @param rl the resource list to edit * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier */ void resource_list_delete(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle = resource_list_find(rl, type, rid); if (rle) { if (rle->res != NULL) panic("resource_list_delete: resource has not been released"); STAILQ_REMOVE(rl, rle, resource_list_entry, link); free(rle, M_BUS); } } /** * @brief Allocate a reserved resource * * This can be used by buses to force the allocation of resources * that are always active in the system even if they are not allocated * by a driver (e.g. PCI BARs). This function is usually called when * adding a new child to the bus. The resource is allocated from the * parent bus when it is reserved. The resource list entry is marked * with RLE_RESERVED to note that it is a reserved resource. * * Subsequent attempts to allocate the resource with * resource_list_alloc() will succeed the first time and will set * RLE_ALLOCATED to note that it has been allocated. When a reserved * resource that has been allocated is released with * resource_list_release() the resource RLE_ALLOCATED is cleared, but * the actual resource remains allocated. The resource can be released to * the parent bus by calling resource_list_unreserve(). * * @param rl the resource list to allocate from * @param bus the parent device of @p child * @param child the device for which the resource is being reserved * @param type the type of resource to allocate * @param rid a pointer to the resource identifier * @param start hint at the start of the resource range - pass * @c 0 for any start address * @param end hint at the end of the resource range - pass * @c ~0 for any end address * @param count hint at the size of range required - pass @c 1 * for any size * @param flags any extra flags to control the resource * allocation - see @c RF_XXX flags in * for details * * @returns the resource which was allocated or @c NULL if no * resource could be allocated */ struct resource * resource_list_reserve(struct resource_list *rl, device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); struct resource *r; if (passthrough) panic( "resource_list_reserve() should only be called for direct children"); if (flags & RF_ACTIVE) panic( "resource_list_reserve() should only reserve inactive resources"); r = resource_list_alloc(rl, bus, child, type, rid, start, end, count, flags); if (r != NULL) { rle = resource_list_find(rl, type, *rid); rle->flags |= RLE_RESERVED; } return (r); } /** * @brief Helper function for implementing BUS_ALLOC_RESOURCE() * * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list * and passing the allocation up to the parent of @p bus. This assumes * that the first entry of @c device_get_ivars(child) is a struct * resource_list. This also handles 'passthrough' allocations where a * child is a remote descendant of bus by passing the allocation up to * the parent of bus. * * Typically, a bus driver would store a list of child resources * somewhere in the child device's ivars (see device_get_ivars()) and * its implementation of BUS_ALLOC_RESOURCE() would find that list and * then call resource_list_alloc() to perform the allocation. * * @param rl the resource list to allocate from * @param bus the parent device of @p child * @param child the device which is requesting an allocation * @param type the type of resource to allocate * @param rid a pointer to the resource identifier * @param start hint at the start of the resource range - pass * @c 0 for any start address * @param end hint at the end of the resource range - pass * @c ~0 for any end address * @param count hint at the size of range required - pass @c 1 * for any size * @param flags any extra flags to control the resource * allocation - see @c RF_XXX flags in * for details * * @returns the resource which was allocated or @c NULL if no * resource could be allocated */ struct resource * resource_list_alloc(struct resource_list *rl, device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); int isdefault = RMAN_IS_DEFAULT_RANGE(start, end); if (passthrough) { return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags)); } rle = resource_list_find(rl, type, *rid); if (!rle) return (NULL); /* no resource of that type/rid */ if (rle->res) { if (rle->flags & RLE_RESERVED) { if (rle->flags & RLE_ALLOCATED) return (NULL); if ((flags & RF_ACTIVE) && bus_activate_resource(child, type, *rid, rle->res) != 0) return (NULL); rle->flags |= RLE_ALLOCATED; return (rle->res); } device_printf(bus, "resource entry %#x type %d for child %s is busy\n", *rid, type, device_get_nameunit(child)); return (NULL); } if (isdefault) { start = rle->start; count = ulmax(count, rle->count); end = ulmax(rle->end, start + count - 1); } rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags); /* * Record the new range. */ if (rle->res) { rle->start = rman_get_start(rle->res); rle->end = rman_get_end(rle->res); rle->count = count; } return (rle->res); } /** * @brief Helper function for implementing BUS_RELEASE_RESOURCE() * * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally * used with resource_list_alloc(). * * @param rl the resource list which was allocated from * @param bus the parent device of @p child * @param child the device which is requesting a release * @param type the type of resource to release * @param rid the resource identifier * @param res the resource to release * * @retval 0 success * @retval non-zero a standard unix error code indicating what * error condition prevented the operation */ int resource_list_release(struct resource_list *rl, device_t bus, device_t child, int type, int rid, struct resource *res) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); int error; if (passthrough) { return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child, type, rid, res)); } rle = resource_list_find(rl, type, rid); if (!rle) panic("resource_list_release: can't find resource"); if (!rle->res) panic("resource_list_release: resource entry is not busy"); if (rle->flags & RLE_RESERVED) { if (rle->flags & RLE_ALLOCATED) { if (rman_get_flags(res) & RF_ACTIVE) { error = bus_deactivate_resource(child, type, rid, res); if (error) return (error); } rle->flags &= ~RLE_ALLOCATED; return (0); } return (EINVAL); } error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child, type, rid, res); if (error) return (error); rle->res = NULL; return (0); } /** * @brief Release all active resources of a given type * * Release all active resources of a specified type. This is intended * to be used to cleanup resources leaked by a driver after detach or * a failed attach. * * @param rl the resource list which was allocated from * @param bus the parent device of @p child * @param child the device whose active resources are being released * @param type the type of resources to release * * @retval 0 success * @retval EBUSY at least one resource was active */ int resource_list_release_active(struct resource_list *rl, device_t bus, device_t child, int type) { struct resource_list_entry *rle; int error, retval; retval = 0; STAILQ_FOREACH(rle, rl, link) { if (rle->type != type) continue; if (rle->res == NULL) continue; if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) continue; retval = EBUSY; error = resource_list_release(rl, bus, child, type, rman_get_rid(rle->res), rle->res); if (error != 0) device_printf(bus, "Failed to release active resource: %d\n", error); } return (retval); } /** * @brief Fully release a reserved resource * * Fully releases a resource reserved via resource_list_reserve(). * * @param rl the resource list which was allocated from * @param bus the parent device of @p child * @param child the device whose reserved resource is being released * @param type the type of resource to release * @param rid the resource identifier * @param res the resource to release * * @retval 0 success * @retval non-zero a standard unix error code indicating what * error condition prevented the operation */ int resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child, int type, int rid) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); if (passthrough) panic( "resource_list_unreserve() should only be called for direct children"); rle = resource_list_find(rl, type, rid); if (!rle) panic("resource_list_unreserve: can't find resource"); if (!(rle->flags & RLE_RESERVED)) return (EINVAL); if (rle->flags & RLE_ALLOCATED) return (EBUSY); rle->flags &= ~RLE_RESERVED; return (resource_list_release(rl, bus, child, type, rid, rle->res)); } /** * @brief Print a description of resources in a resource list * * Print all resources of a specified type, for use in BUS_PRINT_CHILD(). * The name is printed if at least one resource of the given type is available. * The format is used to print resource start and end. * * @param rl the resource list to print * @param name the name of @p type, e.g. @c "memory" * @param type type type of resource entry to print * @param format printf(9) format string to print resource * start and end values * * @returns the number of characters printed */ int resource_list_print_type(struct resource_list *rl, const char *name, int type, const char *format) { struct resource_list_entry *rle; int printed, retval; printed = 0; retval = 0; /* Yes, this is kinda cheating */ STAILQ_FOREACH(rle, rl, link) { if (rle->type == type) { if (printed == 0) retval += printf(" %s ", name); else retval += printf(","); printed++; retval += printf(format, rle->start); if (rle->count > 1) { retval += printf("-"); retval += printf(format, rle->start + rle->count - 1); } } } return (retval); } /** * @brief Releases all the resources in a list. * * @param rl The resource list to purge. * * @returns nothing */ void resource_list_purge(struct resource_list *rl) { struct resource_list_entry *rle; while ((rle = STAILQ_FIRST(rl)) != NULL) { if (rle->res) bus_release_resource(rman_get_device(rle->res), rle->type, rle->rid, rle->res); STAILQ_REMOVE_HEAD(rl, link); free(rle, M_BUS); } } device_t bus_generic_add_child(device_t dev, u_int order, const char *name, int unit) { return (device_add_child_ordered(dev, order, name, unit)); } /** * @brief Helper function for implementing DEVICE_PROBE() * * This function can be used to help implement the DEVICE_PROBE() for * a bus (i.e. a device which has other devices attached to it). It * calls the DEVICE_IDENTIFY() method of each driver in the device's * devclass. */ int bus_generic_probe(device_t dev) { devclass_t dc = dev->devclass; driverlink_t dl; TAILQ_FOREACH(dl, &dc->drivers, link) { /* * If this driver's pass is too high, then ignore it. * For most drivers in the default pass, this will * never be true. For early-pass drivers they will * only call the identify routines of eligible drivers * when this routine is called. Drivers for later * passes should have their identify routines called * on early-pass buses during BUS_NEW_PASS(). */ if (dl->pass > bus_current_pass) continue; DEVICE_IDENTIFY(dl->driver, dev); } return (0); } /** * @brief Helper function for implementing DEVICE_ATTACH() * * This function can be used to help implement the DEVICE_ATTACH() for * a bus. It calls device_probe_and_attach() for each of the device's * children. */ int bus_generic_attach(device_t dev) { device_t child; TAILQ_FOREACH(child, &dev->children, link) { device_probe_and_attach(child); } return (0); } /** * @brief Helper function for delaying attaching children * * Many buses can't run transactions on the bus which children need to probe and * attach until after interrupts and/or timers are running. This function * delays their attach until interrupts and timers are enabled. */ int bus_delayed_attach_children(device_t dev) { /* Probe and attach the bus children when interrupts are available */ config_intrhook_oneshot((ich_func_t)bus_generic_attach, dev); return (0); } /** * @brief Helper function for implementing DEVICE_DETACH() * * This function can be used to help implement the DEVICE_DETACH() for * a bus. It calls device_detach() for each of the device's * children. */ int bus_generic_detach(device_t dev) { device_t child; int error; if (dev->state != DS_ATTACHED) return (EBUSY); /* * Detach children in the reverse order. * See bus_generic_suspend for details. */ TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) { if ((error = device_detach(child)) != 0) return (error); } return (0); } /** * @brief Helper function for implementing DEVICE_SHUTDOWN() * * This function can be used to help implement the DEVICE_SHUTDOWN() * for a bus. It calls device_shutdown() for each of the device's * children. */ int bus_generic_shutdown(device_t dev) { device_t child; /* * Shut down children in the reverse order. * See bus_generic_suspend for details. */ TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) { device_shutdown(child); } return (0); } /** * @brief Default function for suspending a child device. * * This function is to be used by a bus's DEVICE_SUSPEND_CHILD(). */ int bus_generic_suspend_child(device_t dev, device_t child) { int error; error = DEVICE_SUSPEND(child); if (error == 0) child->flags |= DF_SUSPENDED; return (error); } /** * @brief Default function for resuming a child device. * * This function is to be used by a bus's DEVICE_RESUME_CHILD(). */ int bus_generic_resume_child(device_t dev, device_t child) { DEVICE_RESUME(child); child->flags &= ~DF_SUSPENDED; return (0); } /** * @brief Helper function for implementing DEVICE_SUSPEND() * * This function can be used to help implement the DEVICE_SUSPEND() * for a bus. It calls DEVICE_SUSPEND() for each of the device's * children. If any call to DEVICE_SUSPEND() fails, the suspend * operation is aborted and any devices which were suspended are * resumed immediately by calling their DEVICE_RESUME() methods. */ int bus_generic_suspend(device_t dev) { int error; device_t child; /* * Suspend children in the reverse order. * For most buses all children are equal, so the order does not matter. * Other buses, such as acpi, carefully order their child devices to * express implicit dependencies between them. For such buses it is * safer to bring down devices in the reverse order. */ TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) { error = BUS_SUSPEND_CHILD(dev, child); if (error != 0) { child = TAILQ_NEXT(child, link); if (child != NULL) { TAILQ_FOREACH_FROM(child, &dev->children, link) BUS_RESUME_CHILD(dev, child); } return (error); } } return (0); } /** * @brief Helper function for implementing DEVICE_RESUME() * * This function can be used to help implement the DEVICE_RESUME() for * a bus. It calls DEVICE_RESUME() on each of the device's children. */ int bus_generic_resume(device_t dev) { device_t child; TAILQ_FOREACH(child, &dev->children, link) { BUS_RESUME_CHILD(dev, child); /* if resume fails, there's nothing we can usefully do... */ } return (0); } /** * @brief Helper function for implementing BUS_RESET_POST * * Bus can use this function to implement common operations of * re-attaching or resuming the children after the bus itself was * reset, and after restoring bus-unique state of children. * * @param dev The bus * #param flags DEVF_RESET_* */ int bus_helper_reset_post(device_t dev, int flags) { device_t child; int error, error1; error = 0; TAILQ_FOREACH(child, &dev->children,link) { BUS_RESET_POST(dev, child); error1 = (flags & DEVF_RESET_DETACH) != 0 ? device_probe_and_attach(child) : BUS_RESUME_CHILD(dev, child); if (error == 0 && error1 != 0) error = error1; } return (error); } static void bus_helper_reset_prepare_rollback(device_t dev, device_t child, int flags) { child = TAILQ_NEXT(child, link); if (child == NULL) return; TAILQ_FOREACH_FROM(child, &dev->children,link) { BUS_RESET_POST(dev, child); if ((flags & DEVF_RESET_DETACH) != 0) device_probe_and_attach(child); else BUS_RESUME_CHILD(dev, child); } } /** * @brief Helper function for implementing BUS_RESET_PREPARE * * Bus can use this function to implement common operations of * detaching or suspending the children before the bus itself is * reset, and then save bus-unique state of children that must * persists around reset. * * @param dev The bus * #param flags DEVF_RESET_* */ int bus_helper_reset_prepare(device_t dev, int flags) { device_t child; int error; if (dev->state != DS_ATTACHED) return (EBUSY); TAILQ_FOREACH_REVERSE(child, &dev->children, device_list, link) { if ((flags & DEVF_RESET_DETACH) != 0) { error = device_get_state(child) == DS_ATTACHED ? device_detach(child) : 0; } else { error = BUS_SUSPEND_CHILD(dev, child); } if (error == 0) { error = BUS_RESET_PREPARE(dev, child); if (error != 0) { if ((flags & DEVF_RESET_DETACH) != 0) device_probe_and_attach(child); else BUS_RESUME_CHILD(dev, child); } } if (error != 0) { bus_helper_reset_prepare_rollback(dev, child, flags); return (error); } } return (0); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function prints the first part of the ascii representation of * @p child, including its name, unit and description (if any - see * device_set_desc()). * * @returns the number of characters printed */ int bus_print_child_header(device_t dev, device_t child) { int retval = 0; if (device_get_desc(child)) { retval += device_printf(child, "<%s>", device_get_desc(child)); } else { retval += printf("%s", device_get_nameunit(child)); } return (retval); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function prints the last part of the ascii representation of * @p child, which consists of the string @c " on " followed by the * name and unit of the @p dev. * * @returns the number of characters printed */ int bus_print_child_footer(device_t dev, device_t child) { return (printf(" on %s\n", device_get_nameunit(dev))); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function prints out the VM domain for the given device. * * @returns the number of characters printed */ int bus_print_child_domain(device_t dev, device_t child) { int domain; /* No domain? Don't print anything */ if (BUS_GET_DOMAIN(dev, child, &domain) != 0) return (0); return (printf(" numa-domain %d", domain)); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function simply calls bus_print_child_header() followed by * bus_print_child_footer(). * * @returns the number of characters printed */ int bus_generic_print_child(device_t dev, device_t child) { int retval = 0; retval += bus_print_child_header(dev, child); retval += bus_print_child_domain(dev, child); retval += bus_print_child_footer(dev, child); return (retval); } /** * @brief Stub function for implementing BUS_READ_IVAR(). * * @returns ENOENT */ int bus_generic_read_ivar(device_t dev, device_t child, int index, uintptr_t * result) { return (ENOENT); } /** * @brief Stub function for implementing BUS_WRITE_IVAR(). * * @returns ENOENT */ int bus_generic_write_ivar(device_t dev, device_t child, int index, uintptr_t value) { return (ENOENT); } /** * @brief Helper function for implementing BUS_GET_PROPERTY(). * * This simply calls the BUS_GET_PROPERTY of the parent of dev, * until a non-default implementation is found. */ ssize_t bus_generic_get_property(device_t dev, device_t child, const char *propname, void *propvalue, size_t size, device_property_type_t type) { if (device_get_parent(dev) != NULL) return (BUS_GET_PROPERTY(device_get_parent(dev), child, propname, propvalue, size, type)); return (-1); } /** * @brief Stub function for implementing BUS_GET_RESOURCE_LIST(). * * @returns NULL */ struct resource_list * bus_generic_get_resource_list(device_t dev, device_t child) { return (NULL); } /** * @brief Helper function for implementing BUS_DRIVER_ADDED(). * * This implementation of BUS_DRIVER_ADDED() simply calls the driver's * DEVICE_IDENTIFY() method to allow it to add new children to the bus * and then calls device_probe_and_attach() for each unattached child. */ void bus_generic_driver_added(device_t dev, driver_t *driver) { device_t child; DEVICE_IDENTIFY(driver, dev); TAILQ_FOREACH(child, &dev->children, link) { if (child->state == DS_NOTPRESENT) device_probe_and_attach(child); } } /** * @brief Helper function for implementing BUS_NEW_PASS(). * * This implementing of BUS_NEW_PASS() first calls the identify * routines for any drivers that probe at the current pass. Then it * walks the list of devices for this bus. If a device is already * attached, then it calls BUS_NEW_PASS() on that device. If the * device is not already attached, it attempts to attach a driver to * it. */ void bus_generic_new_pass(device_t dev) { driverlink_t dl; devclass_t dc; device_t child; dc = dev->devclass; TAILQ_FOREACH(dl, &dc->drivers, link) { if (dl->pass == bus_current_pass) DEVICE_IDENTIFY(dl->driver, dev); } TAILQ_FOREACH(child, &dev->children, link) { if (child->state >= DS_ATTACHED) BUS_NEW_PASS(child); else if (child->state == DS_NOTPRESENT) device_probe_and_attach(child); } } /** * @brief Helper function for implementing BUS_SETUP_INTR(). * * This simple implementation of BUS_SETUP_INTR() simply calls the * BUS_SETUP_INTR() method of the parent of @p dev. */ int bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq, int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_SETUP_INTR(dev->parent, child, irq, flags, filter, intr, arg, cookiep)); return (EINVAL); } /** * @brief Helper function for implementing BUS_TEARDOWN_INTR(). * * This simple implementation of BUS_TEARDOWN_INTR() simply calls the * BUS_TEARDOWN_INTR() method of the parent of @p dev. */ int bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq, void *cookie) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie)); return (EINVAL); } /** * @brief Helper function for implementing BUS_SUSPEND_INTR(). * * This simple implementation of BUS_SUSPEND_INTR() simply calls the * BUS_SUSPEND_INTR() method of the parent of @p dev. */ int bus_generic_suspend_intr(device_t dev, device_t child, struct resource *irq) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_SUSPEND_INTR(dev->parent, child, irq)); return (EINVAL); } /** * @brief Helper function for implementing BUS_RESUME_INTR(). * * This simple implementation of BUS_RESUME_INTR() simply calls the * BUS_RESUME_INTR() method of the parent of @p dev. */ int bus_generic_resume_intr(device_t dev, device_t child, struct resource *irq) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_RESUME_INTR(dev->parent, child, irq)); return (EINVAL); } /** * @brief Helper function for implementing BUS_ADJUST_RESOURCE(). * * This simple implementation of BUS_ADJUST_RESOURCE() simply calls the * BUS_ADJUST_RESOURCE() method of the parent of @p dev. */ int bus_generic_adjust_resource(device_t dev, device_t child, int type, struct resource *r, rman_res_t start, rman_res_t end) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start, end)); return (EINVAL); } /* * @brief Helper function for implementing BUS_TRANSLATE_RESOURCE(). * * This simple implementation of BUS_TRANSLATE_RESOURCE() simply calls the * BUS_TRANSLATE_RESOURCE() method of the parent of @p dev. If there is no * parent, no translation happens. */ int bus_generic_translate_resource(device_t dev, int type, rman_res_t start, rman_res_t *newstart) { if (dev->parent) return (BUS_TRANSLATE_RESOURCE(dev->parent, type, start, newstart)); *newstart = start; return (0); } /** * @brief Helper function for implementing BUS_ALLOC_RESOURCE(). * * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the * BUS_ALLOC_RESOURCE() method of the parent of @p dev. */ struct resource * bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid, start, end, count, flags)); return (NULL); } /** * @brief Helper function for implementing BUS_RELEASE_RESOURCE(). * * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the * BUS_RELEASE_RESOURCE() method of the parent of @p dev. */ int bus_generic_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid, r)); return (EINVAL); } /** * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE(). * * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev. */ int bus_generic_activate_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid, r)); return (EINVAL); } /** * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE(). * * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev. */ int bus_generic_deactivate_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid, r)); return (EINVAL); } /** * @brief Helper function for implementing BUS_MAP_RESOURCE(). * * This simple implementation of BUS_MAP_RESOURCE() simply calls the * BUS_MAP_RESOURCE() method of the parent of @p dev. */ int bus_generic_map_resource(device_t dev, device_t child, int type, struct resource *r, struct resource_map_request *args, struct resource_map *map) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_MAP_RESOURCE(dev->parent, child, type, r, args, map)); return (EINVAL); } /** * @brief Helper function for implementing BUS_UNMAP_RESOURCE(). * * This simple implementation of BUS_UNMAP_RESOURCE() simply calls the * BUS_UNMAP_RESOURCE() method of the parent of @p dev. */ int bus_generic_unmap_resource(device_t dev, device_t child, int type, struct resource *r, struct resource_map *map) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_UNMAP_RESOURCE(dev->parent, child, type, r, map)); return (EINVAL); } /** * @brief Helper function for implementing BUS_BIND_INTR(). * * This simple implementation of BUS_BIND_INTR() simply calls the * BUS_BIND_INTR() method of the parent of @p dev. */ int bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_BIND_INTR(dev->parent, child, irq, cpu)); return (EINVAL); } /** * @brief Helper function for implementing BUS_CONFIG_INTR(). * * This simple implementation of BUS_CONFIG_INTR() simply calls the * BUS_CONFIG_INTR() method of the parent of @p dev. */ int bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig, enum intr_polarity pol) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol)); return (EINVAL); } /** * @brief Helper function for implementing BUS_DESCRIBE_INTR(). * * This simple implementation of BUS_DESCRIBE_INTR() simply calls the * BUS_DESCRIBE_INTR() method of the parent of @p dev. */ int bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq, void *cookie, const char *descr) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie, descr)); return (EINVAL); } /** * @brief Helper function for implementing BUS_GET_CPUS(). * * This simple implementation of BUS_GET_CPUS() simply calls the * BUS_GET_CPUS() method of the parent of @p dev. */ int bus_generic_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent != NULL) return (BUS_GET_CPUS(dev->parent, child, op, setsize, cpuset)); return (EINVAL); } /** * @brief Helper function for implementing BUS_GET_DMA_TAG(). * * This simple implementation of BUS_GET_DMA_TAG() simply calls the * BUS_GET_DMA_TAG() method of the parent of @p dev. */ bus_dma_tag_t bus_generic_get_dma_tag(device_t dev, device_t child) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent != NULL) return (BUS_GET_DMA_TAG(dev->parent, child)); return (NULL); } /** * @brief Helper function for implementing BUS_GET_BUS_TAG(). * * This simple implementation of BUS_GET_BUS_TAG() simply calls the * BUS_GET_BUS_TAG() method of the parent of @p dev. */ bus_space_tag_t bus_generic_get_bus_tag(device_t dev, device_t child) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent != NULL) return (BUS_GET_BUS_TAG(dev->parent, child)); return ((bus_space_tag_t)0); } /** * @brief Helper function for implementing BUS_GET_RESOURCE(). * * This implementation of BUS_GET_RESOURCE() uses the * resource_list_find() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list to * search. */ int bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid, rman_res_t *startp, rman_res_t *countp) { struct resource_list * rl = NULL; struct resource_list_entry * rle = NULL; rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (EINVAL); rle = resource_list_find(rl, type, rid); if (!rle) return (ENOENT); if (startp) *startp = rle->start; if (countp) *countp = rle->count; return (0); } /** * @brief Helper function for implementing BUS_SET_RESOURCE(). * * This implementation of BUS_SET_RESOURCE() uses the * resource_list_add() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list to * edit. */ int bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid, rman_res_t start, rman_res_t count) { struct resource_list * rl = NULL; rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (EINVAL); resource_list_add(rl, type, rid, start, (start + count - 1), count); return (0); } /** * @brief Helper function for implementing BUS_DELETE_RESOURCE(). * * This implementation of BUS_DELETE_RESOURCE() uses the * resource_list_delete() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list to * edit. */ void bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid) { struct resource_list * rl = NULL; rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return; resource_list_delete(rl, type, rid); return; } /** * @brief Helper function for implementing BUS_RELEASE_RESOURCE(). * * This implementation of BUS_RELEASE_RESOURCE() uses the * resource_list_release() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list. */ int bus_generic_rl_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { struct resource_list * rl = NULL; if (device_get_parent(child) != dev) return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child, type, rid, r)); rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (EINVAL); return (resource_list_release(rl, dev, child, type, rid, r)); } /** * @brief Helper function for implementing BUS_ALLOC_RESOURCE(). * * This implementation of BUS_ALLOC_RESOURCE() uses the * resource_list_alloc() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list. */ struct resource * bus_generic_rl_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource_list * rl = NULL; if (device_get_parent(child) != dev) return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child, type, rid, start, end, count, flags)); rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (NULL); return (resource_list_alloc(rl, dev, child, type, rid, start, end, count, flags)); } /** * @brief Helper function for implementing BUS_CHILD_PRESENT(). * * This simple implementation of BUS_CHILD_PRESENT() simply calls the * BUS_CHILD_PRESENT() method of the parent of @p dev. */ int bus_generic_child_present(device_t dev, device_t child) { return (BUS_CHILD_PRESENT(device_get_parent(dev), dev)); } int bus_generic_get_domain(device_t dev, device_t child, int *domain) { if (dev->parent) return (BUS_GET_DOMAIN(dev->parent, dev, domain)); return (ENOENT); } /** * @brief Helper function to implement normal BUS_GET_DEVICE_PATH() * * This function knows how to (a) pass the request up the tree if there's * a parent and (b) Knows how to supply a FreeBSD locator. * * @param bus bus in the walk up the tree * @param child leaf node to print information about * @param locator BUS_LOCATOR_xxx string for locator * @param sb Buffer to print information into */ int bus_generic_get_device_path(device_t bus, device_t child, const char *locator, struct sbuf *sb) { int rv = 0; device_t parent; /* * We don't recurse on ACPI since either we know the handle for the * device or we don't. And if we're in the generic routine, we don't * have a ACPI override. All other locators build up a path by having * their parents create a path and then adding the path element for this * node. That's why we recurse with parent, bus rather than the typical * parent, child: each spot in the tree is independent of what our child * will do with this path. */ parent = device_get_parent(bus); if (parent != NULL && strcmp(locator, BUS_LOCATOR_ACPI) != 0) { rv = BUS_GET_DEVICE_PATH(parent, bus, locator, sb); } if (strcmp(locator, BUS_LOCATOR_FREEBSD) == 0) { if (rv == 0) { sbuf_printf(sb, "/%s", device_get_nameunit(child)); } return (rv); } /* * Don't know what to do. So assume we do nothing. Not sure that's * the right thing, but keeps us from having a big list here. */ return (0); } /** * @brief Helper function for implementing BUS_RESCAN(). * * This null implementation of BUS_RESCAN() always fails to indicate * the bus does not support rescanning. */ int bus_null_rescan(device_t dev) { return (ENODEV); } /* * Some convenience functions to make it easier for drivers to use the * resource-management functions. All these really do is hide the * indirection through the parent's method table, making for slightly * less-wordy code. In the future, it might make sense for this code * to maintain some sort of a list of resources allocated by each device. */ int bus_alloc_resources(device_t dev, struct resource_spec *rs, struct resource **res) { int i; for (i = 0; rs[i].type != -1; i++) res[i] = NULL; for (i = 0; rs[i].type != -1; i++) { res[i] = bus_alloc_resource_any(dev, rs[i].type, &rs[i].rid, rs[i].flags); if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) { bus_release_resources(dev, rs, res); return (ENXIO); } } return (0); } void bus_release_resources(device_t dev, const struct resource_spec *rs, struct resource **res) { int i; for (i = 0; rs[i].type != -1; i++) if (res[i] != NULL) { bus_release_resource( dev, rs[i].type, rs[i].rid, res[i]); res[i] = NULL; } } /** * @brief Wrapper function for BUS_ALLOC_RESOURCE(). * * This function simply calls the BUS_ALLOC_RESOURCE() method of the * parent of @p dev. */ struct resource * bus_alloc_resource(device_t dev, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource *res; if (dev->parent == NULL) return (NULL); res = BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end, count, flags); return (res); } /** * @brief Wrapper function for BUS_ADJUST_RESOURCE(). * * This function simply calls the BUS_ADJUST_RESOURCE() method of the * parent of @p dev. */ int bus_adjust_resource(device_t dev, int type, struct resource *r, rman_res_t start, rman_res_t end) { if (dev->parent == NULL) return (EINVAL); return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end)); } /** * @brief Wrapper function for BUS_TRANSLATE_RESOURCE(). * * This function simply calls the BUS_TRANSLATE_RESOURCE() method of the * parent of @p dev. */ int bus_translate_resource(device_t dev, int type, rman_res_t start, rman_res_t *newstart) { if (dev->parent == NULL) return (EINVAL); return (BUS_TRANSLATE_RESOURCE(dev->parent, type, start, newstart)); } /** * @brief Wrapper function for BUS_ACTIVATE_RESOURCE(). * * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the * parent of @p dev. */ int bus_activate_resource(device_t dev, int type, int rid, struct resource *r) { if (dev->parent == NULL) return (EINVAL); return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); } /** * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE(). * * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the * parent of @p dev. */ int bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r) { if (dev->parent == NULL) return (EINVAL); return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); } /** * @brief Wrapper function for BUS_MAP_RESOURCE(). * * This function simply calls the BUS_MAP_RESOURCE() method of the * parent of @p dev. */ int bus_map_resource(device_t dev, int type, struct resource *r, struct resource_map_request *args, struct resource_map *map) { if (dev->parent == NULL) return (EINVAL); return (BUS_MAP_RESOURCE(dev->parent, dev, type, r, args, map)); } /** * @brief Wrapper function for BUS_UNMAP_RESOURCE(). * * This function simply calls the BUS_UNMAP_RESOURCE() method of the * parent of @p dev. */ int bus_unmap_resource(device_t dev, int type, struct resource *r, struct resource_map *map) { if (dev->parent == NULL) return (EINVAL); return (BUS_UNMAP_RESOURCE(dev->parent, dev, type, r, map)); } /** * @brief Wrapper function for BUS_RELEASE_RESOURCE(). * * This function simply calls the BUS_RELEASE_RESOURCE() method of the * parent of @p dev. */ int bus_release_resource(device_t dev, int type, int rid, struct resource *r) { int rv; if (dev->parent == NULL) return (EINVAL); rv = BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r); return (rv); } /** * @brief Wrapper function for BUS_SETUP_INTR(). * * This function simply calls the BUS_SETUP_INTR() method of the * parent of @p dev. */ int bus_setup_intr(device_t dev, struct resource *r, int flags, driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep) { int error; if (dev->parent == NULL) return (EINVAL); error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler, arg, cookiep); if (error != 0) return (error); if (handler != NULL && !(flags & INTR_MPSAFE)) device_printf(dev, "[GIANT-LOCKED]\n"); return (0); } /** * @brief Wrapper function for BUS_TEARDOWN_INTR(). * * This function simply calls the BUS_TEARDOWN_INTR() method of the * parent of @p dev. */ int bus_teardown_intr(device_t dev, struct resource *r, void *cookie) { if (dev->parent == NULL) return (EINVAL); return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie)); } /** * @brief Wrapper function for BUS_SUSPEND_INTR(). * * This function simply calls the BUS_SUSPEND_INTR() method of the * parent of @p dev. */ int bus_suspend_intr(device_t dev, struct resource *r) { if (dev->parent == NULL) return (EINVAL); return (BUS_SUSPEND_INTR(dev->parent, dev, r)); } /** * @brief Wrapper function for BUS_RESUME_INTR(). * * This function simply calls the BUS_RESUME_INTR() method of the * parent of @p dev. */ int bus_resume_intr(device_t dev, struct resource *r) { if (dev->parent == NULL) return (EINVAL); return (BUS_RESUME_INTR(dev->parent, dev, r)); } /** * @brief Wrapper function for BUS_BIND_INTR(). * * This function simply calls the BUS_BIND_INTR() method of the * parent of @p dev. */ int bus_bind_intr(device_t dev, struct resource *r, int cpu) { if (dev->parent == NULL) return (EINVAL); return (BUS_BIND_INTR(dev->parent, dev, r, cpu)); } /** * @brief Wrapper function for BUS_DESCRIBE_INTR(). * * This function first formats the requested description into a * temporary buffer and then calls the BUS_DESCRIBE_INTR() method of * the parent of @p dev. */ int bus_describe_intr(device_t dev, struct resource *irq, void *cookie, const char *fmt, ...) { va_list ap; char descr[MAXCOMLEN + 1]; if (dev->parent == NULL) return (EINVAL); va_start(ap, fmt); vsnprintf(descr, sizeof(descr), fmt, ap); va_end(ap); return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr)); } /** * @brief Wrapper function for BUS_SET_RESOURCE(). * * This function simply calls the BUS_SET_RESOURCE() method of the * parent of @p dev. */ int bus_set_resource(device_t dev, int type, int rid, rman_res_t start, rman_res_t count) { return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid, start, count)); } /** * @brief Wrapper function for BUS_GET_RESOURCE(). * * This function simply calls the BUS_GET_RESOURCE() method of the * parent of @p dev. */ int bus_get_resource(device_t dev, int type, int rid, rman_res_t *startp, rman_res_t *countp) { return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, startp, countp)); } /** * @brief Wrapper function for BUS_GET_RESOURCE(). * * This function simply calls the BUS_GET_RESOURCE() method of the * parent of @p dev and returns the start value. */ rman_res_t bus_get_resource_start(device_t dev, int type, int rid) { rman_res_t start; rman_res_t count; int error; error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, &start, &count); if (error) return (0); return (start); } /** * @brief Wrapper function for BUS_GET_RESOURCE(). * * This function simply calls the BUS_GET_RESOURCE() method of the * parent of @p dev and returns the count value. */ rman_res_t bus_get_resource_count(device_t dev, int type, int rid) { rman_res_t start; rman_res_t count; int error; error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, &start, &count); if (error) return (0); return (count); } /** * @brief Wrapper function for BUS_DELETE_RESOURCE(). * * This function simply calls the BUS_DELETE_RESOURCE() method of the * parent of @p dev. */ void bus_delete_resource(device_t dev, int type, int rid) { BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid); } /** * @brief Wrapper function for BUS_CHILD_PRESENT(). * * This function simply calls the BUS_CHILD_PRESENT() method of the * parent of @p dev. */ int bus_child_present(device_t child) { return (BUS_CHILD_PRESENT(device_get_parent(child), child)); } /** * @brief Wrapper function for BUS_CHILD_PNPINFO(). * * This function simply calls the BUS_CHILD_PNPINFO() method of the parent of @p * dev. */ int bus_child_pnpinfo(device_t child, struct sbuf *sb) { device_t parent; parent = device_get_parent(child); if (parent == NULL) return (0); return (BUS_CHILD_PNPINFO(parent, child, sb)); } /** * @brief Generic implementation that does nothing for bus_child_pnpinfo * * This function has the right signature and returns 0 since the sbuf is passed * to us to append to. */ int bus_generic_child_pnpinfo(device_t dev, device_t child, struct sbuf *sb) { return (0); } /** * @brief Wrapper function for BUS_CHILD_LOCATION(). * * This function simply calls the BUS_CHILD_LOCATION() method of the parent of * @p dev. */ int bus_child_location(device_t child, struct sbuf *sb) { device_t parent; parent = device_get_parent(child); if (parent == NULL) return (0); return (BUS_CHILD_LOCATION(parent, child, sb)); } /** * @brief Generic implementation that does nothing for bus_child_location * * This function has the right signature and returns 0 since the sbuf is passed * to us to append to. */ int bus_generic_child_location(device_t dev, device_t child, struct sbuf *sb) { return (0); } /** * @brief Wrapper function for BUS_GET_CPUS(). * * This function simply calls the BUS_GET_CPUS() method of the * parent of @p dev. */ int bus_get_cpus(device_t dev, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { device_t parent; parent = device_get_parent(dev); if (parent == NULL) return (EINVAL); return (BUS_GET_CPUS(parent, dev, op, setsize, cpuset)); } /** * @brief Wrapper function for BUS_GET_DMA_TAG(). * * This function simply calls the BUS_GET_DMA_TAG() method of the * parent of @p dev. */ bus_dma_tag_t bus_get_dma_tag(device_t dev) { device_t parent; parent = device_get_parent(dev); if (parent == NULL) return (NULL); return (BUS_GET_DMA_TAG(parent, dev)); } /** * @brief Wrapper function for BUS_GET_BUS_TAG(). * * This function simply calls the BUS_GET_BUS_TAG() method of the * parent of @p dev. */ bus_space_tag_t bus_get_bus_tag(device_t dev) { device_t parent; parent = device_get_parent(dev); if (parent == NULL) return ((bus_space_tag_t)0); return (BUS_GET_BUS_TAG(parent, dev)); } /** * @brief Wrapper function for BUS_GET_DOMAIN(). * * This function simply calls the BUS_GET_DOMAIN() method of the * parent of @p dev. */ int bus_get_domain(device_t dev, int *domain) { return (BUS_GET_DOMAIN(device_get_parent(dev), dev, domain)); } /* Resume all devices and then notify userland that we're up again. */ static int root_resume(device_t dev) { int error; error = bus_generic_resume(dev); if (error == 0) { devctl_notify("kernel", "power", "resume", NULL); } return (error); } static int root_print_child(device_t dev, device_t child) { int retval = 0; retval += bus_print_child_header(dev, child); retval += printf("\n"); return (retval); } static int root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep) { /* * If an interrupt mapping gets to here something bad has happened. */ panic("root_setup_intr"); } /* * If we get here, assume that the device is permanent and really is * present in the system. Removable bus drivers are expected to intercept * this call long before it gets here. We return -1 so that drivers that * really care can check vs -1 or some ERRNO returned higher in the food * chain. */ static int root_child_present(device_t dev, device_t child) { return (-1); } static int root_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { switch (op) { case INTR_CPUS: /* Default to returning the set of all CPUs. */ if (setsize != sizeof(cpuset_t)) return (EINVAL); *cpuset = all_cpus; return (0); default: return (EINVAL); } } static kobj_method_t root_methods[] = { /* Device interface */ KOBJMETHOD(device_shutdown, bus_generic_shutdown), KOBJMETHOD(device_suspend, bus_generic_suspend), KOBJMETHOD(device_resume, root_resume), /* Bus interface */ KOBJMETHOD(bus_print_child, root_print_child), KOBJMETHOD(bus_read_ivar, bus_generic_read_ivar), KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar), KOBJMETHOD(bus_setup_intr, root_setup_intr), KOBJMETHOD(bus_child_present, root_child_present), KOBJMETHOD(bus_get_cpus, root_get_cpus), KOBJMETHOD_END }; static driver_t root_driver = { "root", root_methods, 1, /* no softc */ }; device_t root_bus; devclass_t root_devclass; static int root_bus_module_handler(module_t mod, int what, void* arg) { switch (what) { case MOD_LOAD: TAILQ_INIT(&bus_data_devices); kobj_class_compile((kobj_class_t) &root_driver); root_bus = make_device(NULL, "root", 0); root_bus->desc = "System root bus"; kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver); root_bus->driver = &root_driver; root_bus->state = DS_ATTACHED; root_devclass = devclass_find_internal("root", NULL, FALSE); devctl2_init(); return (0); case MOD_SHUTDOWN: device_shutdown(root_bus); return (0); default: return (EOPNOTSUPP); } return (0); } static moduledata_t root_bus_mod = { "rootbus", root_bus_module_handler, NULL }; DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); /** * @brief Automatically configure devices * * This function begins the autoconfiguration process by calling * device_probe_and_attach() for each child of the @c root0 device. */ void root_bus_configure(void) { PDEBUG((".")); /* Eventually this will be split up, but this is sufficient for now. */ bus_set_pass(BUS_PASS_DEFAULT); } /** * @brief Module handler for registering device drivers * * This module handler is used to automatically register device * drivers when modules are loaded. If @p what is MOD_LOAD, it calls * devclass_add_driver() for the driver described by the * driver_module_data structure pointed to by @p arg */ int driver_module_handler(module_t mod, int what, void *arg) { struct driver_module_data *dmd; devclass_t bus_devclass; kobj_class_t driver; int error, pass; dmd = (struct driver_module_data *)arg; bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE); error = 0; switch (what) { case MOD_LOAD: if (dmd->dmd_chainevh) error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); pass = dmd->dmd_pass; driver = dmd->dmd_driver; PDEBUG(("Loading module: driver %s on bus %s (pass %d)", DRIVERNAME(driver), dmd->dmd_busname, pass)); error = devclass_add_driver(bus_devclass, driver, pass, dmd->dmd_devclass); break; case MOD_UNLOAD: PDEBUG(("Unloading module: driver %s from bus %s", DRIVERNAME(dmd->dmd_driver), dmd->dmd_busname)); error = devclass_delete_driver(bus_devclass, dmd->dmd_driver); if (!error && dmd->dmd_chainevh) error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); break; case MOD_QUIESCE: PDEBUG(("Quiesce module: driver %s from bus %s", DRIVERNAME(dmd->dmd_driver), dmd->dmd_busname)); error = devclass_quiesce_driver(bus_devclass, dmd->dmd_driver); if (!error && dmd->dmd_chainevh) error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); break; default: error = EOPNOTSUPP; break; } return (error); } /** * @brief Enumerate all hinted devices for this bus. * * Walks through the hints for this bus and calls the bus_hinted_child * routine for each one it fines. It searches first for the specific * bus that's being probed for hinted children (eg isa0), and then for * generic children (eg isa). * * @param dev bus device to enumerate */ void bus_enumerate_hinted_children(device_t bus) { int i; const char *dname, *busname; int dunit; /* * enumerate all devices on the specific bus */ busname = device_get_nameunit(bus); i = 0; while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0) BUS_HINTED_CHILD(bus, dname, dunit); /* * and all the generic ones. */ busname = device_get_name(bus); i = 0; while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0) BUS_HINTED_CHILD(bus, dname, dunit); } #ifdef BUS_DEBUG /* the _short versions avoid iteration by not calling anything that prints * more than oneliners. I love oneliners. */ static void print_device_short(device_t dev, int indent) { if (!dev) return; indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n", dev->unit, dev->desc, (dev->parent? "":"no "), (TAILQ_EMPTY(&dev->children)? "no ":""), (dev->flags&DF_ENABLED? "enabled,":"disabled,"), (dev->flags&DF_FIXEDCLASS? "fixed,":""), (dev->flags&DF_WILDCARD? "wildcard,":""), (dev->flags&DF_DESCMALLOCED? "descmalloced,":""), (dev->flags&DF_SUSPENDED? "suspended,":""), (dev->ivars? "":"no "), (dev->softc? "":"no "), dev->busy)); } static void print_device(device_t dev, int indent) { if (!dev) return; print_device_short(dev, indent); indentprintf(("Parent:\n")); print_device_short(dev->parent, indent+1); indentprintf(("Driver:\n")); print_driver_short(dev->driver, indent+1); indentprintf(("Devclass:\n")); print_devclass_short(dev->devclass, indent+1); } void print_device_tree_short(device_t dev, int indent) /* print the device and all its children (indented) */ { device_t child; if (!dev) return; print_device_short(dev, indent); TAILQ_FOREACH(child, &dev->children, link) { print_device_tree_short(child, indent+1); } } void print_device_tree(device_t dev, int indent) /* print the device and all its children (indented) */ { device_t child; if (!dev) return; print_device(dev, indent); TAILQ_FOREACH(child, &dev->children, link) { print_device_tree(child, indent+1); } } static void print_driver_short(driver_t *driver, int indent) { if (!driver) return; indentprintf(("driver %s: softc size = %zd\n", driver->name, driver->size)); } static void print_driver(driver_t *driver, int indent) { if (!driver) return; print_driver_short(driver, indent); } static void print_driver_list(driver_list_t drivers, int indent) { driverlink_t driver; TAILQ_FOREACH(driver, &drivers, link) { print_driver(driver->driver, indent); } } static void print_devclass_short(devclass_t dc, int indent) { if ( !dc ) return; indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit)); } static void print_devclass(devclass_t dc, int indent) { int i; if ( !dc ) return; print_devclass_short(dc, indent); indentprintf(("Drivers:\n")); print_driver_list(dc->drivers, indent+1); indentprintf(("Devices:\n")); for (i = 0; i < dc->maxunit; i++) if (dc->devices[i]) print_device(dc->devices[i], indent+1); } void print_devclass_list_short(void) { devclass_t dc; printf("Short listing of devclasses, drivers & devices:\n"); TAILQ_FOREACH(dc, &devclasses, link) { print_devclass_short(dc, 0); } } void print_devclass_list(void) { devclass_t dc; printf("Full listing of devclasses, drivers & devices:\n"); TAILQ_FOREACH(dc, &devclasses, link) { print_devclass(dc, 0); } } #endif /* * User-space access to the device tree. * * We implement a small set of nodes: * * hw.bus Single integer read method to obtain the * current generation count. * hw.bus.devices Reads the entire device tree in flat space. * hw.bus.rman Resource manager interface * * We might like to add the ability to scan devclasses and/or drivers to * determine what else is currently loaded/available. */ static int sysctl_bus_info(SYSCTL_HANDLER_ARGS) { struct u_businfo ubus; ubus.ub_version = BUS_USER_VERSION; ubus.ub_generation = bus_data_generation; return (SYSCTL_OUT(req, &ubus, sizeof(ubus))); } SYSCTL_PROC(_hw_bus, OID_AUTO, info, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_bus_info, "S,u_businfo", "bus-related data"); static int sysctl_devices(SYSCTL_HANDLER_ARGS) { struct sbuf sb; int *name = (int *)arg1; u_int namelen = arg2; int index; device_t dev; struct u_device *udev; int error; if (namelen != 2) return (EINVAL); if (bus_data_generation_check(name[0])) return (EINVAL); index = name[1]; /* * Scan the list of devices, looking for the requested index. */ TAILQ_FOREACH(dev, &bus_data_devices, devlink) { if (index-- == 0) break; } if (dev == NULL) return (ENOENT); /* * Populate the return item, careful not to overflow the buffer. */ udev = malloc(sizeof(*udev), M_BUS, M_WAITOK | M_ZERO); if (udev == NULL) return (ENOMEM); udev->dv_handle = (uintptr_t)dev; udev->dv_parent = (uintptr_t)dev->parent; udev->dv_devflags = dev->devflags; udev->dv_flags = dev->flags; udev->dv_state = dev->state; sbuf_new(&sb, udev->dv_fields, sizeof(udev->dv_fields), SBUF_FIXEDLEN); if (dev->nameunit != NULL) sbuf_cat(&sb, dev->nameunit); sbuf_putc(&sb, '\0'); if (dev->desc != NULL) sbuf_cat(&sb, dev->desc); sbuf_putc(&sb, '\0'); if (dev->driver != NULL) sbuf_cat(&sb, dev->driver->name); sbuf_putc(&sb, '\0'); bus_child_pnpinfo(dev, &sb); sbuf_putc(&sb, '\0'); bus_child_location(dev, &sb); sbuf_putc(&sb, '\0'); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, udev, sizeof(*udev)); sbuf_delete(&sb); free(udev, M_BUS); return (error); } SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD | CTLFLAG_NEEDGIANT, sysctl_devices, "system device tree"); int bus_data_generation_check(int generation) { if (generation != bus_data_generation) return (1); /* XXX generate optimised lists here? */ return (0); } void bus_data_generation_update(void) { atomic_add_int(&bus_data_generation, 1); } int bus_free_resource(device_t dev, int type, struct resource *r) { if (r == NULL) return (0); return (bus_release_resource(dev, type, rman_get_rid(r), r)); } device_t device_lookup_by_name(const char *name) { device_t dev; TAILQ_FOREACH(dev, &bus_data_devices, devlink) { if (dev->nameunit != NULL && strcmp(dev->nameunit, name) == 0) return (dev); } return (NULL); } /* * /dev/devctl2 implementation. The existing /dev/devctl device has * implicit semantics on open, so it could not be reused for this. * Another option would be to call this /dev/bus? */ static int find_device(struct devreq *req, device_t *devp) { device_t dev; /* * First, ensure that the name is nul terminated. */ if (memchr(req->dr_name, '\0', sizeof(req->dr_name)) == NULL) return (EINVAL); /* * Second, try to find an attached device whose name matches * 'name'. */ dev = device_lookup_by_name(req->dr_name); if (dev != NULL) { *devp = dev; return (0); } /* Finally, give device enumerators a chance. */ dev = NULL; EVENTHANDLER_DIRECT_INVOKE(dev_lookup, req->dr_name, &dev); if (dev == NULL) return (ENOENT); *devp = dev; return (0); } static bool driver_exists(device_t bus, const char *driver) { devclass_t dc; for (dc = bus->devclass; dc != NULL; dc = dc->parent) { if (devclass_find_driver_internal(dc, driver) != NULL) return (true); } return (false); } static void device_gen_nomatch(device_t dev) { device_t child; if (dev->flags & DF_NEEDNOMATCH && dev->state == DS_NOTPRESENT) { device_handle_nomatch(dev); } dev->flags &= ~DF_NEEDNOMATCH; TAILQ_FOREACH(child, &dev->children, link) { device_gen_nomatch(child); } } static void device_do_deferred_actions(void) { devclass_t dc; driverlink_t dl; /* * Walk through the devclasses to find all the drivers we've tagged as * deferred during the freeze and call the driver added routines. They * have already been added to the lists in the background, so the driver * added routines that trigger a probe will have all the right bidders * for the probe auction. */ TAILQ_FOREACH(dc, &devclasses, link) { TAILQ_FOREACH(dl, &dc->drivers, link) { if (dl->flags & DL_DEFERRED_PROBE) { devclass_driver_added(dc, dl->driver); dl->flags &= ~DL_DEFERRED_PROBE; } } } /* * We also defer no-match events during a freeze. Walk the tree and * generate all the pent-up events that are still relevant. */ device_gen_nomatch(root_bus); bus_data_generation_update(); } static int device_get_path(device_t dev, const char *locator, struct sbuf *sb) { device_t parent; int error; KASSERT(sb != NULL, ("sb is NULL")); parent = device_get_parent(dev); if (parent == NULL) { - error = sbuf_printf(sb, "/"); + error = sbuf_putc(sb, '/'); } else { error = BUS_GET_DEVICE_PATH(parent, dev, locator, sb); if (error == 0) { error = sbuf_error(sb); if (error == 0 && sbuf_len(sb) <= 1) error = EIO; } } sbuf_finish(sb); return (error); } static int devctl2_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct devreq *req; device_t dev; int error, old; /* Locate the device to control. */ bus_topo_lock(); req = (struct devreq *)data; switch (cmd) { case DEV_ATTACH: case DEV_DETACH: case DEV_ENABLE: case DEV_DISABLE: case DEV_SUSPEND: case DEV_RESUME: case DEV_SET_DRIVER: case DEV_CLEAR_DRIVER: case DEV_RESCAN: case DEV_DELETE: case DEV_RESET: error = priv_check(td, PRIV_DRIVER); if (error == 0) error = find_device(req, &dev); break; case DEV_FREEZE: case DEV_THAW: error = priv_check(td, PRIV_DRIVER); break; case DEV_GET_PATH: error = find_device(req, &dev); break; default: error = ENOTTY; break; } if (error) { bus_topo_unlock(); return (error); } /* Perform the requested operation. */ switch (cmd) { case DEV_ATTACH: if (device_is_attached(dev)) error = EBUSY; else if (!device_is_enabled(dev)) error = ENXIO; else error = device_probe_and_attach(dev); break; case DEV_DETACH: if (!device_is_attached(dev)) { error = ENXIO; break; } if (!(req->dr_flags & DEVF_FORCE_DETACH)) { error = device_quiesce(dev); if (error) break; } error = device_detach(dev); break; case DEV_ENABLE: if (device_is_enabled(dev)) { error = EBUSY; break; } /* * If the device has been probed but not attached (e.g. * when it has been disabled by a loader hint), just * attach the device rather than doing a full probe. */ device_enable(dev); if (device_is_alive(dev)) { /* * If the device was disabled via a hint, clear * the hint. */ if (resource_disabled(dev->driver->name, dev->unit)) resource_unset_value(dev->driver->name, dev->unit, "disabled"); error = device_attach(dev); } else error = device_probe_and_attach(dev); break; case DEV_DISABLE: if (!device_is_enabled(dev)) { error = ENXIO; break; } if (!(req->dr_flags & DEVF_FORCE_DETACH)) { error = device_quiesce(dev); if (error) break; } /* * Force DF_FIXEDCLASS on around detach to preserve * the existing name. */ old = dev->flags; dev->flags |= DF_FIXEDCLASS; error = device_detach(dev); if (!(old & DF_FIXEDCLASS)) dev->flags &= ~DF_FIXEDCLASS; if (error == 0) device_disable(dev); break; case DEV_SUSPEND: if (device_is_suspended(dev)) { error = EBUSY; break; } if (device_get_parent(dev) == NULL) { error = EINVAL; break; } error = BUS_SUSPEND_CHILD(device_get_parent(dev), dev); break; case DEV_RESUME: if (!device_is_suspended(dev)) { error = EINVAL; break; } if (device_get_parent(dev) == NULL) { error = EINVAL; break; } error = BUS_RESUME_CHILD(device_get_parent(dev), dev); break; case DEV_SET_DRIVER: { devclass_t dc; char driver[128]; error = copyinstr(req->dr_data, driver, sizeof(driver), NULL); if (error) break; if (driver[0] == '\0') { error = EINVAL; break; } if (dev->devclass != NULL && strcmp(driver, dev->devclass->name) == 0) /* XXX: Could possibly force DF_FIXEDCLASS on? */ break; /* * Scan drivers for this device's bus looking for at * least one matching driver. */ if (dev->parent == NULL) { error = EINVAL; break; } if (!driver_exists(dev->parent, driver)) { error = ENOENT; break; } dc = devclass_create(driver); if (dc == NULL) { error = ENOMEM; break; } /* Detach device if necessary. */ if (device_is_attached(dev)) { if (req->dr_flags & DEVF_SET_DRIVER_DETACH) error = device_detach(dev); else error = EBUSY; if (error) break; } /* Clear any previously-fixed device class and unit. */ if (dev->flags & DF_FIXEDCLASS) devclass_delete_device(dev->devclass, dev); dev->flags |= DF_WILDCARD; dev->unit = -1; /* Force the new device class. */ error = devclass_add_device(dc, dev); if (error) break; dev->flags |= DF_FIXEDCLASS; error = device_probe_and_attach(dev); break; } case DEV_CLEAR_DRIVER: if (!(dev->flags & DF_FIXEDCLASS)) { error = 0; break; } if (device_is_attached(dev)) { if (req->dr_flags & DEVF_CLEAR_DRIVER_DETACH) error = device_detach(dev); else error = EBUSY; if (error) break; } dev->flags &= ~DF_FIXEDCLASS; dev->flags |= DF_WILDCARD; devclass_delete_device(dev->devclass, dev); error = device_probe_and_attach(dev); break; case DEV_RESCAN: if (!device_is_attached(dev)) { error = ENXIO; break; } error = BUS_RESCAN(dev); break; case DEV_DELETE: { device_t parent; parent = device_get_parent(dev); if (parent == NULL) { error = EINVAL; break; } if (!(req->dr_flags & DEVF_FORCE_DELETE)) { if (bus_child_present(dev) != 0) { error = EBUSY; break; } } error = device_delete_child(parent, dev); break; } case DEV_FREEZE: if (device_frozen) error = EBUSY; else device_frozen = true; break; case DEV_THAW: if (!device_frozen) error = EBUSY; else { device_do_deferred_actions(); device_frozen = false; } break; case DEV_RESET: if ((req->dr_flags & ~(DEVF_RESET_DETACH)) != 0) { error = EINVAL; break; } error = BUS_RESET_CHILD(device_get_parent(dev), dev, req->dr_flags); break; case DEV_GET_PATH: { struct sbuf *sb; char locator[64]; ssize_t len; error = copyinstr(req->dr_buffer.buffer, locator, sizeof(locator), NULL); if (error != 0) break; sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND | SBUF_INCLUDENUL /* | SBUF_WAITOK */); error = device_get_path(dev, locator, sb); if (error == 0) { len = sbuf_len(sb); if (req->dr_buffer.length < len) { error = ENAMETOOLONG; } else { error = copyout(sbuf_data(sb), req->dr_buffer.buffer, len); } req->dr_buffer.length = len; } sbuf_delete(sb); break; } } bus_topo_unlock(); return (error); } static struct cdevsw devctl2_cdevsw = { .d_version = D_VERSION, .d_ioctl = devctl2_ioctl, .d_name = "devctl2", }; static void devctl2_init(void) { make_dev_credf(MAKEDEV_ETERNAL, &devctl2_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0644, "devctl2"); } /* * For maintaining device 'at' location info to avoid recomputing it */ struct device_location_node { const char *dln_locator; const char *dln_path; TAILQ_ENTRY(device_location_node) dln_link; }; typedef TAILQ_HEAD(device_location_list, device_location_node) device_location_list_t; struct device_location_cache { device_location_list_t dlc_list; }; /* * Location cache for wired devices. */ device_location_cache_t * dev_wired_cache_init(void) { device_location_cache_t *dcp; dcp = malloc(sizeof(*dcp), M_BUS, M_WAITOK | M_ZERO); TAILQ_INIT(&dcp->dlc_list); return (dcp); } void dev_wired_cache_fini(device_location_cache_t *dcp) { struct device_location_node *dln, *tdln; TAILQ_FOREACH_SAFE(dln, &dcp->dlc_list, dln_link, tdln) { free(dln, M_BUS); } free(dcp, M_BUS); } static struct device_location_node * dev_wired_cache_lookup(device_location_cache_t *dcp, const char *locator) { struct device_location_node *dln; TAILQ_FOREACH(dln, &dcp->dlc_list, dln_link) { if (strcmp(locator, dln->dln_locator) == 0) return (dln); } return (NULL); } static struct device_location_node * dev_wired_cache_add(device_location_cache_t *dcp, const char *locator, const char *path) { struct device_location_node *dln; size_t loclen, pathlen; loclen = strlen(locator) + 1; pathlen = strlen(path) + 1; dln = malloc(sizeof(*dln) + loclen + pathlen, M_BUS, M_WAITOK | M_ZERO); dln->dln_locator = (char *)(dln + 1); memcpy(__DECONST(char *, dln->dln_locator), locator, loclen); dln->dln_path = dln->dln_locator + loclen; memcpy(__DECONST(char *, dln->dln_path), path, pathlen); TAILQ_INSERT_HEAD(&dcp->dlc_list, dln, dln_link); return (dln); } bool dev_wired_cache_match(device_location_cache_t *dcp, device_t dev, const char *at) { struct sbuf *sb; const char *cp; char locator[32]; int error, len; struct device_location_node *res; cp = strchr(at, ':'); if (cp == NULL) return (false); len = cp - at; if (len > sizeof(locator) - 1) /* Skip too long locator */ return (false); memcpy(locator, at, len); locator[len] = '\0'; cp++; error = 0; /* maybe cache this inside device_t and look that up, but not yet */ res = dev_wired_cache_lookup(dcp, locator); if (res == NULL) { sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND | SBUF_INCLUDENUL | SBUF_NOWAIT); if (sb != NULL) { error = device_get_path(dev, locator, sb); if (error == 0) { res = dev_wired_cache_add(dcp, locator, sbuf_data(sb)); } sbuf_delete(sb); } } if (error != 0 || res == NULL || res->dln_path == NULL) return (false); return (strcmp(res->dln_path, cp) == 0); } /* * APIs to manage deprecation and obsolescence. */ static int obsolete_panic = 0; SYSCTL_INT(_debug, OID_AUTO, obsolete_panic, CTLFLAG_RWTUN, &obsolete_panic, 0, "Panic when obsolete features are used (0 = never, 1 = if obsolete, " "2 = if deprecated)"); static void gone_panic(int major, int running, const char *msg) { switch (obsolete_panic) { case 0: return; case 1: if (running < major) return; /* FALLTHROUGH */ default: panic("%s", msg); } } void _gone_in(int major, const char *msg) { gone_panic(major, P_OSREL_MAJOR(__FreeBSD_version), msg); if (P_OSREL_MAJOR(__FreeBSD_version) >= major) printf("Obsolete code will be removed soon: %s\n", msg); else printf("Deprecated code (to be removed in FreeBSD %d): %s\n", major, msg); } void _gone_in_dev(device_t dev, int major, const char *msg) { gone_panic(major, P_OSREL_MAJOR(__FreeBSD_version), msg); if (P_OSREL_MAJOR(__FreeBSD_version) >= major) device_printf(dev, "Obsolete code will be removed soon: %s\n", msg); else device_printf(dev, "Deprecated code (to be removed in FreeBSD %d): %s\n", major, msg); } #ifdef DDB DB_SHOW_COMMAND(device, db_show_device) { device_t dev; if (!have_addr) return; dev = (device_t)addr; db_printf("name: %s\n", device_get_nameunit(dev)); db_printf(" driver: %s\n", DRIVERNAME(dev->driver)); db_printf(" class: %s\n", DEVCLANAME(dev->devclass)); db_printf(" addr: %p\n", dev); db_printf(" parent: %p\n", dev->parent); db_printf(" softc: %p\n", dev->softc); db_printf(" ivars: %p\n", dev->ivars); } DB_SHOW_ALL_COMMAND(devices, db_show_all_devices) { device_t dev; TAILQ_FOREACH(dev, &bus_data_devices, devlink) { db_show_device((db_expr_t)dev, true, count, modif); } } #endif diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c index 5c06bf8270f6..5136ece359e5 100644 --- a/sys/kern/subr_prf.c +++ b/sys/kern/subr_prf.c @@ -1,1339 +1,1339 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94 */ #include #ifdef _KERNEL #include "opt_ddb.h" #include "opt_printf.h" #endif /* _KERNEL */ #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #else /* !_KERNEL */ #include #endif #include #include #ifdef DDB #include #endif /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #ifdef _KERNEL #include #else #include #endif /* * This is needed for sbuf_putbuf() when compiled into userland. Due to the * shared nature of this file, it's the only place to put it. */ #ifndef _KERNEL #include #endif #ifdef _KERNEL #define TOCONS 0x01 #define TOTTY 0x02 #define TOLOG 0x04 /* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */ #define MAXNBUF (sizeof(intmax_t) * NBBY + 1) struct putchar_arg { int flags; int pri; struct tty *tty; char *p_bufr; size_t n_bufr; char *p_next; size_t remain; }; struct snprintf_arg { char *str; size_t remain; }; extern int log_open; static void msglogchar(int c, int pri); static void msglogstr(char *str, int pri, int filter_cr); static void prf_putbuf(char *bufr, int flags, int pri); static void putchar(int ch, void *arg); static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper); static void snprintf_func(int ch, void *arg); static bool msgbufmapped; /* Set when safe to use msgbuf */ int msgbuftrigger; struct msgbuf *msgbufp; #ifndef BOOT_TAG_SZ #define BOOT_TAG_SZ 32 #endif #ifndef BOOT_TAG /* Tag used to mark the start of a boot in dmesg */ #define BOOT_TAG "---<>---" #endif static char current_boot_tag[BOOT_TAG_SZ + 1] = BOOT_TAG; SYSCTL_STRING(_kern, OID_AUTO, boot_tag, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, current_boot_tag, 0, "Tag added to dmesg at start of boot"); static int log_console_output = 1; SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RWTUN, &log_console_output, 0, "Duplicate console output to the syslog"); /* * See the comment in log_console() below for more explanation of this. */ static int log_console_add_linefeed; SYSCTL_INT(_kern, OID_AUTO, log_console_add_linefeed, CTLFLAG_RWTUN, &log_console_add_linefeed, 0, "log_console() adds extra newlines"); static int always_console_output; SYSCTL_INT(_kern, OID_AUTO, always_console_output, CTLFLAG_RWTUN, &always_console_output, 0, "Always output to console despite TIOCCONS"); /* * Warn that a system table is full. */ void tablefull(const char *tab) { log(LOG_ERR, "%s: table is full\n", tab); } /* * Uprintf prints to the controlling terminal for the current process. */ int uprintf(const char *fmt, ...) { va_list ap; struct putchar_arg pca; struct proc *p; struct thread *td; int retval; td = curthread; if (TD_IS_IDLETHREAD(td)) return (0); if (td->td_proc == initproc) { /* Produce output when we fail to load /sbin/init: */ va_start(ap, fmt); retval = vprintf(fmt, ap); va_end(ap); return (retval); } sx_slock(&proctree_lock); p = td->td_proc; PROC_LOCK(p); if ((p->p_flag & P_CONTROLT) == 0) { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); return (0); } SESS_LOCK(p->p_session); pca.tty = p->p_session->s_ttyp; SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); if (pca.tty == NULL) { sx_sunlock(&proctree_lock); return (0); } pca.flags = TOTTY; pca.p_bufr = NULL; va_start(ap, fmt); tty_lock(pca.tty); sx_sunlock(&proctree_lock); retval = kvprintf(fmt, putchar, &pca, 10, ap); tty_unlock(pca.tty); va_end(ap); return (retval); } /* * tprintf and vtprintf print on the controlling terminal associated with the * given session, possibly to the log as well. */ void tprintf(struct proc *p, int pri, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vtprintf(p, pri, fmt, ap); va_end(ap); } void vtprintf(struct proc *p, int pri, const char *fmt, va_list ap) { struct tty *tp = NULL; int flags = 0; struct putchar_arg pca; struct session *sess = NULL; sx_slock(&proctree_lock); if (pri != -1) flags |= TOLOG; if (p != NULL) { PROC_LOCK(p); if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { sess = p->p_session; sess_hold(sess); PROC_UNLOCK(p); tp = sess->s_ttyp; if (tp != NULL && tty_checkoutq(tp)) flags |= TOTTY; else tp = NULL; } else PROC_UNLOCK(p); } pca.pri = pri; pca.tty = tp; pca.flags = flags; pca.p_bufr = NULL; if (pca.tty != NULL) tty_lock(pca.tty); sx_sunlock(&proctree_lock); kvprintf(fmt, putchar, &pca, 10, ap); if (pca.tty != NULL) tty_unlock(pca.tty); if (sess != NULL) sess_release(sess); msgbuftrigger = 1; } static int _vprintf(int level, int flags, const char *fmt, va_list ap) { struct putchar_arg pca; int retval; #ifdef PRINTF_BUFR_SIZE char bufr[PRINTF_BUFR_SIZE]; #endif TSENTER(); pca.tty = NULL; pca.pri = level; pca.flags = flags; #ifdef PRINTF_BUFR_SIZE pca.p_bufr = bufr; pca.p_next = pca.p_bufr; pca.n_bufr = sizeof(bufr); pca.remain = sizeof(bufr); *pca.p_next = '\0'; #else /* Don't buffer console output. */ pca.p_bufr = NULL; #endif retval = kvprintf(fmt, putchar, &pca, 10, ap); #ifdef PRINTF_BUFR_SIZE /* Write any buffered console/log output: */ if (*pca.p_bufr != '\0') prf_putbuf(pca.p_bufr, flags, level); #endif TSEXIT(); return (retval); } /* * Log writes to the log buffer, and guarantees not to sleep (so can be * called by interrupt routines). If there is no process reading the * log yet, it writes to the console also. */ void log(int level, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vlog(level, fmt, ap); va_end(ap); } void vlog(int level, const char *fmt, va_list ap) { (void)_vprintf(level, log_open ? TOLOG : TOCONS | TOLOG, fmt, ap); msgbuftrigger = 1; } #define CONSCHUNK 128 void log_console(struct uio *uio) { int c, error, nl; char *consbuffer; int pri; if (!log_console_output) return; pri = LOG_INFO | LOG_CONSOLE; uio = cloneuio(uio); consbuffer = malloc(CONSCHUNK, M_TEMP, M_WAITOK); nl = 0; while (uio->uio_resid > 0) { c = imin(uio->uio_resid, CONSCHUNK - 1); error = uiomove(consbuffer, c, uio); if (error != 0) break; /* Make sure we're NUL-terminated */ consbuffer[c] = '\0'; if (consbuffer[c - 1] == '\n') nl = 1; else nl = 0; msglogstr(consbuffer, pri, /*filter_cr*/ 1); } /* * The previous behavior in log_console() is preserved when * log_console_add_linefeed is non-zero. For that behavior, if an * individual console write came in that was not terminated with a * line feed, it would add a line feed. * * This results in different data in the message buffer than * appears on the system console (which doesn't add extra line feed * characters). * * A number of programs and rc scripts write a line feed, or a period * and a line feed when they have completed their operation. On * the console, this looks seamless, but when displayed with * 'dmesg -a', you wind up with output that looks like this: * * Updating motd: * . * * On the console, it looks like this: * Updating motd:. * * We could add logic to detect that situation, or just not insert * the extra newlines. Set the kern.log_console_add_linefeed * sysctl/tunable variable to get the old behavior. */ if (!nl && log_console_add_linefeed) { consbuffer[0] = '\n'; consbuffer[1] = '\0'; msglogstr(consbuffer, pri, /*filter_cr*/ 1); } msgbuftrigger = 1; free(uio, M_IOV); free(consbuffer, M_TEMP); } int printf(const char *fmt, ...) { va_list ap; int retval; va_start(ap, fmt); retval = vprintf(fmt, ap); va_end(ap); return (retval); } int vprintf(const char *fmt, va_list ap) { int retval; retval = _vprintf(-1, TOCONS | TOLOG, fmt, ap); if (!KERNEL_PANICKED()) msgbuftrigger = 1; return (retval); } static void prf_putchar(int c, int flags, int pri) { if (flags & TOLOG) { msglogchar(c, pri); msgbuftrigger = 1; } if (flags & TOCONS) { if ((!KERNEL_PANICKED()) && (constty != NULL)) msgbuf_addchar(&consmsgbuf, c); if ((constty == NULL) || always_console_output) cnputc(c); } } static void prf_putbuf(char *bufr, int flags, int pri) { if (flags & TOLOG) { msglogstr(bufr, pri, /*filter_cr*/1); msgbuftrigger = 1; } if (flags & TOCONS) { if ((!KERNEL_PANICKED()) && (constty != NULL)) msgbuf_addstr(&consmsgbuf, -1, bufr, /*filter_cr*/ 0); if ((constty == NULL) || always_console_output) cnputs(bufr); } } static void putbuf(int c, struct putchar_arg *ap) { /* Check if no console output buffer was provided. */ if (ap->p_bufr == NULL) { prf_putchar(c, ap->flags, ap->pri); } else { /* Buffer the character: */ *ap->p_next++ = c; ap->remain--; /* Always leave the buffer zero terminated. */ *ap->p_next = '\0'; /* Check if the buffer needs to be flushed. */ if (ap->remain == 2 || c == '\n') { prf_putbuf(ap->p_bufr, ap->flags, ap->pri); ap->p_next = ap->p_bufr; ap->remain = ap->n_bufr; *ap->p_next = '\0'; } /* * Since we fill the buffer up one character at a time, * this should not happen. We should always catch it when * ap->remain == 2 (if not sooner due to a newline), flush * the buffer and move on. One way this could happen is * if someone sets PRINTF_BUFR_SIZE to 1 or something * similarly silly. */ KASSERT(ap->remain > 2, ("Bad buffer logic, remain = %zd", ap->remain)); } } /* * Print a character on console or users terminal. If destination is * the console then the last bunch of characters are saved in msgbuf for * inspection later. */ static void putchar(int c, void *arg) { struct putchar_arg *ap = (struct putchar_arg*) arg; struct tty *tp = ap->tty; int flags = ap->flags; /* Don't use the tty code after a panic or while in ddb. */ if (kdb_active) { if (c != '\0') cnputc(c); return; } if ((flags & TOTTY) && tp != NULL && !KERNEL_PANICKED()) tty_putchar(tp, c); if ((flags & (TOCONS | TOLOG)) && c != '\0') putbuf(c, ap); } /* * Scaled down version of sprintf(3). */ int sprintf(char *buf, const char *cfmt, ...) { int retval; va_list ap; va_start(ap, cfmt); retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); buf[retval] = '\0'; va_end(ap); return (retval); } /* * Scaled down version of vsprintf(3). */ int vsprintf(char *buf, const char *cfmt, va_list ap) { int retval; retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); buf[retval] = '\0'; return (retval); } /* * Scaled down version of snprintf(3). */ int snprintf(char *str, size_t size, const char *format, ...) { int retval; va_list ap; va_start(ap, format); retval = vsnprintf(str, size, format, ap); va_end(ap); return(retval); } /* * Scaled down version of vsnprintf(3). */ int vsnprintf(char *str, size_t size, const char *format, va_list ap) { struct snprintf_arg info; int retval; info.str = str; info.remain = size; retval = kvprintf(format, snprintf_func, &info, 10, ap); if (info.remain >= 1) *info.str++ = '\0'; return (retval); } /* * Kernel version which takes radix argument vsnprintf(3). */ int vsnrprintf(char *str, size_t size, int radix, const char *format, va_list ap) { struct snprintf_arg info; int retval; info.str = str; info.remain = size; retval = kvprintf(format, snprintf_func, &info, radix, ap); if (info.remain >= 1) *info.str++ = '\0'; return (retval); } static void snprintf_func(int ch, void *arg) { struct snprintf_arg *const info = arg; if (info->remain >= 2) { *info->str++ = ch; info->remain--; } } /* * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse * order; return an optional length and a pointer to the last character * written in the buffer (i.e., the first character of the string). * The buffer pointed to by `nbuf' must have length >= MAXNBUF. */ static char * ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper) { char *p, c; p = nbuf; *p = '\0'; do { c = hex2ascii(num % base); *++p = upper ? toupper(c) : c; } while (num /= base); if (lenp) *lenp = p - nbuf; return (p); } /* * Scaled down version of printf(3). * * Two additional formats: * * The format %b is supported to decode error registers. * Its usage is: * * printf("reg=%b\n", regval, "*"); * * where is the output base expressed as a control character, e.g. * \10 gives octal; \20 gives hex. Each arg is a sequence of characters, * the first of which gives the bit number to be inspected (origin 1), and * the next characters (up to a control character, i.e. a character <= 32), * give the name of the register. Thus: * * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE"); * * would produce output: * * reg=3 * * XXX: %D -- Hexdump, takes pointer and separator string: * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX * ("%*D", len, ptr, " " -> XX XX XX XX ... */ int kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap) { #define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; } char nbuf[MAXNBUF]; char *d; const char *p, *percent, *q; u_char *up; int ch, n; uintmax_t num; int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot; int cflag, hflag, jflag, tflag, zflag; int bconv, dwidth, upper; char padc; int stop = 0, retval = 0; num = 0; q = NULL; if (!func) d = (char *) arg; else d = NULL; if (fmt == NULL) fmt = "(fmt null)\n"; if (radix < 2 || radix > 36) radix = 10; for (;;) { padc = ' '; width = 0; while ((ch = (u_char)*fmt++) != '%' || stop) { if (ch == '\0') return (retval); PCHAR(ch); } percent = fmt - 1; qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; sign = 0; dot = 0; bconv = 0; dwidth = 0; upper = 0; cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0; reswitch: switch (ch = (u_char)*fmt++) { case '.': dot = 1; goto reswitch; case '#': sharpflag = 1; goto reswitch; case '+': sign = 1; goto reswitch; case '-': ladjust = 1; goto reswitch; case '%': PCHAR(ch); break; case '*': if (!dot) { width = va_arg(ap, int); if (width < 0) { ladjust = !ladjust; width = -width; } } else { dwidth = va_arg(ap, int); } goto reswitch; case '0': if (!dot) { padc = '0'; goto reswitch; } /* FALLTHROUGH */ case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': for (n = 0;; ++fmt) { n = n * 10 + ch - '0'; ch = *fmt; if (ch < '0' || ch > '9') break; } if (dot) dwidth = n; else width = n; goto reswitch; case 'b': ladjust = 1; bconv = 1; goto handle_nosign; case 'c': width -= 1; if (!ladjust && width > 0) while (width--) PCHAR(padc); PCHAR(va_arg(ap, int)); if (ladjust && width > 0) while (width--) PCHAR(padc); break; case 'D': up = va_arg(ap, u_char *); p = va_arg(ap, char *); if (!width) width = 16; while(width--) { PCHAR(hex2ascii(*up >> 4)); PCHAR(hex2ascii(*up & 0x0f)); up++; if (width) for (q=p;*q;q++) PCHAR(*q); } break; case 'd': case 'i': base = 10; sign = 1; goto handle_sign; case 'h': if (hflag) { hflag = 0; cflag = 1; } else hflag = 1; goto reswitch; case 'j': jflag = 1; goto reswitch; case 'l': if (lflag) { lflag = 0; qflag = 1; } else lflag = 1; goto reswitch; case 'n': /* * We do not support %n in kernel, but consume the * argument. */ if (jflag) (void)va_arg(ap, intmax_t *); else if (qflag) (void)va_arg(ap, quad_t *); else if (lflag) (void)va_arg(ap, long *); else if (zflag) (void)va_arg(ap, size_t *); else if (hflag) (void)va_arg(ap, short *); else if (cflag) (void)va_arg(ap, char *); else (void)va_arg(ap, int *); break; case 'o': base = 8; goto handle_nosign; case 'p': base = 16; sharpflag = (width == 0); sign = 0; num = (uintptr_t)va_arg(ap, void *); goto number; case 'q': qflag = 1; goto reswitch; case 'r': base = radix; if (sign) goto handle_sign; goto handle_nosign; case 's': p = va_arg(ap, char *); if (p == NULL) p = "(null)"; if (!dot) n = strlen (p); else for (n = 0; n < dwidth && p[n]; n++) continue; width -= n; if (!ladjust && width > 0) while (width--) PCHAR(padc); while (n--) PCHAR(*p++); if (ladjust && width > 0) while (width--) PCHAR(padc); break; case 't': tflag = 1; goto reswitch; case 'u': base = 10; goto handle_nosign; case 'X': upper = 1; /* FALLTHROUGH */ case 'x': base = 16; goto handle_nosign; case 'y': base = 16; sign = 1; goto handle_sign; case 'z': zflag = 1; goto reswitch; handle_nosign: sign = 0; if (jflag) num = va_arg(ap, uintmax_t); else if (qflag) num = va_arg(ap, u_quad_t); else if (tflag) num = va_arg(ap, ptrdiff_t); else if (lflag) num = va_arg(ap, u_long); else if (zflag) num = va_arg(ap, size_t); else if (hflag) num = (u_short)va_arg(ap, int); else if (cflag) num = (u_char)va_arg(ap, int); else num = va_arg(ap, u_int); if (bconv) { q = va_arg(ap, char *); base = *q++; } goto number; handle_sign: if (jflag) num = va_arg(ap, intmax_t); else if (qflag) num = va_arg(ap, quad_t); else if (tflag) num = va_arg(ap, ptrdiff_t); else if (lflag) num = va_arg(ap, long); else if (zflag) num = va_arg(ap, ssize_t); else if (hflag) num = (short)va_arg(ap, int); else if (cflag) num = (char)va_arg(ap, int); else num = va_arg(ap, int); number: if (sign && (intmax_t)num < 0) { neg = 1; num = -(intmax_t)num; } p = ksprintn(nbuf, num, base, &n, upper); tmp = 0; if (sharpflag && num != 0) { if (base == 8) tmp++; else if (base == 16) tmp += 2; } if (neg) tmp++; if (!ladjust && padc == '0') dwidth = width - tmp; width -= tmp + imax(dwidth, n); dwidth -= n; if (!ladjust) while (width-- > 0) PCHAR(' '); if (neg) PCHAR('-'); if (sharpflag && num != 0) { if (base == 8) { PCHAR('0'); } else if (base == 16) { PCHAR('0'); PCHAR('x'); } } while (dwidth-- > 0) PCHAR('0'); while (*p) PCHAR(*p--); if (bconv && num != 0) { /* %b conversion flag format. */ tmp = retval; while (*q) { n = *q++; if (num & (1 << (n - 1))) { PCHAR(retval != tmp ? ',' : '<'); for (; (n = *q) > ' '; ++q) PCHAR(n); } else for (; *q > ' '; ++q) continue; } if (retval != tmp) { PCHAR('>'); width -= retval - tmp; } } if (ladjust) while (width-- > 0) PCHAR(' '); break; default: while (percent < fmt) PCHAR(*percent++); /* * Since we ignore a formatting argument it is no * longer safe to obey the remaining formatting * arguments as the arguments will no longer match * the format specs. */ stop = 1; break; } } #undef PCHAR } /* * Put character in log buffer with a particular priority. */ static void msglogchar(int c, int pri) { static int lastpri = -1; static int dangling; char nbuf[MAXNBUF]; char *p; if (!msgbufmapped) return; if (c == '\0' || c == '\r') return; if (pri != -1 && pri != lastpri) { if (dangling) { msgbuf_addchar(msgbufp, '\n'); dangling = 0; } msgbuf_addchar(msgbufp, '<'); for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;) msgbuf_addchar(msgbufp, *p--); msgbuf_addchar(msgbufp, '>'); lastpri = pri; } msgbuf_addchar(msgbufp, c); if (c == '\n') { dangling = 0; lastpri = -1; } else { dangling = 1; } } static void msglogstr(char *str, int pri, int filter_cr) { if (!msgbufmapped) return; msgbuf_addstr(msgbufp, pri, str, filter_cr); } void msgbufinit(void *ptr, int size) { char *cp; static struct msgbuf *oldp = NULL; bool print_boot_tag; TSENTER(); size -= sizeof(*msgbufp); cp = (char *)ptr; print_boot_tag = !msgbufmapped; /* Attempt to fetch kern.boot_tag tunable on first mapping */ if (!msgbufmapped) TUNABLE_STR_FETCH("kern.boot_tag", current_boot_tag, sizeof(current_boot_tag)); msgbufp = (struct msgbuf *)(cp + size); msgbuf_reinit(msgbufp, cp, size); if (msgbufmapped && oldp != msgbufp) msgbuf_copy(oldp, msgbufp); msgbufmapped = true; if (print_boot_tag && *current_boot_tag != '\0') printf("%s\n", current_boot_tag); oldp = msgbufp; TSEXIT(); } /* Sysctls for accessing/clearing the msgbuf */ static int sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS) { char buf[128], *bp; u_int seq; int error, len; bool wrap; error = priv_check(req->td, PRIV_MSGBUF); if (error) return (error); /* Read the whole buffer, one chunk at a time. */ mtx_lock(&msgbuf_lock); msgbuf_peekbytes(msgbufp, NULL, 0, &seq); wrap = (seq != 0); for (;;) { len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq); mtx_unlock(&msgbuf_lock); if (len == 0) return (SYSCTL_OUT(req, "", 1)); /* add nulterm */ if (wrap) { /* Skip the first line, as it is probably incomplete. */ bp = memchr(buf, '\n', len); if (bp == NULL) { mtx_lock(&msgbuf_lock); continue; } wrap = false; bp++; len -= bp - buf; if (len == 0) { mtx_lock(&msgbuf_lock); continue; } } else bp = buf; error = sysctl_handle_opaque(oidp, bp, len, req); if (error) return (error); mtx_lock(&msgbuf_lock); } } SYSCTL_PROC(_kern, OID_AUTO, msgbuf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer"); static int msgbuf_clearflag; static int sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); if (!error && req->newptr) { mtx_lock(&msgbuf_lock); msgbuf_clear(msgbufp); mtx_unlock(&msgbuf_lock); msgbuf_clearflag = 0; } return (error); } SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, &msgbuf_clearflag, 0, sysctl_kern_msgbuf_clear, "I", "Clear kernel message buffer"); #ifdef DDB DB_SHOW_COMMAND_FLAGS(msgbuf, db_show_msgbuf, DB_CMD_MEMSAFE) { int i, j; if (!msgbufmapped) { db_printf("msgbuf not mapped yet\n"); return; } db_printf("msgbufp = %p\n", msgbufp); db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n", msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq, msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum); for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) { j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq); db_printf("%c", msgbufp->msg_ptr[j]); } db_printf("\n"); } #endif /* DDB */ void hexdump(const void *ptr, int length, const char *hdr, int flags) { int i, j, k; int cols; const unsigned char *cp; char delim; if ((flags & HD_DELIM_MASK) != 0) delim = (flags & HD_DELIM_MASK) >> 8; else delim = ' '; if ((flags & HD_COLUMN_MASK) != 0) cols = flags & HD_COLUMN_MASK; else cols = 16; cp = ptr; for (i = 0; i < length; i+= cols) { if (hdr != NULL) printf("%s", hdr); if ((flags & HD_OMIT_COUNT) == 0) printf("%04x ", i); if ((flags & HD_OMIT_HEX) == 0) { for (j = 0; j < cols; j++) { k = i + j; if (k < length) printf("%c%02x", delim, cp[k]); else printf(" "); } } if ((flags & HD_OMIT_CHARS) == 0) { printf(" |"); for (j = 0; j < cols; j++) { k = i + j; if (k >= length) printf(" "); else if (cp[k] >= ' ' && cp[k] <= '~') printf("%c", cp[k]); else printf("."); } printf("|"); } printf("\n"); } } #endif /* _KERNEL */ void sbuf_hexdump(struct sbuf *sb, const void *ptr, int length, const char *hdr, int flags) { int i, j, k; int cols; const unsigned char *cp; char delim; if ((flags & HD_DELIM_MASK) != 0) delim = (flags & HD_DELIM_MASK) >> 8; else delim = ' '; if ((flags & HD_COLUMN_MASK) != 0) cols = flags & HD_COLUMN_MASK; else cols = 16; cp = ptr; for (i = 0; i < length; i+= cols) { if (hdr != NULL) sbuf_printf(sb, "%s", hdr); if ((flags & HD_OMIT_COUNT) == 0) sbuf_printf(sb, "%04x ", i); if ((flags & HD_OMIT_HEX) == 0) { for (j = 0; j < cols; j++) { k = i + j; if (k < length) sbuf_printf(sb, "%c%02x", delim, cp[k]); else - sbuf_printf(sb, " "); + sbuf_cat(sb, " "); } } if ((flags & HD_OMIT_CHARS) == 0) { - sbuf_printf(sb, " |"); + sbuf_cat(sb, " |"); for (j = 0; j < cols; j++) { k = i + j; if (k >= length) - sbuf_printf(sb, " "); + sbuf_putc(sb, ' '); else if (cp[k] >= ' ' && cp[k] <= '~') - sbuf_printf(sb, "%c", cp[k]); + sbuf_putc(sb, cp[k]); else - sbuf_printf(sb, "."); + sbuf_putc(sb, '.'); } - sbuf_printf(sb, "|"); + sbuf_putc(sb, '|'); } - sbuf_printf(sb, "\n"); + sbuf_putc(sb, '\n'); } } #ifdef _KERNEL void counted_warning(unsigned *counter, const char *msg) { struct thread *td; unsigned c; for (;;) { c = *counter; if (c == 0) break; if (atomic_cmpset_int(counter, c, c - 1)) { td = curthread; log(LOG_INFO, "pid %d (%s) %s%s\n", td->td_proc->p_pid, td->td_name, msg, c > 1 ? "" : " - not logging anymore"); break; } } } #endif #ifdef _KERNEL void sbuf_putbuf(struct sbuf *sb) { prf_putbuf(sbuf_data(sb), TOLOG | TOCONS, -1); } #else void sbuf_putbuf(struct sbuf *sb) { printf("%s", sbuf_data(sb)); } #endif int sbuf_printf_drain(void *arg, const char *data, int len) { size_t *retvalptr; int r; #ifdef _KERNEL char *dataptr; char oldchr; /* * This is allowed as an extra byte is always resvered for * terminating NUL byte. Save and restore the byte because * we might be flushing a record, and there may be valid * data after the buffer. */ oldchr = data[len]; dataptr = __DECONST(char *, data); dataptr[len] = '\0'; prf_putbuf(dataptr, TOLOG | TOCONS, -1); r = len; dataptr[len] = oldchr; #else /* !_KERNEL */ r = printf("%.*s", len, data); if (r < 0) return (-errno); #endif retvalptr = arg; if (retvalptr != NULL) *retvalptr += r; return (r); } diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c index bbbf753e3df0..15dbf396c557 100644 --- a/sys/kern/subr_sleepqueue.c +++ b/sys/kern/subr_sleepqueue.c @@ -1,1528 +1,1528 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2004 John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Implementation of sleep queues used to hold queue of threads blocked on * a wait channel. Sleep queues are different from turnstiles in that wait * channels are not owned by anyone, so there is no priority propagation. * Sleep queues can also provide a timeout and can also be interrupted by * signals. That said, there are several similarities between the turnstile * and sleep queue implementations. (Note: turnstiles were implemented * first.) For example, both use a hash table of the same size where each * bucket is referred to as a "chain" that contains both a spin lock and * a linked list of queues. An individual queue is located by using a hash * to pick a chain, locking the chain, and then walking the chain searching * for the queue. This means that a wait channel object does not need to * embed its queue head just as locks do not embed their turnstile queue * head. Threads also carry around a sleep queue that they lend to the * wait channel when blocking. Just as in turnstiles, the queue includes * a free list of the sleep queues of other threads blocked on the same * wait channel in the case of multiple waiters. * * Some additional functionality provided by sleep queues include the * ability to set a timeout. The timeout is managed using a per-thread * callout that resumes a thread if it is asleep. A thread may also * catch signals while it is asleep (aka an interruptible sleep). The * signal code uses sleepq_abort() to interrupt a sleeping thread. Finally, * sleep queues also provide some extra assertions. One is not allowed to * mix the sleep/wakeup and cv APIs for a given wait channel. Also, one * must consistently use the same lock to synchronize with a wait channel, * though this check is currently only a warning for sleep/wakeup due to * pre-existing abuse of that API. The same lock must also be held when * awakening threads, though that is currently only enforced for condition * variables. */ #include #include "opt_sleepqueue_profiling.h" #include "opt_ddb.h" #include "opt_sched.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef EPOCH_TRACE #include #endif #include #include #ifdef DDB #include #endif /* * Constants for the hash table of sleep queue chains. * SC_TABLESIZE must be a power of two for SC_MASK to work properly. */ #ifndef SC_TABLESIZE #define SC_TABLESIZE 256 #endif CTASSERT(powerof2(SC_TABLESIZE)); #define SC_MASK (SC_TABLESIZE - 1) #define SC_SHIFT 8 #define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \ SC_MASK) #define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)] #define NR_SLEEPQS 2 /* * There are two different lists of sleep queues. Both lists are connected * via the sq_hash entries. The first list is the sleep queue chain list * that a sleep queue is on when it is attached to a wait channel. The * second list is the free list hung off of a sleep queue that is attached * to a wait channel. * * Each sleep queue also contains the wait channel it is attached to, the * list of threads blocked on that wait channel, flags specific to the * wait channel, and the lock used to synchronize with a wait channel. * The flags are used to catch mismatches between the various consumers * of the sleep queue API (e.g. sleep/wakeup and condition variables). * The lock pointer is only used when invariants are enabled for various * debugging checks. * * Locking key: * c - sleep queue chain lock */ struct sleepqueue { struct threadqueue sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */ LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */ LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */ const void *sq_wchan; /* (c) Wait channel. */ int sq_type; /* (c) Queue type. */ #ifdef INVARIANTS struct lock_object *sq_lock; /* (c) Associated lock. */ #endif }; struct sleepqueue_chain { LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */ struct mtx sc_lock; /* Spin lock for this chain. */ #ifdef SLEEPQUEUE_PROFILING u_int sc_depth; /* Length of sc_queues. */ u_int sc_max_depth; /* Max length of sc_queues. */ #endif } __aligned(CACHE_LINE_SIZE); #ifdef SLEEPQUEUE_PROFILING static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "sleepq profiling"); static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "sleepq chain stats"); static u_int sleepq_max_depth; SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth, 0, "maxmimum depth achieved of a single chain"); static void sleepq_profile(const char *wmesg); static int prof_enabled; #endif static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE]; static uma_zone_t sleepq_zone; /* * Prototypes for non-exported routines. */ static int sleepq_catch_signals(const void *wchan, int pri); static inline int sleepq_check_signals(void); static inline int sleepq_check_timeout(void); #ifdef INVARIANTS static void sleepq_dtor(void *mem, int size, void *arg); #endif static int sleepq_init(void *mem, int size, int flags); static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri, int srqflags); static void sleepq_remove_thread(struct sleepqueue *sq, struct thread *td); static void sleepq_switch(const void *wchan, int pri); static void sleepq_timeout(void *arg); SDT_PROBE_DECLARE(sched, , , sleep); SDT_PROBE_DECLARE(sched, , , wakeup); /* * Initialize SLEEPQUEUE_PROFILING specific sysctl nodes. * Note that it must happen after sleepinit() has been fully executed, so * it must happen after SI_SUB_KMEM SYSINIT() subsystem setup. */ #ifdef SLEEPQUEUE_PROFILING static void init_sleepqueue_profiling(void) { char chain_name[10]; struct sysctl_oid *chain_oid; u_int i; for (i = 0; i < SC_TABLESIZE; i++) { snprintf(chain_name, sizeof(chain_name), "%u", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO, chain_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "sleepq chain stats"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0, NULL); } } SYSINIT(sleepqueue_profiling, SI_SUB_LOCK, SI_ORDER_ANY, init_sleepqueue_profiling, NULL); #endif /* * Early initialization of sleep queues that is called from the sleepinit() * SYSINIT. */ void init_sleepqueues(void) { int i; for (i = 0; i < SC_TABLESIZE; i++) { LIST_INIT(&sleepq_chains[i].sc_queues); mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL, MTX_SPIN); } sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue), #ifdef INVARIANTS NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0); #else NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0); #endif thread0.td_sleepqueue = sleepq_alloc(); } /* * Get a sleep queue for a new thread. */ struct sleepqueue * sleepq_alloc(void) { return (uma_zalloc(sleepq_zone, M_WAITOK)); } /* * Free a sleep queue when a thread is destroyed. */ void sleepq_free(struct sleepqueue *sq) { uma_zfree(sleepq_zone, sq); } /* * Lock the sleep queue chain associated with the specified wait channel. */ void sleepq_lock(const void *wchan) { struct sleepqueue_chain *sc; sc = SC_LOOKUP(wchan); mtx_lock_spin(&sc->sc_lock); } /* * Look up the sleep queue associated with a given wait channel in the hash * table locking the associated sleep queue chain. If no queue is found in * the table, NULL is returned. */ struct sleepqueue * sleepq_lookup(const void *wchan) { struct sleepqueue_chain *sc; struct sleepqueue *sq; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); LIST_FOREACH(sq, &sc->sc_queues, sq_hash) if (sq->sq_wchan == wchan) return (sq); return (NULL); } /* * Unlock the sleep queue chain associated with a given wait channel. */ void sleepq_release(const void *wchan) { struct sleepqueue_chain *sc; sc = SC_LOOKUP(wchan); mtx_unlock_spin(&sc->sc_lock); } /* * Places the current thread on the sleep queue for the specified wait * channel. If INVARIANTS is enabled, then it associates the passed in * lock with the sleepq to make sure it is held when that sleep queue is * woken up. */ void sleepq_add(const void *wchan, struct lock_object *lock, const char *wmesg, int flags, int queue) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(td->td_sleepqueue != NULL); MPASS(wchan != NULL); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); /* If this thread is not allowed to sleep, die a horrible death. */ if (__predict_false(!THREAD_CAN_SLEEP())) { #ifdef EPOCH_TRACE epoch_trace_list(curthread); #endif KASSERT(0, ("%s: td %p to sleep on wchan %p with sleeping prohibited", __func__, td, wchan)); } /* Look up the sleep queue associated with the wait channel 'wchan'. */ sq = sleepq_lookup(wchan); /* * If the wait channel does not already have a sleep queue, use * this thread's sleep queue. Otherwise, insert the current thread * into the sleep queue already in use by this wait channel. */ if (sq == NULL) { #ifdef INVARIANTS int i; sq = td->td_sleepqueue; for (i = 0; i < NR_SLEEPQS; i++) { KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]), ("thread's sleep queue %d is not empty", i)); KASSERT(sq->sq_blockedcnt[i] == 0, ("thread's sleep queue %d count mismatches", i)); } KASSERT(LIST_EMPTY(&sq->sq_free), ("thread's sleep queue has a non-empty free list")); KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer")); sq->sq_lock = lock; #endif #ifdef SLEEPQUEUE_PROFILING sc->sc_depth++; if (sc->sc_depth > sc->sc_max_depth) { sc->sc_max_depth = sc->sc_depth; if (sc->sc_max_depth > sleepq_max_depth) sleepq_max_depth = sc->sc_max_depth; } #endif sq = td->td_sleepqueue; LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash); sq->sq_wchan = wchan; sq->sq_type = flags & SLEEPQ_TYPE; } else { MPASS(wchan == sq->sq_wchan); MPASS(lock == sq->sq_lock); MPASS((flags & SLEEPQ_TYPE) == sq->sq_type); LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash); } thread_lock(td); TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq); sq->sq_blockedcnt[queue]++; td->td_sleepqueue = NULL; td->td_sqqueue = queue; td->td_wchan = wchan; td->td_wmesg = wmesg; if (flags & SLEEPQ_INTERRUPTIBLE) { td->td_intrval = 0; td->td_flags |= TDF_SINTR; } td->td_flags &= ~TDF_TIMEOUT; thread_unlock(td); } /* * Sets a timeout that will remove the current thread from the * specified sleep queue at the specified time if the thread has not * already been awakened. Flags are from C_* (callout) namespace. */ void sleepq_set_timeout_sbt(const void *wchan, sbintime_t sbt, sbintime_t pr, int flags) { struct sleepqueue_chain *sc __unused; struct thread *td; sbintime_t pr1; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_sleepqueue == NULL); MPASS(wchan != NULL); if (cold && td == &thread0) panic("timed sleep before timers are working"); KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx", td->td_tid, td, (uintmax_t)td->td_sleeptimo)); thread_lock(td); callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1); thread_unlock(td); callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1, sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC | C_DIRECT_EXEC); } /* * Return the number of actual sleepers for the specified queue. */ u_int sleepq_sleepcnt(const void *wchan, int queue) { struct sleepqueue *sq; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); return (sq->sq_blockedcnt[queue]); } static int sleepq_check_ast_sc_locked(struct thread *td, struct sleepqueue_chain *sc) { struct proc *p; int ret; mtx_assert(&sc->sc_lock, MA_OWNED); if ((td->td_pflags & TDP_WAKEUP) != 0) { td->td_pflags &= ~TDP_WAKEUP; thread_lock(td); return (EINTR); } /* * See if there are any pending signals or suspension requests for this * thread. If not, we can switch immediately. */ thread_lock(td); if (!td_ast_pending(td, TDA_SIG) && !td_ast_pending(td, TDA_SUSPEND)) return (0); thread_unlock(td); mtx_unlock_spin(&sc->sc_lock); p = td->td_proc; CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)", (void *)td, (long)p->p_pid, td->td_name); PROC_LOCK(p); /* * Check for suspension first. Checking for signals and then * suspending could result in a missed signal, since a signal * can be delivered while this thread is suspended. */ ret = sig_ast_checksusp(td); if (ret != 0) { PROC_UNLOCK(p); mtx_lock_spin(&sc->sc_lock); thread_lock(td); return (ret); } ret = sig_ast_needsigchk(td); /* * Lock the per-process spinlock prior to dropping the * PROC_LOCK to avoid a signal delivery race. * PROC_LOCK, PROC_SLOCK, and thread_lock() are * currently held in tdsendsignal() and thread_single(). */ PROC_SLOCK(p); mtx_lock_spin(&sc->sc_lock); PROC_UNLOCK(p); thread_lock(td); PROC_SUNLOCK(p); return (ret); } /* * Marks the pending sleep of the current thread as interruptible and * makes an initial check for pending signals before putting a thread * to sleep. Enters and exits with the thread lock held. Thread lock * may have transitioned from the sleepq lock to a run lock. */ static int sleepq_catch_signals(const void *wchan, int pri) { struct thread *td; struct sleepqueue_chain *sc; struct sleepqueue *sq; int ret; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); MPASS(wchan != NULL); td = curthread; ret = sleepq_check_ast_sc_locked(td, sc); THREAD_LOCK_ASSERT(td, MA_OWNED); mtx_assert(&sc->sc_lock, MA_OWNED); if (ret == 0) { /* * No pending signals and no suspension requests found. * Switch the thread off the cpu. */ sleepq_switch(wchan, pri); } else { /* * There were pending signals and this thread is still * on the sleep queue, remove it from the sleep queue. */ if (TD_ON_SLEEPQ(td)) { sq = sleepq_lookup(wchan); sleepq_remove_thread(sq, td); } MPASS(td->td_lock != &sc->sc_lock); mtx_unlock_spin(&sc->sc_lock); thread_unlock(td); } return (ret); } /* * Switches to another thread if we are still asleep on a sleep queue. * Returns with thread lock. */ static void sleepq_switch(const void *wchan, int pri) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct thread *td; bool rtc_changed; td = curthread; sc = SC_LOOKUP(wchan); mtx_assert(&sc->sc_lock, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * If we have a sleep queue, then we've already been woken up, so * just return. */ if (td->td_sleepqueue != NULL) { mtx_unlock_spin(&sc->sc_lock); thread_unlock(td); return; } /* * If TDF_TIMEOUT is set, then our sleep has been timed out * already but we are still on the sleep queue, so dequeue the * thread and return. * * Do the same if the real-time clock has been adjusted since this * thread calculated its timeout based on that clock. This handles * the following race: * - The Ts thread needs to sleep until an absolute real-clock time. * It copies the global rtc_generation into curthread->td_rtcgen, * reads the RTC, and calculates a sleep duration based on that time. * See umtxq_sleep() for an example. * - The Tc thread adjusts the RTC, bumps rtc_generation, and wakes * threads that are sleeping until an absolute real-clock time. * See tc_setclock() and the POSIX specification of clock_settime(). * - Ts reaches the code below. It holds the sleepqueue chain lock, * so Tc has finished waking, so this thread must test td_rtcgen. * (The declaration of td_rtcgen refers to this comment.) */ rtc_changed = td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation; if ((td->td_flags & TDF_TIMEOUT) || rtc_changed) { if (rtc_changed) { td->td_rtcgen = 0; } MPASS(TD_ON_SLEEPQ(td)); sq = sleepq_lookup(wchan); sleepq_remove_thread(sq, td); mtx_unlock_spin(&sc->sc_lock); thread_unlock(td); return; } #ifdef SLEEPQUEUE_PROFILING if (prof_enabled) sleepq_profile(td->td_wmesg); #endif MPASS(td->td_sleepqueue == NULL); sched_sleep(td, pri); thread_lock_set(td, &sc->sc_lock); SDT_PROBE0(sched, , , sleep); TD_SET_SLEEPING(td); mi_switch(SW_VOL | SWT_SLEEPQ); KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING")); CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); } /* * Check to see if we timed out. */ static inline int sleepq_check_timeout(void) { struct thread *td; int res; res = 0; td = curthread; if (td->td_sleeptimo != 0) { if (td->td_sleeptimo <= sbinuptime()) res = EWOULDBLOCK; td->td_sleeptimo = 0; } return (res); } /* * Check to see if we were awoken by a signal. */ static inline int sleepq_check_signals(void) { struct thread *td; td = curthread; KASSERT((td->td_flags & TDF_SINTR) == 0, ("thread %p still in interruptible sleep?", td)); return (td->td_intrval); } /* * Block the current thread until it is awakened from its sleep queue. */ void sleepq_wait(const void *wchan, int pri) { struct thread *td; td = curthread; MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); } /* * Block the current thread until it is awakened from its sleep queue * or it is interrupted by a signal. */ int sleepq_wait_sig(const void *wchan, int pri) { int rcatch; rcatch = sleepq_catch_signals(wchan, pri); if (rcatch) return (rcatch); return (sleepq_check_signals()); } /* * Block the current thread until it is awakened from its sleep queue * or it times out while waiting. */ int sleepq_timedwait(const void *wchan, int pri) { struct thread *td; td = curthread; MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); return (sleepq_check_timeout()); } /* * Block the current thread until it is awakened from its sleep queue, * it is interrupted by a signal, or it times out waiting to be awakened. */ int sleepq_timedwait_sig(const void *wchan, int pri) { int rcatch, rvalt, rvals; rcatch = sleepq_catch_signals(wchan, pri); /* We must always call check_timeout() to clear sleeptimo. */ rvalt = sleepq_check_timeout(); rvals = sleepq_check_signals(); if (rcatch) return (rcatch); if (rvals) return (rvals); return (rvalt); } /* * Returns the type of sleepqueue given a waitchannel. */ int sleepq_type(const void *wchan) { struct sleepqueue *sq; int type; MPASS(wchan != NULL); sq = sleepq_lookup(wchan); if (sq == NULL) return (-1); type = sq->sq_type; return (type); } /* * Removes a thread from a sleep queue and makes it * runnable. * * Requires the sc chain locked on entry. If SRQ_HOLD is specified it will * be locked on return. Returns without the thread lock held. */ static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri, int srqflags) { struct sleepqueue_chain *sc; bool drop; MPASS(td != NULL); MPASS(sq->sq_wchan != NULL); MPASS(td->td_wchan == sq->sq_wchan); sc = SC_LOOKUP(sq->sq_wchan); mtx_assert(&sc->sc_lock, MA_OWNED); /* * Avoid recursing on the chain lock. If the locks don't match we * need to acquire the thread lock which setrunnable will drop for * us. In this case we need to drop the chain lock afterwards. * * There is no race that will make td_lock equal to sc_lock because * we hold sc_lock. */ drop = false; if (!TD_IS_SLEEPING(td)) { thread_lock(td); drop = true; } else thread_lock_block_wait(td); /* Remove thread from the sleepq. */ sleepq_remove_thread(sq, td); /* If we're done with the sleepqueue release it. */ if ((srqflags & SRQ_HOLD) == 0 && drop) mtx_unlock_spin(&sc->sc_lock); /* Adjust priority if requested. */ MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX)); if (pri != 0 && td->td_priority > pri && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, pri); /* * Note that thread td might not be sleeping if it is running * sleepq_catch_signals() on another CPU or is blocked on its * proc lock to check signals. There's no need to mark the * thread runnable in that case. */ if (TD_IS_SLEEPING(td)) { MPASS(!drop); TD_CLR_SLEEPING(td); return (setrunnable(td, srqflags)); } MPASS(drop); thread_unlock(td); return (0); } static void sleepq_remove_thread(struct sleepqueue *sq, struct thread *td) { struct sleepqueue_chain *sc __unused; MPASS(td != NULL); MPASS(sq->sq_wchan != NULL); MPASS(td->td_wchan == sq->sq_wchan); MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0); THREAD_LOCK_ASSERT(td, MA_OWNED); sc = SC_LOOKUP(sq->sq_wchan); mtx_assert(&sc->sc_lock, MA_OWNED); SDT_PROBE2(sched, , , wakeup, td, td->td_proc); /* Remove the thread from the queue. */ sq->sq_blockedcnt[td->td_sqqueue]--; TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq); /* * Get a sleep queue for this thread. If this is the last waiter, * use the queue itself and take it out of the chain, otherwise, * remove a queue from the free list. */ if (LIST_EMPTY(&sq->sq_free)) { td->td_sleepqueue = sq; #ifdef INVARIANTS sq->sq_wchan = NULL; #endif #ifdef SLEEPQUEUE_PROFILING sc->sc_depth--; #endif } else td->td_sleepqueue = LIST_FIRST(&sq->sq_free); LIST_REMOVE(td->td_sleepqueue, sq_hash); if ((td->td_flags & TDF_TIMEOUT) == 0 && td->td_sleeptimo != 0 && td->td_lock == &sc->sc_lock) { /* * We ignore the situation where timeout subsystem was * unable to stop our callout. The struct thread is * type-stable, the callout will use the correct * memory when running. The checks of the * td_sleeptimo value in this function and in * sleepq_timeout() ensure that the thread does not * get spurious wakeups, even if the callout was reset * or thread reused. * * We also cannot safely stop the callout if a scheduler * lock is held since softclock_thread() forces a lock * order of callout lock -> scheduler lock. The thread * lock will be a scheduler lock only if the thread is * preparing to go to sleep, so this is hopefully a rare * scenario. */ callout_stop(&td->td_slpcallout); } td->td_wmesg = NULL; td->td_wchan = NULL; td->td_flags &= ~(TDF_SINTR | TDF_TIMEOUT); CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, td->td_name); } void sleepq_remove_nested(struct thread *td) { struct sleepqueue_chain *sc; struct sleepqueue *sq; const void *wchan; MPASS(TD_ON_SLEEPQ(td)); wchan = td->td_wchan; sc = SC_LOOKUP(wchan); mtx_lock_spin(&sc->sc_lock); sq = sleepq_lookup(wchan); MPASS(sq != NULL); thread_lock(td); sleepq_remove_thread(sq, td); mtx_unlock_spin(&sc->sc_lock); /* Returns with the thread lock owned. */ } #ifdef INVARIANTS /* * UMA zone item deallocator. */ static void sleepq_dtor(void *mem, int size, void *arg) { struct sleepqueue *sq; int i; sq = mem; for (i = 0; i < NR_SLEEPQS; i++) { MPASS(TAILQ_EMPTY(&sq->sq_blocked[i])); MPASS(sq->sq_blockedcnt[i] == 0); } } #endif /* * UMA zone item initializer. */ static int sleepq_init(void *mem, int size, int flags) { struct sleepqueue *sq; int i; bzero(mem, size); sq = mem; for (i = 0; i < NR_SLEEPQS; i++) { TAILQ_INIT(&sq->sq_blocked[i]); sq->sq_blockedcnt[i] = 0; } LIST_INIT(&sq->sq_free); return (0); } /* * Find thread sleeping on a wait channel and resume it. */ int sleepq_signal(const void *wchan, int flags, int pri, int queue) { struct sleepqueue_chain *sc; struct sleepqueue *sq; struct threadqueue *head; struct thread *td, *besttd; int wakeup_swapper; CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) { if (flags & SLEEPQ_DROP) sleepq_release(wchan); return (0); } KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); head = &sq->sq_blocked[queue]; if (flags & SLEEPQ_UNFAIR) { /* * Find the most recently sleeping thread, but try to * skip threads still in process of context switch to * avoid spinning on the thread lock. */ sc = SC_LOOKUP(wchan); besttd = TAILQ_LAST_FAST(head, thread, td_slpq); while (besttd->td_lock != &sc->sc_lock) { td = TAILQ_PREV_FAST(besttd, head, thread, td_slpq); if (td == NULL) break; besttd = td; } } else { /* * Find the highest priority thread on the queue. If there * is a tie, use the thread that first appears in the queue * as it has been sleeping the longest since threads are * always added to the tail of sleep queues. */ besttd = td = TAILQ_FIRST(head); while ((td = TAILQ_NEXT(td, td_slpq)) != NULL) { if (td->td_priority < besttd->td_priority) besttd = td; } } MPASS(besttd != NULL); wakeup_swapper = sleepq_resume_thread(sq, besttd, pri, (flags & SLEEPQ_DROP) ? 0 : SRQ_HOLD); return (wakeup_swapper); } static bool match_any(struct thread *td __unused) { return (true); } /* * Resume all threads sleeping on a specified wait channel. */ int sleepq_broadcast(const void *wchan, int flags, int pri, int queue) { struct sleepqueue *sq; CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags); KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); sq = sleepq_lookup(wchan); if (sq == NULL) return (0); KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE), ("%s: mismatch between sleep/wakeup and cv_*", __func__)); return (sleepq_remove_matching(sq, queue, match_any, pri)); } /* * Resume threads on the sleep queue that match the given predicate. */ int sleepq_remove_matching(struct sleepqueue *sq, int queue, bool (*matches)(struct thread *), int pri) { struct thread *td, *tdn; int wakeup_swapper; /* * The last thread will be given ownership of sq and may * re-enqueue itself before sleepq_resume_thread() returns, * so we must cache the "next" queue item at the beginning * of the final iteration. */ wakeup_swapper = 0; TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) { if (matches(td)) wakeup_swapper |= sleepq_resume_thread(sq, td, pri, SRQ_HOLD); } return (wakeup_swapper); } /* * Time sleeping threads out. When the timeout expires, the thread is * removed from the sleep queue and made runnable if it is still asleep. */ static void sleepq_timeout(void *arg) { struct sleepqueue_chain *sc __unused; struct sleepqueue *sq; struct thread *td; const void *wchan; int wakeup_swapper; td = arg; CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); thread_lock(td); if (td->td_sleeptimo == 0 || td->td_sleeptimo > td->td_slpcallout.c_time) { /* * The thread does not want a timeout (yet). */ } else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { /* * See if the thread is asleep and get the wait * channel if it is. */ wchan = td->td_wchan; sc = SC_LOOKUP(wchan); THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); sq = sleepq_lookup(wchan); MPASS(sq != NULL); td->td_flags |= TDF_TIMEOUT; wakeup_swapper = sleepq_resume_thread(sq, td, 0, 0); if (wakeup_swapper) kick_proc0(); return; } else if (TD_ON_SLEEPQ(td)) { /* * If the thread is on the SLEEPQ but isn't sleeping * yet, it can either be on another CPU in between * sleepq_add() and one of the sleepq_*wait*() * routines or it can be in sleepq_catch_signals(). */ td->td_flags |= TDF_TIMEOUT; } thread_unlock(td); } /* * Resumes a specific thread from the sleep queue associated with a specific * wait channel if it is on that queue. */ void sleepq_remove(struct thread *td, const void *wchan) { struct sleepqueue_chain *sc; struct sleepqueue *sq; int wakeup_swapper; /* * Look up the sleep queue for this wait channel, then re-check * that the thread is asleep on that channel, if it is not, then * bail. */ MPASS(wchan != NULL); sc = SC_LOOKUP(wchan); mtx_lock_spin(&sc->sc_lock); /* * We can not lock the thread here as it may be sleeping on a * different sleepq. However, holding the sleepq lock for this * wchan can guarantee that we do not miss a wakeup for this * channel. The asserts below will catch any false positives. */ if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) { mtx_unlock_spin(&sc->sc_lock); return; } /* Thread is asleep on sleep queue sq, so wake it up. */ sq = sleepq_lookup(wchan); MPASS(sq != NULL); MPASS(td->td_wchan == wchan); wakeup_swapper = sleepq_resume_thread(sq, td, 0, 0); if (wakeup_swapper) kick_proc0(); } /* * Abort a thread as if an interrupt had occurred. Only abort * interruptible waits (unfortunately it isn't safe to abort others). * * Requires thread lock on entry, releases on return. */ int sleepq_abort(struct thread *td, int intrval) { struct sleepqueue *sq; const void *wchan; THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(TD_ON_SLEEPQ(td)); MPASS(td->td_flags & TDF_SINTR); MPASS((intrval == 0 && (td->td_flags & TDF_SIGWAIT) != 0) || intrval == EINTR || intrval == ERESTART); /* * If the TDF_TIMEOUT flag is set, just leave. A * timeout is scheduled anyhow. */ if (td->td_flags & TDF_TIMEOUT) { thread_unlock(td); return (0); } CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); td->td_intrval = intrval; /* * If the thread has not slept yet it will find the signal in * sleepq_catch_signals() and call sleepq_resume_thread. Otherwise * we have to do it here. */ if (!TD_IS_SLEEPING(td)) { thread_unlock(td); return (0); } wchan = td->td_wchan; MPASS(wchan != NULL); sq = sleepq_lookup(wchan); MPASS(sq != NULL); /* Thread is asleep on sleep queue sq, so wake it up. */ return (sleepq_resume_thread(sq, td, 0, 0)); } void sleepq_chains_remove_matching(bool (*matches)(struct thread *)) { struct sleepqueue_chain *sc; struct sleepqueue *sq, *sq1; int i, wakeup_swapper; wakeup_swapper = 0; for (sc = &sleepq_chains[0]; sc < sleepq_chains + SC_TABLESIZE; ++sc) { if (LIST_EMPTY(&sc->sc_queues)) { continue; } mtx_lock_spin(&sc->sc_lock); LIST_FOREACH_SAFE(sq, &sc->sc_queues, sq_hash, sq1) { for (i = 0; i < NR_SLEEPQS; ++i) { wakeup_swapper |= sleepq_remove_matching(sq, i, matches, 0); } } mtx_unlock_spin(&sc->sc_lock); } if (wakeup_swapper) { kick_proc0(); } } /* * Prints the stacks of all threads presently sleeping on wchan/queue to * the sbuf sb. Sets count_stacks_printed to the number of stacks actually * printed. Typically, this will equal the number of threads sleeping on the * queue, but may be less if sb overflowed before all stacks were printed. */ #ifdef STACK int sleepq_sbuf_print_stacks(struct sbuf *sb, const void *wchan, int queue, int *count_stacks_printed) { struct thread *td, *td_next; struct sleepqueue *sq; struct stack **st; struct sbuf **td_infos; int i, stack_idx, error, stacks_to_allocate; bool finished; error = 0; finished = false; KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); MPASS((queue >= 0) && (queue < NR_SLEEPQS)); stacks_to_allocate = 10; for (i = 0; i < 3 && !finished ; i++) { /* We cannot malloc while holding the queue's spinlock, so * we do our mallocs now, and hope it is enough. If it * isn't, we will free these, drop the lock, malloc more, * and try again, up to a point. After that point we will * give up and report ENOMEM. We also cannot write to sb * during this time since the client may have set the * SBUF_AUTOEXTEND flag on their sbuf, which could cause a * malloc as we print to it. So we defer actually printing * to sb until after we drop the spinlock. */ /* Where we will store the stacks. */ st = malloc(sizeof(struct stack *) * stacks_to_allocate, M_TEMP, M_WAITOK); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) st[stack_idx] = stack_create(M_WAITOK); /* Where we will store the td name, tid, etc. */ td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate, M_TEMP, M_WAITOK); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) td_infos[stack_idx] = sbuf_new(NULL, NULL, MAXCOMLEN + sizeof(struct thread *) * 2 + 40, SBUF_FIXEDLEN); sleepq_lock(wchan); sq = sleepq_lookup(wchan); if (sq == NULL) { /* This sleepq does not exist; exit and return ENOENT. */ error = ENOENT; finished = true; sleepq_release(wchan); goto loop_end; } stack_idx = 0; /* Save thread info */ TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, td_next) { if (stack_idx >= stacks_to_allocate) goto loop_end; /* Note the td_lock is equal to the sleepq_lock here. */ (void)stack_save_td(st[stack_idx], td); sbuf_printf(td_infos[stack_idx], "%d: %s %p", td->td_tid, td->td_name, td); ++stack_idx; } finished = true; sleepq_release(wchan); /* Print the stacks */ for (i = 0; i < stack_idx; i++) { sbuf_finish(td_infos[i]); sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i])); stack_sbuf_print(sb, st[i]); - sbuf_printf(sb, "\n"); + sbuf_putc(sb, '\n'); error = sbuf_error(sb); if (error == 0) *count_stacks_printed = stack_idx; } loop_end: if (!finished) sleepq_release(wchan); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) stack_destroy(st[stack_idx]); for (stack_idx = 0; stack_idx < stacks_to_allocate; stack_idx++) sbuf_delete(td_infos[stack_idx]); free(st, M_TEMP); free(td_infos, M_TEMP); stacks_to_allocate *= 10; } if (!finished && error == 0) error = ENOMEM; return (error); } #endif #ifdef SLEEPQUEUE_PROFILING #define SLEEPQ_PROF_LOCATIONS 1024 #define SLEEPQ_SBUFSIZE 512 struct sleepq_prof { LIST_ENTRY(sleepq_prof) sp_link; const char *sp_wmesg; long sp_count; }; LIST_HEAD(sqphead, sleepq_prof); struct sqphead sleepq_prof_free; struct sqphead sleepq_hash[SC_TABLESIZE]; static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS]; static struct mtx sleepq_prof_lock; MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN); static void sleepq_profile(const char *wmesg) { struct sleepq_prof *sp; mtx_lock_spin(&sleepq_prof_lock); if (prof_enabled == 0) goto unlock; LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link) if (sp->sp_wmesg == wmesg) goto done; sp = LIST_FIRST(&sleepq_prof_free); if (sp == NULL) goto unlock; sp->sp_wmesg = wmesg; LIST_REMOVE(sp, sp_link); LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link); done: sp->sp_count++; unlock: mtx_unlock_spin(&sleepq_prof_lock); return; } static void sleepq_prof_reset(void) { struct sleepq_prof *sp; int enabled; int i; mtx_lock_spin(&sleepq_prof_lock); enabled = prof_enabled; prof_enabled = 0; for (i = 0; i < SC_TABLESIZE; i++) LIST_INIT(&sleepq_hash[i]); LIST_INIT(&sleepq_prof_free); for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) { sp = &sleepq_profent[i]; sp->sp_wmesg = NULL; sp->sp_count = 0; LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link); } prof_enabled = enabled; mtx_unlock_spin(&sleepq_prof_lock); } static int enable_sleepq_prof(SYSCTL_HANDLER_ARGS) { int error, v; v = prof_enabled; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == prof_enabled) return (0); if (v == 1) sleepq_prof_reset(); mtx_lock_spin(&sleepq_prof_lock); prof_enabled = !!v; mtx_unlock_spin(&sleepq_prof_lock); return (0); } static int reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS) { int error, v; v = 0; error = sysctl_handle_int(oidp, &v, 0, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == 0) return (0); sleepq_prof_reset(); return (0); } static int dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS) { struct sleepq_prof *sp; struct sbuf *sb; int enabled; int error; int i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req); - sbuf_printf(sb, "\nwmesg\tcount\n"); + sbuf_cat(sb, "\nwmesg\tcount\n"); enabled = prof_enabled; mtx_lock_spin(&sleepq_prof_lock); prof_enabled = 0; mtx_unlock_spin(&sleepq_prof_lock); for (i = 0; i < SC_TABLESIZE; i++) { LIST_FOREACH(sp, &sleepq_hash[i], sp_link) { sbuf_printf(sb, "%s\t%ld\n", sp->sp_wmesg, sp->sp_count); } } mtx_lock_spin(&sleepq_prof_lock); prof_enabled = enabled; mtx_unlock_spin(&sleepq_prof_lock); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics"); SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, reset_sleepq_prof_stats, "I", "Reset sleepqueue profiling statistics"); SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling"); #endif #ifdef DDB DB_SHOW_COMMAND(sleepq, db_show_sleepqueue) { struct sleepqueue_chain *sc; struct sleepqueue *sq; #ifdef INVARIANTS struct lock_object *lock; #endif struct thread *td; void *wchan; int i; if (!have_addr) return; /* * First, see if there is an active sleep queue for the wait channel * indicated by the address. */ wchan = (void *)addr; sc = SC_LOOKUP(wchan); LIST_FOREACH(sq, &sc->sc_queues, sq_hash) if (sq->sq_wchan == wchan) goto found; /* * Second, see if there is an active sleep queue at the address * indicated. */ for (i = 0; i < SC_TABLESIZE; i++) LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) { if (sq == (struct sleepqueue *)addr) goto found; } db_printf("Unable to locate a sleep queue via %p\n", (void *)addr); return; found: db_printf("Wait channel: %p\n", sq->sq_wchan); db_printf("Queue type: %d\n", sq->sq_type); #ifdef INVARIANTS if (sq->sq_lock) { lock = sq->sq_lock; db_printf("Associated Interlock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name, lock->lo_name); } #endif db_printf("Blocked threads:\n"); for (i = 0; i < NR_SLEEPQS; i++) { db_printf("\nQueue[%d]:\n", i); if (TAILQ_EMPTY(&sq->sq_blocked[i])) db_printf("\tempty\n"); else TAILQ_FOREACH(td, &sq->sq_blocked[i], td_slpq) { db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); } db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]); } } /* Alias 'show sleepqueue' to 'show sleepq'. */ DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue); #endif diff --git a/sys/kern/subr_stats.c b/sys/kern/subr_stats.c index 0e7d2fad5f68..6e8ec44681e7 100644 --- a/sys/kern/subr_stats.c +++ b/sys/kern/subr_stats.c @@ -1,3954 +1,3954 @@ /*- * Copyright (c) 2014-2018 Netflix, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Author: Lawrence Stewart */ #include #include #include #include #include #include #include #include #include #include #if defined(DIAGNOSTIC) #include #endif #include /* Must come after qmath.h and arb.h */ #include #include #include #ifdef _KERNEL #include #include #include #include #include #else /* ! _KERNEL */ #include #include #include #include #include #endif /* _KERNEL */ struct voistatdata_voistate { /* Previous VOI value for diff calculation. */ struct voistatdata_numeric prev; }; #define VS_VSDVALID 0x0001 /* Stat's voistatdata updated at least once. */ struct voistat { int8_t stype; /* Type of stat e.g. VS_STYPE_SUM. */ enum vsd_dtype dtype : 8; /* Data type of this stat's data. */ uint16_t data_off; /* Blob offset for this stat's data. */ uint16_t dsz; /* Size of stat's data. */ #define VS_EBITS 8 uint16_t errs : VS_EBITS;/* Non-wrapping error count. */ uint16_t flags : 16 - VS_EBITS; }; /* The voistat error count is capped to avoid wrapping. */ #define VS_INCERRS(vs) do { \ if ((vs)->errs < (1U << VS_EBITS) - 1) \ (vs)->errs++; \ } while (0) /* * Ideas for flags: * - Global or entity specific (global would imply use of counter(9)?) * - Whether to reset stats on read or not * - Signal an overflow? * - Compressed voistat array */ #define VOI_REQSTATE 0x0001 /* VOI requires VS_STYPE_VOISTATE. */ struct voi { int16_t id; /* VOI id. */ enum vsd_dtype dtype : 8; /* Data type of the VOI itself. */ int8_t voistatmaxid; /* Largest allocated voistat index. */ uint16_t stats_off; /* Blob offset for this VOIs stats. */ uint16_t flags; }; /* * Memory for the entire blob is allocated as a slab and then offsets are * maintained to carve up the slab into sections holding different data types. * * Ideas for flags: * - Compressed voi array (trade off memory usage vs search time) * - Units of offsets (default bytes, flag for e.g. vm_page/KiB/Mib) */ struct statsblobv1 { uint8_t abi; uint8_t endian; uint16_t flags; uint16_t maxsz; uint16_t cursz; /* Fields from here down are opaque to consumers. */ uint32_t tplhash; /* Base template hash ID. */ uint16_t stats_off; /* voistat array blob offset. */ uint16_t statsdata_off; /* voistatdata array blob offset. */ sbintime_t created; /* Blob creation time. */ sbintime_t lastrst; /* Time of last reset. */ struct voi vois[]; /* Array indexed by [voi_id]. */ } __aligned(sizeof(void *)); _Static_assert(offsetof(struct statsblobv1, cursz) + SIZEOF_MEMBER(struct statsblobv1, cursz) == offsetof(struct statsblob, opaque), "statsblobv1 ABI mismatch"); struct statsblobv1_tpl { struct metablob *mb; struct statsblobv1 *sb; }; /* Context passed to iterator callbacks. */ struct sb_iter_ctx { void *usrctx; /* Caller supplied context. */ uint32_t flags; /* Flags for current iteration. */ int16_t vslot; /* struct voi slot index. */ int8_t vsslot; /* struct voistat slot index. */ }; struct sb_tostrcb_ctx { struct sbuf *buf; struct statsblob_tpl *tpl; enum sb_str_fmt fmt; uint32_t flags; }; struct sb_visitcb_ctx { stats_blob_visitcb_t cb; void *usrctx; }; /* Stats blob iterator callback. */ typedef int (*stats_v1_blob_itercb_t)(struct statsblobv1 *sb, struct voi *v, struct voistat *vs, struct sb_iter_ctx *ctx); #ifdef _KERNEL static struct rwlock tpllistlock; RW_SYSINIT(stats_tpl_list, &tpllistlock, "Stat template list lock"); #define TPL_LIST_RLOCK() rw_rlock(&tpllistlock) #define TPL_LIST_RUNLOCK() rw_runlock(&tpllistlock) #define TPL_LIST_WLOCK() rw_wlock(&tpllistlock) #define TPL_LIST_WUNLOCK() rw_wunlock(&tpllistlock) #define TPL_LIST_LOCK_ASSERT() rw_assert(&tpllistlock, RA_LOCKED) #define TPL_LIST_RLOCK_ASSERT() rw_assert(&tpllistlock, RA_RLOCKED) #define TPL_LIST_WLOCK_ASSERT() rw_assert(&tpllistlock, RA_WLOCKED) MALLOC_DEFINE(M_STATS, "stats(9) related memory", "stats(9) related memory"); #define stats_free(ptr) free((ptr), M_STATS) #else /* ! _KERNEL */ static void stats_constructor(void); static void stats_destructor(void); static pthread_rwlock_t tpllistlock; #define TPL_LIST_UNLOCK() pthread_rwlock_unlock(&tpllistlock) #define TPL_LIST_RLOCK() pthread_rwlock_rdlock(&tpllistlock) #define TPL_LIST_RUNLOCK() TPL_LIST_UNLOCK() #define TPL_LIST_WLOCK() pthread_rwlock_wrlock(&tpllistlock) #define TPL_LIST_WUNLOCK() TPL_LIST_UNLOCK() #define TPL_LIST_LOCK_ASSERT() do { } while (0) #define TPL_LIST_RLOCK_ASSERT() do { } while (0) #define TPL_LIST_WLOCK_ASSERT() do { } while (0) #ifdef NDEBUG #define KASSERT(cond, msg) do {} while (0) #define stats_abort() do {} while (0) #else /* ! NDEBUG */ #define KASSERT(cond, msg) do { \ if (!(cond)) { \ panic msg; \ } \ } while (0) #define stats_abort() abort() #endif /* NDEBUG */ #define stats_free(ptr) free(ptr) #define panic(fmt, ...) do { \ fprintf(stderr, (fmt), ##__VA_ARGS__); \ stats_abort(); \ } while (0) #endif /* _KERNEL */ #define SB_V1_MAXSZ 65535 /* Obtain a blob offset pointer. */ #define BLOB_OFFSET(sb, off) ((void *)(((uint8_t *)(sb)) + (off))) /* * Number of VOIs in the blob's vois[] array. By virtue of struct voi being a * power of 2 size, we can shift instead of divide. The shift amount must be * updated if sizeof(struct voi) ever changes, which the assert should catch. */ #define NVOIS(sb) ((int32_t)((((struct statsblobv1 *)(sb))->stats_off - \ sizeof(struct statsblobv1)) >> 3)) _Static_assert(sizeof(struct voi) == 8, "statsblobv1 voi ABI mismatch"); /* Try restrict names to alphanumeric and underscore to simplify JSON compat. */ const char *vs_stype2name[VS_NUM_STYPES] = { [VS_STYPE_VOISTATE] = "VOISTATE", [VS_STYPE_SUM] = "SUM", [VS_STYPE_MAX] = "MAX", [VS_STYPE_MIN] = "MIN", [VS_STYPE_HIST] = "HIST", [VS_STYPE_TDGST] = "TDGST", }; const char *vs_stype2desc[VS_NUM_STYPES] = { [VS_STYPE_VOISTATE] = "VOI related state data (not a real stat)", [VS_STYPE_SUM] = "Simple arithmetic accumulator", [VS_STYPE_MAX] = "Maximum observed VOI value", [VS_STYPE_MIN] = "Minimum observed VOI value", [VS_STYPE_HIST] = "Histogram of observed VOI values", [VS_STYPE_TDGST] = "t-digest of observed VOI values", }; const char *vsd_dtype2name[VSD_NUM_DTYPES] = { [VSD_DTYPE_VOISTATE] = "VOISTATE", [VSD_DTYPE_INT_S32] = "INT_S32", [VSD_DTYPE_INT_U32] = "INT_U32", [VSD_DTYPE_INT_S64] = "INT_S64", [VSD_DTYPE_INT_U64] = "INT_U64", [VSD_DTYPE_INT_SLONG] = "INT_SLONG", [VSD_DTYPE_INT_ULONG] = "INT_ULONG", [VSD_DTYPE_Q_S32] = "Q_S32", [VSD_DTYPE_Q_U32] = "Q_U32", [VSD_DTYPE_Q_S64] = "Q_S64", [VSD_DTYPE_Q_U64] = "Q_U64", [VSD_DTYPE_CRHIST32] = "CRHIST32", [VSD_DTYPE_DRHIST32] = "DRHIST32", [VSD_DTYPE_DVHIST32] = "DVHIST32", [VSD_DTYPE_CRHIST64] = "CRHIST64", [VSD_DTYPE_DRHIST64] = "DRHIST64", [VSD_DTYPE_DVHIST64] = "DVHIST64", [VSD_DTYPE_TDGSTCLUST32] = "TDGSTCLUST32", [VSD_DTYPE_TDGSTCLUST64] = "TDGSTCLUST64", }; const size_t vsd_dtype2size[VSD_NUM_DTYPES] = { [VSD_DTYPE_VOISTATE] = sizeof(struct voistatdata_voistate), [VSD_DTYPE_INT_S32] = sizeof(struct voistatdata_int32), [VSD_DTYPE_INT_U32] = sizeof(struct voistatdata_int32), [VSD_DTYPE_INT_S64] = sizeof(struct voistatdata_int64), [VSD_DTYPE_INT_U64] = sizeof(struct voistatdata_int64), [VSD_DTYPE_INT_SLONG] = sizeof(struct voistatdata_intlong), [VSD_DTYPE_INT_ULONG] = sizeof(struct voistatdata_intlong), [VSD_DTYPE_Q_S32] = sizeof(struct voistatdata_q32), [VSD_DTYPE_Q_U32] = sizeof(struct voistatdata_q32), [VSD_DTYPE_Q_S64] = sizeof(struct voistatdata_q64), [VSD_DTYPE_Q_U64] = sizeof(struct voistatdata_q64), [VSD_DTYPE_CRHIST32] = sizeof(struct voistatdata_crhist32), [VSD_DTYPE_DRHIST32] = sizeof(struct voistatdata_drhist32), [VSD_DTYPE_DVHIST32] = sizeof(struct voistatdata_dvhist32), [VSD_DTYPE_CRHIST64] = sizeof(struct voistatdata_crhist64), [VSD_DTYPE_DRHIST64] = sizeof(struct voistatdata_drhist64), [VSD_DTYPE_DVHIST64] = sizeof(struct voistatdata_dvhist64), [VSD_DTYPE_TDGSTCLUST32] = sizeof(struct voistatdata_tdgstclust32), [VSD_DTYPE_TDGSTCLUST64] = sizeof(struct voistatdata_tdgstclust64), }; static const bool vsd_compoundtype[VSD_NUM_DTYPES] = { [VSD_DTYPE_VOISTATE] = true, [VSD_DTYPE_INT_S32] = false, [VSD_DTYPE_INT_U32] = false, [VSD_DTYPE_INT_S64] = false, [VSD_DTYPE_INT_U64] = false, [VSD_DTYPE_INT_SLONG] = false, [VSD_DTYPE_INT_ULONG] = false, [VSD_DTYPE_Q_S32] = false, [VSD_DTYPE_Q_U32] = false, [VSD_DTYPE_Q_S64] = false, [VSD_DTYPE_Q_U64] = false, [VSD_DTYPE_CRHIST32] = true, [VSD_DTYPE_DRHIST32] = true, [VSD_DTYPE_DVHIST32] = true, [VSD_DTYPE_CRHIST64] = true, [VSD_DTYPE_DRHIST64] = true, [VSD_DTYPE_DVHIST64] = true, [VSD_DTYPE_TDGSTCLUST32] = true, [VSD_DTYPE_TDGSTCLUST64] = true, }; const struct voistatdata_numeric numeric_limits[2][VSD_DTYPE_Q_U64 + 1] = { [LIM_MIN] = { [VSD_DTYPE_VOISTATE] = {0}, [VSD_DTYPE_INT_S32] = {.int32 = {.s32 = INT32_MIN}}, [VSD_DTYPE_INT_U32] = {.int32 = {.u32 = 0}}, [VSD_DTYPE_INT_S64] = {.int64 = {.s64 = INT64_MIN}}, [VSD_DTYPE_INT_U64] = {.int64 = {.u64 = 0}}, [VSD_DTYPE_INT_SLONG] = {.intlong = {.slong = LONG_MIN}}, [VSD_DTYPE_INT_ULONG] = {.intlong = {.ulong = 0}}, [VSD_DTYPE_Q_S32] = {.q32 = {.sq32 = Q_IFMINVAL(INT32_MIN)}}, [VSD_DTYPE_Q_U32] = {.q32 = {.uq32 = 0}}, [VSD_DTYPE_Q_S64] = {.q64 = {.sq64 = Q_IFMINVAL(INT64_MIN)}}, [VSD_DTYPE_Q_U64] = {.q64 = {.uq64 = 0}}, }, [LIM_MAX] = { [VSD_DTYPE_VOISTATE] = {0}, [VSD_DTYPE_INT_S32] = {.int32 = {.s32 = INT32_MAX}}, [VSD_DTYPE_INT_U32] = {.int32 = {.u32 = UINT32_MAX}}, [VSD_DTYPE_INT_S64] = {.int64 = {.s64 = INT64_MAX}}, [VSD_DTYPE_INT_U64] = {.int64 = {.u64 = UINT64_MAX}}, [VSD_DTYPE_INT_SLONG] = {.intlong = {.slong = LONG_MAX}}, [VSD_DTYPE_INT_ULONG] = {.intlong = {.ulong = ULONG_MAX}}, [VSD_DTYPE_Q_S32] = {.q32 = {.sq32 = Q_IFMAXVAL(INT32_MAX)}}, [VSD_DTYPE_Q_U32] = {.q32 = {.uq32 = Q_IFMAXVAL(UINT32_MAX)}}, [VSD_DTYPE_Q_S64] = {.q64 = {.sq64 = Q_IFMAXVAL(INT64_MAX)}}, [VSD_DTYPE_Q_U64] = {.q64 = {.uq64 = Q_IFMAXVAL(UINT64_MAX)}}, } }; /* tpllistlock protects tpllist and ntpl */ static uint32_t ntpl; static struct statsblob_tpl **tpllist; static inline void * stats_realloc(void *ptr, size_t oldsz, size_t newsz, int flags); //static void stats_v1_blob_finalise(struct statsblobv1 *sb); static int stats_v1_blob_init_locked(struct statsblobv1 *sb, uint32_t tpl_id, uint32_t flags); static int stats_v1_blob_expand(struct statsblobv1 **sbpp, int newvoibytes, int newvoistatbytes, int newvoistatdatabytes); static void stats_v1_blob_iter(struct statsblobv1 *sb, stats_v1_blob_itercb_t icb, void *usrctx, uint32_t flags); static inline int stats_v1_vsd_tdgst_add(enum vsd_dtype vs_dtype, struct voistatdata_tdgst *tdgst, s64q_t x, uint64_t weight, int attempt); static inline int ctd32cmp(const struct voistatdata_tdgstctd32 *c1, const struct voistatdata_tdgstctd32 *c2) { KASSERT(Q_PRECEQ(c1->mu, c2->mu), ("%s: Q_RELPREC(c1->mu,c2->mu)=%d", __func__, Q_RELPREC(c1->mu, c2->mu))); return (Q_QLTQ(c1->mu, c2->mu) ? -1 : 1); } ARB_GENERATE_STATIC(ctdth32, voistatdata_tdgstctd32, ctdlnk, ctd32cmp); static inline int ctd64cmp(const struct voistatdata_tdgstctd64 *c1, const struct voistatdata_tdgstctd64 *c2) { KASSERT(Q_PRECEQ(c1->mu, c2->mu), ("%s: Q_RELPREC(c1->mu,c2->mu)=%d", __func__, Q_RELPREC(c1->mu, c2->mu))); return (Q_QLTQ(c1->mu, c2->mu) ? -1 : 1); } ARB_GENERATE_STATIC(ctdth64, voistatdata_tdgstctd64, ctdlnk, ctd64cmp); #ifdef DIAGNOSTIC RB_GENERATE_STATIC(rbctdth32, voistatdata_tdgstctd32, rblnk, ctd32cmp); RB_GENERATE_STATIC(rbctdth64, voistatdata_tdgstctd64, rblnk, ctd64cmp); #endif static inline sbintime_t stats_sbinuptime(void) { sbintime_t sbt; #ifdef _KERNEL sbt = sbinuptime(); #else /* ! _KERNEL */ struct timespec tp; clock_gettime(CLOCK_MONOTONIC_FAST, &tp); sbt = tstosbt(tp); #endif /* _KERNEL */ return (sbt); } static inline void * stats_realloc(void *ptr, size_t oldsz, size_t newsz, int flags) { #ifdef _KERNEL /* Default to M_NOWAIT if neither M_NOWAIT or M_WAITOK are set. */ if (!(flags & (M_WAITOK | M_NOWAIT))) flags |= M_NOWAIT; ptr = realloc(ptr, newsz, M_STATS, flags); #else /* ! _KERNEL */ ptr = realloc(ptr, newsz); if ((flags & M_ZERO) && ptr != NULL) { if (oldsz == 0) memset(ptr, '\0', newsz); else if (newsz > oldsz) memset(BLOB_OFFSET(ptr, oldsz), '\0', newsz - oldsz); } #endif /* _KERNEL */ return (ptr); } static inline char * stats_strdup(const char *s, #ifdef _KERNEL int flags) { char *copy; size_t len; if (!(flags & (M_WAITOK | M_NOWAIT))) flags |= M_NOWAIT; len = strlen(s) + 1; if ((copy = malloc(len, M_STATS, flags)) != NULL) bcopy(s, copy, len); return (copy); #else int flags __unused) { return (strdup(s)); #endif } static inline void stats_tpl_update_hash(struct statsblob_tpl *tpl) { TPL_LIST_WLOCK_ASSERT(); tpl->mb->tplhash = hash32_str(tpl->mb->tplname, 0); for (int voi_id = 0; voi_id < NVOIS(tpl->sb); voi_id++) { if (tpl->mb->voi_meta[voi_id].name != NULL) tpl->mb->tplhash = hash32_str( tpl->mb->voi_meta[voi_id].name, tpl->mb->tplhash); } tpl->mb->tplhash = hash32_buf(tpl->sb, tpl->sb->cursz, tpl->mb->tplhash); } static inline uint64_t stats_pow_u64(uint64_t base, uint64_t exp) { uint64_t result = 1; while (exp) { if (exp & 1) result *= base; exp >>= 1; base *= base; } return (result); } static inline int stats_vss_hist_bkt_hlpr(struct vss_hist_hlpr_info *info, uint32_t curbkt, struct voistatdata_numeric *bkt_lb, struct voistatdata_numeric *bkt_ub) { uint64_t step = 0; int error = 0; switch (info->scheme) { case BKT_LIN: step = info->lin.stepinc; break; case BKT_EXP: step = stats_pow_u64(info->exp.stepbase, info->exp.stepexp + curbkt); break; case BKT_LINEXP: { uint64_t curstepexp = 1; switch (info->voi_dtype) { case VSD_DTYPE_INT_S32: while ((int32_t)stats_pow_u64(info->linexp.stepbase, curstepexp) <= bkt_lb->int32.s32) curstepexp++; break; case VSD_DTYPE_INT_U32: while ((uint32_t)stats_pow_u64(info->linexp.stepbase, curstepexp) <= bkt_lb->int32.u32) curstepexp++; break; case VSD_DTYPE_INT_S64: while ((int64_t)stats_pow_u64(info->linexp.stepbase, curstepexp) <= bkt_lb->int64.s64) curstepexp++; break; case VSD_DTYPE_INT_U64: while ((uint64_t)stats_pow_u64(info->linexp.stepbase, curstepexp) <= bkt_lb->int64.u64) curstepexp++; break; case VSD_DTYPE_INT_SLONG: while ((long)stats_pow_u64(info->linexp.stepbase, curstepexp) <= bkt_lb->intlong.slong) curstepexp++; break; case VSD_DTYPE_INT_ULONG: while ((unsigned long)stats_pow_u64(info->linexp.stepbase, curstepexp) <= bkt_lb->intlong.ulong) curstepexp++; break; case VSD_DTYPE_Q_S32: while ((s32q_t)stats_pow_u64(info->linexp.stepbase, curstepexp) <= Q_GIVAL(bkt_lb->q32.sq32)) break; case VSD_DTYPE_Q_U32: while ((u32q_t)stats_pow_u64(info->linexp.stepbase, curstepexp) <= Q_GIVAL(bkt_lb->q32.uq32)) break; case VSD_DTYPE_Q_S64: while ((s64q_t)stats_pow_u64(info->linexp.stepbase, curstepexp) <= Q_GIVAL(bkt_lb->q64.sq64)) curstepexp++; break; case VSD_DTYPE_Q_U64: while ((u64q_t)stats_pow_u64(info->linexp.stepbase, curstepexp) <= Q_GIVAL(bkt_lb->q64.uq64)) curstepexp++; break; default: break; } step = stats_pow_u64(info->linexp.stepbase, curstepexp) / info->linexp.linstepdiv; if (step == 0) step = 1; break; } default: break; } if (info->scheme == BKT_USR) { *bkt_lb = info->usr.bkts[curbkt].lb; *bkt_ub = info->usr.bkts[curbkt].ub; } else if (step != 0) { switch (info->voi_dtype) { case VSD_DTYPE_INT_S32: bkt_ub->int32.s32 += (int32_t)step; break; case VSD_DTYPE_INT_U32: bkt_ub->int32.u32 += (uint32_t)step; break; case VSD_DTYPE_INT_S64: bkt_ub->int64.s64 += (int64_t)step; break; case VSD_DTYPE_INT_U64: bkt_ub->int64.u64 += (uint64_t)step; break; case VSD_DTYPE_INT_SLONG: bkt_ub->intlong.slong += (long)step; break; case VSD_DTYPE_INT_ULONG: bkt_ub->intlong.ulong += (unsigned long)step; break; case VSD_DTYPE_Q_S32: error = Q_QADDI(&bkt_ub->q32.sq32, step); break; case VSD_DTYPE_Q_U32: error = Q_QADDI(&bkt_ub->q32.uq32, step); break; case VSD_DTYPE_Q_S64: error = Q_QADDI(&bkt_ub->q64.sq64, step); break; case VSD_DTYPE_Q_U64: error = Q_QADDI(&bkt_ub->q64.uq64, step); break; default: break; } } else { /* info->scheme != BKT_USR && step == 0 */ return (EINVAL); } return (error); } static uint32_t stats_vss_hist_nbkts_hlpr(struct vss_hist_hlpr_info *info) { struct voistatdata_numeric bkt_lb, bkt_ub; uint32_t nbkts; int done; if (info->scheme == BKT_USR) { /* XXXLAS: Setting info->{lb,ub} from macro is tricky. */ info->lb = info->usr.bkts[0].lb; info->ub = info->usr.bkts[info->usr.nbkts - 1].lb; } nbkts = 0; done = 0; bkt_ub = info->lb; do { bkt_lb = bkt_ub; if (stats_vss_hist_bkt_hlpr(info, nbkts++, &bkt_lb, &bkt_ub)) return (0); if (info->scheme == BKT_USR) done = (nbkts == info->usr.nbkts); else { switch (info->voi_dtype) { case VSD_DTYPE_INT_S32: done = (bkt_ub.int32.s32 > info->ub.int32.s32); break; case VSD_DTYPE_INT_U32: done = (bkt_ub.int32.u32 > info->ub.int32.u32); break; case VSD_DTYPE_INT_S64: done = (bkt_ub.int64.s64 > info->ub.int64.s64); break; case VSD_DTYPE_INT_U64: done = (bkt_ub.int64.u64 > info->ub.int64.u64); break; case VSD_DTYPE_INT_SLONG: done = (bkt_ub.intlong.slong > info->ub.intlong.slong); break; case VSD_DTYPE_INT_ULONG: done = (bkt_ub.intlong.ulong > info->ub.intlong.ulong); break; case VSD_DTYPE_Q_S32: done = Q_QGTQ(bkt_ub.q32.sq32, info->ub.q32.sq32); break; case VSD_DTYPE_Q_U32: done = Q_QGTQ(bkt_ub.q32.uq32, info->ub.q32.uq32); break; case VSD_DTYPE_Q_S64: done = Q_QGTQ(bkt_ub.q64.sq64, info->ub.q64.sq64); break; case VSD_DTYPE_Q_U64: done = Q_QGTQ(bkt_ub.q64.uq64, info->ub.q64.uq64); break; default: return (0); } } } while (!done); if (info->flags & VSD_HIST_LBOUND_INF) nbkts++; if (info->flags & VSD_HIST_UBOUND_INF) nbkts++; return (nbkts); } int stats_vss_hist_hlpr(enum vsd_dtype voi_dtype, struct voistatspec *vss, struct vss_hist_hlpr_info *info) { struct voistatdata_hist *hist; struct voistatdata_numeric bkt_lb, bkt_ub, *lbinfbktlb, *lbinfbktub, *ubinfbktlb, *ubinfbktub; uint32_t bkt, nbkts, nloop; if (vss == NULL || info == NULL || (info->flags & (VSD_HIST_LBOUND_INF|VSD_HIST_UBOUND_INF) && (info->hist_dtype == VSD_DTYPE_DVHIST32 || info->hist_dtype == VSD_DTYPE_DVHIST64))) return (EINVAL); info->voi_dtype = voi_dtype; if ((nbkts = stats_vss_hist_nbkts_hlpr(info)) == 0) return (EINVAL); switch (info->hist_dtype) { case VSD_DTYPE_CRHIST32: vss->vsdsz = HIST_NBKTS2VSDSZ(crhist32, nbkts); break; case VSD_DTYPE_DRHIST32: vss->vsdsz = HIST_NBKTS2VSDSZ(drhist32, nbkts); break; case VSD_DTYPE_DVHIST32: vss->vsdsz = HIST_NBKTS2VSDSZ(dvhist32, nbkts); break; case VSD_DTYPE_CRHIST64: vss->vsdsz = HIST_NBKTS2VSDSZ(crhist64, nbkts); break; case VSD_DTYPE_DRHIST64: vss->vsdsz = HIST_NBKTS2VSDSZ(drhist64, nbkts); break; case VSD_DTYPE_DVHIST64: vss->vsdsz = HIST_NBKTS2VSDSZ(dvhist64, nbkts); break; default: return (EINVAL); } vss->iv = stats_realloc(NULL, 0, vss->vsdsz, M_ZERO); if (vss->iv == NULL) return (ENOMEM); hist = (struct voistatdata_hist *)vss->iv; bkt_ub = info->lb; for (bkt = (info->flags & VSD_HIST_LBOUND_INF), nloop = 0; bkt < nbkts; bkt++, nloop++) { bkt_lb = bkt_ub; if (stats_vss_hist_bkt_hlpr(info, nloop, &bkt_lb, &bkt_ub)) return (EINVAL); switch (info->hist_dtype) { case VSD_DTYPE_CRHIST32: VSD(crhist32, hist)->bkts[bkt].lb = bkt_lb; break; case VSD_DTYPE_DRHIST32: VSD(drhist32, hist)->bkts[bkt].lb = bkt_lb; VSD(drhist32, hist)->bkts[bkt].ub = bkt_ub; break; case VSD_DTYPE_DVHIST32: VSD(dvhist32, hist)->bkts[bkt].val = bkt_lb; break; case VSD_DTYPE_CRHIST64: VSD(crhist64, hist)->bkts[bkt].lb = bkt_lb; break; case VSD_DTYPE_DRHIST64: VSD(drhist64, hist)->bkts[bkt].lb = bkt_lb; VSD(drhist64, hist)->bkts[bkt].ub = bkt_ub; break; case VSD_DTYPE_DVHIST64: VSD(dvhist64, hist)->bkts[bkt].val = bkt_lb; break; default: return (EINVAL); } } lbinfbktlb = lbinfbktub = ubinfbktlb = ubinfbktub = NULL; switch (info->hist_dtype) { case VSD_DTYPE_CRHIST32: lbinfbktlb = &VSD(crhist32, hist)->bkts[0].lb; ubinfbktlb = &VSD(crhist32, hist)->bkts[nbkts - 1].lb; break; case VSD_DTYPE_DRHIST32: lbinfbktlb = &VSD(drhist32, hist)->bkts[0].lb; lbinfbktub = &VSD(drhist32, hist)->bkts[0].ub; ubinfbktlb = &VSD(drhist32, hist)->bkts[nbkts - 1].lb; ubinfbktub = &VSD(drhist32, hist)->bkts[nbkts - 1].ub; break; case VSD_DTYPE_CRHIST64: lbinfbktlb = &VSD(crhist64, hist)->bkts[0].lb; ubinfbktlb = &VSD(crhist64, hist)->bkts[nbkts - 1].lb; break; case VSD_DTYPE_DRHIST64: lbinfbktlb = &VSD(drhist64, hist)->bkts[0].lb; lbinfbktub = &VSD(drhist64, hist)->bkts[0].ub; ubinfbktlb = &VSD(drhist64, hist)->bkts[nbkts - 1].lb; ubinfbktub = &VSD(drhist64, hist)->bkts[nbkts - 1].ub; break; case VSD_DTYPE_DVHIST32: case VSD_DTYPE_DVHIST64: break; default: return (EINVAL); } if ((info->flags & VSD_HIST_LBOUND_INF) && lbinfbktlb) { *lbinfbktlb = numeric_limits[LIM_MIN][info->voi_dtype]; /* * Assignment from numeric_limit array for Q types assigns max * possible integral/fractional value for underlying data type, * but we must set control bits for this specific histogram per * the user's choice of fractional bits, which we extract from * info->lb. */ if (info->voi_dtype == VSD_DTYPE_Q_S32 || info->voi_dtype == VSD_DTYPE_Q_U32) { /* Signedness doesn't matter for setting control bits. */ Q_SCVAL(lbinfbktlb->q32.sq32, Q_GCVAL(info->lb.q32.sq32)); } else if (info->voi_dtype == VSD_DTYPE_Q_S64 || info->voi_dtype == VSD_DTYPE_Q_U64) { /* Signedness doesn't matter for setting control bits. */ Q_SCVAL(lbinfbktlb->q64.sq64, Q_GCVAL(info->lb.q64.sq64)); } if (lbinfbktub) *lbinfbktub = info->lb; } if ((info->flags & VSD_HIST_UBOUND_INF) && ubinfbktlb) { *ubinfbktlb = bkt_lb; if (ubinfbktub) { *ubinfbktub = numeric_limits[LIM_MAX][info->voi_dtype]; if (info->voi_dtype == VSD_DTYPE_Q_S32 || info->voi_dtype == VSD_DTYPE_Q_U32) { Q_SCVAL(ubinfbktub->q32.sq32, Q_GCVAL(info->lb.q32.sq32)); } else if (info->voi_dtype == VSD_DTYPE_Q_S64 || info->voi_dtype == VSD_DTYPE_Q_U64) { Q_SCVAL(ubinfbktub->q64.sq64, Q_GCVAL(info->lb.q64.sq64)); } } } return (0); } int stats_vss_tdgst_hlpr(enum vsd_dtype voi_dtype, struct voistatspec *vss, struct vss_tdgst_hlpr_info *info) { struct voistatdata_tdgst *tdgst; struct ctdth32 *ctd32tree; struct ctdth64 *ctd64tree; struct voistatdata_tdgstctd32 *ctd32; struct voistatdata_tdgstctd64 *ctd64; info->voi_dtype = voi_dtype; switch (info->tdgst_dtype) { case VSD_DTYPE_TDGSTCLUST32: vss->vsdsz = TDGST_NCTRS2VSDSZ(tdgstclust32, info->nctds); break; case VSD_DTYPE_TDGSTCLUST64: vss->vsdsz = TDGST_NCTRS2VSDSZ(tdgstclust64, info->nctds); break; default: return (EINVAL); } vss->iv = stats_realloc(NULL, 0, vss->vsdsz, M_ZERO); if (vss->iv == NULL) return (ENOMEM); tdgst = (struct voistatdata_tdgst *)vss->iv; switch (info->tdgst_dtype) { case VSD_DTYPE_TDGSTCLUST32: ctd32tree = &VSD(tdgstclust32, tdgst)->ctdtree; ARB_INIT(ctd32, ctdlnk, ctd32tree, info->nctds) { Q_INI(&ctd32->mu, 0, 0, info->prec); } break; case VSD_DTYPE_TDGSTCLUST64: ctd64tree = &VSD(tdgstclust64, tdgst)->ctdtree; ARB_INIT(ctd64, ctdlnk, ctd64tree, info->nctds) { Q_INI(&ctd64->mu, 0, 0, info->prec); } break; default: return (EINVAL); } return (0); } int stats_vss_numeric_hlpr(enum vsd_dtype voi_dtype, struct voistatspec *vss, struct vss_numeric_hlpr_info *info) { struct voistatdata_numeric iv; switch (vss->stype) { case VS_STYPE_SUM: iv = stats_ctor_vsd_numeric(0); break; case VS_STYPE_MIN: iv = numeric_limits[LIM_MAX][voi_dtype]; break; case VS_STYPE_MAX: iv = numeric_limits[LIM_MIN][voi_dtype]; break; default: return (EINVAL); } vss->iv = stats_realloc(NULL, 0, vsd_dtype2size[voi_dtype], 0); if (vss->iv == NULL) return (ENOMEM); vss->vs_dtype = voi_dtype; vss->vsdsz = vsd_dtype2size[voi_dtype]; switch (voi_dtype) { case VSD_DTYPE_INT_S32: *((int32_t *)vss->iv) = iv.int32.s32; break; case VSD_DTYPE_INT_U32: *((uint32_t *)vss->iv) = iv.int32.u32; break; case VSD_DTYPE_INT_S64: *((int64_t *)vss->iv) = iv.int64.s64; break; case VSD_DTYPE_INT_U64: *((uint64_t *)vss->iv) = iv.int64.u64; break; case VSD_DTYPE_INT_SLONG: *((long *)vss->iv) = iv.intlong.slong; break; case VSD_DTYPE_INT_ULONG: *((unsigned long *)vss->iv) = iv.intlong.ulong; break; case VSD_DTYPE_Q_S32: *((s32q_t *)vss->iv) = Q_SCVAL(iv.q32.sq32, Q_CTRLINI(info->prec)); break; case VSD_DTYPE_Q_U32: *((u32q_t *)vss->iv) = Q_SCVAL(iv.q32.uq32, Q_CTRLINI(info->prec)); break; case VSD_DTYPE_Q_S64: *((s64q_t *)vss->iv) = Q_SCVAL(iv.q64.sq64, Q_CTRLINI(info->prec)); break; case VSD_DTYPE_Q_U64: *((u64q_t *)vss->iv) = Q_SCVAL(iv.q64.uq64, Q_CTRLINI(info->prec)); break; default: break; } return (0); } int stats_vss_hlpr_init(enum vsd_dtype voi_dtype, uint32_t nvss, struct voistatspec *vss) { int i, ret; for (i = nvss - 1; i >= 0; i--) { if (vss[i].hlpr && (ret = vss[i].hlpr(voi_dtype, &vss[i], vss[i].hlprinfo)) != 0) return (ret); } return (0); } void stats_vss_hlpr_cleanup(uint32_t nvss, struct voistatspec *vss) { int i; for (i = nvss - 1; i >= 0; i--) { if (vss[i].hlpr) { stats_free((void *)vss[i].iv); vss[i].iv = NULL; } } } int stats_tpl_fetch(int tpl_id, struct statsblob_tpl **tpl) { int error; error = 0; TPL_LIST_WLOCK(); if (tpl_id < 0 || tpl_id >= (int)ntpl) { error = ENOENT; } else { *tpl = tpllist[tpl_id]; /* XXXLAS: Acquire refcount on tpl. */ } TPL_LIST_WUNLOCK(); return (error); } int stats_tpl_fetch_allocid(const char *name, uint32_t hash) { int i, tpl_id; tpl_id = -ESRCH; TPL_LIST_RLOCK(); for (i = ntpl - 1; i >= 0; i--) { if (name != NULL) { if (strlen(name) == strlen(tpllist[i]->mb->tplname) && strncmp(name, tpllist[i]->mb->tplname, TPL_MAX_NAME_LEN) == 0 && (!hash || hash == tpllist[i]->mb->tplhash)) { tpl_id = i; break; } } else if (hash == tpllist[i]->mb->tplhash) { tpl_id = i; break; } } TPL_LIST_RUNLOCK(); return (tpl_id); } int stats_tpl_id2name(uint32_t tpl_id, char *buf, size_t len) { int error; error = 0; TPL_LIST_RLOCK(); if (tpl_id < ntpl) { if (buf != NULL && len > strlen(tpllist[tpl_id]->mb->tplname)) strlcpy(buf, tpllist[tpl_id]->mb->tplname, len); else error = EOVERFLOW; } else error = ENOENT; TPL_LIST_RUNLOCK(); return (error); } int stats_tpl_sample_rollthedice(struct stats_tpl_sample_rate *rates, int nrates, void *seed_bytes, size_t seed_len) { uint32_t cum_pct, rnd_pct; int i; cum_pct = 0; /* * Choose a pseudorandom or seeded number in range [0,100] and use * it to make a sampling decision and template selection where required. * If no seed is supplied, a PRNG is used to generate a pseudorandom * number so that every selection is independent. If a seed is supplied, * the caller desires random selection across different seeds, but * deterministic selection given the same seed. This is achieved by * hashing the seed and using the hash as the random number source. * * XXXLAS: Characterise hash function output distribution. */ if (seed_bytes == NULL) rnd_pct = random() / (INT32_MAX / 100); else rnd_pct = hash32_buf(seed_bytes, seed_len, 0) / (UINT32_MAX / 100U); /* * We map the randomly selected percentage on to the interval [0,100] * consisting of the cumulatively summed template sampling percentages. * The difference between the cumulative sum of all template sampling * percentages and 100 is treated as a NULL assignment i.e. no stats * template will be assigned, and -1 returned instead. */ for (i = 0; i < nrates; i++) { cum_pct += rates[i].tpl_sample_pct; KASSERT(cum_pct <= 100, ("%s cum_pct %u > 100", __func__, cum_pct)); if (rnd_pct > cum_pct || rates[i].tpl_sample_pct == 0) continue; return (rates[i].tpl_slot_id); } return (-1); } int stats_v1_blob_clone(struct statsblobv1 **dst, size_t dstmaxsz, struct statsblobv1 *src, uint32_t flags) { int error; error = 0; if (src == NULL || dst == NULL || src->cursz < sizeof(struct statsblob) || ((flags & SB_CLONE_ALLOCDST) && (flags & (SB_CLONE_USRDSTNOFAULT | SB_CLONE_USRDST)))) { error = EINVAL; } else if (flags & SB_CLONE_ALLOCDST) { *dst = stats_realloc(NULL, 0, src->cursz, 0); if (*dst) (*dst)->maxsz = dstmaxsz = src->cursz; else error = ENOMEM; } else if (*dst == NULL || dstmaxsz < sizeof(struct statsblob)) { error = EINVAL; } if (!error) { size_t postcurszlen; /* * Clone src into dst except for the maxsz field. If dst is too * small to hold all of src, only copy src's header and return * EOVERFLOW. */ #ifdef _KERNEL if (flags & SB_CLONE_USRDSTNOFAULT) copyout_nofault(src, *dst, offsetof(struct statsblob, maxsz)); else if (flags & SB_CLONE_USRDST) copyout(src, *dst, offsetof(struct statsblob, maxsz)); else #endif memcpy(*dst, src, offsetof(struct statsblob, maxsz)); if (dstmaxsz >= src->cursz) { postcurszlen = src->cursz - offsetof(struct statsblob, cursz); } else { error = EOVERFLOW; postcurszlen = sizeof(struct statsblob) - offsetof(struct statsblob, cursz); } #ifdef _KERNEL if (flags & SB_CLONE_USRDSTNOFAULT) copyout_nofault(&(src->cursz), &((*dst)->cursz), postcurszlen); else if (flags & SB_CLONE_USRDST) copyout(&(src->cursz), &((*dst)->cursz), postcurszlen); else #endif memcpy(&((*dst)->cursz), &(src->cursz), postcurszlen); } return (error); } int stats_v1_tpl_alloc(const char *name, uint32_t flags __unused) { struct statsblobv1_tpl *tpl, **newtpllist; struct statsblobv1 *tpl_sb; struct metablob *tpl_mb; int tpl_id; if (name != NULL && strlen(name) > TPL_MAX_NAME_LEN) return (-EINVAL); if (name != NULL && stats_tpl_fetch_allocid(name, 0) >= 0) return (-EEXIST); tpl = stats_realloc(NULL, 0, sizeof(struct statsblobv1_tpl), M_ZERO); tpl_mb = stats_realloc(NULL, 0, sizeof(struct metablob), M_ZERO); tpl_sb = stats_realloc(NULL, 0, sizeof(struct statsblobv1), M_ZERO); if (tpl_mb != NULL && name != NULL) tpl_mb->tplname = stats_strdup(name, 0); if (tpl == NULL || tpl_sb == NULL || tpl_mb == NULL || tpl_mb->tplname == NULL) { stats_free(tpl); stats_free(tpl_sb); if (tpl_mb != NULL) { stats_free(tpl_mb->tplname); stats_free(tpl_mb); } return (-ENOMEM); } tpl->mb = tpl_mb; tpl->sb = tpl_sb; tpl_sb->abi = STATS_ABI_V1; tpl_sb->endian = #if BYTE_ORDER == LITTLE_ENDIAN SB_LE; #elif BYTE_ORDER == BIG_ENDIAN SB_BE; #else SB_UE; #endif tpl_sb->cursz = tpl_sb->maxsz = sizeof(struct statsblobv1); tpl_sb->stats_off = tpl_sb->statsdata_off = sizeof(struct statsblobv1); TPL_LIST_WLOCK(); newtpllist = stats_realloc(tpllist, ntpl * sizeof(void *), (ntpl + 1) * sizeof(void *), 0); if (newtpllist != NULL) { tpl_id = ntpl++; tpllist = (struct statsblob_tpl **)newtpllist; tpllist[tpl_id] = (struct statsblob_tpl *)tpl; stats_tpl_update_hash(tpllist[tpl_id]); } else { stats_free(tpl); stats_free(tpl_sb); if (tpl_mb != NULL) { stats_free(tpl_mb->tplname); stats_free(tpl_mb); } tpl_id = -ENOMEM; } TPL_LIST_WUNLOCK(); return (tpl_id); } int stats_v1_tpl_add_voistats(uint32_t tpl_id, int32_t voi_id, const char *voi_name, enum vsd_dtype voi_dtype, uint32_t nvss, struct voistatspec *vss, uint32_t flags) { struct voi *voi; struct voistat *tmpstat; struct statsblobv1 *tpl_sb; struct metablob *tpl_mb; int error, i, newstatdataidx, newvoibytes, newvoistatbytes, newvoistatdatabytes, newvoistatmaxid; uint32_t nbytes; if (voi_id < 0 || voi_dtype == 0 || voi_dtype >= VSD_NUM_DTYPES || nvss == 0 || vss == NULL) return (EINVAL); error = nbytes = newvoibytes = newvoistatbytes = newvoistatdatabytes = 0; newvoistatmaxid = -1; /* Calculate the number of bytes required for the new voistats. */ for (i = nvss - 1; i >= 0; i--) { if (vss[i].stype == 0 || vss[i].stype >= VS_NUM_STYPES || vss[i].vs_dtype == 0 || vss[i].vs_dtype >= VSD_NUM_DTYPES || vss[i].iv == NULL || vss[i].vsdsz == 0) return (EINVAL); if ((int)vss[i].stype > newvoistatmaxid) newvoistatmaxid = vss[i].stype; newvoistatdatabytes += vss[i].vsdsz; } if (flags & SB_VOI_RELUPDATE) { /* XXXLAS: VOI state bytes may need to vary based on stat types. */ newvoistatdatabytes += sizeof(struct voistatdata_voistate); } nbytes += newvoistatdatabytes; TPL_LIST_WLOCK(); if (tpl_id < ntpl) { tpl_sb = (struct statsblobv1 *)tpllist[tpl_id]->sb; tpl_mb = tpllist[tpl_id]->mb; if (voi_id >= NVOIS(tpl_sb) || tpl_sb->vois[voi_id].id == -1) { /* Adding a new VOI and associated stats. */ if (voi_id >= NVOIS(tpl_sb)) { /* We need to grow the tpl_sb->vois array. */ newvoibytes = (voi_id - (NVOIS(tpl_sb) - 1)) * sizeof(struct voi); nbytes += newvoibytes; } newvoistatbytes = (newvoistatmaxid + 1) * sizeof(struct voistat); } else { /* Adding stats to an existing VOI. */ if (newvoistatmaxid > tpl_sb->vois[voi_id].voistatmaxid) { newvoistatbytes = (newvoistatmaxid - tpl_sb->vois[voi_id].voistatmaxid) * sizeof(struct voistat); } /* XXXLAS: KPI does not yet support expanding VOIs. */ error = EOPNOTSUPP; } nbytes += newvoistatbytes; if (!error && newvoibytes > 0) { struct voi_meta *voi_meta = tpl_mb->voi_meta; voi_meta = stats_realloc(voi_meta, voi_meta == NULL ? 0 : NVOIS(tpl_sb) * sizeof(struct voi_meta), (1 + voi_id) * sizeof(struct voi_meta), M_ZERO); if (voi_meta == NULL) error = ENOMEM; else tpl_mb->voi_meta = voi_meta; } if (!error) { /* NB: Resizing can change where tpl_sb points. */ error = stats_v1_blob_expand(&tpl_sb, newvoibytes, newvoistatbytes, newvoistatdatabytes); } if (!error) { tpl_mb->voi_meta[voi_id].name = stats_strdup(voi_name, 0); if (tpl_mb->voi_meta[voi_id].name == NULL) error = ENOMEM; } if (!error) { /* Update the template list with the resized pointer. */ tpllist[tpl_id]->sb = (struct statsblob *)tpl_sb; /* Update the template. */ voi = &tpl_sb->vois[voi_id]; if (voi->id < 0) { /* VOI is new and needs to be initialised. */ voi->id = voi_id; voi->dtype = voi_dtype; voi->stats_off = tpl_sb->stats_off; if (flags & SB_VOI_RELUPDATE) voi->flags |= VOI_REQSTATE; } else { /* * XXXLAS: When this else block is written, the * "KPI does not yet support expanding VOIs" * error earlier in this function can be * removed. What is required here is to shuffle * the voistat array such that the new stats for * the voi are contiguous, which will displace * stats for other vois that reside after the * voi being updated. The other vois then need * to have their stats_off adjusted post * shuffle. */ } voi->voistatmaxid = newvoistatmaxid; newstatdataidx = 0; if (voi->flags & VOI_REQSTATE) { /* Initialise the voistate stat in slot 0. */ tmpstat = BLOB_OFFSET(tpl_sb, voi->stats_off); tmpstat->stype = VS_STYPE_VOISTATE; tmpstat->flags = 0; tmpstat->dtype = VSD_DTYPE_VOISTATE; newstatdataidx = tmpstat->dsz = sizeof(struct voistatdata_numeric); tmpstat->data_off = tpl_sb->statsdata_off; } for (i = 0; (uint32_t)i < nvss; i++) { tmpstat = BLOB_OFFSET(tpl_sb, voi->stats_off + (vss[i].stype * sizeof(struct voistat))); KASSERT(tmpstat->stype < 0, ("voistat %p " "already initialised", tmpstat)); tmpstat->stype = vss[i].stype; tmpstat->flags = vss[i].flags; tmpstat->dtype = vss[i].vs_dtype; tmpstat->dsz = vss[i].vsdsz; tmpstat->data_off = tpl_sb->statsdata_off + newstatdataidx; memcpy(BLOB_OFFSET(tpl_sb, tmpstat->data_off), vss[i].iv, vss[i].vsdsz); newstatdataidx += vss[i].vsdsz; } /* Update the template version hash. */ stats_tpl_update_hash(tpllist[tpl_id]); /* XXXLAS: Confirm tpl name/hash pair remains unique. */ } } else error = EINVAL; TPL_LIST_WUNLOCK(); return (error); } struct statsblobv1 * stats_v1_blob_alloc(uint32_t tpl_id, uint32_t flags __unused) { struct statsblobv1 *sb; int error; sb = NULL; TPL_LIST_RLOCK(); if (tpl_id < ntpl) { sb = stats_realloc(NULL, 0, tpllist[tpl_id]->sb->maxsz, 0); if (sb != NULL) { sb->maxsz = tpllist[tpl_id]->sb->maxsz; error = stats_v1_blob_init_locked(sb, tpl_id, 0); } else error = ENOMEM; if (error) { stats_free(sb); sb = NULL; } } TPL_LIST_RUNLOCK(); return (sb); } void stats_v1_blob_destroy(struct statsblobv1 *sb) { stats_free(sb); } int stats_v1_voistat_fetch_dptr(struct statsblobv1 *sb, int32_t voi_id, enum voi_stype stype, enum vsd_dtype *retdtype, struct voistatdata **retvsd, size_t *retvsdsz) { struct voi *v; struct voistat *vs; if (retvsd == NULL || sb == NULL || sb->abi != STATS_ABI_V1 || voi_id >= NVOIS(sb)) return (EINVAL); v = &sb->vois[voi_id]; if ((__typeof(v->voistatmaxid))stype > v->voistatmaxid) return (EINVAL); vs = BLOB_OFFSET(sb, v->stats_off + (stype * sizeof(struct voistat))); *retvsd = BLOB_OFFSET(sb, vs->data_off); if (retdtype != NULL) *retdtype = vs->dtype; if (retvsdsz != NULL) *retvsdsz = vs->dsz; return (0); } int stats_v1_blob_init(struct statsblobv1 *sb, uint32_t tpl_id, uint32_t flags) { int error; error = 0; TPL_LIST_RLOCK(); if (sb == NULL || tpl_id >= ntpl) { error = EINVAL; } else { error = stats_v1_blob_init_locked(sb, tpl_id, flags); } TPL_LIST_RUNLOCK(); return (error); } static inline int stats_v1_blob_init_locked(struct statsblobv1 *sb, uint32_t tpl_id, uint32_t flags __unused) { int error; TPL_LIST_RLOCK_ASSERT(); error = (sb->maxsz >= tpllist[tpl_id]->sb->cursz) ? 0 : EOVERFLOW; KASSERT(!error, ("sb %d instead of %d bytes", sb->maxsz, tpllist[tpl_id]->sb->cursz)); if (!error) { memcpy(sb, tpllist[tpl_id]->sb, tpllist[tpl_id]->sb->cursz); sb->created = sb->lastrst = stats_sbinuptime(); sb->tplhash = tpllist[tpl_id]->mb->tplhash; } return (error); } static int stats_v1_blob_expand(struct statsblobv1 **sbpp, int newvoibytes, int newvoistatbytes, int newvoistatdatabytes) { struct statsblobv1 *sb; struct voi *tmpvoi; struct voistat *tmpvoistat, *voistat_array; int error, i, idxnewvois, idxnewvoistats, nbytes, nvoistats; KASSERT(newvoibytes % sizeof(struct voi) == 0, ("Bad newvoibytes %d", newvoibytes)); KASSERT(newvoistatbytes % sizeof(struct voistat) == 0, ("Bad newvoistatbytes %d", newvoistatbytes)); error = ((newvoibytes % sizeof(struct voi) == 0) && (newvoistatbytes % sizeof(struct voistat) == 0)) ? 0 : EINVAL; sb = *sbpp; nbytes = newvoibytes + newvoistatbytes + newvoistatdatabytes; /* * XXXLAS: Required until we gain support for flags which alter the * units of size/offset fields in key structs. */ if (!error && ((((int)sb->cursz) + nbytes) > SB_V1_MAXSZ)) error = EFBIG; if (!error && (sb->cursz + nbytes > sb->maxsz)) { /* Need to expand our blob. */ sb = stats_realloc(sb, sb->maxsz, sb->cursz + nbytes, M_ZERO); if (sb != NULL) { sb->maxsz = sb->cursz + nbytes; *sbpp = sb; } else error = ENOMEM; } if (!error) { /* * Shuffle memory within the expanded blob working from the end * backwards, leaving gaps for the new voistat and voistatdata * structs at the beginning of their respective blob regions, * and for the new voi structs at the end of their blob region. */ memmove(BLOB_OFFSET(sb, sb->statsdata_off + nbytes), BLOB_OFFSET(sb, sb->statsdata_off), sb->cursz - sb->statsdata_off); memmove(BLOB_OFFSET(sb, sb->stats_off + newvoibytes + newvoistatbytes), BLOB_OFFSET(sb, sb->stats_off), sb->statsdata_off - sb->stats_off); /* First index of new voi/voistat structs to be initialised. */ idxnewvois = NVOIS(sb); idxnewvoistats = (newvoistatbytes / sizeof(struct voistat)) - 1; /* Update housekeeping variables and offsets. */ sb->cursz += nbytes; sb->stats_off += newvoibytes; sb->statsdata_off += newvoibytes + newvoistatbytes; /* XXXLAS: Zeroing not strictly needed but aids debugging. */ memset(&sb->vois[idxnewvois], '\0', newvoibytes); memset(BLOB_OFFSET(sb, sb->stats_off), '\0', newvoistatbytes); memset(BLOB_OFFSET(sb, sb->statsdata_off), '\0', newvoistatdatabytes); /* Initialise new voi array members and update offsets. */ for (i = 0; i < NVOIS(sb); i++) { tmpvoi = &sb->vois[i]; if (i >= idxnewvois) { tmpvoi->id = tmpvoi->voistatmaxid = -1; } else if (tmpvoi->id > -1) { tmpvoi->stats_off += newvoibytes + newvoistatbytes; } } /* Initialise new voistat array members and update offsets. */ nvoistats = (sb->statsdata_off - sb->stats_off) / sizeof(struct voistat); voistat_array = BLOB_OFFSET(sb, sb->stats_off); for (i = 0; i < nvoistats; i++) { tmpvoistat = &voistat_array[i]; if (i <= idxnewvoistats) { tmpvoistat->stype = -1; } else if (tmpvoistat->stype > -1) { tmpvoistat->data_off += nbytes; } } } return (error); } static void stats_v1_blob_finalise(struct statsblobv1 *sb __unused) { /* XXXLAS: Fill this in. */ } static void stats_v1_blob_iter(struct statsblobv1 *sb, stats_v1_blob_itercb_t icb, void *usrctx, uint32_t flags) { struct voi *v; struct voistat *vs; struct sb_iter_ctx ctx; int i, j, firstvoi; ctx.usrctx = usrctx; ctx.flags = SB_IT_FIRST_CB; firstvoi = 1; for (i = 0; i < NVOIS(sb); i++) { v = &sb->vois[i]; ctx.vslot = i; ctx.vsslot = -1; ctx.flags |= SB_IT_FIRST_VOISTAT; if (firstvoi) ctx.flags |= SB_IT_FIRST_VOI; else if (i == (NVOIS(sb) - 1)) ctx.flags |= SB_IT_LAST_VOI | SB_IT_LAST_CB; if (v->id < 0 && (flags & SB_IT_NULLVOI)) { if (icb(sb, v, NULL, &ctx)) return; firstvoi = 0; ctx.flags &= ~SB_IT_FIRST_CB; } /* If NULL voi, v->voistatmaxid == -1 */ for (j = 0; j <= v->voistatmaxid; j++) { vs = &((struct voistat *)BLOB_OFFSET(sb, v->stats_off))[j]; if (vs->stype < 0 && !(flags & SB_IT_NULLVOISTAT)) continue; if (j == v->voistatmaxid) { ctx.flags |= SB_IT_LAST_VOISTAT; if (i == (NVOIS(sb) - 1)) ctx.flags |= SB_IT_LAST_CB; } else ctx.flags &= ~SB_IT_LAST_CB; ctx.vsslot = j; if (icb(sb, v, vs, &ctx)) return; ctx.flags &= ~(SB_IT_FIRST_CB | SB_IT_FIRST_VOISTAT | SB_IT_LAST_VOISTAT); } ctx.flags &= ~(SB_IT_FIRST_VOI | SB_IT_LAST_VOI); } } static inline void stats_voistatdata_tdgst_tostr(enum vsd_dtype voi_dtype __unused, const struct voistatdata_tdgst *tdgst, enum vsd_dtype tdgst_dtype, size_t tdgst_dsz __unused, enum sb_str_fmt fmt, struct sbuf *buf, int objdump) { const struct ctdth32 *ctd32tree; const struct ctdth64 *ctd64tree; const struct voistatdata_tdgstctd32 *ctd32; const struct voistatdata_tdgstctd64 *ctd64; const char *fmtstr; uint64_t smplcnt, compcnt; int is32bit, qmaxstrlen; uint16_t maxctds, curctds; switch (tdgst_dtype) { case VSD_DTYPE_TDGSTCLUST32: smplcnt = CONSTVSD(tdgstclust32, tdgst)->smplcnt; compcnt = CONSTVSD(tdgstclust32, tdgst)->compcnt; maxctds = ARB_MAXNODES(&CONSTVSD(tdgstclust32, tdgst)->ctdtree); curctds = ARB_CURNODES(&CONSTVSD(tdgstclust32, tdgst)->ctdtree); ctd32tree = &CONSTVSD(tdgstclust32, tdgst)->ctdtree; ctd32 = (objdump ? ARB_CNODE(ctd32tree, 0) : ARB_CMIN(ctdth32, ctd32tree)); qmaxstrlen = (ctd32 == NULL) ? 1 : Q_MAXSTRLEN(ctd32->mu, 10); is32bit = 1; ctd64tree = NULL; ctd64 = NULL; break; case VSD_DTYPE_TDGSTCLUST64: smplcnt = CONSTVSD(tdgstclust64, tdgst)->smplcnt; compcnt = CONSTVSD(tdgstclust64, tdgst)->compcnt; maxctds = ARB_MAXNODES(&CONSTVSD(tdgstclust64, tdgst)->ctdtree); curctds = ARB_CURNODES(&CONSTVSD(tdgstclust64, tdgst)->ctdtree); ctd64tree = &CONSTVSD(tdgstclust64, tdgst)->ctdtree; ctd64 = (objdump ? ARB_CNODE(ctd64tree, 0) : ARB_CMIN(ctdth64, ctd64tree)); qmaxstrlen = (ctd64 == NULL) ? 1 : Q_MAXSTRLEN(ctd64->mu, 10); is32bit = 0; ctd32tree = NULL; ctd32 = NULL; break; default: return; } switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = "smplcnt=%ju, compcnt=%ju, maxctds=%hu, nctds=%hu"; break; case SB_STRFMT_JSON: default: fmtstr = "\"smplcnt\":%ju,\"compcnt\":%ju,\"maxctds\":%hu," "\"nctds\":%hu,\"ctds\":["; break; } sbuf_printf(buf, fmtstr, (uintmax_t)smplcnt, (uintmax_t)compcnt, maxctds, curctds); while ((is32bit ? NULL != ctd32 : NULL != ctd64)) { char qstr[qmaxstrlen]; switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = "\n\t\t\t\t"; break; case SB_STRFMT_JSON: default: fmtstr = "{"; break; } sbuf_cat(buf, fmtstr); if (objdump) { switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = "ctd[%hu]."; break; case SB_STRFMT_JSON: default: fmtstr = "\"ctd\":%hu,"; break; } sbuf_printf(buf, fmtstr, is32bit ? ARB_SELFIDX(ctd32tree, ctd32) : ARB_SELFIDX(ctd64tree, ctd64)); } switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = "{mu="; break; case SB_STRFMT_JSON: default: fmtstr = "\"mu\":"; break; } sbuf_cat(buf, fmtstr); Q_TOSTR((is32bit ? ctd32->mu : ctd64->mu), -1, 10, qstr, sizeof(qstr)); sbuf_cat(buf, qstr); switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = is32bit ? ",cnt=%u}" : ",cnt=%ju}"; break; case SB_STRFMT_JSON: default: fmtstr = is32bit ? ",\"cnt\":%u}" : ",\"cnt\":%ju}"; break; } sbuf_printf(buf, fmtstr, is32bit ? ctd32->cnt : (uintmax_t)ctd64->cnt); if (is32bit) ctd32 = (objdump ? ARB_CNODE(ctd32tree, ARB_SELFIDX(ctd32tree, ctd32) + 1) : ARB_CNEXT(ctdth32, ctd32tree, ctd32)); else ctd64 = (objdump ? ARB_CNODE(ctd64tree, ARB_SELFIDX(ctd64tree, ctd64) + 1) : ARB_CNEXT(ctdth64, ctd64tree, ctd64)); if (fmt == SB_STRFMT_JSON && (is32bit ? NULL != ctd32 : NULL != ctd64)) sbuf_putc(buf, ','); } if (fmt == SB_STRFMT_JSON) sbuf_cat(buf, "]"); } static inline void stats_voistatdata_hist_tostr(enum vsd_dtype voi_dtype, const struct voistatdata_hist *hist, enum vsd_dtype hist_dtype, size_t hist_dsz, enum sb_str_fmt fmt, struct sbuf *buf, int objdump) { const struct voistatdata_numeric *bkt_lb, *bkt_ub; const char *fmtstr; int is32bit; uint16_t i, nbkts; switch (hist_dtype) { case VSD_DTYPE_CRHIST32: nbkts = HIST_VSDSZ2NBKTS(crhist32, hist_dsz); is32bit = 1; break; case VSD_DTYPE_DRHIST32: nbkts = HIST_VSDSZ2NBKTS(drhist32, hist_dsz); is32bit = 1; break; case VSD_DTYPE_DVHIST32: nbkts = HIST_VSDSZ2NBKTS(dvhist32, hist_dsz); is32bit = 1; break; case VSD_DTYPE_CRHIST64: nbkts = HIST_VSDSZ2NBKTS(crhist64, hist_dsz); is32bit = 0; break; case VSD_DTYPE_DRHIST64: nbkts = HIST_VSDSZ2NBKTS(drhist64, hist_dsz); is32bit = 0; break; case VSD_DTYPE_DVHIST64: nbkts = HIST_VSDSZ2NBKTS(dvhist64, hist_dsz); is32bit = 0; break; default: return; } switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = "nbkts=%hu, "; break; case SB_STRFMT_JSON: default: fmtstr = "\"nbkts\":%hu,"; break; } sbuf_printf(buf, fmtstr, nbkts); switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = (is32bit ? "oob=%u" : "oob=%ju"); break; case SB_STRFMT_JSON: default: fmtstr = (is32bit ? "\"oob\":%u,\"bkts\":[" : "\"oob\":%ju,\"bkts\":["); break; } sbuf_printf(buf, fmtstr, is32bit ? VSD_CONSTHIST_FIELDVAL(hist, hist_dtype, oob) : (uintmax_t)VSD_CONSTHIST_FIELDVAL(hist, hist_dtype, oob)); for (i = 0; i < nbkts; i++) { switch (hist_dtype) { case VSD_DTYPE_CRHIST32: case VSD_DTYPE_CRHIST64: bkt_lb = VSD_CONSTCRHIST_FIELDPTR(hist, hist_dtype, bkts[i].lb); if (i < nbkts - 1) bkt_ub = VSD_CONSTCRHIST_FIELDPTR(hist, hist_dtype, bkts[i + 1].lb); else bkt_ub = &numeric_limits[LIM_MAX][voi_dtype]; break; case VSD_DTYPE_DRHIST32: case VSD_DTYPE_DRHIST64: bkt_lb = VSD_CONSTDRHIST_FIELDPTR(hist, hist_dtype, bkts[i].lb); bkt_ub = VSD_CONSTDRHIST_FIELDPTR(hist, hist_dtype, bkts[i].ub); break; case VSD_DTYPE_DVHIST32: case VSD_DTYPE_DVHIST64: bkt_lb = bkt_ub = VSD_CONSTDVHIST_FIELDPTR(hist, hist_dtype, bkts[i].val); break; default: break; } switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = "\n\t\t\t\t"; break; case SB_STRFMT_JSON: default: fmtstr = "{"; break; } sbuf_cat(buf, fmtstr); if (objdump) { switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = "bkt[%hu]."; break; case SB_STRFMT_JSON: default: fmtstr = "\"bkt\":%hu,"; break; } sbuf_printf(buf, fmtstr, i); } switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = "{lb="; break; case SB_STRFMT_JSON: default: fmtstr = "\"lb\":"; break; } sbuf_cat(buf, fmtstr); stats_voistatdata_tostr((const struct voistatdata *)bkt_lb, voi_dtype, voi_dtype, sizeof(struct voistatdata_numeric), fmt, buf, objdump); switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = ",ub="; break; case SB_STRFMT_JSON: default: fmtstr = ",\"ub\":"; break; } sbuf_cat(buf, fmtstr); stats_voistatdata_tostr((const struct voistatdata *)bkt_ub, voi_dtype, voi_dtype, sizeof(struct voistatdata_numeric), fmt, buf, objdump); switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = is32bit ? ",cnt=%u}" : ",cnt=%ju}"; break; case SB_STRFMT_JSON: default: fmtstr = is32bit ? ",\"cnt\":%u}" : ",\"cnt\":%ju}"; break; } sbuf_printf(buf, fmtstr, is32bit ? VSD_CONSTHIST_FIELDVAL(hist, hist_dtype, bkts[i].cnt) : (uintmax_t)VSD_CONSTHIST_FIELDVAL(hist, hist_dtype, bkts[i].cnt)); if (fmt == SB_STRFMT_JSON && i < nbkts - 1) sbuf_putc(buf, ','); } if (fmt == SB_STRFMT_JSON) sbuf_cat(buf, "]"); } int stats_voistatdata_tostr(const struct voistatdata *vsd, enum vsd_dtype voi_dtype, enum vsd_dtype vsd_dtype, size_t vsd_sz, enum sb_str_fmt fmt, struct sbuf *buf, int objdump) { const char *fmtstr; if (vsd == NULL || buf == NULL || voi_dtype >= VSD_NUM_DTYPES || vsd_dtype >= VSD_NUM_DTYPES || fmt >= SB_STRFMT_NUM_FMTS) return (EINVAL); switch (vsd_dtype) { case VSD_DTYPE_VOISTATE: switch (fmt) { case SB_STRFMT_FREEFORM: fmtstr = "prev="; break; case SB_STRFMT_JSON: default: fmtstr = "\"prev\":"; break; } sbuf_cat(buf, fmtstr); /* * Render prev by passing it as *vsd and voi_dtype as vsd_dtype. */ stats_voistatdata_tostr( (const struct voistatdata *)&CONSTVSD(voistate, vsd)->prev, voi_dtype, voi_dtype, vsd_sz, fmt, buf, objdump); break; case VSD_DTYPE_INT_S32: sbuf_printf(buf, "%d", vsd->int32.s32); break; case VSD_DTYPE_INT_U32: sbuf_printf(buf, "%u", vsd->int32.u32); break; case VSD_DTYPE_INT_S64: sbuf_printf(buf, "%jd", (intmax_t)vsd->int64.s64); break; case VSD_DTYPE_INT_U64: sbuf_printf(buf, "%ju", (uintmax_t)vsd->int64.u64); break; case VSD_DTYPE_INT_SLONG: sbuf_printf(buf, "%ld", vsd->intlong.slong); break; case VSD_DTYPE_INT_ULONG: sbuf_printf(buf, "%lu", vsd->intlong.ulong); break; case VSD_DTYPE_Q_S32: { char qstr[Q_MAXSTRLEN(vsd->q32.sq32, 10)]; Q_TOSTR((s32q_t)vsd->q32.sq32, -1, 10, qstr, sizeof(qstr)); sbuf_cat(buf, qstr); } break; case VSD_DTYPE_Q_U32: { char qstr[Q_MAXSTRLEN(vsd->q32.uq32, 10)]; Q_TOSTR((u32q_t)vsd->q32.uq32, -1, 10, qstr, sizeof(qstr)); sbuf_cat(buf, qstr); } break; case VSD_DTYPE_Q_S64: { char qstr[Q_MAXSTRLEN(vsd->q64.sq64, 10)]; Q_TOSTR((s64q_t)vsd->q64.sq64, -1, 10, qstr, sizeof(qstr)); sbuf_cat(buf, qstr); } break; case VSD_DTYPE_Q_U64: { char qstr[Q_MAXSTRLEN(vsd->q64.uq64, 10)]; Q_TOSTR((u64q_t)vsd->q64.uq64, -1, 10, qstr, sizeof(qstr)); sbuf_cat(buf, qstr); } break; case VSD_DTYPE_CRHIST32: case VSD_DTYPE_DRHIST32: case VSD_DTYPE_DVHIST32: case VSD_DTYPE_CRHIST64: case VSD_DTYPE_DRHIST64: case VSD_DTYPE_DVHIST64: stats_voistatdata_hist_tostr(voi_dtype, CONSTVSD(hist, vsd), vsd_dtype, vsd_sz, fmt, buf, objdump); break; case VSD_DTYPE_TDGSTCLUST32: case VSD_DTYPE_TDGSTCLUST64: stats_voistatdata_tdgst_tostr(voi_dtype, CONSTVSD(tdgst, vsd), vsd_dtype, vsd_sz, fmt, buf, objdump); break; default: break; } return (sbuf_error(buf)); } static void stats_v1_itercb_tostr_freeform(struct statsblobv1 *sb, struct voi *v, struct voistat *vs, struct sb_iter_ctx *ctx) { struct sb_tostrcb_ctx *sctx; struct metablob *tpl_mb; struct sbuf *buf; void *vsd; uint8_t dump; sctx = ctx->usrctx; buf = sctx->buf; tpl_mb = sctx->tpl ? sctx->tpl->mb : NULL; dump = ((sctx->flags & SB_TOSTR_OBJDUMP) != 0); if (ctx->flags & SB_IT_FIRST_CB) { sbuf_printf(buf, "struct statsblobv1@%p", sb); if (dump) { sbuf_printf(buf, ", abi=%hhu, endian=%hhu, maxsz=%hu, " "cursz=%hu, created=%jd, lastrst=%jd, flags=0x%04hx, " "stats_off=%hu, statsdata_off=%hu", sb->abi, sb->endian, sb->maxsz, sb->cursz, sb->created, sb->lastrst, sb->flags, sb->stats_off, sb->statsdata_off); } sbuf_printf(buf, ", tplhash=%u", sb->tplhash); } if (ctx->flags & SB_IT_FIRST_VOISTAT) { sbuf_printf(buf, "\n\tvois[%hd]: id=%hd", ctx->vslot, v->id); if (v->id < 0) return; sbuf_printf(buf, ", name=\"%s\"", (tpl_mb == NULL) ? "" : tpl_mb->voi_meta[v->id].name); if (dump) sbuf_printf(buf, ", flags=0x%04hx, dtype=%s, " "voistatmaxid=%hhd, stats_off=%hu", v->flags, vsd_dtype2name[v->dtype], v->voistatmaxid, v->stats_off); } if (!dump && vs->stype <= 0) return; sbuf_printf(buf, "\n\t\tvois[%hd]stat[%hhd]: stype=", v->id, ctx->vsslot); if (vs->stype < 0) { sbuf_printf(buf, "%hhd", vs->stype); return; } else sbuf_printf(buf, "%s, errs=%hu", vs_stype2name[vs->stype], vs->errs); vsd = BLOB_OFFSET(sb, vs->data_off); if (dump) sbuf_printf(buf, ", flags=0x%04x, dtype=%s, dsz=%hu, " "data_off=%hu", vs->flags, vsd_dtype2name[vs->dtype], vs->dsz, vs->data_off); - sbuf_printf(buf, "\n\t\t\tvoistatdata: "); + sbuf_cat(buf, "\n\t\t\tvoistatdata: "); stats_voistatdata_tostr(vsd, v->dtype, vs->dtype, vs->dsz, sctx->fmt, buf, dump); } static void stats_v1_itercb_tostr_json(struct statsblobv1 *sb, struct voi *v, struct voistat *vs, struct sb_iter_ctx *ctx) { struct sb_tostrcb_ctx *sctx; struct metablob *tpl_mb; struct sbuf *buf; const char *fmtstr; void *vsd; uint8_t dump; sctx = ctx->usrctx; buf = sctx->buf; tpl_mb = sctx->tpl ? sctx->tpl->mb : NULL; dump = ((sctx->flags & SB_TOSTR_OBJDUMP) != 0); if (ctx->flags & SB_IT_FIRST_CB) { sbuf_putc(buf, '{'); if (dump) { sbuf_printf(buf, "\"abi\":%hhu,\"endian\":%hhu," "\"maxsz\":%hu,\"cursz\":%hu,\"created\":%jd," "\"lastrst\":%jd,\"flags\":%hu,\"stats_off\":%hu," "\"statsdata_off\":%hu,", sb->abi, sb->endian, sb->maxsz, sb->cursz, sb->created, sb->lastrst, sb->flags, sb->stats_off, sb->statsdata_off); } if (tpl_mb == NULL) fmtstr = "\"tplname\":%s,\"tplhash\":%u,\"vois\":{"; else fmtstr = "\"tplname\":\"%s\",\"tplhash\":%u,\"vois\":{"; sbuf_printf(buf, fmtstr, tpl_mb ? tpl_mb->tplname : "null", sb->tplhash); } if (ctx->flags & SB_IT_FIRST_VOISTAT) { if (dump) { sbuf_printf(buf, "\"[%d]\":{\"id\":%d", ctx->vslot, v->id); if (v->id < 0) { - sbuf_printf(buf, "},"); + sbuf_cat(buf, "},"); return; } if (tpl_mb == NULL) fmtstr = ",\"name\":%s,\"flags\":%hu," "\"dtype\":\"%s\",\"voistatmaxid\":%hhd," "\"stats_off\":%hu,"; else fmtstr = ",\"name\":\"%s\",\"flags\":%hu," "\"dtype\":\"%s\",\"voistatmaxid\":%hhd," "\"stats_off\":%hu,"; sbuf_printf(buf, fmtstr, tpl_mb ? tpl_mb->voi_meta[v->id].name : "null", v->flags, vsd_dtype2name[v->dtype], v->voistatmaxid, v->stats_off); } else { if (tpl_mb == NULL) { sbuf_printf(buf, "\"[%hd]\":{", v->id); } else { sbuf_printf(buf, "\"%s\":{", tpl_mb->voi_meta[v->id].name); } } sbuf_cat(buf, "\"stats\":{"); } vsd = BLOB_OFFSET(sb, vs->data_off); if (dump) { sbuf_printf(buf, "\"[%hhd]\":", ctx->vsslot); if (vs->stype < 0) { - sbuf_printf(buf, "{\"stype\":-1},"); + sbuf_cat(buf, "{\"stype\":-1},"); return; } sbuf_printf(buf, "{\"stype\":\"%s\",\"errs\":%hu,\"flags\":%hu," "\"dtype\":\"%s\",\"data_off\":%hu,\"voistatdata\":{", vs_stype2name[vs->stype], vs->errs, vs->flags, vsd_dtype2name[vs->dtype], vs->data_off); } else if (vs->stype > 0) { if (tpl_mb == NULL) sbuf_printf(buf, "\"[%hhd]\":", vs->stype); else sbuf_printf(buf, "\"%s\":", vs_stype2name[vs->stype]); } else return; if ((vs->flags & VS_VSDVALID) || dump) { if (!dump) sbuf_printf(buf, "{\"errs\":%hu,", vs->errs); /* Simple non-compound VSD types need a key. */ if (!vsd_compoundtype[vs->dtype]) sbuf_cat(buf, "\"val\":"); stats_voistatdata_tostr(vsd, v->dtype, vs->dtype, vs->dsz, sctx->fmt, buf, dump); sbuf_cat(buf, dump ? "}}" : "}"); } else sbuf_cat(buf, dump ? "null}" : "null"); if (ctx->flags & SB_IT_LAST_VOISTAT) sbuf_cat(buf, "}}"); if (ctx->flags & SB_IT_LAST_CB) sbuf_cat(buf, "}}"); else sbuf_putc(buf, ','); } static int stats_v1_itercb_tostr(struct statsblobv1 *sb, struct voi *v, struct voistat *vs, struct sb_iter_ctx *ctx) { struct sb_tostrcb_ctx *sctx; sctx = ctx->usrctx; switch (sctx->fmt) { case SB_STRFMT_FREEFORM: stats_v1_itercb_tostr_freeform(sb, v, vs, ctx); break; case SB_STRFMT_JSON: stats_v1_itercb_tostr_json(sb, v, vs, ctx); break; default: break; } return (sbuf_error(sctx->buf)); } int stats_v1_blob_tostr(struct statsblobv1 *sb, struct sbuf *buf, enum sb_str_fmt fmt, uint32_t flags) { struct sb_tostrcb_ctx sctx; uint32_t iflags; if (sb == NULL || sb->abi != STATS_ABI_V1 || buf == NULL || fmt >= SB_STRFMT_NUM_FMTS) return (EINVAL); sctx.buf = buf; sctx.fmt = fmt; sctx.flags = flags; if (flags & SB_TOSTR_META) { if (stats_tpl_fetch(stats_tpl_fetch_allocid(NULL, sb->tplhash), &sctx.tpl)) return (EINVAL); } else sctx.tpl = NULL; iflags = 0; if (flags & SB_TOSTR_OBJDUMP) iflags |= (SB_IT_NULLVOI | SB_IT_NULLVOISTAT); stats_v1_blob_iter(sb, stats_v1_itercb_tostr, &sctx, iflags); return (sbuf_error(buf)); } static int stats_v1_itercb_visit(struct statsblobv1 *sb, struct voi *v, struct voistat *vs, struct sb_iter_ctx *ctx) { struct sb_visitcb_ctx *vctx; struct sb_visit sbv; vctx = ctx->usrctx; sbv.tplhash = sb->tplhash; sbv.voi_id = v->id; sbv.voi_dtype = v->dtype; sbv.vs_stype = vs->stype; sbv.vs_dtype = vs->dtype; sbv.vs_dsz = vs->dsz; sbv.vs_data = BLOB_OFFSET(sb, vs->data_off); sbv.vs_errs = vs->errs; sbv.flags = ctx->flags & (SB_IT_FIRST_CB | SB_IT_LAST_CB | SB_IT_FIRST_VOI | SB_IT_LAST_VOI | SB_IT_FIRST_VOISTAT | SB_IT_LAST_VOISTAT); return (vctx->cb(&sbv, vctx->usrctx)); } int stats_v1_blob_visit(struct statsblobv1 *sb, stats_blob_visitcb_t func, void *usrctx) { struct sb_visitcb_ctx vctx; if (sb == NULL || sb->abi != STATS_ABI_V1 || func == NULL) return (EINVAL); vctx.cb = func; vctx.usrctx = usrctx; stats_v1_blob_iter(sb, stats_v1_itercb_visit, &vctx, 0); return (0); } static int stats_v1_icb_reset_voistat(struct statsblobv1 *sb, struct voi *v __unused, struct voistat *vs, struct sb_iter_ctx *ctx __unused) { void *vsd; if (vs->stype == VS_STYPE_VOISTATE) return (0); vsd = BLOB_OFFSET(sb, vs->data_off); /* Perform the stat type's default reset action. */ switch (vs->stype) { case VS_STYPE_SUM: switch (vs->dtype) { case VSD_DTYPE_Q_S32: Q_SIFVAL(VSD(q32, vsd)->sq32, 0); break; case VSD_DTYPE_Q_U32: Q_SIFVAL(VSD(q32, vsd)->uq32, 0); break; case VSD_DTYPE_Q_S64: Q_SIFVAL(VSD(q64, vsd)->sq64, 0); break; case VSD_DTYPE_Q_U64: Q_SIFVAL(VSD(q64, vsd)->uq64, 0); break; default: bzero(vsd, vs->dsz); break; } break; case VS_STYPE_MAX: switch (vs->dtype) { case VSD_DTYPE_Q_S32: Q_SIFVAL(VSD(q32, vsd)->sq32, Q_IFMINVAL(VSD(q32, vsd)->sq32)); break; case VSD_DTYPE_Q_U32: Q_SIFVAL(VSD(q32, vsd)->uq32, Q_IFMINVAL(VSD(q32, vsd)->uq32)); break; case VSD_DTYPE_Q_S64: Q_SIFVAL(VSD(q64, vsd)->sq64, Q_IFMINVAL(VSD(q64, vsd)->sq64)); break; case VSD_DTYPE_Q_U64: Q_SIFVAL(VSD(q64, vsd)->uq64, Q_IFMINVAL(VSD(q64, vsd)->uq64)); break; default: memcpy(vsd, &numeric_limits[LIM_MIN][vs->dtype], vs->dsz); break; } break; case VS_STYPE_MIN: switch (vs->dtype) { case VSD_DTYPE_Q_S32: Q_SIFVAL(VSD(q32, vsd)->sq32, Q_IFMAXVAL(VSD(q32, vsd)->sq32)); break; case VSD_DTYPE_Q_U32: Q_SIFVAL(VSD(q32, vsd)->uq32, Q_IFMAXVAL(VSD(q32, vsd)->uq32)); break; case VSD_DTYPE_Q_S64: Q_SIFVAL(VSD(q64, vsd)->sq64, Q_IFMAXVAL(VSD(q64, vsd)->sq64)); break; case VSD_DTYPE_Q_U64: Q_SIFVAL(VSD(q64, vsd)->uq64, Q_IFMAXVAL(VSD(q64, vsd)->uq64)); break; default: memcpy(vsd, &numeric_limits[LIM_MAX][vs->dtype], vs->dsz); break; } break; case VS_STYPE_HIST: { /* Reset bucket counts. */ struct voistatdata_hist *hist; int i, is32bit; uint16_t nbkts; hist = VSD(hist, vsd); switch (vs->dtype) { case VSD_DTYPE_CRHIST32: nbkts = HIST_VSDSZ2NBKTS(crhist32, vs->dsz); is32bit = 1; break; case VSD_DTYPE_DRHIST32: nbkts = HIST_VSDSZ2NBKTS(drhist32, vs->dsz); is32bit = 1; break; case VSD_DTYPE_DVHIST32: nbkts = HIST_VSDSZ2NBKTS(dvhist32, vs->dsz); is32bit = 1; break; case VSD_DTYPE_CRHIST64: nbkts = HIST_VSDSZ2NBKTS(crhist64, vs->dsz); is32bit = 0; break; case VSD_DTYPE_DRHIST64: nbkts = HIST_VSDSZ2NBKTS(drhist64, vs->dsz); is32bit = 0; break; case VSD_DTYPE_DVHIST64: nbkts = HIST_VSDSZ2NBKTS(dvhist64, vs->dsz); is32bit = 0; break; default: return (0); } bzero(VSD_HIST_FIELDPTR(hist, vs->dtype, oob), is32bit ? sizeof(uint32_t) : sizeof(uint64_t)); for (i = nbkts - 1; i >= 0; i--) { bzero(VSD_HIST_FIELDPTR(hist, vs->dtype, bkts[i].cnt), is32bit ? sizeof(uint32_t) : sizeof(uint64_t)); } break; } case VS_STYPE_TDGST: { /* Reset sample count centroids array/tree. */ struct voistatdata_tdgst *tdgst; struct ctdth32 *ctd32tree; struct ctdth64 *ctd64tree; struct voistatdata_tdgstctd32 *ctd32; struct voistatdata_tdgstctd64 *ctd64; tdgst = VSD(tdgst, vsd); switch (vs->dtype) { case VSD_DTYPE_TDGSTCLUST32: VSD(tdgstclust32, tdgst)->smplcnt = 0; VSD(tdgstclust32, tdgst)->compcnt = 0; ctd32tree = &VSD(tdgstclust32, tdgst)->ctdtree; ARB_INIT(ctd32, ctdlnk, ctd32tree, ARB_MAXNODES(ctd32tree)) { ctd32->cnt = 0; Q_SIFVAL(ctd32->mu, 0); } #ifdef DIAGNOSTIC RB_INIT(&VSD(tdgstclust32, tdgst)->rbctdtree); #endif break; case VSD_DTYPE_TDGSTCLUST64: VSD(tdgstclust64, tdgst)->smplcnt = 0; VSD(tdgstclust64, tdgst)->compcnt = 0; ctd64tree = &VSD(tdgstclust64, tdgst)->ctdtree; ARB_INIT(ctd64, ctdlnk, ctd64tree, ARB_MAXNODES(ctd64tree)) { ctd64->cnt = 0; Q_SIFVAL(ctd64->mu, 0); } #ifdef DIAGNOSTIC RB_INIT(&VSD(tdgstclust64, tdgst)->rbctdtree); #endif break; default: return (0); } break; } default: KASSERT(0, ("Unknown VOI stat type %d", vs->stype)); break; } vs->errs = 0; vs->flags &= ~VS_VSDVALID; return (0); } int stats_v1_blob_snapshot(struct statsblobv1 **dst, size_t dstmaxsz, struct statsblobv1 *src, uint32_t flags) { int error; if (src != NULL && src->abi == STATS_ABI_V1) { error = stats_v1_blob_clone(dst, dstmaxsz, src, flags); if (!error) { if (flags & SB_CLONE_RSTSRC) { stats_v1_blob_iter(src, stats_v1_icb_reset_voistat, NULL, 0); src->lastrst = stats_sbinuptime(); } stats_v1_blob_finalise(*dst); } } else error = EINVAL; return (error); } static inline int stats_v1_voi_update_max(enum vsd_dtype voi_dtype __unused, struct voistatdata *voival, struct voistat *vs, void *vsd) { int error; KASSERT(vs->dtype < VSD_NUM_DTYPES, ("Unknown VSD dtype %d", vs->dtype)); error = 0; switch (vs->dtype) { case VSD_DTYPE_INT_S32: if (VSD(int32, vsd)->s32 < voival->int32.s32) { VSD(int32, vsd)->s32 = voival->int32.s32; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_U32: if (VSD(int32, vsd)->u32 < voival->int32.u32) { VSD(int32, vsd)->u32 = voival->int32.u32; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_S64: if (VSD(int64, vsd)->s64 < voival->int64.s64) { VSD(int64, vsd)->s64 = voival->int64.s64; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_U64: if (VSD(int64, vsd)->u64 < voival->int64.u64) { VSD(int64, vsd)->u64 = voival->int64.u64; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_SLONG: if (VSD(intlong, vsd)->slong < voival->intlong.slong) { VSD(intlong, vsd)->slong = voival->intlong.slong; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_ULONG: if (VSD(intlong, vsd)->ulong < voival->intlong.ulong) { VSD(intlong, vsd)->ulong = voival->intlong.ulong; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_Q_S32: if (Q_QLTQ(VSD(q32, vsd)->sq32, voival->q32.sq32) && (0 == (error = Q_QCPYVALQ(&VSD(q32, vsd)->sq32, voival->q32.sq32)))) { vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_Q_U32: if (Q_QLTQ(VSD(q32, vsd)->uq32, voival->q32.uq32) && (0 == (error = Q_QCPYVALQ(&VSD(q32, vsd)->uq32, voival->q32.uq32)))) { vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_Q_S64: if (Q_QLTQ(VSD(q64, vsd)->sq64, voival->q64.sq64) && (0 == (error = Q_QCPYVALQ(&VSD(q64, vsd)->sq64, voival->q64.sq64)))) { vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_Q_U64: if (Q_QLTQ(VSD(q64, vsd)->uq64, voival->q64.uq64) && (0 == (error = Q_QCPYVALQ(&VSD(q64, vsd)->uq64, voival->q64.uq64)))) { vs->flags |= VS_VSDVALID; } break; default: error = EINVAL; break; } return (error); } static inline int stats_v1_voi_update_min(enum vsd_dtype voi_dtype __unused, struct voistatdata *voival, struct voistat *vs, void *vsd) { int error; KASSERT(vs->dtype < VSD_NUM_DTYPES, ("Unknown VSD dtype %d", vs->dtype)); error = 0; switch (vs->dtype) { case VSD_DTYPE_INT_S32: if (VSD(int32, vsd)->s32 > voival->int32.s32) { VSD(int32, vsd)->s32 = voival->int32.s32; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_U32: if (VSD(int32, vsd)->u32 > voival->int32.u32) { VSD(int32, vsd)->u32 = voival->int32.u32; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_S64: if (VSD(int64, vsd)->s64 > voival->int64.s64) { VSD(int64, vsd)->s64 = voival->int64.s64; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_U64: if (VSD(int64, vsd)->u64 > voival->int64.u64) { VSD(int64, vsd)->u64 = voival->int64.u64; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_SLONG: if (VSD(intlong, vsd)->slong > voival->intlong.slong) { VSD(intlong, vsd)->slong = voival->intlong.slong; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_INT_ULONG: if (VSD(intlong, vsd)->ulong > voival->intlong.ulong) { VSD(intlong, vsd)->ulong = voival->intlong.ulong; vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_Q_S32: if (Q_QGTQ(VSD(q32, vsd)->sq32, voival->q32.sq32) && (0 == (error = Q_QCPYVALQ(&VSD(q32, vsd)->sq32, voival->q32.sq32)))) { vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_Q_U32: if (Q_QGTQ(VSD(q32, vsd)->uq32, voival->q32.uq32) && (0 == (error = Q_QCPYVALQ(&VSD(q32, vsd)->uq32, voival->q32.uq32)))) { vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_Q_S64: if (Q_QGTQ(VSD(q64, vsd)->sq64, voival->q64.sq64) && (0 == (error = Q_QCPYVALQ(&VSD(q64, vsd)->sq64, voival->q64.sq64)))) { vs->flags |= VS_VSDVALID; } break; case VSD_DTYPE_Q_U64: if (Q_QGTQ(VSD(q64, vsd)->uq64, voival->q64.uq64) && (0 == (error = Q_QCPYVALQ(&VSD(q64, vsd)->uq64, voival->q64.uq64)))) { vs->flags |= VS_VSDVALID; } break; default: error = EINVAL; break; } return (error); } static inline int stats_v1_voi_update_sum(enum vsd_dtype voi_dtype __unused, struct voistatdata *voival, struct voistat *vs, void *vsd) { int error; KASSERT(vs->dtype < VSD_NUM_DTYPES, ("Unknown VSD dtype %d", vs->dtype)); error = 0; switch (vs->dtype) { case VSD_DTYPE_INT_S32: VSD(int32, vsd)->s32 += voival->int32.s32; break; case VSD_DTYPE_INT_U32: VSD(int32, vsd)->u32 += voival->int32.u32; break; case VSD_DTYPE_INT_S64: VSD(int64, vsd)->s64 += voival->int64.s64; break; case VSD_DTYPE_INT_U64: VSD(int64, vsd)->u64 += voival->int64.u64; break; case VSD_DTYPE_INT_SLONG: VSD(intlong, vsd)->slong += voival->intlong.slong; break; case VSD_DTYPE_INT_ULONG: VSD(intlong, vsd)->ulong += voival->intlong.ulong; break; case VSD_DTYPE_Q_S32: error = Q_QADDQ(&VSD(q32, vsd)->sq32, voival->q32.sq32); break; case VSD_DTYPE_Q_U32: error = Q_QADDQ(&VSD(q32, vsd)->uq32, voival->q32.uq32); break; case VSD_DTYPE_Q_S64: error = Q_QADDQ(&VSD(q64, vsd)->sq64, voival->q64.sq64); break; case VSD_DTYPE_Q_U64: error = Q_QADDQ(&VSD(q64, vsd)->uq64, voival->q64.uq64); break; default: error = EINVAL; break; } if (!error) vs->flags |= VS_VSDVALID; return (error); } static inline int stats_v1_voi_update_hist(enum vsd_dtype voi_dtype, struct voistatdata *voival, struct voistat *vs, struct voistatdata_hist *hist) { struct voistatdata_numeric *bkt_lb, *bkt_ub; uint64_t *oob64, *cnt64; uint32_t *oob32, *cnt32; int error, i, found, is32bit, has_ub, eq_only; error = 0; switch (vs->dtype) { case VSD_DTYPE_CRHIST32: i = HIST_VSDSZ2NBKTS(crhist32, vs->dsz); is32bit = 1; has_ub = eq_only = 0; oob32 = &VSD(crhist32, hist)->oob; break; case VSD_DTYPE_DRHIST32: i = HIST_VSDSZ2NBKTS(drhist32, vs->dsz); is32bit = has_ub = 1; eq_only = 0; oob32 = &VSD(drhist32, hist)->oob; break; case VSD_DTYPE_DVHIST32: i = HIST_VSDSZ2NBKTS(dvhist32, vs->dsz); is32bit = eq_only = 1; has_ub = 0; oob32 = &VSD(dvhist32, hist)->oob; break; case VSD_DTYPE_CRHIST64: i = HIST_VSDSZ2NBKTS(crhist64, vs->dsz); is32bit = has_ub = eq_only = 0; oob64 = &VSD(crhist64, hist)->oob; break; case VSD_DTYPE_DRHIST64: i = HIST_VSDSZ2NBKTS(drhist64, vs->dsz); is32bit = eq_only = 0; has_ub = 1; oob64 = &VSD(drhist64, hist)->oob; break; case VSD_DTYPE_DVHIST64: i = HIST_VSDSZ2NBKTS(dvhist64, vs->dsz); is32bit = has_ub = 0; eq_only = 1; oob64 = &VSD(dvhist64, hist)->oob; break; default: return (EINVAL); } i--; /* Adjust for 0-based array index. */ /* XXXLAS: Should probably use a better bucket search algorithm. ARB? */ for (found = 0; i >= 0 && !found; i--) { switch (vs->dtype) { case VSD_DTYPE_CRHIST32: bkt_lb = &VSD(crhist32, hist)->bkts[i].lb; cnt32 = &VSD(crhist32, hist)->bkts[i].cnt; break; case VSD_DTYPE_DRHIST32: bkt_lb = &VSD(drhist32, hist)->bkts[i].lb; bkt_ub = &VSD(drhist32, hist)->bkts[i].ub; cnt32 = &VSD(drhist32, hist)->bkts[i].cnt; break; case VSD_DTYPE_DVHIST32: bkt_lb = &VSD(dvhist32, hist)->bkts[i].val; cnt32 = &VSD(dvhist32, hist)->bkts[i].cnt; break; case VSD_DTYPE_CRHIST64: bkt_lb = &VSD(crhist64, hist)->bkts[i].lb; cnt64 = &VSD(crhist64, hist)->bkts[i].cnt; break; case VSD_DTYPE_DRHIST64: bkt_lb = &VSD(drhist64, hist)->bkts[i].lb; bkt_ub = &VSD(drhist64, hist)->bkts[i].ub; cnt64 = &VSD(drhist64, hist)->bkts[i].cnt; break; case VSD_DTYPE_DVHIST64: bkt_lb = &VSD(dvhist64, hist)->bkts[i].val; cnt64 = &VSD(dvhist64, hist)->bkts[i].cnt; break; default: return (EINVAL); } switch (voi_dtype) { case VSD_DTYPE_INT_S32: if (voival->int32.s32 >= bkt_lb->int32.s32) { if ((eq_only && voival->int32.s32 == bkt_lb->int32.s32) || (!eq_only && (!has_ub || voival->int32.s32 < bkt_ub->int32.s32))) found = 1; } break; case VSD_DTYPE_INT_U32: if (voival->int32.u32 >= bkt_lb->int32.u32) { if ((eq_only && voival->int32.u32 == bkt_lb->int32.u32) || (!eq_only && (!has_ub || voival->int32.u32 < bkt_ub->int32.u32))) found = 1; } break; case VSD_DTYPE_INT_S64: if (voival->int64.s64 >= bkt_lb->int64.s64) if ((eq_only && voival->int64.s64 == bkt_lb->int64.s64) || (!eq_only && (!has_ub || voival->int64.s64 < bkt_ub->int64.s64))) found = 1; break; case VSD_DTYPE_INT_U64: if (voival->int64.u64 >= bkt_lb->int64.u64) if ((eq_only && voival->int64.u64 == bkt_lb->int64.u64) || (!eq_only && (!has_ub || voival->int64.u64 < bkt_ub->int64.u64))) found = 1; break; case VSD_DTYPE_INT_SLONG: if (voival->intlong.slong >= bkt_lb->intlong.slong) if ((eq_only && voival->intlong.slong == bkt_lb->intlong.slong) || (!eq_only && (!has_ub || voival->intlong.slong < bkt_ub->intlong.slong))) found = 1; break; case VSD_DTYPE_INT_ULONG: if (voival->intlong.ulong >= bkt_lb->intlong.ulong) if ((eq_only && voival->intlong.ulong == bkt_lb->intlong.ulong) || (!eq_only && (!has_ub || voival->intlong.ulong < bkt_ub->intlong.ulong))) found = 1; break; case VSD_DTYPE_Q_S32: if (Q_QGEQ(voival->q32.sq32, bkt_lb->q32.sq32)) if ((eq_only && Q_QEQ(voival->q32.sq32, bkt_lb->q32.sq32)) || (!eq_only && (!has_ub || Q_QLTQ(voival->q32.sq32, bkt_ub->q32.sq32)))) found = 1; break; case VSD_DTYPE_Q_U32: if (Q_QGEQ(voival->q32.uq32, bkt_lb->q32.uq32)) if ((eq_only && Q_QEQ(voival->q32.uq32, bkt_lb->q32.uq32)) || (!eq_only && (!has_ub || Q_QLTQ(voival->q32.uq32, bkt_ub->q32.uq32)))) found = 1; break; case VSD_DTYPE_Q_S64: if (Q_QGEQ(voival->q64.sq64, bkt_lb->q64.sq64)) if ((eq_only && Q_QEQ(voival->q64.sq64, bkt_lb->q64.sq64)) || (!eq_only && (!has_ub || Q_QLTQ(voival->q64.sq64, bkt_ub->q64.sq64)))) found = 1; break; case VSD_DTYPE_Q_U64: if (Q_QGEQ(voival->q64.uq64, bkt_lb->q64.uq64)) if ((eq_only && Q_QEQ(voival->q64.uq64, bkt_lb->q64.uq64)) || (!eq_only && (!has_ub || Q_QLTQ(voival->q64.uq64, bkt_ub->q64.uq64)))) found = 1; break; default: break; } } if (found) { if (is32bit) *cnt32 += 1; else *cnt64 += 1; } else { if (is32bit) *oob32 += 1; else *oob64 += 1; } vs->flags |= VS_VSDVALID; return (error); } static inline int stats_v1_vsd_tdgst_compress(enum vsd_dtype vs_dtype, struct voistatdata_tdgst *tdgst, int attempt) { struct ctdth32 *ctd32tree; struct ctdth64 *ctd64tree; struct voistatdata_tdgstctd32 *ctd32; struct voistatdata_tdgstctd64 *ctd64; uint64_t ebits, idxmask; uint32_t bitsperidx, nebits; int error, idx, is32bit, maxctds, remctds, tmperr; error = 0; switch (vs_dtype) { case VSD_DTYPE_TDGSTCLUST32: ctd32tree = &VSD(tdgstclust32, tdgst)->ctdtree; if (!ARB_FULL(ctd32tree)) return (0); VSD(tdgstclust32, tdgst)->compcnt++; maxctds = remctds = ARB_MAXNODES(ctd32tree); ARB_RESET_TREE(ctd32tree, ctdth32, maxctds); VSD(tdgstclust32, tdgst)->smplcnt = 0; is32bit = 1; ctd64tree = NULL; ctd64 = NULL; #ifdef DIAGNOSTIC RB_INIT(&VSD(tdgstclust32, tdgst)->rbctdtree); #endif break; case VSD_DTYPE_TDGSTCLUST64: ctd64tree = &VSD(tdgstclust64, tdgst)->ctdtree; if (!ARB_FULL(ctd64tree)) return (0); VSD(tdgstclust64, tdgst)->compcnt++; maxctds = remctds = ARB_MAXNODES(ctd64tree); ARB_RESET_TREE(ctd64tree, ctdth64, maxctds); VSD(tdgstclust64, tdgst)->smplcnt = 0; is32bit = 0; ctd32tree = NULL; ctd32 = NULL; #ifdef DIAGNOSTIC RB_INIT(&VSD(tdgstclust64, tdgst)->rbctdtree); #endif break; default: return (EINVAL); } /* * Rebuild the t-digest ARB by pseudorandomly selecting centroids and * re-inserting the mu/cnt of each as a value and corresponding weight. */ /* * XXXCEM: random(9) is currently rand(3), not random(3). rand(3) * RAND_MAX happens to be approximately 31 bits (range [0, * 0x7ffffffd]), so the math kinda works out. When/if this portion of * the code is compiled in userspace, it gets the random(3) behavior, * which has expected range [0, 0x7fffffff]. */ #define bitsperrand 31 ebits = 0; nebits = 0; bitsperidx = fls(maxctds); KASSERT(bitsperidx <= sizeof(ebits) << 3, ("%s: bitsperidx=%d, ebits=%d", __func__, bitsperidx, (int)(sizeof(ebits) << 3))); idxmask = (UINT64_C(1) << bitsperidx) - 1; /* Initialise the free list with randomised centroid indices. */ for (; remctds > 0; remctds--) { while (nebits < bitsperidx) { ebits |= ((uint64_t)random()) << nebits; nebits += bitsperrand; if (nebits > (sizeof(ebits) << 3)) nebits = sizeof(ebits) << 3; } idx = ebits & idxmask; nebits -= bitsperidx; ebits >>= bitsperidx; /* * Select the next centroid to put on the ARB free list. We * start with the centroid at our randomly selected array index, * and work our way forwards until finding one (the latter * aspect reduces re-insertion randomness, but is good enough). */ do { if (idx >= maxctds) idx %= maxctds; if (is32bit) ctd32 = ARB_NODE(ctd32tree, idx); else ctd64 = ARB_NODE(ctd64tree, idx); } while ((is32bit ? ARB_ISFREE(ctd32, ctdlnk) : ARB_ISFREE(ctd64, ctdlnk)) && ++idx); /* Put the centroid on the ARB free list. */ if (is32bit) ARB_RETURNFREE(ctd32tree, ctd32, ctdlnk); else ARB_RETURNFREE(ctd64tree, ctd64, ctdlnk); } /* * The free list now contains the randomised indices of every centroid. * Walk the free list from start to end, re-inserting each centroid's * mu/cnt. The tdgst_add() call may or may not consume the free centroid * we re-insert values from during each loop iteration, so we must latch * the index of the next free list centroid before the re-insertion * call. The previous loop above should have left the centroid pointer * pointing to the element at the head of the free list. */ KASSERT((is32bit ? ARB_FREEIDX(ctd32tree) == ARB_SELFIDX(ctd32tree, ctd32) : ARB_FREEIDX(ctd64tree) == ARB_SELFIDX(ctd64tree, ctd64)), ("%s: t-digest ARB@%p free list bug", __func__, (is32bit ? (void *)ctd32tree : (void *)ctd64tree))); remctds = maxctds; while ((is32bit ? ctd32 != NULL : ctd64 != NULL)) { tmperr = 0; if (is32bit) { s64q_t x; idx = ARB_NEXTFREEIDX(ctd32, ctdlnk); /* Cloning a s32q_t into a s64q_t should never fail. */ tmperr = Q_QCLONEQ(&x, ctd32->mu); tmperr = tmperr ? tmperr : stats_v1_vsd_tdgst_add( vs_dtype, tdgst, x, ctd32->cnt, attempt); ctd32 = ARB_NODE(ctd32tree, idx); KASSERT(ctd32 == NULL || ARB_ISFREE(ctd32, ctdlnk), ("%s: t-digest ARB@%p free list bug", __func__, ctd32tree)); } else { idx = ARB_NEXTFREEIDX(ctd64, ctdlnk); tmperr = stats_v1_vsd_tdgst_add(vs_dtype, tdgst, ctd64->mu, ctd64->cnt, attempt); ctd64 = ARB_NODE(ctd64tree, idx); KASSERT(ctd64 == NULL || ARB_ISFREE(ctd64, ctdlnk), ("%s: t-digest ARB@%p free list bug", __func__, ctd64tree)); } /* * This process should not produce errors, bugs notwithstanding. * Just in case, latch any errors and attempt all re-insertions. */ error = tmperr ? tmperr : error; remctds--; } KASSERT(remctds == 0, ("%s: t-digest ARB@%p free list bug", __func__, (is32bit ? (void *)ctd32tree : (void *)ctd64tree))); return (error); } static inline int stats_v1_vsd_tdgst_add(enum vsd_dtype vs_dtype, struct voistatdata_tdgst *tdgst, s64q_t x, uint64_t weight, int attempt) { #ifdef DIAGNOSTIC char qstr[Q_MAXSTRLEN(x, 10)]; #endif struct ctdth32 *ctd32tree; struct ctdth64 *ctd64tree; void *closest, *cur, *lb, *ub; struct voistatdata_tdgstctd32 *ctd32; struct voistatdata_tdgstctd64 *ctd64; uint64_t cnt, smplcnt, sum, tmpsum; s64q_t k, minz, q, z; int error, is32bit, n; error = 0; minz = Q_INI(&z, 0, 0, Q_NFBITS(x)); switch (vs_dtype) { case VSD_DTYPE_TDGSTCLUST32: if ((UINT32_MAX - weight) < VSD(tdgstclust32, tdgst)->smplcnt) error = EOVERFLOW; smplcnt = VSD(tdgstclust32, tdgst)->smplcnt; ctd32tree = &VSD(tdgstclust32, tdgst)->ctdtree; is32bit = 1; ctd64tree = NULL; ctd64 = NULL; break; case VSD_DTYPE_TDGSTCLUST64: if ((UINT64_MAX - weight) < VSD(tdgstclust64, tdgst)->smplcnt) error = EOVERFLOW; smplcnt = VSD(tdgstclust64, tdgst)->smplcnt; ctd64tree = &VSD(tdgstclust64, tdgst)->ctdtree; is32bit = 0; ctd32tree = NULL; ctd32 = NULL; break; default: error = EINVAL; break; } if (error) return (error); /* * Inspired by Ted Dunning's AVLTreeDigest.java */ do { #if defined(DIAGNOSTIC) KASSERT(attempt < 5, ("%s: Too many attempts", __func__)); #endif if (attempt >= 5) return (EAGAIN); Q_SIFVAL(minz, Q_IFMAXVAL(minz)); closest = ub = NULL; sum = tmpsum = 0; if (is32bit) lb = cur = (void *)(ctd32 = ARB_MIN(ctdth32, ctd32tree)); else lb = cur = (void *)(ctd64 = ARB_MIN(ctdth64, ctd64tree)); if (lb == NULL) /* Empty tree. */ lb = (is32bit ? (void *)ARB_ROOT(ctd32tree) : (void *)ARB_ROOT(ctd64tree)); /* * Find the set of centroids with minimum distance to x and * compute the sum of counts for all centroids with mean less * than the first centroid in the set. */ for (; cur != NULL; cur = (is32bit ? (void *)(ctd32 = ARB_NEXT(ctdth32, ctd32tree, ctd32)) : (void *)(ctd64 = ARB_NEXT(ctdth64, ctd64tree, ctd64)))) { if (is32bit) { cnt = ctd32->cnt; KASSERT(Q_PRECEQ(ctd32->mu, x), ("%s: Q_RELPREC(mu,x)=%d", __func__, Q_RELPREC(ctd32->mu, x))); /* Ok to assign as both have same precision. */ z = ctd32->mu; } else { cnt = ctd64->cnt; KASSERT(Q_PRECEQ(ctd64->mu, x), ("%s: Q_RELPREC(mu,x)=%d", __func__, Q_RELPREC(ctd64->mu, x))); /* Ok to assign as both have same precision. */ z = ctd64->mu; } error = Q_QSUBQ(&z, x); #if defined(DIAGNOSTIC) KASSERT(!error, ("%s: unexpected error %d", __func__, error)); #endif if (error) return (error); z = Q_QABS(z); if (Q_QLTQ(z, minz)) { minz = z; lb = cur; sum = tmpsum; tmpsum += cnt; } else if (Q_QGTQ(z, minz)) { ub = cur; break; } } cur = (is32bit ? (void *)(ctd32 = (struct voistatdata_tdgstctd32 *)lb) : (void *)(ctd64 = (struct voistatdata_tdgstctd64 *)lb)); for (n = 0; cur != ub; cur = (is32bit ? (void *)(ctd32 = ARB_NEXT(ctdth32, ctd32tree, ctd32)) : (void *)(ctd64 = ARB_NEXT(ctdth64, ctd64tree, ctd64)))) { if (is32bit) cnt = ctd32->cnt; else cnt = ctd64->cnt; q = Q_CTRLINI(16); if (smplcnt == 1) error = Q_QFRACI(&q, 1, 2); else /* [ sum + ((cnt - 1) / 2) ] / (smplcnt - 1) */ error = Q_QFRACI(&q, (sum << 1) + cnt - 1, (smplcnt - 1) << 1); k = q; /* k = q x 4 x samplcnt x attempt */ error |= Q_QMULI(&k, 4 * smplcnt * attempt); /* k = k x (1 - q) */ error |= Q_QSUBI(&q, 1); q = Q_QABS(q); error |= Q_QMULQ(&k, q); #if defined(DIAGNOSTIC) #if !defined(_KERNEL) double q_dbl, k_dbl, q2d, k2d; q2d = Q_Q2D(q); k2d = Q_Q2D(k); q_dbl = smplcnt == 1 ? 0.5 : (sum + ((cnt - 1) / 2.0)) / (double)(smplcnt - 1); k_dbl = 4 * smplcnt * q_dbl * (1.0 - q_dbl) * attempt; /* * If the difference between q and q_dbl is greater than * the fractional precision of q, something is off. * NB: q is holding the value of 1 - q */ q_dbl = 1.0 - q_dbl; KASSERT((q_dbl > q2d ? q_dbl - q2d : q2d - q_dbl) < (1.05 * ((double)1 / (double)(1ULL << Q_NFBITS(q)))), ("Q-type q bad precision")); KASSERT((k_dbl > k2d ? k_dbl - k2d : k2d - k_dbl) < 1.0 + (0.01 * smplcnt), ("Q-type k bad precision")); #endif /* !_KERNEL */ KASSERT(!error, ("%s: unexpected error %d", __func__, error)); #endif /* DIAGNOSTIC */ if (error) return (error); if ((is32bit && ((ctd32->cnt + weight) <= (uint64_t)Q_GIVAL(k))) || (!is32bit && ((ctd64->cnt + weight) <= (uint64_t)Q_GIVAL(k)))) { n++; /* random() produces 31 bits. */ if (random() < (INT32_MAX / n)) closest = cur; } sum += cnt; } } while (closest == NULL && (is32bit ? ARB_FULL(ctd32tree) : ARB_FULL(ctd64tree)) && (error = stats_v1_vsd_tdgst_compress(vs_dtype, tdgst, attempt++)) == 0); if (error) return (error); if (closest != NULL) { /* Merge with an existing centroid. */ if (is32bit) { ctd32 = (struct voistatdata_tdgstctd32 *)closest; error = Q_QSUBQ(&x, ctd32->mu); /* * The following calculation "x / (cnt + weight)" * computes the amount by which to adjust the centroid's * mu value in order to merge in the VOI sample. * * It can underflow (Q_QDIVI() returns ERANGE) when the * user centroids' fractional precision (which is * inherited by 'x') is too low to represent the result. * * A sophisticated approach to dealing with this issue * would minimise accumulation of error by tracking * underflow per centroid and making an adjustment when * a LSB's worth of underflow has accumulated. * * A simpler approach is to let the result underflow * i.e. merge the VOI sample into the centroid without * adjusting the centroid's mu, and rely on the user to * specify their t-digest with sufficient centroid * fractional precision such that the accumulation of * error from multiple underflows is of no material * consequence to the centroid's final value of mu. * * For the moment, the latter approach is employed by * simply ignoring ERANGE here. * * XXXLAS: Per-centroid underflow tracking is likely too * onerous, but it probably makes sense to accumulate a * single underflow error variable across all centroids * and report it as part of the digest to provide * additional visibility into the digest's fidelity. */ error = error ? error : Q_QDIVI(&x, ctd32->cnt + weight); if ((error && error != ERANGE) || (error = Q_QADDQ(&ctd32->mu, x))) { #ifdef DIAGNOSTIC KASSERT(!error, ("%s: unexpected error %d", __func__, error)); #endif return (error); } ctd32->cnt += weight; error = ARB_REINSERT(ctdth32, ctd32tree, ctd32) == NULL ? 0 : EALREADY; #ifdef DIAGNOSTIC RB_REINSERT(rbctdth32, &VSD(tdgstclust32, tdgst)->rbctdtree, ctd32); #endif } else { ctd64 = (struct voistatdata_tdgstctd64 *)closest; error = Q_QSUBQ(&x, ctd64->mu); error = error ? error : Q_QDIVI(&x, ctd64->cnt + weight); /* Refer to is32bit ERANGE discussion above. */ if ((error && error != ERANGE) || (error = Q_QADDQ(&ctd64->mu, x))) { KASSERT(!error, ("%s: unexpected error %d", __func__, error)); return (error); } ctd64->cnt += weight; error = ARB_REINSERT(ctdth64, ctd64tree, ctd64) == NULL ? 0 : EALREADY; #ifdef DIAGNOSTIC RB_REINSERT(rbctdth64, &VSD(tdgstclust64, tdgst)->rbctdtree, ctd64); #endif } } else { /* * Add a new centroid. If digest compression is working * correctly, there should always be at least one free. */ if (is32bit) { ctd32 = ARB_GETFREE(ctd32tree, ctdlnk); #ifdef DIAGNOSTIC KASSERT(ctd32 != NULL, ("%s: t-digest@%p has no free centroids", __func__, tdgst)); #endif if (ctd32 == NULL) return (EAGAIN); if ((error = Q_QCPYVALQ(&ctd32->mu, x))) return (error); ctd32->cnt = weight; error = ARB_INSERT(ctdth32, ctd32tree, ctd32) == NULL ? 0 : EALREADY; #ifdef DIAGNOSTIC RB_INSERT(rbctdth32, &VSD(tdgstclust32, tdgst)->rbctdtree, ctd32); #endif } else { ctd64 = ARB_GETFREE(ctd64tree, ctdlnk); #ifdef DIAGNOSTIC KASSERT(ctd64 != NULL, ("%s: t-digest@%p has no free centroids", __func__, tdgst)); #endif if (ctd64 == NULL) /* Should not happen. */ return (EAGAIN); /* Direct assignment ok as both have same type/prec. */ ctd64->mu = x; ctd64->cnt = weight; error = ARB_INSERT(ctdth64, ctd64tree, ctd64) == NULL ? 0 : EALREADY; #ifdef DIAGNOSTIC RB_INSERT(rbctdth64, &VSD(tdgstclust64, tdgst)->rbctdtree, ctd64); #endif } } if (is32bit) VSD(tdgstclust32, tdgst)->smplcnt += weight; else { VSD(tdgstclust64, tdgst)->smplcnt += weight; #ifdef DIAGNOSTIC struct rbctdth64 *rbctdtree = &VSD(tdgstclust64, tdgst)->rbctdtree; struct voistatdata_tdgstctd64 *rbctd64; int i = 0; ARB_FOREACH(ctd64, ctdth64, ctd64tree) { rbctd64 = (i == 0 ? RB_MIN(rbctdth64, rbctdtree) : RB_NEXT(rbctdth64, rbctdtree, rbctd64)); if (i >= ARB_CURNODES(ctd64tree) || ctd64 != rbctd64 || ARB_MIN(ctdth64, ctd64tree) != RB_MIN(rbctdth64, rbctdtree) || ARB_MAX(ctdth64, ctd64tree) != RB_MAX(rbctdth64, rbctdtree) || ARB_LEFTIDX(ctd64, ctdlnk) != ARB_SELFIDX(ctd64tree, RB_LEFT(rbctd64, rblnk)) || ARB_RIGHTIDX(ctd64, ctdlnk) != ARB_SELFIDX(ctd64tree, RB_RIGHT(rbctd64, rblnk)) || ARB_PARENTIDX(ctd64, ctdlnk) != ARB_SELFIDX(ctd64tree, RB_PARENT(rbctd64, rblnk))) { Q_TOSTR(ctd64->mu, -1, 10, qstr, sizeof(qstr)); printf("ARB ctd=%3d p=%3d l=%3d r=%3d c=%2d " "mu=%s\n", (int)ARB_SELFIDX(ctd64tree, ctd64), ARB_PARENTIDX(ctd64, ctdlnk), ARB_LEFTIDX(ctd64, ctdlnk), ARB_RIGHTIDX(ctd64, ctdlnk), ARB_COLOR(ctd64, ctdlnk), qstr); Q_TOSTR(rbctd64->mu, -1, 10, qstr, sizeof(qstr)); struct voistatdata_tdgstctd64 *parent; parent = RB_PARENT(rbctd64, rblnk); int rb_color = parent == NULL ? 0 : RB_LEFT(parent, rblnk) == rbctd64 ? (_RB_BITSUP(parent, rblnk) & _RB_L) != 0 : (_RB_BITSUP(parent, rblnk) & _RB_R) != 0; printf(" RB ctd=%3d p=%3d l=%3d r=%3d c=%2d " "mu=%s\n", (int)ARB_SELFIDX(ctd64tree, rbctd64), (int)ARB_SELFIDX(ctd64tree, RB_PARENT(rbctd64, rblnk)), (int)ARB_SELFIDX(ctd64tree, RB_LEFT(rbctd64, rblnk)), (int)ARB_SELFIDX(ctd64tree, RB_RIGHT(rbctd64, rblnk)), rb_color, qstr); panic("RB@%p and ARB@%p trees differ\n", rbctdtree, ctd64tree); } i++; } #endif /* DIAGNOSTIC */ } return (error); } static inline int stats_v1_voi_update_tdgst(enum vsd_dtype voi_dtype, struct voistatdata *voival, struct voistat *vs, struct voistatdata_tdgst *tdgst) { s64q_t x; int error; error = 0; switch (vs->dtype) { case VSD_DTYPE_TDGSTCLUST32: /* Use same precision as the user's centroids. */ Q_INI(&x, 0, 0, Q_NFBITS( ARB_CNODE(&VSD(tdgstclust32, tdgst)->ctdtree, 0)->mu)); break; case VSD_DTYPE_TDGSTCLUST64: /* Use same precision as the user's centroids. */ Q_INI(&x, 0, 0, Q_NFBITS( ARB_CNODE(&VSD(tdgstclust64, tdgst)->ctdtree, 0)->mu)); break; default: KASSERT(vs->dtype == VSD_DTYPE_TDGSTCLUST32 || vs->dtype == VSD_DTYPE_TDGSTCLUST64, ("%s: vs->dtype(%d) != VSD_DTYPE_TDGSTCLUST<32|64>", __func__, vs->dtype)); return (EINVAL); } /* * XXXLAS: Should have both a signed and unsigned 'x' variable to avoid * returning EOVERFLOW if the voival would have fit in a u64q_t. */ switch (voi_dtype) { case VSD_DTYPE_INT_S32: error = Q_QCPYVALI(&x, voival->int32.s32); break; case VSD_DTYPE_INT_U32: error = Q_QCPYVALI(&x, voival->int32.u32); break; case VSD_DTYPE_INT_S64: error = Q_QCPYVALI(&x, voival->int64.s64); break; case VSD_DTYPE_INT_U64: error = Q_QCPYVALI(&x, voival->int64.u64); break; case VSD_DTYPE_INT_SLONG: error = Q_QCPYVALI(&x, voival->intlong.slong); break; case VSD_DTYPE_INT_ULONG: error = Q_QCPYVALI(&x, voival->intlong.ulong); break; case VSD_DTYPE_Q_S32: error = Q_QCPYVALQ(&x, voival->q32.sq32); break; case VSD_DTYPE_Q_U32: error = Q_QCPYVALQ(&x, voival->q32.uq32); break; case VSD_DTYPE_Q_S64: error = Q_QCPYVALQ(&x, voival->q64.sq64); break; case VSD_DTYPE_Q_U64: error = Q_QCPYVALQ(&x, voival->q64.uq64); break; default: error = EINVAL; break; } if (error || (error = stats_v1_vsd_tdgst_add(vs->dtype, tdgst, x, 1, 1))) return (error); vs->flags |= VS_VSDVALID; return (0); } int stats_v1_voi_update(struct statsblobv1 *sb, int32_t voi_id, enum vsd_dtype voi_dtype, struct voistatdata *voival, uint32_t flags) { struct voi *v; struct voistat *vs; void *statevsd, *vsd; int error, i, tmperr; error = 0; if (sb == NULL || sb->abi != STATS_ABI_V1 || voi_id >= NVOIS(sb) || voi_dtype == 0 || voi_dtype >= VSD_NUM_DTYPES || voival == NULL) return (EINVAL); v = &sb->vois[voi_id]; if (voi_dtype != v->dtype || v->id < 0 || ((flags & SB_VOI_RELUPDATE) && !(v->flags & VOI_REQSTATE))) return (EINVAL); vs = BLOB_OFFSET(sb, v->stats_off); if (v->flags & VOI_REQSTATE) statevsd = BLOB_OFFSET(sb, vs->data_off); else statevsd = NULL; if (flags & SB_VOI_RELUPDATE) { switch (voi_dtype) { case VSD_DTYPE_INT_S32: voival->int32.s32 += VSD(voistate, statevsd)->prev.int32.s32; break; case VSD_DTYPE_INT_U32: voival->int32.u32 += VSD(voistate, statevsd)->prev.int32.u32; break; case VSD_DTYPE_INT_S64: voival->int64.s64 += VSD(voistate, statevsd)->prev.int64.s64; break; case VSD_DTYPE_INT_U64: voival->int64.u64 += VSD(voistate, statevsd)->prev.int64.u64; break; case VSD_DTYPE_INT_SLONG: voival->intlong.slong += VSD(voistate, statevsd)->prev.intlong.slong; break; case VSD_DTYPE_INT_ULONG: voival->intlong.ulong += VSD(voistate, statevsd)->prev.intlong.ulong; break; case VSD_DTYPE_Q_S32: error = Q_QADDQ(&voival->q32.sq32, VSD(voistate, statevsd)->prev.q32.sq32); break; case VSD_DTYPE_Q_U32: error = Q_QADDQ(&voival->q32.uq32, VSD(voistate, statevsd)->prev.q32.uq32); break; case VSD_DTYPE_Q_S64: error = Q_QADDQ(&voival->q64.sq64, VSD(voistate, statevsd)->prev.q64.sq64); break; case VSD_DTYPE_Q_U64: error = Q_QADDQ(&voival->q64.uq64, VSD(voistate, statevsd)->prev.q64.uq64); break; default: KASSERT(0, ("Unknown VOI data type %d", voi_dtype)); break; } } if (error) return (error); for (i = v->voistatmaxid; i > 0; i--) { vs = &((struct voistat *)BLOB_OFFSET(sb, v->stats_off))[i]; if (vs->stype < 0) continue; vsd = BLOB_OFFSET(sb, vs->data_off); switch (vs->stype) { case VS_STYPE_MAX: tmperr = stats_v1_voi_update_max(voi_dtype, voival, vs, vsd); break; case VS_STYPE_MIN: tmperr = stats_v1_voi_update_min(voi_dtype, voival, vs, vsd); break; case VS_STYPE_SUM: tmperr = stats_v1_voi_update_sum(voi_dtype, voival, vs, vsd); break; case VS_STYPE_HIST: tmperr = stats_v1_voi_update_hist(voi_dtype, voival, vs, vsd); break; case VS_STYPE_TDGST: tmperr = stats_v1_voi_update_tdgst(voi_dtype, voival, vs, vsd); break; default: KASSERT(0, ("Unknown VOI stat type %d", vs->stype)); break; } if (tmperr) { error = tmperr; VS_INCERRS(vs); } } if (statevsd) { switch (voi_dtype) { case VSD_DTYPE_INT_S32: VSD(voistate, statevsd)->prev.int32.s32 = voival->int32.s32; break; case VSD_DTYPE_INT_U32: VSD(voistate, statevsd)->prev.int32.u32 = voival->int32.u32; break; case VSD_DTYPE_INT_S64: VSD(voistate, statevsd)->prev.int64.s64 = voival->int64.s64; break; case VSD_DTYPE_INT_U64: VSD(voistate, statevsd)->prev.int64.u64 = voival->int64.u64; break; case VSD_DTYPE_INT_SLONG: VSD(voistate, statevsd)->prev.intlong.slong = voival->intlong.slong; break; case VSD_DTYPE_INT_ULONG: VSD(voistate, statevsd)->prev.intlong.ulong = voival->intlong.ulong; break; case VSD_DTYPE_Q_S32: error = Q_QCPYVALQ( &VSD(voistate, statevsd)->prev.q32.sq32, voival->q32.sq32); break; case VSD_DTYPE_Q_U32: error = Q_QCPYVALQ( &VSD(voistate, statevsd)->prev.q32.uq32, voival->q32.uq32); break; case VSD_DTYPE_Q_S64: error = Q_QCPYVALQ( &VSD(voistate, statevsd)->prev.q64.sq64, voival->q64.sq64); break; case VSD_DTYPE_Q_U64: error = Q_QCPYVALQ( &VSD(voistate, statevsd)->prev.q64.uq64, voival->q64.uq64); break; default: KASSERT(0, ("Unknown VOI data type %d", voi_dtype)); break; } } return (error); } #ifdef _KERNEL static void stats_init(void *arg) { } SYSINIT(stats, SI_SUB_KDTRACE, SI_ORDER_FIRST, stats_init, NULL); /* * Sysctl handler to display the list of available stats templates. */ static int stats_tpl_list_available(SYSCTL_HANDLER_ARGS) { struct sbuf *s; int err, i; err = 0; /* We can tolerate ntpl being stale, so do not take the lock. */ s = sbuf_new(NULL, NULL, /* +1 per tpl for , */ ntpl * (STATS_TPL_MAX_STR_SPEC_LEN + 1), SBUF_FIXEDLEN); if (s == NULL) return (ENOMEM); TPL_LIST_RLOCK(); for (i = 0; i < ntpl; i++) { err = sbuf_printf(s, "%s\"%s\":%u", i ? "," : "", tpllist[i]->mb->tplname, tpllist[i]->mb->tplhash); if (err) { /* Sbuf overflow condition. */ err = EOVERFLOW; break; } } TPL_LIST_RUNLOCK(); if (!err) { sbuf_finish(s); err = sysctl_handle_string(oidp, sbuf_data(s), 0, req); } sbuf_delete(s); return (err); } /* * Called by subsystem-specific sysctls to report and/or parse the list of * templates being sampled and their sampling rates. A stats_tpl_sr_cb_t * conformant function pointer must be passed in as arg1, which is used to * interact with the subsystem's stats template sample rates list. If arg2 > 0, * a zero-initialised allocation of arg2-sized contextual memory is * heap-allocated and passed in to all subsystem callbacks made during the * operation of stats_tpl_sample_rates(). * * XXXLAS: Assumes templates are never removed, which is currently true but may * need to be reworked in future if dynamic template management becomes a * requirement e.g. to support kernel module based templates. */ int stats_tpl_sample_rates(SYSCTL_HANDLER_ARGS) { char kvpair_fmt[16], tplspec_fmt[16]; char tpl_spec[STATS_TPL_MAX_STR_SPEC_LEN]; char tpl_name[TPL_MAX_NAME_LEN + 2]; /* +2 for "" */ stats_tpl_sr_cb_t subsys_cb; void *subsys_ctx; char *buf, *new_rates_usr_str, *tpl_name_p; struct stats_tpl_sample_rate *rates; struct sbuf *s, _s; uint32_t cum_pct, pct, tpl_hash; int err, i, off, len, newlen, nrates; buf = NULL; rates = NULL; err = nrates = 0; subsys_cb = (stats_tpl_sr_cb_t)arg1; KASSERT(subsys_cb != NULL, ("%s: subsys_cb == arg1 == NULL", __func__)); if (arg2 > 0) subsys_ctx = malloc(arg2, M_TEMP, M_WAITOK | M_ZERO); else subsys_ctx = NULL; /* Grab current count of subsystem rates. */ err = subsys_cb(TPL_SR_UNLOCKED_GET, NULL, &nrates, subsys_ctx); if (err) goto done; /* +1 to ensure we can append '\0' post copyin, +5 per rate for =nnn, */ len = max(req->newlen + 1, nrates * (STATS_TPL_MAX_STR_SPEC_LEN + 5)); if (req->oldptr != NULL || req->newptr != NULL) buf = malloc(len, M_TEMP, M_WAITOK); if (req->oldptr != NULL) { if (nrates == 0) { /* No rates, so return an empty string via oldptr. */ err = SYSCTL_OUT(req, "", 1); if (err) goto done; goto process_new; } s = sbuf_new(&_s, buf, len, SBUF_FIXEDLEN | SBUF_INCLUDENUL); /* Grab locked count of, and ptr to, subsystem rates. */ err = subsys_cb(TPL_SR_RLOCKED_GET, &rates, &nrates, subsys_ctx); if (err) goto done; TPL_LIST_RLOCK(); for (i = 0; i < nrates && !err; i++) { err = sbuf_printf(s, "%s\"%s\":%u=%u", i ? "," : "", tpllist[rates[i].tpl_slot_id]->mb->tplname, tpllist[rates[i].tpl_slot_id]->mb->tplhash, rates[i].tpl_sample_pct); } TPL_LIST_RUNLOCK(); /* Tell subsystem that we're done with its rates list. */ err = subsys_cb(TPL_SR_RUNLOCK, &rates, &nrates, subsys_ctx); if (err) goto done; err = sbuf_finish(s); if (err) goto done; /* We lost a race for buf to be too small. */ /* Return the rendered string data via oldptr. */ err = SYSCTL_OUT(req, sbuf_data(s), sbuf_len(s)); } else { /* Return the upper bound size for buffer sizing requests. */ err = SYSCTL_OUT(req, NULL, len); } process_new: if (err || req->newptr == NULL) goto done; newlen = req->newlen - req->newidx; err = SYSCTL_IN(req, buf, newlen); if (err) goto done; /* * Initialise format strings at run time. * * Write the max template spec string length into the * template_spec=percent key-value pair parsing format string as: * " %[^=]=%u %n" * * Write the max template name string length into the tplname:tplhash * parsing format string as: * "%[^:]:%u" * * Subtract 1 for \0 appended by sscanf(). */ sprintf(kvpair_fmt, " %%%zu[^=]=%%u %%n", sizeof(tpl_spec) - 1); sprintf(tplspec_fmt, "%%%zu[^:]:%%u", sizeof(tpl_name) - 1); /* * Parse each CSV key-value pair specifying a template and its sample * percentage. Whitespace either side of a key-value pair is ignored. * Templates can be specified by name, hash, or name and hash per the * following formats (chars in [] are optional): * ["]["]= * :hash=pct * ["]["]:hash= */ cum_pct = nrates = 0; rates = NULL; buf[newlen] = '\0'; /* buf is at least newlen+1 in size. */ new_rates_usr_str = buf; while (isspace(*new_rates_usr_str)) new_rates_usr_str++; /* Skip leading whitespace. */ while (*new_rates_usr_str != '\0') { tpl_name_p = tpl_name; tpl_name[0] = '\0'; tpl_hash = 0; off = 0; /* * Parse key-value pair which must perform 2 conversions, then * parse the template spec to extract either name, hash, or name * and hash depending on the three possible spec formats. The * tplspec_fmt format specifier parses name or name and hash * template specs, while the ":%u" format specifier parses * hash-only template specs. If parsing is successfull, ensure * the cumulative sampling percentage does not exceed 100. */ err = EINVAL; if (2 != sscanf(new_rates_usr_str, kvpair_fmt, tpl_spec, &pct, &off)) break; if ((1 > sscanf(tpl_spec, tplspec_fmt, tpl_name, &tpl_hash)) && (1 != sscanf(tpl_spec, ":%u", &tpl_hash))) break; if ((cum_pct += pct) > 100) break; err = 0; /* Strip surrounding "" from template name if present. */ len = strlen(tpl_name); if (len > 0) { if (tpl_name[len - 1] == '"') tpl_name[--len] = '\0'; if (tpl_name[0] == '"') { tpl_name_p++; len--; } } rates = stats_realloc(rates, 0, /* oldsz is unused in kernel. */ (nrates + 1) * sizeof(*rates), M_WAITOK); rates[nrates].tpl_slot_id = stats_tpl_fetch_allocid(len ? tpl_name_p : NULL, tpl_hash); if (rates[nrates].tpl_slot_id < 0) { err = -rates[nrates].tpl_slot_id; break; } rates[nrates].tpl_sample_pct = pct; nrates++; new_rates_usr_str += off; if (*new_rates_usr_str != ',') break; /* End-of-input or malformed. */ new_rates_usr_str++; /* Move past comma to next pair. */ } if (!err) { if ((new_rates_usr_str - buf) < newlen) { /* Entire input has not been consumed. */ err = EINVAL; } else { /* * Give subsystem the new rates. They'll return the * appropriate rates pointer for us to garbage collect. */ err = subsys_cb(TPL_SR_PUT, &rates, &nrates, subsys_ctx); } } stats_free(rates); done: free(buf, M_TEMP); free(subsys_ctx, M_TEMP); return (err); } SYSCTL_NODE(_kern, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "stats(9) MIB"); SYSCTL_PROC(_kern_stats, OID_AUTO, templates, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, stats_tpl_list_available, "A", "list the name/hash of all available stats(9) templates"); #else /* ! _KERNEL */ static void __attribute__ ((constructor)) stats_constructor(void) { pthread_rwlock_init(&tpllistlock, NULL); } static void __attribute__ ((destructor)) stats_destructor(void) { pthread_rwlock_destroy(&tpllistlock); } #endif /* _KERNEL */ diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c index 5b9f8afd9565..aa189e8cd057 100644 --- a/sys/kern/subr_witness.c +++ b/sys/kern/subr_witness.c @@ -1,3183 +1,3183 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2008 Isilon Systems, Inc. * Copyright (c) 2008 Ilya Maykov * Copyright (c) 1998 Berkeley Software Design, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Implementation of the `witness' lock verifier. Originally implemented for * mutexes in BSD/OS. Extended to handle generic lock objects and lock * classes in FreeBSD. */ /* * Main Entry: witness * Pronunciation: 'wit-n&s * Function: noun * Etymology: Middle English witnesse, from Old English witnes knowledge, * testimony, witness, from 2wit * Date: before 12th century * 1 : attestation of a fact or event : TESTIMONY * 2 : one that gives evidence; specifically : one who testifies in * a cause or before a judicial tribunal * 3 : one asked to be present at a transaction so as to be able to * testify to its having taken place * 4 : one who has personal knowledge of something * 5 a : something serving as evidence or proof : SIGN * b : public affirmation by word or example of usually * religious faith or conviction * 6 capitalized : a member of the Jehovah's Witnesses */ /* * Special rules concerning Giant and lock orders: * * 1) Giant must be acquired before any other mutexes. Stated another way, * no other mutex may be held when Giant is acquired. * * 2) Giant must be released when blocking on a sleepable lock. * * This rule is less obvious, but is a result of Giant providing the same * semantics as spl(). Basically, when a thread sleeps, it must release * Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule * 2). * * 3) Giant may be acquired before or after sleepable locks. * * This rule is also not quite as obvious. Giant may be acquired after * a sleepable lock because it is a non-sleepable lock and non-sleepable * locks may always be acquired while holding a sleepable lock. The second * case, Giant before a sleepable lock, follows from rule 2) above. Suppose * you have two threads T1 and T2 and a sleepable lock X. Suppose that T1 * acquires X and blocks on Giant. Then suppose that T2 acquires Giant and * blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to * execute. Thus, acquiring Giant both before and after a sleepable lock * will not result in a lock order reversal. */ #include #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_stack.h" #include "opt_witness.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #if !defined(DDB) && !defined(STACK) #error "DDB or STACK options are required for WITNESS" #endif /* Note that these traces do not work with KTR_ALQ. */ #if 0 #define KTR_WITNESS KTR_SUBSYS #else #define KTR_WITNESS 0 #endif #define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */ #define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */ #define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */ #define LI_SLEEPABLE 0x00040000 /* Lock may be held while sleeping. */ #ifndef WITNESS_COUNT #define WITNESS_COUNT 1536 #endif #define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */ #define WITNESS_PENDLIST (512 + (MAXCPU * 4)) /* Allocate 256 KB of stack data space */ #define WITNESS_LO_DATA_COUNT 2048 /* Prime, gives load factor of ~2 at full load */ #define WITNESS_LO_HASH_SIZE 1021 /* * XXX: This is somewhat bogus, as we assume here that at most 2048 threads * will hold LOCK_NCHILDREN locks. We handle failure ok, and we should * probably be safe for the most part, but it's still a SWAG. */ #define LOCK_NCHILDREN 5 #define LOCK_CHILDCOUNT 2048 #define MAX_W_NAME 64 #define FULLGRAPH_SBUF_SIZE 512 /* * These flags go in the witness relationship matrix and describe the * relationship between any two struct witness objects. */ #define WITNESS_UNRELATED 0x00 /* No lock order relation. */ #define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */ #define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */ #define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */ #define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */ #define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR) #define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT) #define WITNESS_RELATED_MASK \ (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK) #define WITNESS_REVERSAL 0x10 /* A lock order reversal has been * observed. */ #define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */ #define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */ #define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */ /* Descendant to ancestor flags */ #define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2) /* Ancestor to descendant flags */ #define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2) #define WITNESS_INDEX_ASSERT(i) \ MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count) static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness"); /* * Lock instances. A lock instance is the data associated with a lock while * it is held by witness. For example, a lock instance will hold the * recursion count of a lock. Lock instances are held in lists. Spin locks * are held in a per-cpu list while sleep locks are held in per-thread list. */ struct lock_instance { struct lock_object *li_lock; const char *li_file; int li_line; u_int li_flags; }; /* * A simple list type used to build the list of locks held by a thread * or CPU. We can't simply embed the list in struct lock_object since a * lock may be held by more than one thread if it is a shared lock. Locks * are added to the head of the list, so we fill up each list entry from * "the back" logically. To ease some of the arithmetic, we actually fill * in each list entry the normal way (children[0] then children[1], etc.) but * when we traverse the list we read children[count-1] as the first entry * down to children[0] as the final entry. */ struct lock_list_entry { struct lock_list_entry *ll_next; struct lock_instance ll_children[LOCK_NCHILDREN]; u_int ll_count; }; /* * The main witness structure. One of these per named lock type in the system * (for example, "vnode interlock"). */ struct witness { char w_name[MAX_W_NAME]; uint32_t w_index; /* Index in the relationship matrix */ struct lock_class *w_class; STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */ struct witness *w_hash_next; /* Linked list in hash buckets. */ const char *w_file; /* File where last acquired */ uint32_t w_line; /* Line where last acquired */ uint32_t w_refcount; uint16_t w_num_ancestors; /* direct/indirect * ancestor count */ uint16_t w_num_descendants; /* direct/indirect * descendant count */ int16_t w_ddb_level; unsigned w_displayed:1; unsigned w_reversed:1; }; STAILQ_HEAD(witness_list, witness); /* * The witness hash table. Keys are witness names (const char *), elements are * witness objects (struct witness *). */ struct witness_hash { struct witness *wh_array[WITNESS_HASH_SIZE]; uint32_t wh_size; uint32_t wh_count; }; /* * Key type for the lock order data hash table. */ struct witness_lock_order_key { uint16_t from; uint16_t to; }; struct witness_lock_order_data { struct stack wlod_stack; struct witness_lock_order_key wlod_key; struct witness_lock_order_data *wlod_next; }; /* * The witness lock order data hash table. Keys are witness index tuples * (struct witness_lock_order_key), elements are lock order data objects * (struct witness_lock_order_data). */ struct witness_lock_order_hash { struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE]; u_int wloh_size; u_int wloh_count; }; struct witness_blessed { const char *b_lock1; const char *b_lock2; }; struct witness_pendhelp { const char *wh_type; struct lock_object *wh_lock; }; struct witness_order_list_entry { const char *w_name; struct lock_class *w_class; }; /* * Returns 0 if one of the locks is a spin lock and the other is not. * Returns 1 otherwise. */ static __inline int witness_lock_type_equal(struct witness *w1, struct witness *w2) { return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) == (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK))); } static __inline int witness_lock_order_key_equal(const struct witness_lock_order_key *a, const struct witness_lock_order_key *b) { return (a->from == b->from && a->to == b->to); } static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname); static void adopt(struct witness *parent, struct witness *child); static int blessed(struct witness *, struct witness *); static void depart(struct witness *w); static struct witness *enroll(const char *description, struct lock_class *lock_class); static struct lock_instance *find_instance(struct lock_list_entry *list, const struct lock_object *lock); static int isitmychild(struct witness *parent, struct witness *child); static int isitmydescendant(struct witness *parent, struct witness *child); static void itismychild(struct witness *parent, struct witness *child); static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS); static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS); static void witness_add_fullgraph(struct sbuf *sb, struct witness *parent); #ifdef DDB static void witness_ddb_compute_levels(void); static void witness_ddb_display(int(*)(const char *fmt, ...)); static void witness_ddb_display_descendants(int(*)(const char *fmt, ...), struct witness *, int indent); static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list); static void witness_ddb_level_descendants(struct witness *parent, int l); static void witness_ddb_list(struct thread *td); #endif static void witness_enter_debugger(const char *msg); static void witness_debugger(int cond, const char *msg); static void witness_free(struct witness *m); static struct witness *witness_get(void); static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size); static struct witness *witness_hash_get(const char *key); static void witness_hash_put(struct witness *w); static void witness_init_hash_tables(void); static void witness_increment_graph_generation(void); static void witness_lock_list_free(struct lock_list_entry *lle); static struct lock_list_entry *witness_lock_list_get(void); static int witness_lock_order_add(struct witness *parent, struct witness *child); static int witness_lock_order_check(struct witness *parent, struct witness *child); static struct witness_lock_order_data *witness_lock_order_get( struct witness *parent, struct witness *child); static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)); static int witness_output(const char *fmt, ...) __printflike(1, 2); static int witness_output_drain(void *arg __unused, const char *data, int len); static int witness_voutput(const char *fmt, va_list ap) __printflike(1, 0); static void witness_setflag(struct lock_object *lock, int flag, int set); FEATURE(witness, "kernel has witness(9) support"); static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Witness Locking"); /* * If set to 0, lock order checking is disabled. If set to -1, * witness is completely disabled. Otherwise witness performs full * lock order checking for all locks. At runtime, lock order checking * may be toggled. However, witness cannot be reenabled once it is * completely disabled. */ static int witness_watch = 1; SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT | CTLFLAG_MPSAFE, NULL, 0, sysctl_debug_witness_watch, "I", "witness is watching lock operations"); #ifdef KDB /* * When KDB is enabled and witness_kdb is 1, it will cause the system * to drop into kdebug() when: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ #ifdef WITNESS_KDB int witness_kdb = 1; #else int witness_kdb = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, ""); #endif /* KDB */ #if defined(DDB) || defined(KDB) /* * When DDB or KDB is enabled and witness_trace is 1, it will cause the system * to print a stack trace: * - a lock hierarchy violation occurs * - locks are held when going to sleep. */ int witness_trace = 1; SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, ""); #endif /* DDB || KDB */ #ifdef WITNESS_SKIPSPIN int witness_skipspin = 1; #else int witness_skipspin = 0; #endif SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, ""); int badstack_sbuf_size; int witness_count = WITNESS_COUNT; SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, &witness_count, 0, ""); /* * Output channel for witness messages. By default we print to the console. */ enum witness_channel { WITNESS_CONSOLE, WITNESS_LOG, WITNESS_NONE, }; static enum witness_channel witness_channel = WITNESS_CONSOLE; SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, sysctl_debug_witness_channel, "A", "Output channel for warnings"); /* * Call this to print out the relations between locks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs"); /* * Call this to print out the witness faulty stacks. */ SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks"); static struct mtx w_mtx; /* w_list */ static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free); static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all); /* w_typelist */ static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin); static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep); /* lock list */ static struct lock_list_entry *w_lock_list_free = NULL; static struct witness_pendhelp pending_locks[WITNESS_PENDLIST]; static u_int pending_cnt; static int w_free_cnt, w_spin_cnt, w_sleep_cnt; SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, ""); SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0, ""); static struct witness *w_data; static uint8_t **w_rmatrix; static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT]; static struct witness_hash w_hash; /* The witness hash table. */ /* The lock order data hash */ static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT]; static struct witness_lock_order_data *w_lofree = NULL; static struct witness_lock_order_hash w_lohash; static int w_max_used_index = 0; static unsigned int w_generation = 0; static const char w_notrunning[] = "Witness not running\n"; static const char w_stillcold[] = "Witness is still cold\n"; #ifdef __i386__ static const char w_notallowed[] = "The sysctl is disabled on the arch\n"; #endif static struct witness_order_list_entry order_lists[] = { /* * sx locks */ { "proctree", &lock_class_sx }, { "allproc", &lock_class_sx }, { "allprison", &lock_class_sx }, { NULL, NULL }, /* * Various mutexes */ { "Giant", &lock_class_mtx_sleep }, { "pipe mutex", &lock_class_mtx_sleep }, { "sigio lock", &lock_class_mtx_sleep }, { "process group", &lock_class_mtx_sleep }, #ifdef HWPMC_HOOKS { "pmc-sleep", &lock_class_mtx_sleep }, #endif { "process lock", &lock_class_mtx_sleep }, { "session", &lock_class_mtx_sleep }, { "uidinfo hash", &lock_class_rw }, { "time lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * umtx */ { "umtx lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Sockets */ { "accept", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { "so_rcv", &lock_class_mtx_sleep }, { "sellck", &lock_class_mtx_sleep }, { NULL, NULL }, /* * Routing */ { "so_rcv", &lock_class_mtx_sleep }, { "radix node head", &lock_class_rm }, { "ifaddr", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IPv4 multicast: * protocol locks before interface locks, after UDP locks. */ { "in_multi_sx", &lock_class_sx }, { "udpinp", &lock_class_rw }, { "in_multi_list_mtx", &lock_class_mtx_sleep }, { "igmp_mtx", &lock_class_mtx_sleep }, { "if_addr_lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IPv6 multicast: * protocol locks before interface locks, after UDP locks. */ { "in6_multi_sx", &lock_class_sx }, { "udpinp", &lock_class_rw }, { "in6_multi_list_mtx", &lock_class_mtx_sleep }, { "mld_mtx", &lock_class_mtx_sleep }, { "if_addr_lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UNIX Domain Sockets */ { "unp_link_rwlock", &lock_class_rw }, { "unp_list_lock", &lock_class_mtx_sleep }, { "unp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * UDP/IP */ { "udpinp", &lock_class_rw }, { "udp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ { "tcpinp", &lock_class_rw }, { "tcp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * BPF */ { "bpf global lock", &lock_class_sx }, { "bpf cdev lock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * NFS server */ { "nfsd_mtx", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * IEEE 802.11 */ { "802.11 com lock", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Network drivers */ { "network driver", &lock_class_mtx_sleep}, { NULL, NULL }, /* * Netgraph */ { "ng_node", &lock_class_mtx_sleep }, { "ng_worklist", &lock_class_mtx_sleep }, { NULL, NULL }, /* * CDEV */ { "vm map (system)", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "cdev", &lock_class_mtx_sleep }, { "devthrd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VM */ { "vm map (user)", &lock_class_sx }, { "vm object", &lock_class_rw }, { "vm page", &lock_class_mtx_sleep }, { "pmap pv global", &lock_class_rw }, { "pmap", &lock_class_mtx_sleep }, { "pmap pv list", &lock_class_rw }, { "vm page free queue", &lock_class_mtx_sleep }, { "vm pagequeue", &lock_class_mtx_sleep }, { NULL, NULL }, /* * kqueue/VFS interaction */ { "kqueue", &lock_class_mtx_sleep }, { "struct mount mtx", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { NULL, NULL }, /* * VFS namecache */ { "ncvn", &lock_class_mtx_sleep }, { "ncbuc", &lock_class_mtx_sleep }, { "vnode interlock", &lock_class_mtx_sleep }, { "ncneg", &lock_class_mtx_sleep }, { NULL, NULL }, /* * ZFS locking */ { "dn->dn_mtx", &lock_class_sx }, { "dr->dt.di.dr_mtx", &lock_class_sx }, { "db->db_mtx", &lock_class_sx }, { NULL, NULL }, /* * TCP log locks */ { "TCP ID tree", &lock_class_rw }, { "tcp log id bucket", &lock_class_mtx_sleep }, { "tcpinp", &lock_class_rw }, { "TCP log expireq", &lock_class_mtx_sleep }, { NULL, NULL }, /* * spin locks */ #ifdef SMP { "ap boot", &lock_class_mtx_spin }, #endif { "rm.mutex_mtx", &lock_class_mtx_spin }, #ifdef __i386__ { "cy", &lock_class_mtx_spin }, #endif { "scc_hwmtx", &lock_class_mtx_spin }, { "uart_hwmtx", &lock_class_mtx_spin }, { "fast_taskqueue", &lock_class_mtx_spin }, { "intr table", &lock_class_mtx_spin }, { "process slock", &lock_class_mtx_spin }, { "syscons video lock", &lock_class_mtx_spin }, { "sleepq chain", &lock_class_mtx_spin }, { "rm_spinlock", &lock_class_mtx_spin }, { "turnstile chain", &lock_class_mtx_spin }, { "turnstile lock", &lock_class_mtx_spin }, { "sched lock", &lock_class_mtx_spin }, { "td_contested", &lock_class_mtx_spin }, { "callout", &lock_class_mtx_spin }, { "entropy harvest mutex", &lock_class_mtx_spin }, #ifdef SMP { "smp rendezvous", &lock_class_mtx_spin }, #endif #ifdef __powerpc__ { "tlb0", &lock_class_mtx_spin }, #endif { NULL, NULL }, { "sched lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-per-proc", &lock_class_mtx_spin }, #endif { NULL, NULL }, /* * leaf locks */ { "intrcnt", &lock_class_mtx_spin }, { "icu", &lock_class_mtx_spin }, #ifdef __i386__ { "allpmaps", &lock_class_mtx_spin }, { "descriptor tables", &lock_class_mtx_spin }, #endif { "clk", &lock_class_mtx_spin }, { "cpuset", &lock_class_mtx_spin }, { "mprof lock", &lock_class_mtx_spin }, { "zombie lock", &lock_class_mtx_spin }, { "ALD Queue", &lock_class_mtx_spin }, #if defined(__i386__) || defined(__amd64__) { "pcicfg", &lock_class_mtx_spin }, { "NDIS thread lock", &lock_class_mtx_spin }, #endif { "tw_osl_io_lock", &lock_class_mtx_spin }, { "tw_osl_q_lock", &lock_class_mtx_spin }, { "tw_cl_io_lock", &lock_class_mtx_spin }, { "tw_cl_intr_lock", &lock_class_mtx_spin }, { "tw_cl_gen_lock", &lock_class_mtx_spin }, #ifdef HWPMC_HOOKS { "pmc-leaf", &lock_class_mtx_spin }, #endif { "blocked lock", &lock_class_mtx_spin }, { NULL, NULL }, { NULL, NULL } }; /* * Pairs of locks which have been blessed. Witness does not complain about * order problems with blessed lock pairs. Please do not add an entry to the * table without an explanatory comment. */ static struct witness_blessed blessed_list[] = { /* * See the comment in ufs_dirhash.c. Basically, a vnode lock serializes * both lock orders, so a deadlock cannot happen as a result of this * LOR. */ { "dirhash", "bufwait" }, /* * A UFS vnode may be locked in vget() while a buffer belonging to the * parent directory vnode is locked. */ { "ufs", "bufwait" }, /* * The tarfs decompression stream vnode may be locked while a * buffer belonging to a tarfs data vnode is locked. */ { "tarfs", "bufwait" }, }; /* * This global is set to 0 once it becomes safe to use the witness code. */ static int witness_cold = 1; /* * This global is set to 1 once the static lock orders have been enrolled * so that a warning can be issued for any spin locks enrolled later. */ static int witness_spin_warn = 0; /* Trim useless garbage from filenames. */ static const char * fixup_filename(const char *file) { if (file == NULL) return (NULL); while (strncmp(file, "../", 3) == 0) file += 3; return (file); } /* * Calculate the size of early witness structures. */ int witness_startup_count(void) { int sz; sz = sizeof(struct witness) * witness_count; sz += sizeof(*w_rmatrix) * (witness_count + 1); sz += sizeof(*w_rmatrix[0]) * (witness_count + 1) * (witness_count + 1); return (sz); } /* * The WITNESS-enabled diagnostic code. Note that the witness code does * assume that the early boot is single-threaded at least until after this * routine is completed. */ void witness_startup(void *mem) { struct lock_object *lock; struct witness_order_list_entry *order; struct witness *w, *w1; uintptr_t p; int i; p = (uintptr_t)mem; w_data = (void *)p; p += sizeof(struct witness) * witness_count; w_rmatrix = (void *)p; p += sizeof(*w_rmatrix) * (witness_count + 1); for (i = 0; i < witness_count + 1; i++) { w_rmatrix[i] = (void *)p; p += sizeof(*w_rmatrix[i]) * (witness_count + 1); } badstack_sbuf_size = witness_count * 256; /* * We have to release Giant before initializing its witness * structure so that WITNESS doesn't get confused. */ mtx_unlock(&Giant); mtx_assert(&Giant, MA_NOTOWNED); CTR1(KTR_WITNESS, "%s: initializing witness", __func__); mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET | MTX_NOWITNESS | MTX_NOPROFILE); for (i = witness_count - 1; i >= 0; i--) { w = &w_data[i]; memset(w, 0, sizeof(*w)); w_data[i].w_index = i; /* Witness index never changes. */ witness_free(w); } KASSERT(STAILQ_FIRST(&w_free)->w_index == 0, ("%s: Invalid list of free witness objects", __func__)); /* Witness with index 0 is not used to aid in debugging. */ STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; for (i = 0; i < witness_count; i++) { memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * (witness_count + 1)); } for (i = 0; i < LOCK_CHILDCOUNT; i++) witness_lock_list_free(&w_locklistdata[i]); witness_init_hash_tables(); /* First add in all the specified order lists. */ for (order = order_lists; order->w_name != NULL; order++) { w = enroll(order->w_name, order->w_class); if (w == NULL) continue; w->w_file = "order list"; for (order++; order->w_name != NULL; order++) { w1 = enroll(order->w_name, order->w_class); if (w1 == NULL) continue; w1->w_file = "order list"; itismychild(w, w1); w = w1; } } witness_spin_warn = 1; /* Iterate through all locks and add them to witness. */ for (i = 0; pending_locks[i].wh_lock != NULL; i++) { lock = pending_locks[i].wh_lock; KASSERT(lock->lo_flags & LO_WITNESS, ("%s: lock %s is on pending list but not LO_WITNESS", __func__, lock->lo_name)); lock->lo_witness = enroll(pending_locks[i].wh_type, LOCK_CLASS(lock)); } /* Mark the witness code as being ready for use. */ witness_cold = 0; mtx_lock(&Giant); } void witness_init(struct lock_object *lock, const char *type) { struct lock_class *class; /* Various sanity checks. */ class = LOCK_CLASS(lock); if ((lock->lo_flags & LO_RECURSABLE) != 0 && (class->lc_flags & LC_RECURSABLE) == 0) kassert_panic("%s: lock (%s) %s can not be recursable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (class->lc_flags & LC_SLEEPABLE) == 0) kassert_panic("%s: lock (%s) %s can not be sleepable", __func__, class->lc_name, lock->lo_name); if ((lock->lo_flags & LO_UPGRADABLE) != 0 && (class->lc_flags & LC_UPGRADABLE) == 0) kassert_panic("%s: lock (%s) %s can not be upgradable", __func__, class->lc_name, lock->lo_name); /* * If we shouldn't watch this lock, then just clear lo_witness. * Otherwise, if witness_cold is set, then it is too early to * enroll this lock, so defer it to witness_initialize() by adding * it to the pending_locks list. If it is not too early, then enroll * the lock now. */ if (witness_watch < 1 || KERNEL_PANICKED() || (lock->lo_flags & LO_WITNESS) == 0) lock->lo_witness = NULL; else if (witness_cold) { pending_locks[pending_cnt].wh_lock = lock; pending_locks[pending_cnt++].wh_type = type; if (pending_cnt > WITNESS_PENDLIST) panic("%s: pending locks list is too small, " "increase WITNESS_PENDLIST\n", __func__); } else lock->lo_witness = enroll(type, class); } void witness_destroy(struct lock_object *lock) { struct lock_class *class; struct witness *w; class = LOCK_CLASS(lock); if (witness_cold) panic("lock (%s) %s destroyed while witness_cold", class->lc_name, lock->lo_name); /* XXX: need to verify that no one holds the lock */ if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL) return; w = lock->lo_witness; mtx_lock_spin(&w_mtx); MPASS(w->w_refcount > 0); w->w_refcount--; if (w->w_refcount == 0) depart(w); mtx_unlock_spin(&w_mtx); } #ifdef DDB static void witness_ddb_compute_levels(void) { struct witness *w; /* * First clear all levels. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_ddb_level = -1; /* * Look for locks with no parents and level all their descendants. */ STAILQ_FOREACH(w, &w_all, w_list) { /* If the witness has ancestors (is not a root), skip it. */ if (w->w_num_ancestors > 0) continue; witness_ddb_level_descendants(w, 0); } } static void witness_ddb_level_descendants(struct witness *w, int l) { int i; if (w->w_ddb_level >= l) return; w->w_ddb_level = l; l++; for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_level_descendants(&w_data[i], l); } } static void witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...), struct witness *w, int indent) { int i; for (i = 0; i < indent; i++) prnt(" "); prnt("%s (type: %s, depth: %d, active refs: %d)", w->w_name, w->w_class->lc_name, w->w_ddb_level, w->w_refcount); if (w->w_displayed) { prnt(" -- (already displayed)\n"); return; } w->w_displayed = 1; if (w->w_file != NULL && w->w_line != 0) prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file), w->w_line); else prnt(" -- never acquired\n"); indent++; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (db_pager_quit) return; if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) witness_ddb_display_descendants(prnt, &w_data[i], indent); } } static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...), struct witness_list *list) { struct witness *w; STAILQ_FOREACH(w, list, w_typelist) { if (w->w_file == NULL || w->w_ddb_level > 0) continue; /* This lock has no anscestors - display its descendants. */ witness_ddb_display_descendants(prnt, w, 0); if (db_pager_quit) return; } } static void witness_ddb_display(int(*prnt)(const char *fmt, ...)) { struct witness *w; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); witness_ddb_compute_levels(); /* Clear all the displayed flags. */ STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; /* * First, handle sleep locks which have been acquired at least * once. */ prnt("Sleep locks:\n"); witness_ddb_display_list(prnt, &w_sleep); if (db_pager_quit) return; /* * Now do spin locks which have been acquired at least once. */ prnt("\nSpin locks:\n"); witness_ddb_display_list(prnt, &w_spin); if (db_pager_quit) return; /* * Finally, any locks which have not been acquired yet. */ prnt("\nLocks which were never acquired:\n"); STAILQ_FOREACH(w, &w_all, w_list) { if (w->w_file != NULL || w->w_refcount == 0) continue; prnt("%s (type: %s, depth: %d)\n", w->w_name, w->w_class->lc_name, w->w_ddb_level); if (db_pager_quit) return; } } #endif /* DDB */ int witness_defineorder(struct lock_object *lock1, struct lock_object *lock2) { if (witness_watch == -1 || KERNEL_PANICKED()) return (0); /* Require locks that witness knows about. */ if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL || lock2->lo_witness == NULL) return (EINVAL); mtx_assert(&w_mtx, MA_NOTOWNED); mtx_lock_spin(&w_mtx); /* * If we already have either an explicit or implied lock order that * is the other way around, then return an error. */ if (witness_watch && isitmydescendant(lock2->lo_witness, lock1->lo_witness)) { mtx_unlock_spin(&w_mtx); return (EDOOFUS); } /* Try to add the new order. */ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, lock2->lo_witness->w_name, lock1->lo_witness->w_name); itismychild(lock1->lo_witness, lock2->lo_witness); mtx_unlock_spin(&w_mtx); return (0); } void witness_checkorder(struct lock_object *lock, int flags, const char *file, int line, struct lock_object *interlock) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1, *lock2, *plock; struct lock_class *class, *iclass; struct witness *w, *w1; struct thread *td; int i, j; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL || KERNEL_PANICKED()) return; w = lock->lo_witness; class = LOCK_CLASS(lock); td = curthread; if (class->lc_flags & LC_SLEEPLOCK) { /* * Since spin locks include a critical section, this check * implicitly enforces a lock order of all sleep locks before * all spin locks. */ if (td->td_critnest != 0 && !kdb_active) kassert_panic("acquiring blockable sleep lock with " "spinlock or critical section held (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); /* * If this is the first lock acquired then just return as * no order checking is needed. */ lock_list = td->td_sleeplocks; if (lock_list == NULL || lock_list->ll_count == 0) return; } else { /* * If this is the first lock, just return as no order * checking is needed. Avoid problems with thread * migration pinning the thread while checking if * spinlocks are held. If at least one spinlock is held * the thread is in a safe path and it is allowed to * unpin it. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list == NULL || lock_list->ll_count == 0) { sched_unpin(); return; } sched_unpin(); } /* * Check to see if we are recursing on a lock we already own. If * so, make sure that we don't mismatch exclusive and shared lock * acquires. */ lock1 = find_instance(lock_list, lock); if (lock1 != NULL) { if ((lock1->li_flags & LI_EXCLUSIVE) != 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("excl->share"); } if ((lock1->li_flags & LI_EXCLUSIVE) == 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive lock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(lock1->li_file), lock1->li_line); kassert_panic("share->excl"); } return; } /* Warn if the interlock is not locked exactly once. */ if (interlock != NULL) { iclass = LOCK_CLASS(interlock); lock1 = find_instance(lock_list, interlock); if (lock1 == NULL) kassert_panic("interlock (%s) %s not locked @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); else if ((lock1->li_flags & LI_RECURSEMASK) != 0) kassert_panic("interlock (%s) %s recursed @ %s:%d", iclass->lc_name, interlock->lo_name, fixup_filename(file), line); } /* * Find the previously acquired lock, but ignore interlocks. */ plock = &lock_list->ll_children[lock_list->ll_count - 1]; if (interlock != NULL && plock->li_lock == interlock) { if (lock_list->ll_count > 1) plock = &lock_list->ll_children[lock_list->ll_count - 2]; else { lle = lock_list->ll_next; /* * The interlock is the only lock we hold, so * simply return. */ if (lle == NULL) return; plock = &lle->ll_children[lle->ll_count - 1]; } } /* * Try to perform most checks without a lock. If this succeeds we * can skip acquiring the lock and return success. Otherwise we redo * the check with the lock held to handle races with concurrent updates. */ w1 = plock->li_lock->lo_witness; if (witness_lock_order_check(w1, w)) return; mtx_lock_spin(&w_mtx); if (witness_lock_order_check(w1, w)) { mtx_unlock_spin(&w_mtx); return; } witness_lock_order_add(w1, w); /* * Check for duplicate locks of the same type. Note that we only * have to check for this on the last lock we just acquired. Any * other cases will be caught as lock order violations. */ if (w1 == w) { i = w->w_index; if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) && !(w_rmatrix[i][i] & WITNESS_REVERSAL)) { w_rmatrix[i][i] |= WITNESS_REVERSAL; w->w_reversed = 1; mtx_unlock_spin(&w_mtx); witness_output( "acquiring duplicate lock of same type: \"%s\"\n", w->w_name); witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name, fixup_filename(plock->li_file), plock->li_line); witness_output(" 2nd %s @ %s:%d\n", lock->lo_name, fixup_filename(file), line); witness_debugger(1, __func__); } else mtx_unlock_spin(&w_mtx); return; } mtx_assert(&w_mtx, MA_OWNED); /* * If we know that the lock we are acquiring comes after * the lock we most recently acquired in the lock order tree, * then there is no need for any further checks. */ if (isitmychild(w1, w)) goto out; for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) { for (i = lle->ll_count - 1; i >= 0; i--, j++) { struct stack pstack; bool pstackv, trace; MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN); lock1 = &lle->ll_children[i]; /* * Ignore the interlock. */ if (interlock == lock1->li_lock) continue; /* * If this lock doesn't undergo witness checking, * then skip it. */ w1 = lock1->li_lock->lo_witness; if (w1 == NULL) { KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0, ("lock missing witness structure")); continue; } /* * If we are locking Giant and this is a sleepable * lock, then skip it. */ if ((lock1->li_flags & LI_SLEEPABLE) != 0 && lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * is Giant, then skip it. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0 && lock1->li_lock == &Giant.lock_object) continue; /* * If we are locking a sleepable lock and this lock * isn't sleepable, we want to treat it as a lock * order violation to enfore a general lock order of * sleepable locks before non-sleepable locks. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0 && (lock1->li_flags & LI_SLEEPABLE) == 0) goto reversal; /* * If we are locking Giant and this is a non-sleepable * lock, then treat it as a reversal. */ if ((lock1->li_flags & LI_SLEEPABLE) == 0 && lock == &Giant.lock_object) goto reversal; /* * Check the lock order hierarchy for a reveresal. */ if (!isitmydescendant(w, w1)) continue; reversal: /* * We have a lock order violation, check to see if it * is allowed or has already been yelled about. */ /* Bail if this violation is known */ if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL) goto out; /* Record this as a violation */ w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL; w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL; w->w_reversed = w1->w_reversed = 1; witness_increment_graph_generation(); /* * If the lock order is blessed, bail before logging * anything. We don't look for other lock order * violations though, which may be a bug. */ if (blessed(w, w1)) goto out; trace = atomic_load_int(&witness_trace); if (trace) { struct witness_lock_order_data *data; pstackv = false; data = witness_lock_order_get(w, w1); if (data != NULL) { stack_copy(&data->wlod_stack, &pstack); pstackv = true; } } mtx_unlock_spin(&w_mtx); #ifdef WITNESS_NO_VNODE /* * There are known LORs between VNODE locks. They are * not an indication of a bug. VNODE locks are flagged * as such (LO_IS_VNODE) and we don't yell if the LOR * is between 2 VNODE locks. */ if ((lock->lo_flags & LO_IS_VNODE) != 0 && (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0) return; #endif /* * Ok, yell about it. */ if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0 && (lock1->li_flags & LI_SLEEPABLE) == 0) witness_output( "lock order reversal: (sleepable after non-sleepable)\n"); else if ((lock1->li_flags & LI_SLEEPABLE) == 0 && lock == &Giant.lock_object) witness_output( "lock order reversal: (Giant after non-sleepable)\n"); else witness_output("lock order reversal:\n"); /* * Try to locate an earlier lock with * witness w in our list. */ do { lock2 = &lle->ll_children[i]; MPASS(lock2->li_lock != NULL); if (lock2->li_lock->lo_witness == w) break; if (i == 0 && lle->ll_next != NULL) { lle = lle->ll_next; i = lle->ll_count - 1; MPASS(i >= 0 && i < LOCK_NCHILDREN); } else i--; } while (i >= 0); if (i < 0) { witness_output(" 1st %p %s (%s, %s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, w1->w_class->lc_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 2nd %p %s (%s, %s) @ %s:%d\n", lock, lock->lo_name, w->w_name, w->w_class->lc_name, fixup_filename(file), line); } else { struct witness *w2 = lock2->li_lock->lo_witness; witness_output(" 1st %p %s (%s, %s) @ %s:%d\n", lock2->li_lock, lock2->li_lock->lo_name, w2->w_name, w2->w_class->lc_name, fixup_filename(lock2->li_file), lock2->li_line); witness_output(" 2nd %p %s (%s, %s) @ %s:%d\n", lock1->li_lock, lock1->li_lock->lo_name, w1->w_name, w1->w_class->lc_name, fixup_filename(lock1->li_file), lock1->li_line); witness_output(" 3rd %p %s (%s, %s) @ %s:%d\n", lock, lock->lo_name, w->w_name, w->w_class->lc_name, fixup_filename(file), line); } if (trace) { char buf[64]; struct sbuf sb; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, witness_output_drain, NULL); if (pstackv) { sbuf_printf(&sb, "lock order %s -> %s established at:\n", w->w_name, w1->w_name); stack_sbuf_print_flags(&sb, &pstack, M_NOWAIT, STACK_SBUF_FMT_LONG); } sbuf_printf(&sb, "lock order %s -> %s attempted at:\n", w1->w_name, w->w_name); stack_save(&pstack); stack_sbuf_print_flags(&sb, &pstack, M_NOWAIT, STACK_SBUF_FMT_LONG); sbuf_finish(&sb); sbuf_delete(&sb); } witness_enter_debugger(__func__); return; } } /* * If requested, build a new lock order. However, don't build a new * relationship between a sleepable lock and Giant if it is in the * wrong direction. The correct lock order is that sleepable locks * always come before Giant. */ if (flags & LOP_NEWORDER && !(plock->li_lock == &Giant.lock_object && (lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0)) { CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__, w->w_name, plock->li_lock->lo_witness->w_name); itismychild(plock->li_lock->lo_witness, w); } out: mtx_unlock_spin(&w_mtx); } void witness_lock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct witness *w; struct thread *td; if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL || KERNEL_PANICKED()) return; w = lock->lo_witness; td = curthread; /* Determine lock list for this lock. */ if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); /* Check to see if we are recursing on a lock we already own. */ instance = find_instance(*lock_list, lock); if (instance != NULL) { instance->li_flags++; CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__, td->td_proc->p_pid, lock->lo_name, instance->li_flags & LI_RECURSEMASK); instance->li_file = file; instance->li_line = line; return; } /* Update per-witness last file and line acquire. */ w->w_file = file; w->w_line = line; /* Find the next open lock instance in the list and fill it. */ lle = *lock_list; if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) { lle = witness_lock_list_get(); if (lle == NULL) return; lle->ll_next = *lock_list; CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__, td->td_proc->p_pid, lle); *lock_list = lle; } instance = &lle->ll_children[lle->ll_count++]; instance->li_lock = lock; instance->li_line = line; instance->li_file = file; instance->li_flags = 0; if ((flags & LOP_EXCLUSIVE) != 0) instance->li_flags |= LI_EXCLUSIVE; if ((lock->lo_flags & LO_SLEEPABLE) != 0 && (flags & LOP_NOSLEEP) == 0) instance->li_flags |= LI_SLEEPABLE; CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__, td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1); } void witness_upgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "upgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "upgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "upgrade of exclusive lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "upgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags |= LI_EXCLUSIVE; } void witness_downgrade(struct lock_object *lock, int flags, const char *file, int line) { struct lock_instance *instance; struct lock_class *class; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (witness_watch) { if ((lock->lo_flags & LO_UPGRADABLE) == 0) kassert_panic( "downgrade of non-upgradable lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((class->lc_flags & LC_SLEEPLOCK) == 0) kassert_panic( "downgrade of non-sleep lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); } instance = find_instance(curthread->td_sleeplocks, lock); if (instance == NULL) { kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } if (witness_watch) { if ((instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "downgrade of shared lock (%s) %s @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic( "downgrade of recursed lock (%s) %s r=%d @ %s:%d", class->lc_name, lock->lo_name, instance->li_flags & LI_RECURSEMASK, fixup_filename(file), line); } instance->li_flags &= ~LI_EXCLUSIVE; } void witness_unlock(struct lock_object *lock, int flags, const char *file, int line) { struct lock_list_entry **lock_list, *lle; struct lock_instance *instance; struct lock_class *class; struct thread *td; register_t s; int i, j; if (witness_cold || lock->lo_witness == NULL || KERNEL_PANICKED()) return; td = curthread; class = LOCK_CLASS(lock); /* Find lock instance associated with this lock. */ if (class->lc_flags & LC_SLEEPLOCK) lock_list = &td->td_sleeplocks; else lock_list = PCPU_PTR(spinlocks); lle = *lock_list; for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next) for (i = 0; i < (*lock_list)->ll_count; i++) { instance = &(*lock_list)->ll_children[i]; if (instance->li_lock == lock) goto found; } /* * When disabling WITNESS through witness_watch we could end up in * having registered locks in the td_sleeplocks queue. * We have to make sure we flush these queues, so just search for * eventual register locks and remove them. */ if (witness_watch > 0) { kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name, lock->lo_name, fixup_filename(file), line); return; } else { return; } found: /* First, check for shared/exclusive mismatches. */ if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) == 0) { witness_output("shared unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while exclusively locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("excl->ushare"); } if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 && (flags & LOP_EXCLUSIVE) != 0) { witness_output("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); witness_output("while share locked from %s:%d\n", fixup_filename(instance->li_file), instance->li_line); kassert_panic("share->uexcl"); } /* If we are recursed, unrecurse. */ if ((instance->li_flags & LI_RECURSEMASK) > 0) { CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, instance->li_flags); instance->li_flags--; return; } /* The lock is now being dropped, check for NORELEASE flag */ if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) { witness_output("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name, lock->lo_name, fixup_filename(file), line); kassert_panic("lock marked norelease"); } /* Otherwise, remove this item from the list. */ s = intr_disable(); CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__, td->td_proc->p_pid, instance->li_lock->lo_name, (*lock_list)->ll_count - 1); for (j = i; j < (*lock_list)->ll_count - 1; j++) (*lock_list)->ll_children[j] = (*lock_list)->ll_children[j + 1]; (*lock_list)->ll_count--; intr_restore(s); /* * In order to reduce contention on w_mtx, we want to keep always an * head object into lists so that frequent allocation from the * free witness pool (and subsequent locking) is avoided. * In order to maintain the current code simple, when the head * object is totally unloaded it means also that we do not have * further objects in the list, so the list ownership needs to be * hand over to another object if the current head needs to be freed. */ if ((*lock_list)->ll_count == 0) { if (*lock_list == lle) { if (lle->ll_next == NULL) return; } else lle = *lock_list; *lock_list = lle->ll_next; CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__, td->td_proc->p_pid, lle); witness_lock_list_free(lle); } } void witness_thread_exit(struct thread *td) { struct lock_list_entry *lle; int i, n; lle = td->td_sleeplocks; if (lle == NULL || KERNEL_PANICKED()) return; if (lle->ll_count != 0) { for (n = 0; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { if (n == 0) witness_output( "Thread %p exiting with the following locks held:\n", td); n++; witness_list_lock(&lle->ll_children[i], witness_output); } kassert_panic( "Thread %p cannot exit while holding sleeplocks\n", td); } witness_lock_list_free(lle); } /* * Warn if any locks other than 'lock' are held. Flags can be passed in to * exempt Giant and sleepable locks from the checks as well. If any * non-exempt locks are held, then a supplied message is printed to the * output channel along with a list of the offending locks. If indicated in the * flags then a failure results in a panic as well. */ int witness_warn(int flags, struct lock_object *lock, const char *fmt, ...) { struct lock_list_entry *lock_list, *lle; struct lock_instance *lock1; struct thread *td; va_list ap; int i, n; if (witness_cold || witness_watch < 1 || KERNEL_PANICKED()) return (0); n = 0; td = curthread; for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { lock1 = &lle->ll_children[i]; if (lock1->li_lock == lock) continue; if (flags & WARN_GIANTOK && lock1->li_lock == &Giant.lock_object) continue; if (flags & WARN_SLEEPOK && (lock1->li_flags & LI_SLEEPABLE) != 0) continue; if (n == 0) { va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); } n++; witness_list_lock(lock1, printf); } /* * Pin the thread in order to avoid problems with thread migration. * Once that all verifies are passed about spinlocks ownership, * the thread is in a safe path and it can be unpinned. */ sched_pin(); lock_list = PCPU_GET(spinlocks); if (lock_list != NULL && lock_list->ll_count != 0) { sched_unpin(); /* * We should only have one spinlock and as long as * the flags cannot match for this locks class, * check if the first spinlock is the one curthread * should hold. */ lock1 = &lock_list->ll_children[lock_list->ll_count - 1]; if (lock_list->ll_count == 1 && lock_list->ll_next == NULL && lock1->li_lock == lock && n == 0) return (0); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf(" with the following %slocks held:\n", (flags & WARN_SLEEPOK) != 0 ? "non-sleepable " : ""); n += witness_list_locks(&lock_list, printf); } else sched_unpin(); if (flags & WARN_PANIC && n) kassert_panic("%s", __func__); else witness_debugger(n, __func__); return (n); } const char * witness_file(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return ("?"); w = lock->lo_witness; return (w->w_file); } int witness_line(struct lock_object *lock) { struct witness *w; if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL) return (0); w = lock->lo_witness; return (w->w_line); } static struct witness * enroll(const char *description, struct lock_class *lock_class) { struct witness *w; MPASS(description != NULL); if (witness_watch == -1 || KERNEL_PANICKED()) return (NULL); if ((lock_class->lc_flags & LC_SPINLOCK)) { if (witness_skipspin) return (NULL); } else if ((lock_class->lc_flags & LC_SLEEPLOCK) == 0) { kassert_panic("lock class %s is not sleep or spin", lock_class->lc_name); return (NULL); } mtx_lock_spin(&w_mtx); w = witness_hash_get(description); if (w) goto found; if ((w = witness_get()) == NULL) return (NULL); MPASS(strlen(description) < MAX_W_NAME); strcpy(w->w_name, description); w->w_class = lock_class; w->w_refcount = 1; STAILQ_INSERT_HEAD(&w_all, w, w_list); if (lock_class->lc_flags & LC_SPINLOCK) { STAILQ_INSERT_HEAD(&w_spin, w, w_typelist); w_spin_cnt++; } else if (lock_class->lc_flags & LC_SLEEPLOCK) { STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist); w_sleep_cnt++; } /* Insert new witness into the hash */ witness_hash_put(w); witness_increment_graph_generation(); mtx_unlock_spin(&w_mtx); return (w); found: w->w_refcount++; if (w->w_refcount == 1) w->w_class = lock_class; mtx_unlock_spin(&w_mtx); if (lock_class != w->w_class) kassert_panic( "lock (%s) %s does not match earlier (%s) lock", description, lock_class->lc_name, w->w_class->lc_name); return (w); } static void depart(struct witness *w) { MPASS(w->w_refcount == 0); if (w->w_class->lc_flags & LC_SLEEPLOCK) { w_sleep_cnt--; } else { w_spin_cnt--; } /* * Set file to NULL as it may point into a loadable module. */ w->w_file = NULL; w->w_line = 0; witness_increment_graph_generation(); } static void adopt(struct witness *parent, struct witness *child) { int pi, ci, i, j; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); /* If the relationship is already known, there's no work to be done. */ if (isitmychild(parent, child)) return; /* When the structure of the graph changes, bump up the generation. */ witness_increment_graph_generation(); /* * The hard part ... create the direct relationship, then propagate all * indirect relationships. */ pi = parent->w_index; ci = child->w_index; WITNESS_INDEX_ASSERT(pi); WITNESS_INDEX_ASSERT(ci); MPASS(pi != ci); w_rmatrix[pi][ci] |= WITNESS_PARENT; w_rmatrix[ci][pi] |= WITNESS_CHILD; /* * If parent was not already an ancestor of child, * then we increment the descendant and ancestor counters. */ if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) { parent->w_num_descendants++; child->w_num_ancestors++; } /* * Find each ancestor of 'pi'. Note that 'pi' itself is counted as * an ancestor of 'pi' during this loop. */ for (i = 1; i <= w_max_used_index; i++) { if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && (i != pi)) continue; /* Find each descendant of 'i' and mark it as a descendant. */ for (j = 1; j <= w_max_used_index; j++) { /* * Skip children that are already marked as * descendants of 'i'. */ if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) continue; /* * We are only interested in descendants of 'ci'. Note * that 'ci' itself is counted as a descendant of 'ci'. */ if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && (j != ci)) continue; w_rmatrix[i][j] |= WITNESS_ANCESTOR; w_rmatrix[j][i] |= WITNESS_DESCENDANT; w_data[i].w_num_descendants++; w_data[j].w_num_ancestors++; /* * Make sure we aren't marking a node as both an * ancestor and descendant. We should have caught * this as a lock order reversal earlier. */ if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", i, j, w_rmatrix[i][j]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) && (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) { printf("witness rmatrix paradox! [%d][%d]=%d " "both ancestor and descendant\n", j, i, w_rmatrix[j][i]); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } } } } static void itismychild(struct witness *parent, struct witness *child) { int unlocked; MPASS(child != NULL && parent != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (!witness_lock_type_equal(parent, child)) { if (witness_cold == 0) { unlocked = 1; mtx_unlock_spin(&w_mtx); } else { unlocked = 0; } kassert_panic( "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not " "the same lock type", __func__, parent->w_name, parent->w_class->lc_name, child->w_name, child->w_class->lc_name); if (unlocked) mtx_lock_spin(&w_mtx); } adopt(parent, child); } /* * Generic code for the isitmy*() functions. The rmask parameter is the * expected relationship of w1 to w2. */ static int _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname) { unsigned char r1, r2; int i1, i2; i1 = w1->w_index; i2 = w2->w_index; WITNESS_INDEX_ASSERT(i1); WITNESS_INDEX_ASSERT(i2); r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK; r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK; /* The flags on one better be the inverse of the flags on the other */ if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) || (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) { /* Don't squawk if we're potentially racing with an update. */ if (!mtx_owned(&w_mtx)) return (0); printf("%s: rmatrix mismatch between %s (index %d) and %s " "(index %d): w_rmatrix[%d][%d] == %hhx but " "w_rmatrix[%d][%d] == %hhx\n", fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1, i2, i1, r2); kdb_backtrace(); printf("Witness disabled.\n"); witness_watch = -1; } return (r1 & rmask); } /* * Checks if @child is a direct child of @parent. */ static int isitmychild(struct witness *parent, struct witness *child) { return (_isitmyx(parent, child, WITNESS_PARENT, __func__)); } /* * Checks if @descendant is a direct or inderect descendant of @ancestor. */ static int isitmydescendant(struct witness *ancestor, struct witness *descendant) { return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK, __func__)); } static int blessed(struct witness *w1, struct witness *w2) { int i; struct witness_blessed *b; for (i = 0; i < nitems(blessed_list); i++) { b = &blessed_list[i]; if (strcmp(w1->w_name, b->b_lock1) == 0) { if (strcmp(w2->w_name, b->b_lock2) == 0) return (1); continue; } if (strcmp(w1->w_name, b->b_lock2) == 0) if (strcmp(w2->w_name, b->b_lock1) == 0) return (1); } return (0); } static struct witness * witness_get(void) { struct witness *w; int index; if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); if (witness_watch == -1) { mtx_unlock_spin(&w_mtx); return (NULL); } if (STAILQ_EMPTY(&w_free)) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("WITNESS: unable to allocate a new witness object\n"); return (NULL); } w = STAILQ_FIRST(&w_free); STAILQ_REMOVE_HEAD(&w_free, w_list); w_free_cnt--; index = w->w_index; MPASS(index > 0 && index == w_max_used_index+1 && index < witness_count); bzero(w, sizeof(*w)); w->w_index = index; if (index > w_max_used_index) w_max_used_index = index; return (w); } static void witness_free(struct witness *w) { STAILQ_INSERT_HEAD(&w_free, w, w_list); w_free_cnt++; } static struct lock_list_entry * witness_lock_list_get(void) { struct lock_list_entry *lle; if (witness_watch == -1) return (NULL); mtx_lock_spin(&w_mtx); lle = w_lock_list_free; if (lle == NULL) { witness_watch = -1; mtx_unlock_spin(&w_mtx); printf("%s: witness exhausted\n", __func__); return (NULL); } w_lock_list_free = lle->ll_next; mtx_unlock_spin(&w_mtx); bzero(lle, sizeof(*lle)); return (lle); } static void witness_lock_list_free(struct lock_list_entry *lle) { mtx_lock_spin(&w_mtx); lle->ll_next = w_lock_list_free; w_lock_list_free = lle; mtx_unlock_spin(&w_mtx); } static struct lock_instance * find_instance(struct lock_list_entry *list, const struct lock_object *lock) { struct lock_list_entry *lle; struct lock_instance *instance; int i; for (lle = list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { instance = &lle->ll_children[i]; if (instance->li_lock == lock) return (instance); } return (NULL); } static void witness_list_lock(struct lock_instance *instance, int (*prnt)(const char *fmt, ...)) { struct lock_object *lock; lock = instance->li_lock; prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ? "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name); if (lock->lo_witness->w_name != lock->lo_name) prnt(" (%s)", lock->lo_witness->w_name); prnt(" r = %d (%p) locked @ %s:%d\n", instance->li_flags & LI_RECURSEMASK, lock, fixup_filename(instance->li_file), instance->li_line); } static int witness_output(const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = witness_voutput(fmt, ap); va_end(ap); return (ret); } static int witness_voutput(const char *fmt, va_list ap) { int ret; ret = 0; switch (witness_channel) { case WITNESS_CONSOLE: ret = vprintf(fmt, ap); break; case WITNESS_LOG: vlog(LOG_NOTICE, fmt, ap); break; case WITNESS_NONE: break; } return (ret); } #ifdef DDB static int witness_thread_has_locks(struct thread *td) { if (td->td_sleeplocks == NULL) return (0); return (td->td_sleeplocks->ll_count != 0); } static int witness_proc_has_locks(struct proc *p) { struct thread *td; FOREACH_THREAD_IN_PROC(p, td) { if (witness_thread_has_locks(td)) return (1); } return (0); } #endif int witness_list_locks(struct lock_list_entry **lock_list, int (*prnt)(const char *fmt, ...)) { struct lock_list_entry *lle; int i, nheld; nheld = 0; for (lle = *lock_list; lle != NULL; lle = lle->ll_next) for (i = lle->ll_count - 1; i >= 0; i--) { witness_list_lock(&lle->ll_children[i], prnt); nheld++; } return (nheld); } /* * This is a bit risky at best. We call this function when we have timed * out acquiring a spin lock, and we assume that the other CPU is stuck * with this lock held. So, we go groveling around in the other CPU's * per-cpu data to try to find the lock instance for this spin lock to * see when it was last acquired. */ void witness_display_spinlock(struct lock_object *lock, struct thread *owner, int (*prnt)(const char *fmt, ...)) { struct lock_instance *instance; struct pcpu *pc; if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU) return; pc = pcpu_find(owner->td_oncpu); instance = find_instance(pc->pc_spinlocks, lock); if (instance != NULL) witness_list_lock(instance, prnt); } void witness_save(struct lock_object *lock, const char **filep, int *linep) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* Initialize for KMSAN's benefit. */ *filep = NULL; *linep = 0; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } *filep = instance->li_file; *linep = instance->li_line; } void witness_restore(struct lock_object *lock, const char *file, int line) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; /* * This function is used independently in locking code to deal with * Giant, SCHEDULER_STOPPED() check can be removed here after Giant * is gone. */ if (SCHEDULER_STOPPED()) return; KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); lock->lo_witness->w_file = file; lock->lo_witness->w_line = line; if (instance == NULL) return; instance->li_file = file; instance->li_line = line; } static bool witness_find_instance(const struct lock_object *lock, struct lock_instance **instance) { #ifdef INVARIANT_SUPPORT struct lock_class *class; if (lock->lo_witness == NULL || witness_watch < 1 || KERNEL_PANICKED()) return (false); class = LOCK_CLASS(lock); if ((class->lc_flags & LC_SLEEPLOCK) != 0) { *instance = find_instance(curthread->td_sleeplocks, lock); return (true); } else if ((class->lc_flags & LC_SPINLOCK) != 0) { *instance = find_instance(PCPU_GET(spinlocks), lock); return (true); } else { kassert_panic("Lock (%s) %s is not sleep or spin!", class->lc_name, lock->lo_name); return (false); } #else return (false); #endif } void witness_assert(const struct lock_object *lock, int flags, const char *file, int line) { #ifdef INVARIANT_SUPPORT struct lock_instance *instance; struct lock_class *class; if (!witness_find_instance(lock, &instance)) return; class = LOCK_CLASS(lock); switch (flags) { case LA_UNLOCKED: if (instance != NULL) kassert_panic("Lock (%s) %s locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; case LA_LOCKED: case LA_LOCKED | LA_RECURSED: case LA_LOCKED | LA_NOTRECURSED: case LA_SLOCKED: case LA_SLOCKED | LA_RECURSED: case LA_SLOCKED | LA_NOTRECURSED: case LA_XLOCKED: case LA_XLOCKED | LA_RECURSED: case LA_XLOCKED | LA_NOTRECURSED: if (instance == NULL) { kassert_panic("Lock (%s) %s not locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; } if ((flags & LA_XLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) == 0) kassert_panic( "Lock (%s) %s not exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_SLOCKED) != 0 && (instance->li_flags & LI_EXCLUSIVE) != 0) kassert_panic( "Lock (%s) %s exclusively locked @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_RECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) == 0) kassert_panic("Lock (%s) %s not recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); if ((flags & LA_NOTRECURSED) != 0 && (instance->li_flags & LI_RECURSEMASK) != 0) kassert_panic("Lock (%s) %s recursed @ %s:%d.", class->lc_name, lock->lo_name, fixup_filename(file), line); break; default: kassert_panic("Invalid lock assertion at %s:%d.", fixup_filename(file), line); } #endif /* INVARIANT_SUPPORT */ } /* * Checks the ownership of the lock by curthread, consulting the witness list. * Returns: * 0 if witness is disabled or did not work * -1 if not owned * 1 if owned */ int witness_is_owned(const struct lock_object *lock) { #ifdef INVARIANT_SUPPORT struct lock_instance *instance; if (!witness_find_instance(lock, &instance)) return (0); return (instance == NULL ? -1 : 1); #else return (0); #endif } static void witness_setflag(struct lock_object *lock, int flag, int set) { struct lock_list_entry *lock_list; struct lock_instance *instance; struct lock_class *class; if (lock->lo_witness == NULL || witness_watch == -1 || KERNEL_PANICKED()) return; class = LOCK_CLASS(lock); if (class->lc_flags & LC_SLEEPLOCK) lock_list = curthread->td_sleeplocks; else { if (witness_skipspin) return; lock_list = PCPU_GET(spinlocks); } instance = find_instance(lock_list, lock); if (instance == NULL) { kassert_panic("%s: lock (%s) %s not locked", __func__, class->lc_name, lock->lo_name); return; } if (set) instance->li_flags |= flag; else instance->li_flags &= ~flag; } void witness_norelease(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 1); } void witness_releaseok(struct lock_object *lock) { witness_setflag(lock, LI_NORELEASE, 0); } #ifdef DDB static void witness_ddb_list(struct thread *td) { KASSERT(witness_cold == 0, ("%s: witness_cold", __func__)); KASSERT(kdb_active, ("%s: not in the debugger", __func__)); if (witness_watch < 1) return; witness_list_locks(&td->td_sleeplocks, db_printf); /* * We only handle spinlocks if td == curthread. This is somewhat broken * if td is currently executing on some other CPU and holds spin locks * as we won't display those locks. If we had a MI way of getting * the per-cpu data for a given cpu then we could use * td->td_oncpu to get the list of spinlocks for this thread * and "fix" this. * * That still wouldn't really fix this unless we locked the scheduler * lock or stopped the other CPU to make sure it wasn't changing the * list out from under us. It is probably best to just not try to * handle threads on other CPU's for now. */ if (td == curthread && PCPU_GET(spinlocks) != NULL) witness_list_locks(PCPU_PTR(spinlocks), db_printf); } DB_SHOW_COMMAND(locks, db_witness_list) { struct thread *td; if (have_addr) td = db_lookup_thread(addr, true); else td = kdb_thread; witness_ddb_list(td); } DB_SHOW_ALL_COMMAND(locks, db_witness_list_all) { struct thread *td; struct proc *p; /* * It would be nice to list only threads and processes that actually * held sleep locks, but that information is currently not exported * by WITNESS. */ FOREACH_PROC_IN_SYSTEM(p) { if (!witness_proc_has_locks(p)) continue; FOREACH_THREAD_IN_PROC(p, td) { if (!witness_thread_has_locks(td)) continue; db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid, p->p_comm, td, td->td_tid); witness_ddb_list(td); if (db_pager_quit) return; } } } DB_SHOW_ALIAS_FLAGS(alllocks, db_witness_list_all, DB_CMD_MEMSAFE); DB_SHOW_COMMAND_FLAGS(witness, db_witness_display, DB_CMD_MEMSAFE) { witness_ddb_display(db_printf); } #endif static void sbuf_print_witness_badstacks(struct sbuf *sb, size_t *oldidx) { struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2; struct witness *tmp_w1, *tmp_w2, *w1, *w2; int generation, i, j; tmp_data1 = NULL; tmp_data2 = NULL; tmp_w1 = NULL; tmp_w2 = NULL; /* Allocate and init temporary storage space. */ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO); tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, M_WAITOK | M_ZERO); stack_zero(&tmp_data1->wlod_stack); stack_zero(&tmp_data2->wlod_stack); restart: mtx_lock_spin(&w_mtx); generation = w_generation; mtx_unlock_spin(&w_mtx); sbuf_printf(sb, "Number of known direct relationships is %d\n", w_lohash.wloh_count); for (i = 1; i < w_max_used_index; i++) { mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w1 = &w_data[i]; if (w1->w_reversed == 0) { mtx_unlock_spin(&w_mtx); continue; } /* Copy w1 locally so we can release the spin lock. */ *tmp_w1 = *w1; mtx_unlock_spin(&w_mtx); if (tmp_w1->w_reversed == 0) continue; for (j = 1; j < w_max_used_index; j++) { if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j) continue; mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* The graph has changed, try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } w2 = &w_data[j]; data1 = witness_lock_order_get(w1, w2); data2 = witness_lock_order_get(w2, w1); /* * Copy information locally so we can release the * spin lock. */ *tmp_w2 = *w2; if (data1) { stack_zero(&tmp_data1->wlod_stack); stack_copy(&data1->wlod_stack, &tmp_data1->wlod_stack); } if (data2 && data2 != data1) { stack_zero(&tmp_data2->wlod_stack); stack_copy(&data2->wlod_stack, &tmp_data2->wlod_stack); } mtx_unlock_spin(&w_mtx); if (blessed(tmp_w1, tmp_w2)) continue; sbuf_printf(sb, "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); if (data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w1->w_name, tmp_w1->w_class->lc_name, tmp_w2->w_name, tmp_w2->w_class->lc_name); stack_sbuf_print(sb, &tmp_data1->wlod_stack); - sbuf_printf(sb, "\n"); + sbuf_putc(sb, '\n'); } if (data2 && data2 != data1) { sbuf_printf(sb, "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n", tmp_w2->w_name, tmp_w2->w_class->lc_name, tmp_w1->w_name, tmp_w1->w_class->lc_name); stack_sbuf_print(sb, &tmp_data2->wlod_stack); - sbuf_printf(sb, "\n"); + sbuf_putc(sb, '\n'); } } } mtx_lock_spin(&w_mtx); if (generation != w_generation) { mtx_unlock_spin(&w_mtx); /* * The graph changed while we were printing stack data, * try again. */ *oldidx = 0; sbuf_clear(sb); goto restart; } mtx_unlock_spin(&w_mtx); /* Free temporary storage space. */ free(tmp_data1, M_TEMP); free(tmp_data2, M_TEMP); free(tmp_w1, M_TEMP); free(tmp_w2, M_TEMP); } static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error; if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND); if (sb == NULL) return (ENOMEM); sbuf_print_witness_badstacks(sb, &req->oldidx); sbuf_finish(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1); sbuf_delete(sb); return (error); } #ifdef DDB static int sbuf_db_printf_drain(void *arg __unused, const char *data, int len) { return (db_printf("%.*s", len, data)); } DB_SHOW_COMMAND_FLAGS(badstacks, db_witness_badstacks, DB_CMD_MEMSAFE) { struct sbuf sb; char buffer[128]; size_t dummy; sbuf_new(&sb, buffer, sizeof(buffer), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_db_printf_drain, NULL); sbuf_print_witness_badstacks(&sb, &dummy); sbuf_finish(&sb); } #endif static int sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS) { static const struct { enum witness_channel channel; const char *name; } channels[] = { { WITNESS_CONSOLE, "console" }, { WITNESS_LOG, "log" }, { WITNESS_NONE, "none" }, }; char buf[16]; u_int i; int error; buf[0] = '\0'; for (i = 0; i < nitems(channels); i++) if (witness_channel == channels[i].channel) { snprintf(buf, sizeof(buf), "%s", channels[i].name); break; } error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); error = EINVAL; for (i = 0; i < nitems(channels); i++) if (strcmp(channels[i].name, buf) == 0) { witness_channel = channels[i].channel; error = 0; break; } return (error); } static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS) { struct witness *w; struct sbuf *sb; int error; #ifdef __i386__ error = SYSCTL_OUT(req, w_notallowed, sizeof(w_notallowed)); return (error); #endif if (witness_watch < 1) { error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning)); return (error); } if (witness_cold) { error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold)); return (error); } error = 0; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req); if (sb == NULL) return (ENOMEM); - sbuf_printf(sb, "\n"); + sbuf_putc(sb, '\n'); mtx_lock_spin(&w_mtx); STAILQ_FOREACH(w, &w_all, w_list) w->w_displayed = 0; STAILQ_FOREACH(w, &w_all, w_list) witness_add_fullgraph(sb, w); mtx_unlock_spin(&w_mtx); /* * Close the sbuf and return to userland. */ error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS) { int error, value; value = witness_watch; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value > 1 || value < -1 || (witness_watch == -1 && value != witness_watch)) return (EINVAL); witness_watch = value; return (0); } static void witness_add_fullgraph(struct sbuf *sb, struct witness *w) { int i; if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0)) return; w->w_displayed = 1; WITNESS_INDEX_ASSERT(w->w_index); for (i = 1; i <= w_max_used_index; i++) { if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) { sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name, w_data[i].w_name); witness_add_fullgraph(sb, &w_data[i]); } } } /* * A simple hash function. Takes a key pointer and a key size. If size == 0, * interprets the key as a string and reads until the null * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit * hash value computed from the key. */ static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size) { unsigned int hash = 5381; int i; /* hash = hash * 33 + key[i] */ if (size) for (i = 0; i < size; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; else for (i = 0; key[i] != 0; i++) hash = ((hash << 5) + hash) + (unsigned int)key[i]; return (hash); } /* * Initializes the two witness hash tables. Called exactly once from * witness_initialize(). */ static void witness_init_hash_tables(void) { int i; MPASS(witness_cold); /* Initialize the hash tables. */ for (i = 0; i < WITNESS_HASH_SIZE; i++) w_hash.wh_array[i] = NULL; w_hash.wh_size = WITNESS_HASH_SIZE; w_hash.wh_count = 0; /* Initialize the lock order data hash. */ w_lofree = NULL; for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) { memset(&w_lodata[i], 0, sizeof(w_lodata[i])); w_lodata[i].wlod_next = w_lofree; w_lofree = &w_lodata[i]; } w_lohash.wloh_size = WITNESS_LO_HASH_SIZE; w_lohash.wloh_count = 0; for (i = 0; i < WITNESS_LO_HASH_SIZE; i++) w_lohash.wloh_array[i] = NULL; } static struct witness * witness_hash_get(const char *key) { struct witness *w; uint32_t hash; MPASS(key != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); hash = witness_hash_djb2(key, 0) % w_hash.wh_size; w = w_hash.wh_array[hash]; while (w != NULL) { if (strcmp(w->w_name, key) == 0) goto out; w = w->w_hash_next; } out: return (w); } static void witness_hash_put(struct witness *w) { uint32_t hash; MPASS(w != NULL); MPASS(w->w_name != NULL); if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); KASSERT(witness_hash_get(w->w_name) == NULL, ("%s: trying to add a hash entry that already exists!", __func__)); KASSERT(w->w_hash_next == NULL, ("%s: w->w_hash_next != NULL", __func__)); hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size; w->w_hash_next = w_hash.wh_array[hash]; w_hash.wh_array[hash] = w; w_hash.wh_count++; } static struct witness_lock_order_data * witness_lock_order_get(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if ((w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) == 0) goto out; hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; data = w_lohash.wloh_array[hash]; while (data != NULL) { if (witness_lock_order_key_equal(&data->wlod_key, &key)) break; data = data->wlod_next; } out: return (data); } /* * Verify that parent and child have a known relationship, are not the same, * and child is actually a child of parent. This is done without w_mtx * to avoid contention in the common case. */ static int witness_lock_order_check(struct witness *parent, struct witness *child) { if (parent != child && w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN && isitmychild(parent, child)) return (1); return (0); } static int witness_lock_order_add(struct witness *parent, struct witness *child) { struct witness_lock_order_data *data = NULL; struct witness_lock_order_key key; unsigned int hash; MPASS(parent != NULL && child != NULL); key.from = parent->w_index; key.to = child->w_index; WITNESS_INDEX_ASSERT(key.from); WITNESS_INDEX_ASSERT(key.to); if (w_rmatrix[parent->w_index][child->w_index] & WITNESS_LOCK_ORDER_KNOWN) return (1); hash = witness_hash_djb2((const char*)&key, sizeof(key)) % w_lohash.wloh_size; w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN; data = w_lofree; if (data == NULL) return (0); w_lofree = data->wlod_next; data->wlod_next = w_lohash.wloh_array[hash]; data->wlod_key = key; w_lohash.wloh_array[hash] = data; w_lohash.wloh_count++; stack_save(&data->wlod_stack); return (1); } /* Call this whenever the structure of the witness graph changes. */ static void witness_increment_graph_generation(void) { if (witness_cold == 0) mtx_assert(&w_mtx, MA_OWNED); w_generation++; } static int witness_output_drain(void *arg __unused, const char *data, int len) { witness_output("%.*s", len, data); return (len); } static void witness_debugger(int cond, const char *msg) { char buf[32]; struct sbuf sb; struct stack st; if (!cond) return; if (witness_trace) { sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, witness_output_drain, NULL); stack_save(&st); witness_output("stack backtrace:\n"); stack_sbuf_print_ddb(&sb, &st); sbuf_finish(&sb); } witness_enter_debugger(msg); } static void witness_enter_debugger(const char *msg) { #ifdef KDB if (witness_kdb) kdb_enter(KDB_WHY_WITNESS, msg); #endif } diff --git a/sys/kern/tty_info.c b/sys/kern/tty_info.c index 15ba5995cea9..f54fc3a30f5e 100644 --- a/sys/kern/tty_info.c +++ b/sys/kern/tty_info.c @@ -1,408 +1,408 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2002 Networks Associates Technologies, Inc. * All rights reserved. * * Portions of this software were developed for the FreeBSD Project by * ThinkSec AS and NAI Labs, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 * ("CBOSS"), as part of the DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Returns 1 if p2 is "better" than p1 * * The algorithm for picking the "interesting" process is thus: * * 1) Only foreground processes are eligible - implied. * 2) Runnable processes are favored over anything else. The runner * with the highest cpu utilization is picked (p_estcpu). Ties are * broken by picking the highest pid. * 3) The sleeper with the shortest sleep time is next. With ties, * we pick out just "short-term" sleepers (P_SINTR == 0). * 4) Further ties are broken by picking the highest pid. */ #define TESTAB(a, b) ((a)<<1 | (b)) #define ONLYA 2 #define ONLYB 1 #define BOTH 3 static int proc_sum(struct proc *p, fixpt_t *estcpup) { struct thread *td; int estcpu; int val; val = 0; estcpu = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_ON_RUNQ(td) || TD_IS_RUNNING(td)) val = 1; estcpu += sched_pctcpu(td); thread_unlock(td); } *estcpup = estcpu; return (val); } static int thread_compare(struct thread *td, struct thread *td2) { int runa, runb; int slpa, slpb; fixpt_t esta, estb; if (td == NULL) return (1); /* * Fetch running stats, pctcpu usage, and interruptable flag. */ thread_lock(td); runa = TD_IS_RUNNING(td) || TD_ON_RUNQ(td); slpa = td->td_flags & TDF_SINTR; esta = sched_pctcpu(td); thread_unlock(td); thread_lock(td2); runb = TD_IS_RUNNING(td2) || TD_ON_RUNQ(td2); estb = sched_pctcpu(td2); slpb = td2->td_flags & TDF_SINTR; thread_unlock(td2); /* * see if at least one of them is runnable */ switch (TESTAB(runa, runb)) { case ONLYA: return (0); case ONLYB: return (1); case BOTH: break; } /* * favor one with highest recent cpu utilization */ if (estb > esta) return (1); if (esta > estb) return (0); /* * favor one sleeping in a non-interruptible sleep */ switch (TESTAB(slpa, slpb)) { case ONLYA: return (0); case ONLYB: return (1); case BOTH: break; } return (td < td2); } static int proc_compare(struct proc *p1, struct proc *p2) { int runa, runb; fixpt_t esta, estb; if (p1 == NULL) return (1); /* * Fetch various stats about these processes. After we drop the * lock the information could be stale but the race is unimportant. */ PROC_LOCK(p1); runa = proc_sum(p1, &esta); PROC_UNLOCK(p1); PROC_LOCK(p2); runb = proc_sum(p2, &estb); PROC_UNLOCK(p2); /* * see if at least one of them is runnable */ switch (TESTAB(runa, runb)) { case ONLYA: return (0); case ONLYB: return (1); case BOTH: break; } /* * favor one with highest recent cpu utilization */ if (estb > esta) return (1); if (esta > estb) return (0); /* * weed out zombies */ switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) { case ONLYA: return (1); case ONLYB: return (0); case BOTH: break; } return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ } static int sbuf_tty_drain(void *a, const char *d, int len) { struct tty *tp; int rc; tp = a; if (kdb_active) { cnputsn(d, len); return (len); } if (tp != NULL && !KERNEL_PANICKED()) { rc = tty_putstrn(tp, d, len); if (rc != 0) return (-ENXIO); return (len); } return (-ENXIO); } #ifdef STACK #ifdef INVARIANTS static int tty_info_kstacks = STACK_SBUF_FMT_COMPACT; #else static int tty_info_kstacks = STACK_SBUF_FMT_NONE; #endif static int sysctl_tty_info_kstacks(SYSCTL_HANDLER_ARGS) { enum stack_sbuf_fmt val; int error; val = tty_info_kstacks; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); switch (val) { case STACK_SBUF_FMT_NONE: case STACK_SBUF_FMT_LONG: case STACK_SBUF_FMT_COMPACT: tty_info_kstacks = val; break; default: error = EINVAL; } return (error); } SYSCTL_PROC(_kern, OID_AUTO, tty_info_kstacks, CTLFLAG_RWTUN | CTLFLAG_MPSAFE | CTLTYPE_INT, NULL, 0, sysctl_tty_info_kstacks, "I", "Adjust format of kernel stack(9) traces on ^T (tty info): " "0 - disabled; 1 - long; 2 - compact"); #endif /* * Report on state of foreground process group. */ void tty_info(struct tty *tp) { struct timeval rtime, utime, stime; #ifdef STACK struct stack stack; int sterr, kstacks_val; bool print_kstacks; #endif struct proc *p, *ppick; struct thread *td, *tdpick; const char *stateprefix, *state; struct sbuf sb; long rss; int load, pctcpu; pid_t pid; char comm[MAXCOMLEN + 1]; struct rusage ru; tty_assert_locked(tp); if (tty_checkoutq(tp) == 0) return; (void)sbuf_new(&sb, tp->t_prbuf, tp->t_prbufsz, SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_tty_drain, tp); /* Print load average. */ load = ((int64_t)averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; sbuf_printf(&sb, "%sload: %d.%02d ", tp->t_column == 0 ? "" : "\n", load / 100, load % 100); if (tp->t_session == NULL) { - sbuf_printf(&sb, "not a controlling terminal\n"); + sbuf_cat(&sb, "not a controlling terminal\n"); goto out; } if (tp->t_pgrp == NULL) { - sbuf_printf(&sb, "no foreground process group\n"); + sbuf_cat(&sb, "no foreground process group\n"); goto out; } PGRP_LOCK(tp->t_pgrp); if (LIST_EMPTY(&tp->t_pgrp->pg_members)) { PGRP_UNLOCK(tp->t_pgrp); - sbuf_printf(&sb, "empty foreground process group\n"); + sbuf_cat(&sb, "empty foreground process group\n"); goto out; } /* * Pick the most interesting process and copy some of its * state for printing later. This operation could rely on stale * data as we can't hold the proc slock or thread locks over the * whole list. However, we're guaranteed not to reference an exited * thread or proc since we hold the tty locked. */ p = NULL; LIST_FOREACH(ppick, &tp->t_pgrp->pg_members, p_pglist) if (proc_compare(p, ppick)) p = ppick; PROC_LOCK(p); PGRP_UNLOCK(tp->t_pgrp); td = NULL; FOREACH_THREAD_IN_PROC(p, tdpick) if (thread_compare(td, tdpick)) td = tdpick; stateprefix = ""; thread_lock(td); if (TD_IS_RUNNING(td)) state = "running"; else if (TD_ON_RUNQ(td) || TD_CAN_RUN(td)) state = "runnable"; else if (TD_IS_SLEEPING(td)) { /* XXX: If we're sleeping, are we ever not in a queue? */ if (TD_ON_SLEEPQ(td)) state = td->td_wmesg; else state = "sleeping without queue"; } else if (TD_ON_LOCK(td)) { state = td->td_lockname; stateprefix = "*"; } else if (TD_IS_SUSPENDED(td)) state = "suspended"; else if (TD_AWAITING_INTR(td)) state = "intrwait"; else if (p->p_state == PRS_ZOMBIE) state = "zombie"; else state = "unknown"; pctcpu = (sched_pctcpu(td) * 10000 + FSCALE / 2) >> FSHIFT; #ifdef STACK kstacks_val = atomic_load_int(&tty_info_kstacks); print_kstacks = (kstacks_val != STACK_SBUF_FMT_NONE); if (print_kstacks) { if (TD_IS_SWAPPED(td)) sterr = ENOENT; else sterr = stack_save_td(&stack, td); } #endif thread_unlock(td); if (p->p_state == PRS_NEW || p->p_state == PRS_ZOMBIE) rss = 0; else rss = pgtok(vmspace_resident_count(p->p_vmspace)); microuptime(&rtime); timevalsub(&rtime, &p->p_stats->p_start); rufetchcalc(p, &ru, &utime, &stime); pid = p->p_pid; strlcpy(comm, p->p_comm, sizeof comm); PROC_UNLOCK(p); /* Print command, pid, state, rtime, utime, stime, %cpu, and rss. */ sbuf_printf(&sb, " cmd: %s %d [%s%s] %ld.%02ldr %ld.%02ldu %ld.%02lds %d%% %ldk\n", comm, pid, stateprefix, state, (long)rtime.tv_sec, rtime.tv_usec / 10000, (long)utime.tv_sec, utime.tv_usec / 10000, (long)stime.tv_sec, stime.tv_usec / 10000, pctcpu / 100, rss); #ifdef STACK if (print_kstacks && sterr == 0) stack_sbuf_print_flags(&sb, &stack, M_NOWAIT, kstacks_val); #endif out: sbuf_finish(&sb); sbuf_delete(&sb); } diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c index 7a429e6392b5..569f8560788c 100644 --- a/sys/kern/vfs_mountroot.c +++ b/sys/kern/vfs_mountroot.c @@ -1,1166 +1,1166 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2010 Marcel Moolenaar * Copyright (c) 1999-2004 Poul-Henning Kamp * Copyright (c) 1999 Michael Smith * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_rootdevname.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The root filesystem is detailed in the kernel environment variable * vfs.root.mountfrom, which is expected to be in the general format * * :[][ :[] ...] * vfsname := the name of a VFS known to the kernel and capable * of being mounted as root * path := disk device name or other data used by the filesystem * to locate its physical store * * If the environment variable vfs.root.mountfrom is a space separated list, * each list element is tried in turn and the root filesystem will be mounted * from the first one that succeeds. * * The environment variable vfs.root.mountfrom.options is a comma delimited * set of string mount options. These mount options must be parseable * by nmount() in the kernel. */ static int parse_mount(char **); static struct mntarg *parse_mountroot_options(struct mntarg *, const char *); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS); static void vfs_mountroot_wait(void); static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev); /* * The vnode of the system's root (/ in the filesystem, without chroot * active.) */ struct vnode *rootvnode; /* * Mount of the system's /dev. */ struct mount *rootdevmp; char *rootdevnames[2] = {NULL, NULL}; struct mtx root_holds_mtx; MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF); static TAILQ_HEAD(, root_hold_token) root_holds = TAILQ_HEAD_INITIALIZER(root_holds); enum action { A_CONTINUE, A_PANIC, A_REBOOT, A_RETRY }; enum rh_flags { RH_FREE, RH_ALLOC, RH_ARG, }; static enum action root_mount_onfail = A_CONTINUE; static int root_mount_mddev; static int root_mount_complete; /* By default wait up to 3 seconds for devices to appear. */ static int root_mount_timeout = 3; TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout); static int root_mount_always_wait = 0; SYSCTL_INT(_vfs, OID_AUTO, root_mount_always_wait, CTLFLAG_RDTUN, &root_mount_always_wait, 0, "Wait for root mount holds even if the root device already exists"); SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_root_mount_hold, "A", "List of root mount hold tokens"); static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct root_hold_token *h; int error; sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); mtx_lock(&root_holds_mtx); TAILQ_FOREACH(h, &root_holds, list) { if (h != TAILQ_FIRST(&root_holds)) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s", h->who); } mtx_unlock(&root_holds_mtx); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return (error); } struct root_hold_token * root_mount_hold(const char *identifier) { struct root_hold_token *h; h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK); h->flags = RH_ALLOC; h->who = identifier; mtx_lock(&root_holds_mtx); TSHOLD("root mount"); TAILQ_INSERT_TAIL(&root_holds, h, list); mtx_unlock(&root_holds_mtx); return (h); } void root_mount_hold_token(const char *identifier, struct root_hold_token *h) { #ifdef INVARIANTS struct root_hold_token *t; #endif h->flags = RH_ARG; h->who = identifier; mtx_lock(&root_holds_mtx); #ifdef INVARIANTS TAILQ_FOREACH(t, &root_holds, list) { if (t == h) { panic("Duplicate mount hold by '%s' on %p", identifier, h); } } #endif TSHOLD("root mount"); TAILQ_INSERT_TAIL(&root_holds, h, list); mtx_unlock(&root_holds_mtx); } void root_mount_rel(struct root_hold_token *h) { if (h == NULL || h->flags == RH_FREE) return; mtx_lock(&root_holds_mtx); TAILQ_REMOVE(&root_holds, h, list); TSRELEASE("root mount"); wakeup(&root_holds); mtx_unlock(&root_holds_mtx); if (h->flags == RH_ALLOC) { free(h, M_DEVBUF); } else h->flags = RH_FREE; } int root_mounted(void) { /* No mutex is acquired here because int stores are atomic. */ return (root_mount_complete); } static void set_rootvnode(void) { if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode)) panic("set_rootvnode: Cannot find root vnode"); VOP_UNLOCK(rootvnode); pwd_set_rootvnode(); } static int vfs_mountroot_devfs(struct thread *td, struct mount **mpp) { struct vfsoptlist *opts; struct vfsconf *vfsp; struct mount *mp; int error; *mpp = NULL; if (rootdevmp != NULL) { /* * Already have /dev; this happens during rerooting. */ error = vfs_busy(rootdevmp, 0); if (error != 0) return (error); *mpp = rootdevmp; } else { vfsp = vfs_byname("devfs"); KASSERT(vfsp != NULL, ("Could not find devfs by name")); if (vfsp == NULL) return (ENOENT); mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred); error = VFS_MOUNT(mp); KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error)); if (error) return (error); error = VFS_STATFS(mp, &mp->mnt_stat); KASSERT(error == 0, ("VFS_STATFS(devfs) failed %d", error)); if (error) return (error); opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); TAILQ_INIT(opts); mp->mnt_opt = opts; mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); *mpp = mp; rootdevmp = mp; vfs_op_exit(mp); } set_rootvnode(); error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE); if (error) printf("kern_symlink /dev -> / returns %d\n", error); return (error); } static void vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs) { struct nameidata nd; struct mount *mporoot, *mpnroot; struct vnode *vp, *vporoot, *vpdevfs; char *fspath; int error; mpnroot = TAILQ_NEXT(mpdevfs, mnt_list); /* Shuffle the mountlist. */ mtx_lock(&mountlist_mtx); mporoot = TAILQ_FIRST(&mountlist); TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list); if (mporoot != mpdevfs) { TAILQ_REMOVE(&mountlist, mpnroot, mnt_list); TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list); } TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list); mtx_unlock(&mountlist_mtx); cache_purgevfs(mporoot); if (mporoot != mpdevfs) cache_purgevfs(mpdevfs); if (VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot)) panic("vfs_mountroot_shuffle: Cannot find root vnode"); VI_LOCK(vporoot); vporoot->v_iflag &= ~VI_MOUNT; vn_irflag_unset_locked(vporoot, VIRF_MOUNTPOINT); vporoot->v_mountedhere = NULL; VI_UNLOCK(vporoot); mporoot->mnt_flag &= ~MNT_ROOTFS; mporoot->mnt_vnodecovered = NULL; vput(vporoot); /* Set up the new rootvnode, and purge the cache */ mpnroot->mnt_vnodecovered = NULL; set_rootvnode(); cache_purgevfs(rootvnode->v_mount); if (mporoot != mpdevfs) { /* Remount old root under /.mount or /mnt */ fspath = "/.mount"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath); error = namei(&nd); if (error) { fspath = "/mnt"; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath); error = namei(&nd); } if (!error) { NDFREE_PNBUF(&nd); vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { cache_purge(vp); VI_LOCK(vp); mporoot->mnt_vnodecovered = vp; vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); vp->v_mountedhere = mporoot; strlcpy(mporoot->mnt_stat.f_mntonname, fspath, MNAMELEN); VI_UNLOCK(vp); VOP_UNLOCK(vp); } else vput(vp); } if (error) printf("mountroot: unable to remount previous root " "under /.mount or /mnt (error %d)\n", error); } /* Remount devfs under /dev */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev"); error = namei(&nd); if (!error) { NDFREE_PNBUF(&nd); vp = nd.ni_vp; error = (vp->v_type == VDIR) ? 0 : ENOTDIR; if (!error) error = vinvalbuf(vp, V_SAVE, 0, 0); if (!error) { vpdevfs = mpdevfs->mnt_vnodecovered; if (vpdevfs != NULL) { cache_purge(vpdevfs); VI_LOCK(vpdevfs); vn_irflag_unset_locked(vpdevfs, VIRF_MOUNTPOINT); vpdevfs->v_mountedhere = NULL; VI_UNLOCK(vpdevfs); vrele(vpdevfs); } VI_LOCK(vp); mpdevfs->mnt_vnodecovered = vp; vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); vp->v_mountedhere = mpdevfs; VI_UNLOCK(vp); VOP_UNLOCK(vp); } else vput(vp); } if (error) printf("mountroot: unable to remount devfs under /dev " "(error %d)\n", error); if (mporoot == mpdevfs) { vfs_unbusy(mpdevfs); /* Unlink the no longer needed /dev/dev -> / symlink */ error = kern_funlinkat(td, AT_FDCWD, "/dev/dev", FD_NONE, UIO_SYSSPACE, 0, 0); if (error) printf("mountroot: unable to unlink /dev/dev " "(error %d)\n", error); } } /* * Configuration parser. */ /* Parser character classes. */ #define CC_WHITESPACE -1 #define CC_NONWHITESPACE -2 /* Parse errors. */ #define PE_EOF -1 #define PE_EOL -2 static __inline int parse_peek(char **conf) { return (**conf); } static __inline void parse_poke(char **conf, int c) { **conf = c; } static __inline void parse_advance(char **conf) { (*conf)++; } static int parse_skipto(char **conf, int mc) { int c, match; while (1) { c = parse_peek(conf); if (c == 0) return (PE_EOF); switch (mc) { case CC_WHITESPACE: match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0; break; case CC_NONWHITESPACE: if (c == '\n') return (PE_EOL); match = (c != ' ' && c != '\t') ? 1 : 0; break; default: match = (c == mc) ? 1 : 0; break; } if (match) break; parse_advance(conf); } return (0); } static int parse_token(char **conf, char **tok) { char *p; size_t len; int error; *tok = NULL; error = parse_skipto(conf, CC_NONWHITESPACE); if (error) return (error); p = *conf; error = parse_skipto(conf, CC_WHITESPACE); len = *conf - p; *tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO); bcopy(p, *tok, len); return (0); } static void parse_dir_ask_printenv(const char *var) { char *val; val = kern_getenv(var); if (val != NULL) { printf(" %s=%s\n", var, val); freeenv(val); } } static int parse_dir_ask(char **conf) { char name[80]; char *mnt; int error; vfs_mountroot_wait(); printf("\nLoader variables:\n"); parse_dir_ask_printenv("vfs.root.mountfrom"); parse_dir_ask_printenv("vfs.root.mountfrom.options"); printf("\nManual root filesystem specification:\n"); printf(" : [options]\n"); printf(" Mount using filesystem \n"); printf(" and with the specified (optional) option list.\n"); printf("\n"); printf(" eg. ufs:/dev/da0s1a\n"); printf(" zfs:zroot/ROOT/default\n"); printf(" cd9660:/dev/cd0 ro\n"); printf(" (which is equivalent to: "); printf("mount -t cd9660 -o ro /dev/cd0 /)\n"); printf("\n"); printf(" ? List valid disk boot devices\n"); printf(" . Yield 1 second (for background tasks)\n"); printf(" Abort manual input\n"); do { error = EINVAL; printf("\nmountroot> "); cngets(name, sizeof(name), GETS_ECHO); if (name[0] == '\0') break; if (name[0] == '?' && name[1] == '\0') { printf("\nList of GEOM managed disk devices:\n "); g_dev_print(); continue; } if (name[0] == '.' && name[1] == '\0') { pause("rmask", hz); continue; } mnt = name; error = parse_mount(&mnt); if (error == -1) printf("Invalid file system specification.\n"); } while (error != 0); return (error); } static int parse_dir_md(char **conf) { struct stat sb; struct thread *td; struct md_ioctl *mdio; char *path, *tok; int error, fd, len; td = curthread; fd = -1; error = parse_token(conf, &tok); if (error) return (error); len = strlen(tok); mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO); path = (void *)(mdio + 1); bcopy(tok, path, len); free(tok, M_TEMP); /* Get file status. */ error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb); if (error) goto out; /* Open /dev/mdctl so that we can attach/detach. */ error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE, O_RDWR, 0); if (error) goto out; fd = td->td_retval[0]; mdio->md_version = MDIOVERSION; mdio->md_type = MD_VNODE; if (root_mount_mddev != -1) { mdio->md_unit = root_mount_mddev; (void)kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); /* Ignore errors. We don't care. */ root_mount_mddev = -1; } mdio->md_file = (void *)(mdio + 1); mdio->md_options = MD_AUTOUNIT | MD_READONLY; mdio->md_mediasize = sb.st_size; mdio->md_unit = 0; error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio); if (error) goto out; if (mdio->md_unit > 9) { printf("rootmount: too many md units\n"); mdio->md_file = NULL; mdio->md_options = 0; mdio->md_mediasize = 0; error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio); /* Ignore errors. We don't care. */ error = ERANGE; goto out; } root_mount_mddev = mdio->md_unit; printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file); out: if (fd >= 0) (void)kern_close(td, fd); free(mdio, M_TEMP); return (error); } static int parse_dir_onfail(char **conf) { char *action; int error; error = parse_token(conf, &action); if (error) return (error); if (!strcmp(action, "continue")) root_mount_onfail = A_CONTINUE; else if (!strcmp(action, "panic")) root_mount_onfail = A_PANIC; else if (!strcmp(action, "reboot")) root_mount_onfail = A_REBOOT; else if (!strcmp(action, "retry")) root_mount_onfail = A_RETRY; else { printf("rootmount: %s: unknown action\n", action); error = EINVAL; } free(action, M_TEMP); return (0); } static int parse_dir_timeout(char **conf) { char *tok, *endtok; long secs; int error; error = parse_token(conf, &tok); if (error) return (error); secs = strtol(tok, &endtok, 0); error = (secs < 0 || *endtok != '\0') ? EINVAL : 0; if (!error) root_mount_timeout = secs; free(tok, M_TEMP); return (error); } static int parse_directive(char **conf) { char *dir; int error; error = parse_token(conf, &dir); if (error) return (error); if (strcmp(dir, ".ask") == 0) error = parse_dir_ask(conf); else if (strcmp(dir, ".md") == 0) error = parse_dir_md(conf); else if (strcmp(dir, ".onfail") == 0) error = parse_dir_onfail(conf); else if (strcmp(dir, ".timeout") == 0) error = parse_dir_timeout(conf); else { printf("mountroot: invalid directive `%s'\n", dir); /* Ignore the rest of the line. */ (void)parse_skipto(conf, '\n'); error = EINVAL; } free(dir, M_TEMP); return (error); } static bool parse_mount_dev_present(const char *dev) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, dev); error = namei(&nd); if (error != 0) return (false); vrele(nd.ni_vp); NDFREE_PNBUF(&nd); return (true); } #define ERRMSGL 255 static int parse_mount(char **conf) { char *errmsg; struct mntarg *ma; char *dev, *fs, *opts, *tok; int delay, error, timeout; error = parse_token(conf, &tok); if (error) return (error); fs = tok; error = parse_skipto(&tok, ':'); if (error) { free(fs, M_TEMP); return (error); } parse_poke(&tok, '\0'); parse_advance(&tok); dev = tok; if (root_mount_mddev != -1) { /* Handle substitution for the md unit number. */ tok = strstr(dev, "md#"); if (tok != NULL) tok[2] = '0' + root_mount_mddev; } /* Parse options. */ error = parse_token(conf, &tok); opts = (error == 0) ? tok : NULL; printf("Trying to mount root from %s:%s [%s]...\n", fs, dev, (opts != NULL) ? opts : ""); errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO); if (vfs_byname(fs) == NULL) { strlcpy(errmsg, "unknown file system", ERRMSGL); error = ENOENT; goto out; } error = vfs_mountroot_wait_if_neccessary(fs, dev); if (error != 0) goto out; delay = hz / 10; timeout = root_mount_timeout * hz; for (;;) { ma = NULL; ma = mount_arg(ma, "fstype", fs, -1); ma = mount_arg(ma, "fspath", "/", -1); ma = mount_arg(ma, "from", dev, -1); ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL); ma = mount_arg(ma, "ro", NULL, 0); ma = parse_mountroot_options(ma, opts); error = kernel_mount(ma, MNT_ROOTFS); if (error == 0 || error == EILSEQ || timeout <= 0) break; if (root_mount_timeout * hz == timeout || (bootverbose && timeout % hz == 0)) { printf("Mounting from %s:%s failed with error %d; " "retrying for %d more second%s\n", fs, dev, error, timeout / hz, (timeout / hz > 1) ? "s" : ""); } pause("rmretry", delay); timeout -= delay; } out: if (error) { printf("Mounting from %s:%s failed with error %d", fs, dev, error); if (errmsg[0] != '\0') printf(": %s", errmsg); printf(".\n"); } free(fs, M_TEMP); free(errmsg, M_TEMP); if (opts != NULL) free(opts, M_TEMP); /* kernel_mount can return -1 on error. */ return ((error < 0) ? EDOOFUS : error); } #undef ERRMSGL static int vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs) { struct mount *mp; char *conf; int error; root_mount_mddev = -1; retry: conf = sbuf_data(sb); mp = TAILQ_NEXT(mpdevfs, mnt_list); error = (mp == NULL) ? 0 : EDOOFUS; root_mount_onfail = A_CONTINUE; while (mp == NULL) { error = parse_skipto(&conf, CC_NONWHITESPACE); if (error == PE_EOL) { parse_advance(&conf); continue; } if (error < 0) break; switch (parse_peek(&conf)) { case '#': error = parse_skipto(&conf, '\n'); break; case '.': error = parse_directive(&conf); break; default: error = parse_mount(&conf); if (error == -1) { printf("mountroot: invalid file system " "specification.\n"); error = 0; } break; } if (error < 0) break; /* Ignore any trailing garbage on the line. */ if (parse_peek(&conf) != '\n') { printf("mountroot: advancing to next directive...\n"); (void)parse_skipto(&conf, '\n'); } mp = TAILQ_NEXT(mpdevfs, mnt_list); } if (mp != NULL) return (0); /* * We failed to mount (a new) root. */ switch (root_mount_onfail) { case A_CONTINUE: break; case A_PANIC: panic("mountroot: unable to (re-)mount root."); /* NOTREACHED */ case A_RETRY: goto retry; case A_REBOOT: kern_reboot(RB_NOSYNC); /* NOTREACHED */ } return (error); } static void vfs_mountroot_conf0(struct sbuf *sb) { char *s, *tok, *mnt, *opt; int error; - sbuf_printf(sb, ".onfail panic\n"); + sbuf_cat(sb, ".onfail panic\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); if (boothowto & RB_ASKNAME) - sbuf_printf(sb, ".ask\n"); + sbuf_cat(sb, ".ask\n"); #ifdef ROOTDEVNAME if (boothowto & RB_DFLTROOT) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (boothowto & RB_CDROM) { - sbuf_printf(sb, "cd9660:/dev/cd0 ro\n"); - sbuf_printf(sb, ".timeout 0\n"); - sbuf_printf(sb, "cd9660:/dev/cd1 ro\n"); + sbuf_cat(sb, "cd9660:/dev/cd0 ro\n"); + sbuf_cat(sb, ".timeout 0\n"); + sbuf_cat(sb, "cd9660:/dev/cd1 ro\n"); sbuf_printf(sb, ".timeout %d\n", root_mount_timeout); } s = kern_getenv("vfs.root.mountfrom"); if (s != NULL) { opt = kern_getenv("vfs.root.mountfrom.options"); tok = s; error = parse_token(&tok, &mnt); while (!error) { sbuf_printf(sb, "%s %s\n", mnt, (opt != NULL) ? opt : ""); free(mnt, M_TEMP); error = parse_token(&tok, &mnt); } if (opt != NULL) freeenv(opt); freeenv(s); } if (rootdevnames[0] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[0]); if (rootdevnames[1] != NULL) sbuf_printf(sb, "%s\n", rootdevnames[1]); #ifdef ROOTDEVNAME if (!(boothowto & RB_DFLTROOT)) sbuf_printf(sb, "%s\n", ROOTDEVNAME); #endif if (!(boothowto & RB_ASKNAME)) - sbuf_printf(sb, ".ask\n"); + sbuf_cat(sb, ".ask\n"); } static int vfs_mountroot_readconf(struct thread *td, struct sbuf *sb) { static char buf[128]; struct nameidata nd; off_t ofs; ssize_t resid; int error, flags, len; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf"); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE_PNBUF(&nd); ofs = 0; len = sizeof(buf) - 1; while (1) { error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td); if (error) break; if (resid == len) break; buf[len - resid] = 0; sbuf_printf(sb, "%s", buf); ofs += len - resid; } VOP_UNLOCK(nd.ni_vp); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); return (error); } static void vfs_mountroot_wait(void) { struct root_hold_token *h; struct thread *td; struct timeval lastfail; int curfail; TSENTER(); curfail = 0; lastfail.tv_sec = 0; eventratecheck(&lastfail, &curfail, 1); td = curthread; while (1) { g_waitidle(td); mtx_lock(&root_holds_mtx); if (TAILQ_EMPTY(&root_holds)) { mtx_unlock(&root_holds_mtx); break; } if (eventratecheck(&lastfail, &curfail, 1)) { printf("Root mount waiting for:"); TAILQ_FOREACH(h, &root_holds, list) printf(" %s", h->who); printf("\n"); } TSWAIT("root mount"); msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold", hz); TSUNWAIT("root mount"); } g_waitidle(td); TSEXIT(); } static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev) { int delay, timeout; /* * In case of ZFS and NFS we don't have a way to wait for * specific device. Also do the wait if the user forced that * behaviour by setting vfs.root_mount_always_wait=1. */ if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL || dev[0] == '\0' || root_mount_always_wait != 0) { vfs_mountroot_wait(); return (0); } /* * Otherwise, no point in waiting if the device is already there. * Note that we must wait for GEOM to finish reconfiguring itself, * eg for geom_part(4) to finish tasting. */ g_waitidle(curthread); if (parse_mount_dev_present(dev)) return (0); /* * No luck. Let's wait. This code looks weird, but it's that way * to behave exactly as it used to work before. */ vfs_mountroot_wait(); if (parse_mount_dev_present(dev)) return (0); printf("mountroot: waiting for device %s...\n", dev); delay = hz / 10; timeout = root_mount_timeout * hz; do { pause("rmdev", delay); timeout -= delay; } while (timeout > 0 && !parse_mount_dev_present(dev)); if (timeout <= 0) return (ENODEV); return (0); } void vfs_mountroot(void) { struct mount *mp; struct sbuf *sb; struct thread *td; time_t timebase; int error; mtx_assert(&Giant, MA_NOTOWNED); TSENTER(); td = curthread; sb = sbuf_new_auto(); vfs_mountroot_conf0(sb); sbuf_finish(sb); error = vfs_mountroot_devfs(td, &mp); while (!error) { error = vfs_mountroot_parse(sb, mp); if (!error) { vfs_mountroot_shuffle(td, mp); sbuf_clear(sb); error = vfs_mountroot_readconf(td, sb); sbuf_finish(sb); } } sbuf_delete(sb); /* * Iterate over all currently mounted file systems and use * the time stamp found to check and/or initialize the RTC. * Call inittodr() only once and pass it the largest of the * timestamps we encounter. */ timebase = 0; mtx_lock(&mountlist_mtx); mp = TAILQ_FIRST(&mountlist); while (mp != NULL) { if (mp->mnt_time > timebase) timebase = mp->mnt_time; mp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); inittodr(timebase); /* Keep prison0's root in sync with the global rootvnode. */ mtx_lock(&prison0.pr_mtx); prison0.pr_root = rootvnode; vref(prison0.pr_root); mtx_unlock(&prison0.pr_mtx); mtx_lock(&root_holds_mtx); atomic_store_rel_int(&root_mount_complete, 1); wakeup(&root_mount_complete); mtx_unlock(&root_holds_mtx); EVENTHANDLER_INVOKE(mountroot); TSEXIT(); } static struct mntarg * parse_mountroot_options(struct mntarg *ma, const char *options) { char *p; char *name, *name_arg; char *val, *val_arg; char *opts; if (options == NULL || options[0] == '\0') return (ma); p = opts = strdup(options, M_MOUNT); if (opts == NULL) { return (ma); } while((name = strsep(&p, ",")) != NULL) { if (name[0] == '\0') break; val = strchr(name, '='); if (val != NULL) { *val = '\0'; ++val; } if (strcmp(name, "rw") == 0 || strcmp(name, "noro") == 0) { /* * The first time we mount the root file system, * we need to mount 'ro', so We need to ignore * 'rw' and 'noro' mount options. */ continue; } name_arg = strdup(name, M_MOUNT); val_arg = NULL; if (val != NULL) val_arg = strdup(val, M_MOUNT); ma = mount_arg(ma, name_arg, val_arg, (val_arg != NULL ? -1 : 0)); } free(opts, M_MOUNT); return (ma); }